Crawler/wcr/crawl_wcr.py

from config import get_db_connection, preload_source_urls
import requests, time, mysql.connector
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime


def insert_video_to_db(data):
    
    
    try:
        cursor = db.cursor()
        sql = """
        INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
        VALUES (%s, %s, %s, %s, NOW())
                """

        values = (
            data['username'],
            data['date'],
            data['embed_link'],
            data['source_url']
        )
        cursor.execute(sql, values)
        db.commit()
        if cursor.rowcount == 0:
            print("❌ Video already exists in DB")
        else:
            print("✅ Inserted into DB!")

    except mysql.connector.Error as err:
        print(f"❌ Failed to insert: {err}")

    
def crawl_user_page(full_url):
    response = requests.get(full_url)
    if response.status_code != 200:
        print(f"❌ Failed to load {full_url}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")

    data = parse_data(soup)
    data["source_url"] = full_url
    return data
    

def parse_data(soup):
    
    title_tag = soup.find("h1", class_="entry-title")
    username = title_tag.contents[0].strip() 
    date_tag = soup.find("span", class_="entry-date")
    date = date_tag.text.strip() if date_tag else None

    if date:
        try:
            date_obj = datetime.strptime(date, "%d/%m/%Y")
            date = date_obj.strftime("%Y-%m-%d")
        except ValueError:
            print(f"⚠️ Failed to parse date: {date}")
            date = None

        embed_link = None
        for iframe in soup.find_all("iframe", src=True):
            src = iframe["src"]
            if "xpornium.net" in src:
                embed_link = src  # no urljoin needed!
                break  # stop after finding the first match

        # --- print info after crawling this user ---
        print(f"\n✅ Scraped {username}: — {date}")

        # -------------------------------------------

        return {
            "username": username,
            "date": date,
            "embed_link": embed_link,
        }

def crawl_all(init_url):
    """Crawl page by page and extract user data as we go."""
    page = 1
    all_data = []

    while True:
        url = f"{init_url}?p={page}"
        print(f"\n🕷️ Crawling index page {page}: {url}")
        response = requests.get(url)
        if response.status_code != 200:
            print(f"❌ Page {page} returned {response.status_code}, stopping.")
            break

        soup = BeautifulSoup(response.text, "html.parser")
        video_pages = soup.find_all("a", class_="thumbnail-link", href=True)
        video_pages = [link['href'] for link in video_pages]
        if not video_pages:
            print("⚠️ No user links found — reached end of site.")
            break
        cursor = db.cursor()    
        seen_urls = preload_source_urls(cursor)
        
        for link in video_pages:
            full_url = init_url + link
            
            if full_url in seen_urls:
                print(f"⚠️ Skipping {link} - already seen.")
                continue
            
            
            user_data = crawl_user_page(full_url) # slow as fuk
            if not user_data:
                print("⚠️ Skipping empty user_data.")
                continue

            if not user_data["embed_link"]:
                print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
                continue

            insert_video_to_db(user_data)
            

        page += 1
        
            
    print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}")
    return all_data

if __name__ == "__main__":
    db = get_db_connection()
    BASE_URL = "https://webcamrips.to"
    results = crawl_all(BASE_URL)
    print("💾 All data saved to users_data.json")
    cursor = db.cursor()
    cursor.close()
    db.close()
MY ASS BROKEN!!!! 13 hours ago			`from config import get_db_connection, preload_source_urls`
Initial commit 14 hours ago			`import requests, time, mysql.connector`
			`from bs4 import BeautifulSoup`
			`from urllib.parse import urljoin`
			`from datetime import datetime`


			`def insert_video_to_db(data):`
MY ASS BROKEN!!!! 13 hours ago
Initial commit 14 hours ago
			`try:`
			`cursor = db.cursor()`
			`sql = """`
MY ASS BROKEN!!!! 13 hours ago			`INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)`
			`VALUES (%s, %s, %s, %s, NOW())`
Initial commit 14 hours ago			`"""`

			`values = (`
			`data['username'],`
			`data['date'],`
			`data['embed_link'],`
			`data['source_url']`
			`)`
			`cursor.execute(sql, values)`
			`db.commit()`
MY ASS BROKEN!!!! 13 hours ago			`if cursor.rowcount == 0:`
			`print("❌ Video already exists in DB")`
			`else:`
			`print("✅ Inserted into DB!")`
Initial commit 14 hours ago
			`except mysql.connector.Error as err:`
			`print(f"❌ Failed to insert: {err}")`

MY ASS BROKEN!!!! 13 hours ago
Initial commit 14 hours ago
MY ASS BROKEN!!!! 13 hours ago			`def crawl_user_page(full_url):`
Initial commit 14 hours ago			`response = requests.get(full_url)`
			`if response.status_code != 200:`
			`print(f"❌ Failed to load {full_url}")`
			`return None`

			`soup = BeautifulSoup(response.text, "html.parser")`

MY ASS BROKEN!!!! 13 hours ago			`data = parse_data(soup)`
			`data["source_url"] = full_url`
			`return data`
Initial commit 14 hours ago

Initial commit 14 hours ago			`def parse_data(soup):`
MY ASS BROKEN!!!! 13 hours ago
Initial commit 14 hours ago			`title_tag = soup.find("h1", class_="entry-title")`
MY ASS BROKEN!!!! 13 hours ago			`username = title_tag.contents[0].strip()`
Initial commit 14 hours ago			`date_tag = soup.find("span", class_="entry-date")`
			`date = date_tag.text.strip() if date_tag else None`

			`if date:`
			`try:`
			`date_obj = datetime.strptime(date, "%d/%m/%Y")`
			`date = date_obj.strftime("%Y-%m-%d")`
			`except ValueError:`
			`print(f"⚠️ Failed to parse date: {date}")`
			`date = None`

			`embed_link = None`
			`for iframe in soup.find_all("iframe", src=True):`
			`src = iframe["src"]`
			`if "xpornium.net" in src:`
			`embed_link = src # no urljoin needed!`
			`break # stop after finding the first match`

			`# --- print info after crawling this user ---`
			`print(f"\n✅ Scraped {username}: — {date}")`

			`# -------------------------------------------`

			`return {`
			`"username": username,`
			`"date": date,`
			`"embed_link": embed_link,`
			`}`

			`def crawl_all(init_url):`
			`"""Crawl page by page and extract user data as we go."""`
			`page = 1`
			`all_data = []`

			`while True:`
			`url = f"{init_url}?p={page}"`
			`print(f"\n🕷️ Crawling index page {page}: {url}")`
			`response = requests.get(url)`
			`if response.status_code != 200:`
			`print(f"❌ Page {page} returned {response.status_code}, stopping.")`
			`break`

			`soup = BeautifulSoup(response.text, "html.parser")`
MY ASS BROKEN!!!! 13 hours ago			`video_pages = soup.find_all("a", class_="thumbnail-link", href=True)`
			`video_pages = [link['href'] for link in video_pages]`
			`if not video_pages:`
Initial commit 14 hours ago			`print("⚠️ No user links found — reached end of site.")`
			`break`
MY ASS BROKEN!!!! 13 hours ago			`cursor = db.cursor()`
			`seen_urls = preload_source_urls(cursor)`

			`for link in video_pages:`
			`full_url = init_url + link`

			`if full_url in seen_urls:`
			`print(f"⚠️ Skipping {link} - already seen.")`
			`continue`



			`user_data = crawl_user_page(full_url) # slow as fuk`
Initial commit 14 hours ago			`if not user_data:`
			`print("⚠️ Skipping empty user_data.")`
			`continue`

			`if not user_data["embed_link"]:`
			`print(f"⚠️ Skipping {user_data['username']} - no embed link found.")`
			`continue`

			`insert_video_to_db(user_data)`
MY ASS BROKEN!!!! 13 hours ago
Initial commit 14 hours ago
			`page += 1`
MY ASS BROKEN!!!! 13 hours ago
Initial commit 14 hours ago
			`print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}")`
			`return all_data`

			`if __name__ == "__main__":`
MY ASS BROKEN!!!! 13 hours ago			`db = get_db_connection()`
Initial commit 14 hours ago			`BASE_URL = "https://webcamrips.to"`
			`results = crawl_all(BASE_URL)`
			`print("💾 All data saved to users_data.json")`
MY ASS BROKEN!!!! 13 hours ago			`cursor = db.cursor()`
			`cursor.close()`
			`db.close()`