ASS STILL FUCKING BROKEN MY NIGGA WTFFFFFFF

12 hours ago · f90cbd7b53
parent c51464d68e
commit f90cbd7b53
1 changed files with 65 additions and 67 deletions
--- a/wcr/crawl_wcr.py
+++ b/wcr/crawl_wcr.py
@ -3,18 +3,16 @@ import requests, time, mysql.connector
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 from datetime import datetime
 from concurrent.futures import ThreadPoolExecutor, as_completed
-def insert_video_to_db(data):
+def insert_video_to_db(data, db):
    try:
        cursor = db.cursor()
        sql = """
        INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
        VALUES (%s, %s, %s, %s, NOW())
-                """
+        """
        values = (
            data['username'],
            data['date'],
@ -27,29 +25,31 @@ def insert_video_to_db(data):
            print("❌ Video already exists in DB")
        else:
            print("✅ Inserted into DB!")
    except mysql.connector.Error as err:
        print(f"❌ Failed to insert: {err}")
    finally:
        cursor.close()
 def crawl_user_page(full_url):
-    response = requests.get(full_url)
+    try:
-    if response.status_code != 200:
+        response = requests.get(full_url)
-        print(f"❌ Failed to load {full_url}")
+        if response.status_code != 200:
-        return None
+            print(f"❌ Failed to load {full_url}")
            return None
-    soup = BeautifulSoup(response.text, "html.parser")
+        soup = BeautifulSoup(response.text, "html.parser")
        data = parse_data(soup)
        data["source_url"] = full_url
        return data
    except Exception as e:
        print(f"❌ Exception while crawling {full_url}: {e}")
        return None
    data = parse_data(soup)
    data["source_url"] = full_url
    return data
 def parse_data(soup):
    title_tag = soup.find("h1", class_="entry-title")
-    username = title_tag.contents[0].strip() 
+    username = title_tag.contents[0].strip() if title_tag else "unknown"
    date_tag = soup.find("span", class_="entry-date")
    date = date_tag.text.strip() if date_tag else None
@ -61,28 +61,49 @@ def parse_data(soup):
            print(f"⚠️ Failed to parse date: {date}")
            date = None
-        embed_link = None
+    embed_link = None
-        for iframe in soup.find_all("iframe", src=True):
+    for iframe in soup.find_all("iframe", src=True):
-            src = iframe["src"]
+        src = iframe["src"]
-            if "xpornium.net" in src:
+        if "xpornium.net" in src:
-                embed_link = src  # no urljoin needed!
+            embed_link = src
-                break  # stop after finding the first match
+            break
    print(f"\n✅ Scraped {username}: — {date}")
    return {
        "username": username,
        "date": date,
        "embed_link": embed_link,
    }
 def process_link(link, seen_urls, init_url):
    full_url = init_url + link
    if full_url in seen_urls:
        print(f"⚠️ Skipping {link} - already seen.")
        return
-        # --- print info after crawling this user ---
+    user_data = crawl_user_page(full_url)
-        print(f"\n✅ Scraped {username}: — {date}")
+    if not user_data:
        print("⚠️ Skipping empty user_data.")
        return
-        # -------------------------------------------
+    if not user_data["embed_link"]:
        print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
        return
    local_db = get_db_connection()
    insert_video_to_db(user_data, local_db)
    local_db.close()
        return {
            "username": username,
            "date": date,
            "embed_link": embed_link,
        }
 def crawl_all(init_url):
    """Crawl page by page and extract user data as we go."""
    page = 1
-    all_data = []
+    db = get_db_connection()
    cursor = db.cursor()
    seen_urls = preload_source_urls(cursor)
    cursor.close()
    db.close()
    while True:
        url = f"{init_url}?p={page}"
@ -95,44 +116,21 @@ def crawl_all(init_url):
        soup = BeautifulSoup(response.text, "html.parser")
        video_pages = soup.find_all("a", class_="thumbnail-link", href=True)
        video_pages = [link['href'] for link in video_pages]
        if not video_pages:
            print("⚠️ No user links found — reached end of site.")
            break
-        cursor = db.cursor()    
+
-        seen_urls = preload_source_urls(cursor)
+        with ThreadPoolExecutor(max_workers=50) as executor:
-        
+            futures = [executor.submit(process_link, link, seen_urls, init_url) for link in video_pages]
-        for link in video_pages:
+            for _ in as_completed(futures):
-            full_url = init_url + link
+                pass  # we already log inside the functions
            if full_url in seen_urls:
                print(f"⚠️ Skipping {link} - already seen.")
                continue
            user_data = crawl_user_page(full_url) # slow as fuk
            if not user_data:
                print("⚠️ Skipping empty user_data.")
                continue
            if not user_data["embed_link"]:
                print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
                continue
            insert_video_to_db(user_data)
        page += 1
-        
+
-            
+    print("\n✅ Finished crawling all pages.")
-    print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}")
+
    return all_data
 if __name__ == "__main__":
    db = get_db_connection()
    BASE_URL = "https://webcamrips.to"
-    results = crawl_all(BASE_URL)
+    crawl_all(BASE_URL)
    print("💾 All data saved to users_data.json")
    cursor = db.cursor()
    cursor.close()
    db.close()