ASS STILL HURT

11 hours ago · f5d91117d8
parent f90cbd7b53
commit f5d91117d8
1 changed files with 85 additions and 103 deletions
--- a/wcr/crawl_wcr.py
+++ b/wcr/crawl_wcr.py
@ -1,136 +1,118 @@
-from config import get_db_connection, preload_source_urls
+import requests, mysql.connector, time
 import requests, time, mysql.connector
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 from datetime import datetime
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from config import get_db_connection, preload_source_urls
-
+# ────────────────────────────────────────────────────────────────
-def insert_video_to_db(data, db):
+def insert_video_to_db(data):
    try:
        db = get_db_connection()
        cursor = db.cursor()
-        sql = """
+        cursor.execute("""
-        INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
+            INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
-        VALUES (%s, %s, %s, %s, NOW())
+            VALUES (%s, %s, %s, %s, NOW())
-        """
+        """, (data['username'], data['date'], data['embed_link'], data['source_url']))
        values = (
            data['username'],
            data['date'],
            data['embed_link'],
            data['source_url']
        )
        cursor.execute(sql, values)
        db.commit()
-        if cursor.rowcount == 0:
+        if cursor.rowcount > 0:
-            print("❌ Video already exists in DB")
+            print(f"✅ New: {data['username']} — {data['date']}")
        else:
            print("✅ Inserted into DB!")
    except mysql.connector.Error as err:
-        print(f"❌ Failed to insert: {err}")
+        print(f"❌ DB insert error: {err}")
    finally:
        cursor.close()
        db.close()
-def crawl_user_page(full_url):
+# ────────────────────────────────────────────────────────────────
    try:
        response = requests.get(full_url)
        if response.status_code != 200:
            print(f"❌ Failed to load {full_url}")
            return None
        soup = BeautifulSoup(response.text, "html.parser")
        data = parse_data(soup)
        data["source_url"] = full_url
        return data
    except Exception as e:
        print(f"❌ Exception while crawling {full_url}: {e}")
        return None
 def parse_data(soup):
-    title_tag = soup.find("h1", class_="entry-title")
+    username = soup.select_one("h1.entry-title")
-    username = title_tag.contents[0].strip() if title_tag else "unknown"
+    username = username.contents[0].strip() if username and username.contents else None
    date_tag = soup.find("span", class_="entry-date")
    date = date_tag.text.strip() if date_tag else None
    date_tag = soup.select_one("span.entry-date")
    date = date_tag.text.strip() if date_tag else None
    if date:
        try:
-            date_obj = datetime.strptime(date, "%d/%m/%Y")
+            date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d")
            date = date_obj.strftime("%Y-%m-%d")
        except ValueError:
            print(f"⚠️ Failed to parse date: {date}")
            date = None
-    embed_link = None
+    embed_link = next((i["src"] for i in soup.find_all("iframe", src=True) if "xpornium.net" in i["src"]), None)
-    for iframe in soup.find_all("iframe", src=True):
+    return {"username": username, "date": date, "embed_link": embed_link}
        src = iframe["src"]
        if "xpornium.net" in src:
            embed_link = src
            break
    print(f"\n✅ Scraped {username}: — {date}")
    return {
        "username": username,
        "date": date,
        "embed_link": embed_link,
    }
-def process_link(link, seen_urls, init_url):
+# ────────────────────────────────────────────────────────────────
-    full_url = init_url + link
+def crawl_user_page(url):
-    if full_url in seen_urls:
+    try:
-        print(f"⚠️ Skipping {link} - already seen.")
+        res = requests.get(url, timeout=15)
-        return
+        if res.ok:
-
+            soup = BeautifulSoup(res.text, "html.parser")
-    user_data = crawl_user_page(full_url)
+            data = parse_data(soup)
-    if not user_data:
+            data["source_url"] = url
-        print("⚠️ Skipping empty user_data.")
+            return data if data["embed_link"] else None
-        return
+    except Exception:
-
+        pass
-    if not user_data["embed_link"]:
+    return None
-        print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
+
-        return
+# ────────────────────────────────────────────────────────────────
-
+def crawl_index_page(base_url, page_num, seen, cursor, db):
-    local_db = get_db_connection()
+    url = f"{base_url}?p={page_num}"
-    insert_video_to_db(user_data, local_db)
+    print(f"📄 Page {page_num}")
-    local_db.close()
+    try:
-
+        res = requests.get(url, timeout=15)
-
+        if not res.ok:
-def crawl_all(init_url):
+            return 0
-    page = 1
+
        soup = BeautifulSoup(res.text, "html.parser")
        links = [urljoin(base_url, a["href"]) for a in soup.select("a.thumbnail-link[href]")]
        links = [link for link in links if link not in seen]
        if not links:
            return 0
        new_count = 0
        with ThreadPoolExecutor(max_workers=50) as pool:
            for f in as_completed(pool.submit(crawl_user_page, l) for l in links):
                data = f.result()
                if data:
                    insert_video_to_db(data, cursor, db)
                    new_count += 1
        return new_count
    except Exception:
        return 0
 # ────────────────────────────────────────────────────────────────
 def crawl_all(base_url):
    db = get_db_connection()
    cursor = db.cursor()
-    seen_urls = preload_source_urls(cursor)
+    seen = preload_source_urls(cursor)
    cursor.close()
    db.close()
    QUIT_LOGIC = True
    page, total, empty_results = 1000, 0, 0
    while True:
-        url = f"{init_url}?p={page}"
+        batch = range(page, page + 10)
-        print(f"\n🕷️ Crawling index page {page}: {url}")
+        print(f"\n🚀 Batch {page}–{page + 9}")
        response = requests.get(url)
        if response.status_code != 200:
            print(f"❌ Page {page} returned {response.status_code}, stopping.")
            break
-        soup = BeautifulSoup(response.text, "html.parser")
+        with ThreadPoolExecutor(max_workers=10) as pool:
-        video_pages = soup.find_all("a", class_="thumbnail-link", href=True)
+            results = [f.result() for f in as_completed(pool.submit(crawl_index_page, base_url, p, seen, cursor, db) for p in batch)]
        video_pages = [link['href'] for link in video_pages]
-        if not video_pages:
+        batch_total = sum(results)
-            print("⚠️ No user links found — reached end of site.")
+        total += batch_total
-            break
+        print(f"📦 Batch complete — {batch_total} new videos (total: {total})")
-        with ThreadPoolExecutor(max_workers=50) as executor:
+        if not QUIT_LOGIC:
-            futures = [executor.submit(process_link, link, seen_urls, init_url) for link in video_pages]
+            if batch_total == 0:
-            for _ in as_completed(futures):
+                empty_results += 1
                pass  # we already log inside the functions
-        page += 1
+            if empty_results >= 10:
                print("\n🛑 No new videos found for 30 consecutive pages. Stopping.")
                break
-    print("\n✅ Finished crawling all pages.")
+        page += 10
    cursor.close()
    db.close()
    print(f"\n✅ Done! Total new videos: {total}")
 # ────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
-    BASE_URL = "https://webcamrips.to"
+    crawl_all("https://webcamrips.to")
    crawl_all(BASE_URL)