Crawler/wcr/crawl_wcr.py

import requests, mysql.connector, time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from config import get_db_connection, preload_source_urls

# ────────────────────────────────────────────────────────────────
def insert_video_to_db(data):
    try:
        db = get_db_connection()
        cursor = db.cursor()
        cursor.execute("""
            INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
            VALUES (%s, %s, %s, %s, NOW())
        """, (data['username'], data['date'], data['embed_link'], data['source_url']))
        db.commit()
        if cursor.rowcount > 0:
            print(f"✅ New: {data['username']} — {data['date']}")
    except mysql.connector.Error as err:
        print(f"❌ DB insert error: {err}")
    finally:
        cursor.close()
        db.close()


# ────────────────────────────────────────────────────────────────
def parse_data(soup):
    username = soup.select_one("h1.entry-title")
    username = username.contents[0].strip() if username and username.contents else None

    date_tag = soup.select_one("span.entry-date")
    date = date_tag.text.strip() if date_tag else None
    if date:
        try:
            date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d")
        except ValueError:
            date = None

    embed_link = next((i["src"] for i in soup.find_all("iframe", src=True) if "xpornium.net" in i["src"]), None)
    return {"username": username, "date": date, "embed_link": embed_link}

# ────────────────────────────────────────────────────────────────
def crawl_user_page(url):
    try:
        res = requests.get(url, timeout=15)
        if res.ok:
            soup = BeautifulSoup(res.text, "html.parser")
            data = parse_data(soup)
            data["source_url"] = url
            return data if data["embed_link"] else None
    except Exception:
        pass
    return None

# ────────────────────────────────────────────────────────────────
def crawl_index_page(base_url, page_num, seen, cursor, db):
    url = f"{base_url}?p={page_num}"
    print(f"📄 Page {page_num}")
    try:
        res = requests.get(url, timeout=15)
        if not res.ok:
            return 0

        soup = BeautifulSoup(res.text, "html.parser")
        links = [urljoin(base_url, a["href"]) for a in soup.select("a.thumbnail-link[href]")]
        links = [link for link in links if link not in seen]
        if not links:
            return 0

        new_count = 0
        with ThreadPoolExecutor(max_workers=50) as pool:
            for f in as_completed(pool.submit(crawl_user_page, l) for l in links):
                data = f.result()
                if data:
                    insert_video_to_db(data, cursor, db)
                    new_count += 1
        return new_count
    except Exception:
        return 0

# ────────────────────────────────────────────────────────────────
def crawl_all(base_url):
    db = get_db_connection()
    cursor = db.cursor()
    seen = preload_source_urls(cursor)
    
    QUIT_LOGIC = True

    page, total, empty_results = 1000, 0, 0
    while True:
        batch = range(page, page + 10)
        print(f"\n🚀 Batch {page}–{page + 9}")

        with ThreadPoolExecutor(max_workers=10) as pool:
            results = [f.result() for f in as_completed(pool.submit(crawl_index_page, base_url, p, seen, cursor, db) for p in batch)]

        batch_total = sum(results)
        total += batch_total
        print(f"📦 Batch complete — {batch_total} new videos (total: {total})")

        if not QUIT_LOGIC:
            if batch_total == 0:
                empty_results += 1

            if empty_results >= 10:
                print("\n🛑 No new videos found for 30 consecutive pages. Stopping.")
                break

        page += 10

    cursor.close()
    db.close()
    print(f"\n✅ Done! Total new videos: {total}")

# ────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    crawl_all("https://webcamrips.to")
-												ASS STILL HURT

											
										
										
											11 hours ago
+								import requests, mysql.connector, time
-												Initial commit

											
										
										
											14 hours ago
+								from bs4 import BeautifulSoup
 								from urllib.parse import urljoin
 								from datetime import datetime
-												ASS STILL FUCKING BROKEN MY NIGGA WTFFFFFFF

											
										
										
											12 hours ago
+								from concurrent.futures import ThreadPoolExecutor, as_completed
-												ASS STILL HURT

											
										
										
											11 hours ago
+								from config import get_db_connection, preload_source_urls
-												Initial commit

											
										
										
											14 hours ago
-												ASS STILL HURT

											
										
										
											11 hours ago
+								# ────────────────────────────────────────────────────────────────
 								def insert_video_to_db(data):
-												Initial commit

											
										
										
											14 hours ago
+								    try:
-												ASS STILL HURT

											
										
										
											11 hours ago
+								        db = get_db_connection()
-												Initial commit

											
										
										
											14 hours ago
+								        cursor = db.cursor()
-												ASS STILL HURT

											
										
										
											11 hours ago
+								        cursor.execute("""
 								            INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
 								            VALUES (%s, %s, %s, %s, NOW())
 								        """, (data['username'], data['date'], data['embed_link'], data['source_url']))
-												Initial commit

											
										
										
											14 hours ago
+								        db.commit()
-												ASS STILL HURT

											
										
										
											11 hours ago
+								        if cursor.rowcount > 0:
 								            print(f"✅ New: {data['username']} — {data['date']}")
-												Initial commit

											
										
										
											14 hours ago
+								    except mysql.connector.Error as err:
-												ASS STILL HURT

											
										
										
											11 hours ago
+								        print(f"❌ DB insert error: {err}")
-												ASS STILL FUCKING BROKEN MY NIGGA WTFFFFFFF

											
										
										
											12 hours ago
+								    finally:
 								        cursor.close()
-												ASS STILL HURT

											
										
										
											11 hours ago
+								        db.close()
-												Initial commit

											
										
										
											14 hours ago
-												ASS STILL HURT

											
										
										
											11 hours ago
+								# ────────────────────────────────────────────────────────────────
-												Initial commit

											
										
										
											14 hours ago
+								def parse_data(soup):
-												ASS STILL HURT

											
										
										
											11 hours ago
+								    username = soup.select_one("h1.entry-title")
 								    username = username.contents[0].strip() if username and username.contents else None
-												Initial commit

											
										
										
											14 hours ago
-												ASS STILL HURT

											
										
										
											11 hours ago
+								    date_tag = soup.select_one("span.entry-date")
 								    date = date_tag.text.strip() if date_tag else None
-												Initial commit

											
										
										
											14 hours ago
+								    if date:
 								        try:
-												ASS STILL HURT

											
										
										
											11 hours ago
+								            date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d")
-												Initial commit

											
										
										
											14 hours ago
+								        except ValueError:
 								            date = None
-												ASS STILL HURT

											
										
										
											11 hours ago
+								    embed_link = next((i["src"] for i in soup.find_all("iframe", src=True) if "xpornium.net" in i["src"]), None)
 								    return {"username": username, "date": date, "embed_link": embed_link}
-												ASS STILL FUCKING BROKEN MY NIGGA WTFFFFFFF

											
										
										
											12 hours ago
-												ASS STILL HURT

											
										
										
											11 hours ago
+								# ────────────────────────────────────────────────────────────────
 								def crawl_user_page(url):
 								    try:
 								        res = requests.get(url, timeout=15)
 								        if res.ok:
 								            soup = BeautifulSoup(res.text, "html.parser")
 								            data = parse_data(soup)
 								            data["source_url"] = url
 								            return data if data["embed_link"] else None
 								    except Exception:
 								        pass
 								    return None
 								# ────────────────────────────────────────────────────────────────
 								def crawl_index_page(base_url, page_num, seen, cursor, db):
 								    url = f"{base_url}?p={page_num}"
 								    print(f"📄 Page {page_num}")
 								    try:
 								        res = requests.get(url, timeout=15)
 								        if not res.ok:
 								            return 0
 								        soup = BeautifulSoup(res.text, "html.parser")
 								        links = [urljoin(base_url, a["href"]) for a in soup.select("a.thumbnail-link[href]")]
 								        links = [link for link in links if link not in seen]
 								        if not links:
 								            return 0
 								        new_count = 0
 								        with ThreadPoolExecutor(max_workers=50) as pool:
 								            for f in as_completed(pool.submit(crawl_user_page, l) for l in links):
 								                data = f.result()
 								                if data:
 								                    insert_video_to_db(data, cursor, db)
 								                    new_count += 1
 								        return new_count
 								    except Exception:
 								        return 0
 								# ────────────────────────────────────────────────────────────────
 								def crawl_all(base_url):
-												ASS STILL FUCKING BROKEN MY NIGGA WTFFFFFFF

											
										
										
											12 hours ago
+								    db = get_db_connection()
 								    cursor = db.cursor()
-												ASS STILL HURT

											
										
										
											11 hours ago
+								    seen = preload_source_urls(cursor)
 								    QUIT_LOGIC = True
-												Initial commit

											
										
										
											14 hours ago
-												ASS STILL HURT

											
										
										
											11 hours ago
+								    page, total, empty_results = 1000, 0, 0
-												Initial commit

											
										
										
											14 hours ago
+								    while True:
-												ASS STILL HURT

											
										
										
											11 hours ago
+								        batch = range(page, page + 10)
 								        print(f"\n🚀 Batch {page}–{page + 9}")
-												Initial commit

											
										
										
											14 hours ago
-												ASS STILL HURT

											
										
										
											11 hours ago
+								        with ThreadPoolExecutor(max_workers=10) as pool:
 								            results = [f.result() for f in as_completed(pool.submit(crawl_index_page, base_url, p, seen, cursor, db) for p in batch)]
-												ASS STILL FUCKING BROKEN MY NIGGA WTFFFFFFF

											
										
										
											12 hours ago
-												ASS STILL HURT

											
										
										
											11 hours ago
+								        batch_total = sum(results)
 								        total += batch_total
 								        print(f"📦 Batch complete — {batch_total} new videos (total: {total})")
-												ASS STILL FUCKING BROKEN MY NIGGA WTFFFFFFF

											
										
										
											12 hours ago
-												ASS STILL HURT

											
										
										
											11 hours ago
+								        if not QUIT_LOGIC:
 								            if batch_total == 0:
 								                empty_results += 1
-												Initial commit

											
										
										
											14 hours ago
-												ASS STILL HURT

											
										
										
											11 hours ago
+								            if empty_results >= 10:
 								                print("\n🛑 No new videos found for 30 consecutive pages. Stopping.")
 								                break
-												ASS STILL FUCKING BROKEN MY NIGGA WTFFFFFFF

											
										
										
											12 hours ago
-												ASS STILL HURT

											
										
										
											11 hours ago
+								        page += 10
-												ASS STILL FUCKING BROKEN MY NIGGA WTFFFFFFF

											
										
										
											12 hours ago
-												ASS STILL HURT

											
										
										
											11 hours ago
+								    cursor.close()
 								    db.close()
 								    print(f"\n✅ Done! Total new videos: {total}")
-												Initial commit

											
										
										
											14 hours ago
-												ASS STILL HURT

											
										
										
											11 hours ago
+								# ────────────────────────────────────────────────────────────────
-												Initial commit

											
										
										
											14 hours ago
+								if __name__ == "__main__":
-												ASS STILL HURT

											
										
										
											11 hours ago
+								    crawl_all("https://webcamrips.to")