From f5d91117d892611501bd850a0c82333fa5d24f85 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 27 Oct 2025 04:27:30 -0700 Subject: [PATCH] ASS STILL HURT --- wcr/crawl_wcr.py | 188 +++++++++++++++++++++-------------------------- 1 file changed, 85 insertions(+), 103 deletions(-) diff --git a/wcr/crawl_wcr.py b/wcr/crawl_wcr.py index f7a053e..d43a9c2 100644 --- a/wcr/crawl_wcr.py +++ b/wcr/crawl_wcr.py @@ -1,136 +1,118 @@ -from config import get_db_connection, preload_source_urls -import requests, time, mysql.connector +import requests, mysql.connector, time from bs4 import BeautifulSoup from urllib.parse import urljoin from datetime import datetime from concurrent.futures import ThreadPoolExecutor, as_completed +from config import get_db_connection, preload_source_urls - -def insert_video_to_db(data, db): +# ──────────────────────────────────────────────────────────────── +def insert_video_to_db(data): try: + db = get_db_connection() cursor = db.cursor() - sql = """ - INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at) - VALUES (%s, %s, %s, %s, NOW()) - """ - values = ( - data['username'], - data['date'], - data['embed_link'], - data['source_url'] - ) - cursor.execute(sql, values) + cursor.execute(""" + INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at) + VALUES (%s, %s, %s, %s, NOW()) + """, (data['username'], data['date'], data['embed_link'], data['source_url'])) db.commit() - if cursor.rowcount == 0: - print("❌ Video already exists in DB") - else: - print("✅ Inserted into DB!") + if cursor.rowcount > 0: + print(f"✅ New: {data['username']} — {data['date']}") except mysql.connector.Error as err: - print(f"❌ Failed to insert: {err}") + print(f"❌ DB insert error: {err}") finally: cursor.close() + db.close() -def crawl_user_page(full_url): - try: - response = requests.get(full_url) - if response.status_code != 200: - print(f"❌ Failed to load {full_url}") - return None - - soup = BeautifulSoup(response.text, "html.parser") - data = parse_data(soup) - data["source_url"] = full_url - return data - except Exception as e: - print(f"❌ Exception while crawling {full_url}: {e}") - return None - - +# ──────────────────────────────────────────────────────────────── def parse_data(soup): - title_tag = soup.find("h1", class_="entry-title") - username = title_tag.contents[0].strip() if title_tag else "unknown" - date_tag = soup.find("span", class_="entry-date") - date = date_tag.text.strip() if date_tag else None + username = soup.select_one("h1.entry-title") + username = username.contents[0].strip() if username and username.contents else None + date_tag = soup.select_one("span.entry-date") + date = date_tag.text.strip() if date_tag else None if date: try: - date_obj = datetime.strptime(date, "%d/%m/%Y") - date = date_obj.strftime("%Y-%m-%d") + date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d") except ValueError: - print(f"⚠️ Failed to parse date: {date}") date = None - embed_link = None - for iframe in soup.find_all("iframe", src=True): - src = iframe["src"] - if "xpornium.net" in src: - embed_link = src - break - - print(f"\n✅ Scraped {username}: — {date}") - - return { - "username": username, - "date": date, - "embed_link": embed_link, - } - + embed_link = next((i["src"] for i in soup.find_all("iframe", src=True) if "xpornium.net" in i["src"]), None) + return {"username": username, "date": date, "embed_link": embed_link} -def process_link(link, seen_urls, init_url): - full_url = init_url + link - if full_url in seen_urls: - print(f"⚠️ Skipping {link} - already seen.") - return - - user_data = crawl_user_page(full_url) - if not user_data: - print("⚠️ Skipping empty user_data.") - return - - if not user_data["embed_link"]: - print(f"⚠️ Skipping {user_data['username']} - no embed link found.") - return - - local_db = get_db_connection() - insert_video_to_db(user_data, local_db) - local_db.close() - - -def crawl_all(init_url): - page = 1 +# ──────────────────────────────────────────────────────────────── +def crawl_user_page(url): + try: + res = requests.get(url, timeout=15) + if res.ok: + soup = BeautifulSoup(res.text, "html.parser") + data = parse_data(soup) + data["source_url"] = url + return data if data["embed_link"] else None + except Exception: + pass + return None + +# ──────────────────────────────────────────────────────────────── +def crawl_index_page(base_url, page_num, seen, cursor, db): + url = f"{base_url}?p={page_num}" + print(f"📄 Page {page_num}") + try: + res = requests.get(url, timeout=15) + if not res.ok: + return 0 + + soup = BeautifulSoup(res.text, "html.parser") + links = [urljoin(base_url, a["href"]) for a in soup.select("a.thumbnail-link[href]")] + links = [link for link in links if link not in seen] + if not links: + return 0 + + new_count = 0 + with ThreadPoolExecutor(max_workers=50) as pool: + for f in as_completed(pool.submit(crawl_user_page, l) for l in links): + data = f.result() + if data: + insert_video_to_db(data, cursor, db) + new_count += 1 + return new_count + except Exception: + return 0 + +# ──────────────────────────────────────────────────────────────── +def crawl_all(base_url): db = get_db_connection() cursor = db.cursor() - seen_urls = preload_source_urls(cursor) - cursor.close() - db.close() + seen = preload_source_urls(cursor) + + QUIT_LOGIC = True + page, total, empty_results = 1000, 0, 0 while True: - url = f"{init_url}?p={page}" - print(f"\n🕷️ Crawling index page {page}: {url}") - response = requests.get(url) - if response.status_code != 200: - print(f"❌ Page {page} returned {response.status_code}, stopping.") - break + batch = range(page, page + 10) + print(f"\n🚀 Batch {page}–{page + 9}") - soup = BeautifulSoup(response.text, "html.parser") - video_pages = soup.find_all("a", class_="thumbnail-link", href=True) - video_pages = [link['href'] for link in video_pages] + with ThreadPoolExecutor(max_workers=10) as pool: + results = [f.result() for f in as_completed(pool.submit(crawl_index_page, base_url, p, seen, cursor, db) for p in batch)] - if not video_pages: - print("⚠️ No user links found — reached end of site.") - break + batch_total = sum(results) + total += batch_total + print(f"📦 Batch complete — {batch_total} new videos (total: {total})") - with ThreadPoolExecutor(max_workers=50) as executor: - futures = [executor.submit(process_link, link, seen_urls, init_url) for link in video_pages] - for _ in as_completed(futures): - pass # we already log inside the functions + if not QUIT_LOGIC: + if batch_total == 0: + empty_results += 1 - page += 1 + if empty_results >= 10: + print("\n🛑 No new videos found for 30 consecutive pages. Stopping.") + break - print("\n✅ Finished crawling all pages.") + page += 10 + cursor.close() + db.close() + print(f"\n✅ Done! Total new videos: {total}") +# ──────────────────────────────────────────────────────────────── if __name__ == "__main__": - BASE_URL = "https://webcamrips.to" - crawl_all(BASE_URL) + crawl_all("https://webcamrips.to")