diff --git a/wcr/crawl_wcr.py b/wcr/crawl_wcr.py index 3c470ec..f7a053e 100644 --- a/wcr/crawl_wcr.py +++ b/wcr/crawl_wcr.py @@ -3,18 +3,16 @@ import requests, time, mysql.connector from bs4 import BeautifulSoup from urllib.parse import urljoin from datetime import datetime +from concurrent.futures import ThreadPoolExecutor, as_completed -def insert_video_to_db(data): - - +def insert_video_to_db(data, db): try: cursor = db.cursor() sql = """ INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at) VALUES (%s, %s, %s, %s, NOW()) - """ - + """ values = ( data['username'], data['date'], @@ -27,29 +25,31 @@ def insert_video_to_db(data): print("❌ Video already exists in DB") else: print("✅ Inserted into DB!") - except mysql.connector.Error as err: print(f"❌ Failed to insert: {err}") + finally: + cursor.close() - def crawl_user_page(full_url): - response = requests.get(full_url) - if response.status_code != 200: - print(f"❌ Failed to load {full_url}") - return None + try: + response = requests.get(full_url) + if response.status_code != 200: + print(f"❌ Failed to load {full_url}") + return None - soup = BeautifulSoup(response.text, "html.parser") + soup = BeautifulSoup(response.text, "html.parser") + data = parse_data(soup) + data["source_url"] = full_url + return data + except Exception as e: + print(f"❌ Exception while crawling {full_url}: {e}") + return None - data = parse_data(soup) - data["source_url"] = full_url - return data - def parse_data(soup): - title_tag = soup.find("h1", class_="entry-title") - username = title_tag.contents[0].strip() + username = title_tag.contents[0].strip() if title_tag else "unknown" date_tag = soup.find("span", class_="entry-date") date = date_tag.text.strip() if date_tag else None @@ -61,28 +61,49 @@ def parse_data(soup): print(f"⚠️ Failed to parse date: {date}") date = None - embed_link = None - for iframe in soup.find_all("iframe", src=True): - src = iframe["src"] - if "xpornium.net" in src: - embed_link = src # no urljoin needed! - break # stop after finding the first match + embed_link = None + for iframe in soup.find_all("iframe", src=True): + src = iframe["src"] + if "xpornium.net" in src: + embed_link = src + break + + print(f"\n✅ Scraped {username}: — {date}") + + return { + "username": username, + "date": date, + "embed_link": embed_link, + } + + +def process_link(link, seen_urls, init_url): + full_url = init_url + link + if full_url in seen_urls: + print(f"⚠️ Skipping {link} - already seen.") + return - # --- print info after crawling this user --- - print(f"\n✅ Scraped {username}: — {date}") + user_data = crawl_user_page(full_url) + if not user_data: + print("⚠️ Skipping empty user_data.") + return - # ------------------------------------------- + if not user_data["embed_link"]: + print(f"⚠️ Skipping {user_data['username']} - no embed link found.") + return + + local_db = get_db_connection() + insert_video_to_db(user_data, local_db) + local_db.close() - return { - "username": username, - "date": date, - "embed_link": embed_link, - } def crawl_all(init_url): - """Crawl page by page and extract user data as we go.""" page = 1 - all_data = [] + db = get_db_connection() + cursor = db.cursor() + seen_urls = preload_source_urls(cursor) + cursor.close() + db.close() while True: url = f"{init_url}?p={page}" @@ -95,44 +116,21 @@ def crawl_all(init_url): soup = BeautifulSoup(response.text, "html.parser") video_pages = soup.find_all("a", class_="thumbnail-link", href=True) video_pages = [link['href'] for link in video_pages] + if not video_pages: print("⚠️ No user links found — reached end of site.") break - cursor = db.cursor() - seen_urls = preload_source_urls(cursor) - - for link in video_pages: - full_url = init_url + link - - if full_url in seen_urls: - print(f"⚠️ Skipping {link} - already seen.") - continue - - - - user_data = crawl_user_page(full_url) # slow as fuk - if not user_data: - print("⚠️ Skipping empty user_data.") - continue - - if not user_data["embed_link"]: - print(f"⚠️ Skipping {user_data['username']} - no embed link found.") - continue - - insert_video_to_db(user_data) - + + with ThreadPoolExecutor(max_workers=50) as executor: + futures = [executor.submit(process_link, link, seen_urls, init_url) for link in video_pages] + for _ in as_completed(futures): + pass # we already log inside the functions page += 1 - - - print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}") - return all_data + + print("\n✅ Finished crawling all pages.") + if __name__ == "__main__": - db = get_db_connection() BASE_URL = "https://webcamrips.to" - results = crawl_all(BASE_URL) - print("💾 All data saved to users_data.json") - cursor = db.cursor() - cursor.close() - db.close() \ No newline at end of file + crawl_all(BASE_URL)