from config import get_db_connection, preload_source_urls import requests, time, mysql.connector from bs4 import BeautifulSoup from urllib.parse import urljoin from datetime import datetime from concurrent.futures import ThreadPoolExecutor, as_completed def insert_video_to_db(data, db): try: cursor = db.cursor() sql = """ INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at) VALUES (%s, %s, %s, %s, NOW()) """ values = ( data['username'], data['date'], data['embed_link'], data['source_url'] ) cursor.execute(sql, values) db.commit() if cursor.rowcount == 0: print("❌ Video already exists in DB") else: print("✅ Inserted into DB!") except mysql.connector.Error as err: print(f"❌ Failed to insert: {err}") finally: cursor.close() def crawl_user_page(full_url): try: response = requests.get(full_url) if response.status_code != 200: print(f"❌ Failed to load {full_url}") return None soup = BeautifulSoup(response.text, "html.parser") data = parse_data(soup) data["source_url"] = full_url return data except Exception as e: print(f"❌ Exception while crawling {full_url}: {e}") return None def parse_data(soup): title_tag = soup.find("h1", class_="entry-title") username = title_tag.contents[0].strip() if title_tag else "unknown" date_tag = soup.find("span", class_="entry-date") date = date_tag.text.strip() if date_tag else None if date: try: date_obj = datetime.strptime(date, "%d/%m/%Y") date = date_obj.strftime("%Y-%m-%d") except ValueError: print(f"⚠️ Failed to parse date: {date}") date = None embed_link = None for iframe in soup.find_all("iframe", src=True): src = iframe["src"] if "xpornium.net" in src: embed_link = src break print(f"\n✅ Scraped {username}: — {date}") return { "username": username, "date": date, "embed_link": embed_link, } def process_link(link, seen_urls, init_url): full_url = init_url + link if full_url in seen_urls: print(f"⚠️ Skipping {link} - already seen.") return user_data = crawl_user_page(full_url) if not user_data: print("⚠️ Skipping empty user_data.") return if not user_data["embed_link"]: print(f"⚠️ Skipping {user_data['username']} - no embed link found.") return local_db = get_db_connection() insert_video_to_db(user_data, local_db) local_db.close() def crawl_all(init_url): page = 1 db = get_db_connection() cursor = db.cursor() seen_urls = preload_source_urls(cursor) cursor.close() db.close() while True: url = f"{init_url}?p={page}" print(f"\n🕷️ Crawling index page {page}: {url}") response = requests.get(url) if response.status_code != 200: print(f"❌ Page {page} returned {response.status_code}, stopping.") break soup = BeautifulSoup(response.text, "html.parser") video_pages = soup.find_all("a", class_="thumbnail-link", href=True) video_pages = [link['href'] for link in video_pages] if not video_pages: print("⚠️ No user links found — reached end of site.") break with ThreadPoolExecutor(max_workers=50) as executor: futures = [executor.submit(process_link, link, seen_urls, init_url) for link in video_pages] for _ in as_completed(futures): pass # we already log inside the functions page += 1 print("\n✅ Finished crawling all pages.") if __name__ == "__main__": BASE_URL = "https://webcamrips.to" crawl_all(BASE_URL)