from config import get_db_connection, preload_source_urls import requests, time, mysql.connector from bs4 import BeautifulSoup from urllib.parse import urljoin from datetime import datetime def insert_video_to_db(data): try: cursor = db.cursor() sql = """ INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at) VALUES (%s, %s, %s, %s, NOW()) """ values = ( data['username'], data['date'], data['embed_link'], data['source_url'] ) cursor.execute(sql, values) db.commit() if cursor.rowcount == 0: print("❌ Video already exists in DB") else: print("✅ Inserted into DB!") except mysql.connector.Error as err: print(f"❌ Failed to insert: {err}") def crawl_user_page(full_url): response = requests.get(full_url) if response.status_code != 200: print(f"❌ Failed to load {full_url}") return None soup = BeautifulSoup(response.text, "html.parser") data = parse_data(soup) data["source_url"] = full_url return data def parse_data(soup): title_tag = soup.find("h1", class_="entry-title") username = title_tag.contents[0].strip() date_tag = soup.find("span", class_="entry-date") date = date_tag.text.strip() if date_tag else None if date: try: date_obj = datetime.strptime(date, "%d/%m/%Y") date = date_obj.strftime("%Y-%m-%d") except ValueError: print(f"⚠️ Failed to parse date: {date}") date = None embed_link = None for iframe in soup.find_all("iframe", src=True): src = iframe["src"] if "xpornium.net" in src: embed_link = src # no urljoin needed! break # stop after finding the first match # --- print info after crawling this user --- print(f"\n✅ Scraped {username}: — {date}") # ------------------------------------------- return { "username": username, "date": date, "embed_link": embed_link, } def crawl_all(init_url): """Crawl page by page and extract user data as we go.""" page = 1 all_data = [] while True: url = f"{init_url}?p={page}" print(f"\n🕷️ Crawling index page {page}: {url}") response = requests.get(url) if response.status_code != 200: print(f"❌ Page {page} returned {response.status_code}, stopping.") break soup = BeautifulSoup(response.text, "html.parser") video_pages = soup.find_all("a", class_="thumbnail-link", href=True) video_pages = [link['href'] for link in video_pages] if not video_pages: print("⚠️ No user links found — reached end of site.") break cursor = db.cursor() seen_urls = preload_source_urls(cursor) for link in video_pages: full_url = init_url + link if full_url in seen_urls: print(f"⚠️ Skipping {link} - already seen.") continue user_data = crawl_user_page(full_url) # slow as fuk if not user_data: print("⚠️ Skipping empty user_data.") continue if not user_data["embed_link"]: print(f"⚠️ Skipping {user_data['username']} - no embed link found.") continue insert_video_to_db(user_data) page += 1 print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}") return all_data if __name__ == "__main__": db = get_db_connection() BASE_URL = "https://webcamrips.to" results = crawl_all(BASE_URL) print("💾 All data saved to users_data.json") cursor = db.cursor() cursor.close() db.close()