from config import get_db_connection import requests, time, mysql.connector from bs4 import BeautifulSoup from urllib.parse import urljoin from datetime import datetime def insert_video_to_db(data): db = get_db_connection() try: cursor = db.cursor() sql = """ INSERT IGNORE INTO videos (username, url, title, date, embed_link, source_url, created_at) VALUES (%s, %s, %s, %s, %s, %s, NOW()) """ values = ( data['username'], data['url'], data['title'], data['date'], data['embed_link'], data['source_url'] ) cursor.execute(sql, values) db.commit() print("✅ Inserted into DB!") except mysql.connector.Error as err: print(f"❌ Failed to insert: {err}") finally: cursor.close() db.close() def crawl_user_page(base_url, user_path): full_url = urljoin(base_url, user_path) response = requests.get(full_url) if response.status_code != 200: print(f"❌ Failed to load {full_url}") return None soup = BeautifulSoup(response.text, "html.parser") def parse_data(soup): username = user_path.strip("/") title_tag = soup.find("h1", class_="entry-title") title = title_tag.text.strip() if title_tag else "(no title)" date_tag = soup.find("span", class_="entry-date") date = date_tag.text.strip() if date_tag else None if date: try: date_obj = datetime.strptime(date, "%d/%m/%Y") date = date_obj.strftime("%Y-%m-%d") except ValueError: print(f"⚠️ Failed to parse date: {date}") date = None embed_link = None for iframe in soup.find_all("iframe", src=True): src = iframe["src"] if "xpornium.net" in src: embed_link = src # no urljoin needed! break # stop after finding the first match # --- print info after crawling this user --- print(f"\n✅ Scraped {username}: — {date}") # ------------------------------------------- return { "username": username, "title": title, "date": date, "embed_link": embed_link, } def crawl_all(init_url): """Crawl page by page and extract user data as we go.""" page = 1 all_data = [] while True: url = f"{init_url}?p={page}" print(f"\n🕷️ Crawling index page {page}: {url}") response = requests.get(url) if response.status_code != 200: print(f"❌ Page {page} returned {response.status_code}, stopping.") break soup = BeautifulSoup(response.text, "html.parser") user_links = soup.find_all("a", class_="thumbnail-link", href=True) if not user_links: print("⚠️ No user links found — reached end of site.") break for link in user_links: user_path = link["href"] user_data = crawl_user_page(init_url, user_path) if not user_data: print("⚠️ Skipping empty user_data.") continue if not user_data["embed_link"]: print(f"⚠️ Skipping {user_data['username']} - no embed link found.") continue insert_video_to_db(user_data) time.sleep(0.5) page += 1 time.sleep(1) print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}") return all_data if __name__ == "__main__": BASE_URL = "https://webcamrips.to" results = crawl_all(BASE_URL) print("💾 All data saved to users_data.json")