import requests, mysql.connector, time from bs4 import BeautifulSoup from urllib.parse import urljoin from datetime import datetime from concurrent.futures import ThreadPoolExecutor, as_completed from config import get_db_connection, preload_source_urls # ──────────────────────────────────────────────────────────────── def insert_video_to_db(data): try: db = get_db_connection() cursor = db.cursor() cursor.execute(""" INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at) VALUES (%s, %s, %s, %s, NOW()) """, (data['username'], data['date'], data['embed_link'], data['source_url'])) db.commit() if cursor.rowcount > 0: print(f"✅ New: {data['username']} — {data['date']}") except mysql.connector.Error as err: print(f"❌ DB insert error: {err}") finally: cursor.close() db.close() # ──────────────────────────────────────────────────────────────── def parse_data(soup): username = soup.select_one("h1.entry-title") username = username.contents[0].strip() if username and username.contents else None date_tag = soup.select_one("span.entry-date") date = date_tag.text.strip() if date_tag else None if date: try: date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d") except ValueError: date = None embed_link = next((i["src"] for i in soup.find_all("iframe", src=True) if "xpornium.net" in i["src"]), None) return {"username": username, "date": date, "embed_link": embed_link} # ──────────────────────────────────────────────────────────────── def crawl_user_page(url): try: res = requests.get(url, timeout=15) if res.ok: soup = BeautifulSoup(res.text, "html.parser") data = parse_data(soup) data["source_url"] = url return data if data["embed_link"] else None except Exception: pass return None # ──────────────────────────────────────────────────────────────── def crawl_index_page(base_url, page_num, seen, cursor, db): url = f"{base_url}?p={page_num}" print(f"📄 Page {page_num}") try: res = requests.get(url, timeout=15) if not res.ok: return 0 soup = BeautifulSoup(res.text, "html.parser") links = [urljoin(base_url, a["href"]) for a in soup.select("a.thumbnail-link[href]")] links = [link for link in links if link not in seen] if not links: return 0 new_count = 0 with ThreadPoolExecutor(max_workers=50) as pool: for f in as_completed(pool.submit(crawl_user_page, l) for l in links): data = f.result() if data: insert_video_to_db(data, cursor, db) new_count += 1 return new_count except Exception: return 0 # ──────────────────────────────────────────────────────────────── def crawl_all(base_url): db = get_db_connection() cursor = db.cursor() seen = preload_source_urls(cursor) QUIT_LOGIC = True page, total, empty_results = 1000, 0, 0 while True: batch = range(page, page + 10) print(f"\n🚀 Batch {page}–{page + 9}") with ThreadPoolExecutor(max_workers=10) as pool: results = [f.result() for f in as_completed(pool.submit(crawl_index_page, base_url, p, seen, cursor, db) for p in batch)] batch_total = sum(results) total += batch_total print(f"📦 Batch complete — {batch_total} new videos (total: {total})") if not QUIT_LOGIC: if batch_total == 0: empty_results += 1 if empty_results >= 10: print("\n🛑 No new videos found for 30 consecutive pages. Stopping.") break page += 10 cursor.close() db.close() print(f"\n✅ Done! Total new videos: {total}") # ──────────────────────────────────────────────────────────────── if __name__ == "__main__": crawl_all("https://webcamrips.to")