You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

119 lines
4.8 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests, mysql.connector, time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from config import get_db_connection, preload_source_urls
# ────────────────────────────────────────────────────────────────
def insert_video_to_db(data):
try:
db = get_db_connection()
cursor = db.cursor()
cursor.execute("""
INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
VALUES (%s, %s, %s, %s, NOW())
""", (data['username'], data['date'], data['embed_link'], data['source_url']))
db.commit()
if cursor.rowcount > 0:
print(f"✅ New: {data['username']}{data['date']}")
except mysql.connector.Error as err:
print(f"❌ DB insert error: {err}")
finally:
cursor.close()
db.close()
# ────────────────────────────────────────────────────────────────
def parse_data(soup):
username = soup.select_one("h1.entry-title")
username = username.contents[0].strip() if username and username.contents else None
date_tag = soup.select_one("span.entry-date")
date = date_tag.text.strip() if date_tag else None
if date:
try:
date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d")
except ValueError:
date = None
embed_link = next((i["src"] for i in soup.find_all("iframe", src=True) if "xpornium.net" in i["src"]), None)
return {"username": username, "date": date, "embed_link": embed_link}
# ────────────────────────────────────────────────────────────────
def crawl_user_page(url):
try:
res = requests.get(url, timeout=15)
if res.ok:
soup = BeautifulSoup(res.text, "html.parser")
data = parse_data(soup)
data["source_url"] = url
return data if data["embed_link"] else None
except Exception:
pass
return None
# ────────────────────────────────────────────────────────────────
def crawl_index_page(base_url, page_num, seen, cursor, db):
url = f"{base_url}?p={page_num}"
print(f"📄 Page {page_num}")
try:
res = requests.get(url, timeout=15)
if not res.ok:
return 0
soup = BeautifulSoup(res.text, "html.parser")
links = [urljoin(base_url, a["href"]) for a in soup.select("a.thumbnail-link[href]")]
links = [link for link in links if link not in seen]
if not links:
return 0
new_count = 0
with ThreadPoolExecutor(max_workers=50) as pool:
for f in as_completed(pool.submit(crawl_user_page, l) for l in links):
data = f.result()
if data:
insert_video_to_db(data, cursor, db)
new_count += 1
return new_count
except Exception:
return 0
# ────────────────────────────────────────────────────────────────
def crawl_all(base_url):
db = get_db_connection()
cursor = db.cursor()
seen = preload_source_urls(cursor)
QUIT_LOGIC = True
page, total, empty_results = 1000, 0, 0
while True:
batch = range(page, page + 10)
print(f"\n🚀 Batch {page}{page + 9}")
with ThreadPoolExecutor(max_workers=10) as pool:
results = [f.result() for f in as_completed(pool.submit(crawl_index_page, base_url, p, seen, cursor, db) for p in batch)]
batch_total = sum(results)
total += batch_total
print(f"📦 Batch complete — {batch_total} new videos (total: {total})")
if not QUIT_LOGIC:
if batch_total == 0:
empty_results += 1
if empty_results >= 10:
print("\n🛑 No new videos found for 30 consecutive pages. Stopping.")
break
page += 10
cursor.close()
db.close()
print(f"\n✅ Done! Total new videos: {total}")
# ────────────────────────────────────────────────────────────────
if __name__ == "__main__":
crawl_all("https://webcamrips.to")