You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

119 lines
4.8 KiB
Python

11 hours ago
import requests, mysql.connector, time
14 hours ago
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
11 hours ago
from config import get_db_connection, preload_source_urls
14 hours ago
11 hours ago
# ────────────────────────────────────────────────────────────────
def insert_video_to_db(data):
14 hours ago
try:
11 hours ago
db = get_db_connection()
14 hours ago
cursor = db.cursor()
11 hours ago
cursor.execute("""
INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
VALUES (%s, %s, %s, %s, NOW())
""", (data['username'], data['date'], data['embed_link'], data['source_url']))
14 hours ago
db.commit()
11 hours ago
if cursor.rowcount > 0:
print(f"✅ New: {data['username']}{data['date']}")
14 hours ago
except mysql.connector.Error as err:
11 hours ago
print(f"❌ DB insert error: {err}")
finally:
cursor.close()
11 hours ago
db.close()
14 hours ago
11 hours ago
# ────────────────────────────────────────────────────────────────
14 hours ago
def parse_data(soup):
11 hours ago
username = soup.select_one("h1.entry-title")
username = username.contents[0].strip() if username and username.contents else None
14 hours ago
11 hours ago
date_tag = soup.select_one("span.entry-date")
date = date_tag.text.strip() if date_tag else None
14 hours ago
if date:
try:
11 hours ago
date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d")
14 hours ago
except ValueError:
date = None
11 hours ago
embed_link = next((i["src"] for i in soup.find_all("iframe", src=True) if "xpornium.net" in i["src"]), None)
return {"username": username, "date": date, "embed_link": embed_link}
11 hours ago
# ────────────────────────────────────────────────────────────────
def crawl_user_page(url):
try:
res = requests.get(url, timeout=15)
if res.ok:
soup = BeautifulSoup(res.text, "html.parser")
data = parse_data(soup)
data["source_url"] = url
return data if data["embed_link"] else None
except Exception:
pass
return None
# ────────────────────────────────────────────────────────────────
def crawl_index_page(base_url, page_num, seen, cursor, db):
url = f"{base_url}?p={page_num}"
print(f"📄 Page {page_num}")
try:
res = requests.get(url, timeout=15)
if not res.ok:
return 0
soup = BeautifulSoup(res.text, "html.parser")
links = [urljoin(base_url, a["href"]) for a in soup.select("a.thumbnail-link[href]")]
links = [link for link in links if link not in seen]
if not links:
return 0
new_count = 0
with ThreadPoolExecutor(max_workers=50) as pool:
for f in as_completed(pool.submit(crawl_user_page, l) for l in links):
data = f.result()
if data:
insert_video_to_db(data, cursor, db)
new_count += 1
return new_count
except Exception:
return 0
# ────────────────────────────────────────────────────────────────
def crawl_all(base_url):
db = get_db_connection()
cursor = db.cursor()
11 hours ago
seen = preload_source_urls(cursor)
QUIT_LOGIC = True
14 hours ago
11 hours ago
page, total, empty_results = 1000, 0, 0
14 hours ago
while True:
11 hours ago
batch = range(page, page + 10)
print(f"\n🚀 Batch {page}{page + 9}")
14 hours ago
11 hours ago
with ThreadPoolExecutor(max_workers=10) as pool:
results = [f.result() for f in as_completed(pool.submit(crawl_index_page, base_url, p, seen, cursor, db) for p in batch)]
11 hours ago
batch_total = sum(results)
total += batch_total
print(f"📦 Batch complete — {batch_total} new videos (total: {total})")
11 hours ago
if not QUIT_LOGIC:
if batch_total == 0:
empty_results += 1
14 hours ago
11 hours ago
if empty_results >= 10:
print("\n🛑 No new videos found for 30 consecutive pages. Stopping.")
break
11 hours ago
page += 10
11 hours ago
cursor.close()
db.close()
print(f"\n✅ Done! Total new videos: {total}")
14 hours ago
11 hours ago
# ────────────────────────────────────────────────────────────────
14 hours ago
if __name__ == "__main__":
11 hours ago
crawl_all("https://webcamrips.to")