Compare commits

..

No commits in common. 'f5d91117d892611501bd850a0c82333fa5d24f85' and 'c79a86eddb8f53020b0331673b641848f46b0b25' have entirely different histories.

Binary file not shown.

@ -15,7 +15,3 @@ def get_db_connection():
print(f"❌ Failed to connect to DB: {err}") print(f"❌ Failed to connect to DB: {err}")
return # dont continue if DB failed return # dont continue if DB failed
return db return db
def preload_source_urls(cursor):
cursor.execute("SELECT source_url FROM videos")
return set(row[0] for row in cursor.fetchall())

@ -1,118 +1,135 @@
import requests, mysql.connector, time from config import get_db_connection
import requests, time, mysql.connector
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
from datetime import datetime from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from config import get_db_connection, preload_source_urls
# ────────────────────────────────────────────────────────────────
def insert_video_to_db(data): def insert_video_to_db(data):
try:
db = get_db_connection() db = get_db_connection()
try:
cursor = db.cursor() cursor = db.cursor()
cursor.execute(""" sql = """
INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at) INSERT IGNORE INTO videos (username, url, title, date, embed_link, source_url, created_at)
VALUES (%s, %s, %s, %s, NOW()) VALUES (%s, %s, %s, %s, %s, %s, NOW())
""", (data['username'], data['date'], data['embed_link'], data['source_url'])) """
values = (
data['username'],
data['url'],
data['title'],
data['date'],
data['embed_link'],
data['source_url']
)
cursor.execute(sql, values)
db.commit() db.commit()
if cursor.rowcount > 0: print("✅ Inserted into DB!")
print(f"✅ New: {data['username']}{data['date']}")
except mysql.connector.Error as err: except mysql.connector.Error as err:
print(f"❌ DB insert error: {err}") print(f"❌ Failed to insert: {err}")
finally: finally:
cursor.close() cursor.close()
db.close() db.close()
def crawl_user_page(base_url, user_path):
full_url = urljoin(base_url, user_path)
response = requests.get(full_url)
if response.status_code != 200:
print(f"❌ Failed to load {full_url}")
return None
soup = BeautifulSoup(response.text, "html.parser")
# ────────────────────────────────────────────────────────────────
def parse_data(soup): def parse_data(soup):
username = soup.select_one("h1.entry-title") username = user_path.strip("/")
username = username.contents[0].strip() if username and username.contents else None title_tag = soup.find("h1", class_="entry-title")
title = title_tag.text.strip() if title_tag else "(no title)"
date_tag = soup.select_one("span.entry-date") date_tag = soup.find("span", class_="entry-date")
date = date_tag.text.strip() if date_tag else None date = date_tag.text.strip() if date_tag else None
if date: if date:
try: try:
date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d") date_obj = datetime.strptime(date, "%d/%m/%Y")
date = date_obj.strftime("%Y-%m-%d")
except ValueError: except ValueError:
print(f"⚠️ Failed to parse date: {date}")
date = None date = None
embed_link = next((i["src"] for i in soup.find_all("iframe", src=True) if "xpornium.net" in i["src"]), None) embed_link = None
return {"username": username, "date": date, "embed_link": embed_link} for iframe in soup.find_all("iframe", src=True):
src = iframe["src"]
if "xpornium.net" in src:
embed_link = src # no urljoin needed!
break # stop after finding the first match
# ──────────────────────────────────────────────────────────────── # --- print info after crawling this user ---
def crawl_user_page(url): print(f"\n✅ Scraped {username}: — {date}")
try:
res = requests.get(url, timeout=15)
if res.ok:
soup = BeautifulSoup(res.text, "html.parser")
data = parse_data(soup)
data["source_url"] = url
return data if data["embed_link"] else None
except Exception:
pass
return None
# ──────────────────────────────────────────────────────────────── # -------------------------------------------
def crawl_index_page(base_url, page_num, seen, cursor, db):
url = f"{base_url}?p={page_num}"
print(f"📄 Page {page_num}")
try:
res = requests.get(url, timeout=15)
if not res.ok:
return 0
soup = BeautifulSoup(res.text, "html.parser")
links = [urljoin(base_url, a["href"]) for a in soup.select("a.thumbnail-link[href]")]
links = [link for link in links if link not in seen]
if not links:
return 0
new_count = 0
with ThreadPoolExecutor(max_workers=50) as pool:
for f in as_completed(pool.submit(crawl_user_page, l) for l in links):
data = f.result()
if data:
insert_video_to_db(data, cursor, db)
new_count += 1
return new_count
except Exception:
return 0
# ────────────────────────────────────────────────────────────────
def crawl_all(base_url):
db = get_db_connection()
cursor = db.cursor()
seen = preload_source_urls(cursor)
QUIT_LOGIC = True return {
"username": username,
"title": title,
"date": date,
"embed_link": embed_link,
}
def crawl_all(init_url):
"""Crawl page by page and extract user data as we go."""
page = 1
all_data = []
page, total, empty_results = 1000, 0, 0
while True: while True:
batch = range(page, page + 10) url = f"{init_url}?p={page}"
print(f"\n🚀 Batch {page}{page + 9}") print(f"\n🕷️ Crawling index page {page}: {url}")
response = requests.get(url)
if response.status_code != 200:
print(f"❌ Page {page} returned {response.status_code}, stopping.")
break
with ThreadPoolExecutor(max_workers=10) as pool: soup = BeautifulSoup(response.text, "html.parser")
results = [f.result() for f in as_completed(pool.submit(crawl_index_page, base_url, p, seen, cursor, db) for p in batch)] user_links = soup.find_all("a", class_="thumbnail-link", href=True)
if not user_links:
print("⚠️ No user links found — reached end of site.")
break
batch_total = sum(results) for link in user_links:
total += batch_total user_path = link["href"]
print(f"📦 Batch complete — {batch_total} new videos (total: {total})") user_data = crawl_user_page(init_url, user_path)
if not user_data:
print("⚠️ Skipping empty user_data.")
continue
if not QUIT_LOGIC: if not user_data["embed_link"]:
if batch_total == 0: print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
empty_results += 1 continue
if empty_results >= 10: insert_video_to_db(user_data)
print("\n🛑 No new videos found for 30 consecutive pages. Stopping.") time.sleep(0.5)
break
page += 10 page += 1
time.sleep(1)
cursor.close() print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}")
db.close() return all_data
print(f"\n✅ Done! Total new videos: {total}")
# ────────────────────────────────────────────────────────────────
if __name__ == "__main__": if __name__ == "__main__":
crawl_all("https://webcamrips.to") BASE_URL = "https://webcamrips.to"
results = crawl_all(BASE_URL)
print("💾 All data saved to users_data.json")

Loading…
Cancel
Save