Compare commits

..

No commits in common. 'f5d91117d892611501bd850a0c82333fa5d24f85' and 'c79a86eddb8f53020b0331673b641848f46b0b25' have entirely different histories.

Binary file not shown.

@ -14,8 +14,4 @@ def get_db_connection():
except mysql.connector.Error as err: except mysql.connector.Error as err:
print(f"❌ Failed to connect to DB: {err}") print(f"❌ Failed to connect to DB: {err}")
return # dont continue if DB failed return # dont continue if DB failed
return db return db
def preload_source_urls(cursor):
cursor.execute("SELECT source_url FROM videos")
return set(row[0] for row in cursor.fetchall())

@ -1,118 +1,135 @@
import requests, mysql.connector, time from config import get_db_connection
import requests, time, mysql.connector
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
from datetime import datetime from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from config import get_db_connection, preload_source_urls
# ────────────────────────────────────────────────────────────────
def insert_video_to_db(data): def insert_video_to_db(data):
db = get_db_connection()
try: try:
db = get_db_connection()
cursor = db.cursor() cursor = db.cursor()
cursor.execute(""" sql = """
INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at) INSERT IGNORE INTO videos (username, url, title, date, embed_link, source_url, created_at)
VALUES (%s, %s, %s, %s, NOW()) VALUES (%s, %s, %s, %s, %s, %s, NOW())
""", (data['username'], data['date'], data['embed_link'], data['source_url'])) """
values = (
data['username'],
data['url'],
data['title'],
data['date'],
data['embed_link'],
data['source_url']
)
cursor.execute(sql, values)
db.commit() db.commit()
if cursor.rowcount > 0: print("✅ Inserted into DB!")
print(f"✅ New: {data['username']}{data['date']}")
except mysql.connector.Error as err: except mysql.connector.Error as err:
print(f"❌ DB insert error: {err}") print(f"❌ Failed to insert: {err}")
finally: finally:
cursor.close() cursor.close()
db.close() db.close()
def crawl_user_page(base_url, user_path):
full_url = urljoin(base_url, user_path)
response = requests.get(full_url)
if response.status_code != 200:
print(f"❌ Failed to load {full_url}")
return None
soup = BeautifulSoup(response.text, "html.parser")
# ────────────────────────────────────────────────────────────────
def parse_data(soup): def parse_data(soup):
username = soup.select_one("h1.entry-title") username = user_path.strip("/")
username = username.contents[0].strip() if username and username.contents else None title_tag = soup.find("h1", class_="entry-title")
title = title_tag.text.strip() if title_tag else "(no title)"
date_tag = soup.select_one("span.entry-date") date_tag = soup.find("span", class_="entry-date")
date = date_tag.text.strip() if date_tag else None date = date_tag.text.strip() if date_tag else None
if date: if date:
try: try:
date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d") date_obj = datetime.strptime(date, "%d/%m/%Y")
date = date_obj.strftime("%Y-%m-%d")
except ValueError: except ValueError:
print(f"⚠️ Failed to parse date: {date}")
date = None date = None
embed_link = next((i["src"] for i in soup.find_all("iframe", src=True) if "xpornium.net" in i["src"]), None) embed_link = None
return {"username": username, "date": date, "embed_link": embed_link} for iframe in soup.find_all("iframe", src=True):
src = iframe["src"]
if "xpornium.net" in src:
embed_link = src # no urljoin needed!
break # stop after finding the first match
# ──────────────────────────────────────────────────────────────── # --- print info after crawling this user ---
def crawl_user_page(url): print(f"\n✅ Scraped {username}: — {date}")
try:
res = requests.get(url, timeout=15)
if res.ok:
soup = BeautifulSoup(res.text, "html.parser")
data = parse_data(soup)
data["source_url"] = url
return data if data["embed_link"] else None
except Exception:
pass
return None
# ────────────────────────────────────────────────────────────────
def crawl_index_page(base_url, page_num, seen, cursor, db):
url = f"{base_url}?p={page_num}"
print(f"📄 Page {page_num}")
try:
res = requests.get(url, timeout=15)
if not res.ok:
return 0
soup = BeautifulSoup(res.text, "html.parser")
links = [urljoin(base_url, a["href"]) for a in soup.select("a.thumbnail-link[href]")]
links = [link for link in links if link not in seen]
if not links:
return 0
new_count = 0
with ThreadPoolExecutor(max_workers=50) as pool:
for f in as_completed(pool.submit(crawl_user_page, l) for l in links):
data = f.result()
if data:
insert_video_to_db(data, cursor, db)
new_count += 1
return new_count
except Exception:
return 0
# ────────────────────────────────────────────────────────────────
def crawl_all(base_url):
db = get_db_connection()
cursor = db.cursor()
seen = preload_source_urls(cursor)
QUIT_LOGIC = True
page, total, empty_results = 1000, 0, 0 # -------------------------------------------
while True:
batch = range(page, page + 10)
print(f"\n🚀 Batch {page}{page + 9}")
with ThreadPoolExecutor(max_workers=10) as pool: return {
results = [f.result() for f in as_completed(pool.submit(crawl_index_page, base_url, p, seen, cursor, db) for p in batch)] "username": username,
"title": title,
"date": date,
"embed_link": embed_link,
}
batch_total = sum(results) def crawl_all(init_url):
total += batch_total """Crawl page by page and extract user data as we go."""
print(f"📦 Batch complete — {batch_total} new videos (total: {total})") page = 1
all_data = []
while True:
url = f"{init_url}?p={page}"
print(f"\n🕷️ Crawling index page {page}: {url}")
response = requests.get(url)
if response.status_code != 200:
print(f"❌ Page {page} returned {response.status_code}, stopping.")
break
soup = BeautifulSoup(response.text, "html.parser")
user_links = soup.find_all("a", class_="thumbnail-link", href=True)
if not user_links:
print("⚠️ No user links found — reached end of site.")
break
for link in user_links:
user_path = link["href"]
user_data = crawl_user_page(init_url, user_path)
if not user_data:
print("⚠️ Skipping empty user_data.")
continue
if not user_data["embed_link"]:
print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
continue
insert_video_to_db(user_data)
time.sleep(0.5)
page += 1
time.sleep(1)
print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}")
return all_data
if __name__ == "__main__":
BASE_URL = "https://webcamrips.to"
results = crawl_all(BASE_URL)
print("💾 All data saved to users_data.json")
if not QUIT_LOGIC:
if batch_total == 0:
empty_results += 1
if empty_results >= 10:
print("\n🛑 No new videos found for 30 consecutive pages. Stopping.")
break
page += 10
cursor.close()
db.close()
print(f"\n✅ Done! Total new videos: {total}")
# ────────────────────────────────────────────────────────────────
if __name__ == "__main__":
crawl_all("https://webcamrips.to")

Loading…
Cancel
Save