ASS STILL FUCKING BROKEN MY NIGGA WTFFFFFFF

main
Your Name 12 hours ago
parent c51464d68e
commit f90cbd7b53

@ -3,18 +3,16 @@ import requests, time, mysql.connector
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
from datetime import datetime from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
def insert_video_to_db(data): def insert_video_to_db(data, db):
try: try:
cursor = db.cursor() cursor = db.cursor()
sql = """ sql = """
INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at) INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
VALUES (%s, %s, %s, %s, NOW()) VALUES (%s, %s, %s, %s, NOW())
""" """
values = ( values = (
data['username'], data['username'],
data['date'], data['date'],
@ -27,29 +25,31 @@ def insert_video_to_db(data):
print("❌ Video already exists in DB") print("❌ Video already exists in DB")
else: else:
print("✅ Inserted into DB!") print("✅ Inserted into DB!")
except mysql.connector.Error as err: except mysql.connector.Error as err:
print(f"❌ Failed to insert: {err}") print(f"❌ Failed to insert: {err}")
finally:
cursor.close()
def crawl_user_page(full_url): def crawl_user_page(full_url):
response = requests.get(full_url) try:
if response.status_code != 200: response = requests.get(full_url)
print(f"❌ Failed to load {full_url}") if response.status_code != 200:
return None print(f"❌ Failed to load {full_url}")
return None
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
data = parse_data(soup)
data["source_url"] = full_url
return data
except Exception as e:
print(f"❌ Exception while crawling {full_url}: {e}")
return None
data = parse_data(soup)
data["source_url"] = full_url
return data
def parse_data(soup): def parse_data(soup):
title_tag = soup.find("h1", class_="entry-title") title_tag = soup.find("h1", class_="entry-title")
username = title_tag.contents[0].strip() username = title_tag.contents[0].strip() if title_tag else "unknown"
date_tag = soup.find("span", class_="entry-date") date_tag = soup.find("span", class_="entry-date")
date = date_tag.text.strip() if date_tag else None date = date_tag.text.strip() if date_tag else None
@ -61,28 +61,49 @@ def parse_data(soup):
print(f"⚠️ Failed to parse date: {date}") print(f"⚠️ Failed to parse date: {date}")
date = None date = None
embed_link = None embed_link = None
for iframe in soup.find_all("iframe", src=True): for iframe in soup.find_all("iframe", src=True):
src = iframe["src"] src = iframe["src"]
if "xpornium.net" in src: if "xpornium.net" in src:
embed_link = src # no urljoin needed! embed_link = src
break # stop after finding the first match break
print(f"\n✅ Scraped {username}: — {date}")
return {
"username": username,
"date": date,
"embed_link": embed_link,
}
def process_link(link, seen_urls, init_url):
full_url = init_url + link
if full_url in seen_urls:
print(f"⚠️ Skipping {link} - already seen.")
return
# --- print info after crawling this user --- user_data = crawl_user_page(full_url)
print(f"\n✅ Scraped {username}: — {date}") if not user_data:
print("⚠️ Skipping empty user_data.")
return
# ------------------------------------------- if not user_data["embed_link"]:
print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
return
local_db = get_db_connection()
insert_video_to_db(user_data, local_db)
local_db.close()
return {
"username": username,
"date": date,
"embed_link": embed_link,
}
def crawl_all(init_url): def crawl_all(init_url):
"""Crawl page by page and extract user data as we go."""
page = 1 page = 1
all_data = [] db = get_db_connection()
cursor = db.cursor()
seen_urls = preload_source_urls(cursor)
cursor.close()
db.close()
while True: while True:
url = f"{init_url}?p={page}" url = f"{init_url}?p={page}"
@ -95,44 +116,21 @@ def crawl_all(init_url):
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
video_pages = soup.find_all("a", class_="thumbnail-link", href=True) video_pages = soup.find_all("a", class_="thumbnail-link", href=True)
video_pages = [link['href'] for link in video_pages] video_pages = [link['href'] for link in video_pages]
if not video_pages: if not video_pages:
print("⚠️ No user links found — reached end of site.") print("⚠️ No user links found — reached end of site.")
break break
cursor = db.cursor()
seen_urls = preload_source_urls(cursor) with ThreadPoolExecutor(max_workers=50) as executor:
futures = [executor.submit(process_link, link, seen_urls, init_url) for link in video_pages]
for link in video_pages: for _ in as_completed(futures):
full_url = init_url + link pass # we already log inside the functions
if full_url in seen_urls:
print(f"⚠️ Skipping {link} - already seen.")
continue
user_data = crawl_user_page(full_url) # slow as fuk
if not user_data:
print("⚠️ Skipping empty user_data.")
continue
if not user_data["embed_link"]:
print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
continue
insert_video_to_db(user_data)
page += 1 page += 1
print("\n✅ Finished crawling all pages.")
print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}")
return all_data
if __name__ == "__main__": if __name__ == "__main__":
db = get_db_connection()
BASE_URL = "https://webcamrips.to" BASE_URL = "https://webcamrips.to"
results = crawl_all(BASE_URL) crawl_all(BASE_URL)
print("💾 All data saved to users_data.json")
cursor = db.cursor()
cursor.close()
db.close()

Loading…
Cancel
Save