MY ASS BROKEN!!!!

main
Your Name 10 hours ago
parent c79a86eddb
commit c51464d68e

Binary file not shown.

@ -14,4 +14,8 @@ def get_db_connection():
except mysql.connector.Error as err: except mysql.connector.Error as err:
print(f"❌ Failed to connect to DB: {err}") print(f"❌ Failed to connect to DB: {err}")
return # dont continue if DB failed return # dont continue if DB failed
return db return db
def preload_source_urls(cursor):
cursor.execute("SELECT source_url FROM videos")
return set(row[0] for row in cursor.fetchall())

@ -1,4 +1,4 @@
from config import get_db_connection from config import get_db_connection, preload_source_urls
import requests, time, mysql.connector import requests, time, mysql.connector
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
@ -6,36 +6,34 @@ from datetime import datetime
def insert_video_to_db(data): def insert_video_to_db(data):
db = get_db_connection()
try: try:
cursor = db.cursor() cursor = db.cursor()
sql = """ sql = """
INSERT IGNORE INTO videos (username, url, title, date, embed_link, source_url, created_at) INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
VALUES (%s, %s, %s, %s, %s, %s, NOW()) VALUES (%s, %s, %s, %s, NOW())
""" """
values = ( values = (
data['username'], data['username'],
data['url'],
data['title'],
data['date'], data['date'],
data['embed_link'], data['embed_link'],
data['source_url'] data['source_url']
) )
cursor.execute(sql, values) cursor.execute(sql, values)
db.commit() db.commit()
print("✅ Inserted into DB!") if cursor.rowcount == 0:
print("❌ Video already exists in DB")
else:
print("✅ Inserted into DB!")
except mysql.connector.Error as err: except mysql.connector.Error as err:
print(f"❌ Failed to insert: {err}") print(f"❌ Failed to insert: {err}")
finally:
cursor.close()
db.close()
def crawl_user_page(base_url, user_path): def crawl_user_page(full_url):
full_url = urljoin(base_url, user_path)
response = requests.get(full_url) response = requests.get(full_url)
if response.status_code != 200: if response.status_code != 200:
print(f"❌ Failed to load {full_url}") print(f"❌ Failed to load {full_url}")
@ -43,13 +41,15 @@ def crawl_user_page(base_url, user_path):
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
data = parse_data(soup)
data["source_url"] = full_url
return data
def parse_data(soup): def parse_data(soup):
username = user_path.strip("/")
title_tag = soup.find("h1", class_="entry-title") title_tag = soup.find("h1", class_="entry-title")
title = title_tag.text.strip() if title_tag else "(no title)" username = title_tag.contents[0].strip()
date_tag = soup.find("span", class_="entry-date") date_tag = soup.find("span", class_="entry-date")
date = date_tag.text.strip() if date_tag else None date = date_tag.text.strip() if date_tag else None
@ -75,7 +75,6 @@ def parse_data(soup):
return { return {
"username": username, "username": username,
"title": title,
"date": date, "date": date,
"embed_link": embed_link, "embed_link": embed_link,
} }
@ -94,14 +93,24 @@ def crawl_all(init_url):
break break
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
user_links = soup.find_all("a", class_="thumbnail-link", href=True) video_pages = soup.find_all("a", class_="thumbnail-link", href=True)
if not user_links: video_pages = [link['href'] for link in video_pages]
if not video_pages:
print("⚠️ No user links found — reached end of site.") print("⚠️ No user links found — reached end of site.")
break break
cursor = db.cursor()
for link in user_links: seen_urls = preload_source_urls(cursor)
user_path = link["href"]
user_data = crawl_user_page(init_url, user_path) for link in video_pages:
full_url = init_url + link
if full_url in seen_urls:
print(f"⚠️ Skipping {link} - already seen.")
continue
user_data = crawl_user_page(full_url) # slow as fuk
if not user_data: if not user_data:
print("⚠️ Skipping empty user_data.") print("⚠️ Skipping empty user_data.")
continue continue
@ -111,25 +120,19 @@ def crawl_all(init_url):
continue continue
insert_video_to_db(user_data) insert_video_to_db(user_data)
time.sleep(0.5)
page += 1 page += 1
time.sleep(1)
print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}") print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}")
return all_data return all_data
if __name__ == "__main__": if __name__ == "__main__":
db = get_db_connection()
BASE_URL = "https://webcamrips.to" BASE_URL = "https://webcamrips.to"
results = crawl_all(BASE_URL) results = crawl_all(BASE_URL)
print("💾 All data saved to users_data.json") print("💾 All data saved to users_data.json")
cursor = db.cursor()
cursor.close()
db.close()
Loading…
Cancel
Save