Compare commits

...

3 Commits

Author SHA1 Message Date
Your Name f5d91117d8 ASS STILL HURT 9 hours ago
Your Name f90cbd7b53 ASS STILL FUCKING BROKEN MY NIGGA WTFFFFFFF 10 hours ago
Your Name c51464d68e MY ASS BROKEN!!!! 10 hours ago

Binary file not shown.

@ -14,4 +14,8 @@ def get_db_connection():
except mysql.connector.Error as err:
print(f"❌ Failed to connect to DB: {err}")
return # dont continue if DB failed
return db
return db
def preload_source_urls(cursor):
cursor.execute("SELECT source_url FROM videos")
return set(row[0] for row in cursor.fetchall())

@ -1,135 +1,118 @@
from config import get_db_connection
import requests, time, mysql.connector
import requests, mysql.connector, time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from config import get_db_connection, preload_source_urls
# ────────────────────────────────────────────────────────────────
def insert_video_to_db(data):
db = get_db_connection()
try:
db = get_db_connection()
cursor = db.cursor()
sql = """
INSERT IGNORE INTO videos (username, url, title, date, embed_link, source_url, created_at)
VALUES (%s, %s, %s, %s, %s, %s, NOW())
"""
values = (
data['username'],
data['url'],
data['title'],
data['date'],
data['embed_link'],
data['source_url']
)
cursor.execute(sql, values)
cursor.execute("""
INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
VALUES (%s, %s, %s, %s, NOW())
""", (data['username'], data['date'], data['embed_link'], data['source_url']))
db.commit()
print("✅ Inserted into DB!")
if cursor.rowcount > 0:
print(f"✅ New: {data['username']}{data['date']}")
except mysql.connector.Error as err:
print(f"❌ Failed to insert: {err}")
print(f"❌ DB insert error: {err}")
finally:
cursor.close()
db.close()
def crawl_user_page(base_url, user_path):
full_url = urljoin(base_url, user_path)
response = requests.get(full_url)
if response.status_code != 200:
print(f"❌ Failed to load {full_url}")
return None
soup = BeautifulSoup(response.text, "html.parser")
# ────────────────────────────────────────────────────────────────
def parse_data(soup):
username = user_path.strip("/")
title_tag = soup.find("h1", class_="entry-title")
title = title_tag.text.strip() if title_tag else "(no title)"
username = soup.select_one("h1.entry-title")
username = username.contents[0].strip() if username and username.contents else None
date_tag = soup.find("span", class_="entry-date")
date_tag = soup.select_one("span.entry-date")
date = date_tag.text.strip() if date_tag else None
if date:
try:
date_obj = datetime.strptime(date, "%d/%m/%Y")
date = date_obj.strftime("%Y-%m-%d")
date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d")
except ValueError:
print(f"⚠️ Failed to parse date: {date}")
date = None
embed_link = None
for iframe in soup.find_all("iframe", src=True):
src = iframe["src"]
if "xpornium.net" in src:
embed_link = src # no urljoin needed!
break # stop after finding the first match
# --- print info after crawling this user ---
print(f"\n✅ Scraped {username}: — {date}")
embed_link = next((i["src"] for i in soup.find_all("iframe", src=True) if "xpornium.net" in i["src"]), None)
return {"username": username, "date": date, "embed_link": embed_link}
# -------------------------------------------
return {
"username": username,
"title": title,
"date": date,
"embed_link": embed_link,
}
def crawl_all(init_url):
"""Crawl page by page and extract user data as we go."""
page = 1
all_data = []
# ────────────────────────────────────────────────────────────────
def crawl_user_page(url):
try:
res = requests.get(url, timeout=15)
if res.ok:
soup = BeautifulSoup(res.text, "html.parser")
data = parse_data(soup)
data["source_url"] = url
return data if data["embed_link"] else None
except Exception:
pass
return None
# ────────────────────────────────────────────────────────────────
def crawl_index_page(base_url, page_num, seen, cursor, db):
url = f"{base_url}?p={page_num}"
print(f"📄 Page {page_num}")
try:
res = requests.get(url, timeout=15)
if not res.ok:
return 0
soup = BeautifulSoup(res.text, "html.parser")
links = [urljoin(base_url, a["href"]) for a in soup.select("a.thumbnail-link[href]")]
links = [link for link in links if link not in seen]
if not links:
return 0
new_count = 0
with ThreadPoolExecutor(max_workers=50) as pool:
for f in as_completed(pool.submit(crawl_user_page, l) for l in links):
data = f.result()
if data:
insert_video_to_db(data, cursor, db)
new_count += 1
return new_count
except Exception:
return 0
# ────────────────────────────────────────────────────────────────
def crawl_all(base_url):
db = get_db_connection()
cursor = db.cursor()
seen = preload_source_urls(cursor)
QUIT_LOGIC = True
page, total, empty_results = 1000, 0, 0
while True:
url = f"{init_url}?p={page}"
print(f"\n🕷️ Crawling index page {page}: {url}")
response = requests.get(url)
if response.status_code != 200:
print(f"❌ Page {page} returned {response.status_code}, stopping.")
break
soup = BeautifulSoup(response.text, "html.parser")
user_links = soup.find_all("a", class_="thumbnail-link", href=True)
if not user_links:
print("⚠️ No user links found — reached end of site.")
break
for link in user_links:
user_path = link["href"]
user_data = crawl_user_page(init_url, user_path)
if not user_data:
print("⚠️ Skipping empty user_data.")
continue
if not user_data["embed_link"]:
print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
continue
insert_video_to_db(user_data)
time.sleep(0.5)
page += 1
time.sleep(1)
print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}")
return all_data
batch = range(page, page + 10)
print(f"\n🚀 Batch {page}{page + 9}")
if __name__ == "__main__":
BASE_URL = "https://webcamrips.to"
results = crawl_all(BASE_URL)
print("💾 All data saved to users_data.json")
with ThreadPoolExecutor(max_workers=10) as pool:
results = [f.result() for f in as_completed(pool.submit(crawl_index_page, base_url, p, seen, cursor, db) for p in batch)]
batch_total = sum(results)
total += batch_total
print(f"📦 Batch complete — {batch_total} new videos (total: {total})")
if not QUIT_LOGIC:
if batch_total == 0:
empty_results += 1
if empty_results >= 10:
print("\n🛑 No new videos found for 30 consecutive pages. Stopping.")
break
page += 10
cursor.close()
db.close()
print(f"\n✅ Done! Total new videos: {total}")
# ────────────────────────────────────────────────────────────────
if __name__ == "__main__":
crawl_all("https://webcamrips.to")

Loading…
Cancel
Save