Compare commits

...

3 Commits

Author SHA1 Message Date
Your Name f5d91117d8 ASS STILL HURT 11 hours ago
Your Name f90cbd7b53 ASS STILL FUCKING BROKEN MY NIGGA WTFFFFFFF 12 hours ago
Your Name c51464d68e MY ASS BROKEN!!!! 13 hours ago

Binary file not shown.

@ -14,4 +14,8 @@ def get_db_connection():
except mysql.connector.Error as err:
print(f"❌ Failed to connect to DB: {err}")
return # dont continue if DB failed
return db
return db
def preload_source_urls(cursor):
cursor.execute("SELECT source_url FROM videos")
return set(row[0] for row in cursor.fetchall())

@ -1,135 +1,118 @@
from config import get_db_connection
import requests, time, mysql.connector
import requests, mysql.connector, time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from config import get_db_connection, preload_source_urls
# ────────────────────────────────────────────────────────────────
def insert_video_to_db(data):
db = get_db_connection()
try:
db = get_db_connection()
cursor = db.cursor()
sql = """
INSERT IGNORE INTO videos (username, url, title, date, embed_link, source_url, created_at)
VALUES (%s, %s, %s, %s, %s, %s, NOW())
"""
values = (
data['username'],
data['url'],
data['title'],
data['date'],
data['embed_link'],
data['source_url']
)
cursor.execute(sql, values)
cursor.execute("""
INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
VALUES (%s, %s, %s, %s, NOW())
""", (data['username'], data['date'], data['embed_link'], data['source_url']))
db.commit()
print("✅ Inserted into DB!")
if cursor.rowcount > 0:
print(f"✅ New: {data['username']}{data['date']}")
except mysql.connector.Error as err:
print(f"❌ Failed to insert: {err}")
print(f"❌ DB insert error: {err}")
finally:
cursor.close()
db.close()
def crawl_user_page(base_url, user_path):
full_url = urljoin(base_url, user_path)
response = requests.get(full_url)
if response.status_code != 200:
print(f"❌ Failed to load {full_url}")
return None
soup = BeautifulSoup(response.text, "html.parser")
# ────────────────────────────────────────────────────────────────
def parse_data(soup):
username = user_path.strip("/")
title_tag = soup.find("h1", class_="entry-title")
title = title_tag.text.strip() if title_tag else "(no title)"
username = soup.select_one("h1.entry-title")
username = username.contents[0].strip() if username and username.contents else None
date_tag = soup.find("span", class_="entry-date")
date_tag = soup.select_one("span.entry-date")
date = date_tag.text.strip() if date_tag else None
if date:
try:
date_obj = datetime.strptime(date, "%d/%m/%Y")
date = date_obj.strftime("%Y-%m-%d")
date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d")
except ValueError:
print(f"⚠️ Failed to parse date: {date}")
date = None
embed_link = None
for iframe in soup.find_all("iframe", src=True):
src = iframe["src"]
if "xpornium.net" in src:
embed_link = src # no urljoin needed!
break # stop after finding the first match
# --- print info after crawling this user ---
print(f"\n✅ Scraped {username}: — {date}")
embed_link = next((i["src"] for i in soup.find_all("iframe", src=True) if "xpornium.net" in i["src"]), None)
return {"username": username, "date": date, "embed_link": embed_link}
# -------------------------------------------
return {
"username": username,
"title": title,
"date": date,
"embed_link": embed_link,
}
def crawl_all(init_url):
"""Crawl page by page and extract user data as we go."""
page = 1
all_data = []
# ────────────────────────────────────────────────────────────────
def crawl_user_page(url):
try:
res = requests.get(url, timeout=15)
if res.ok:
soup = BeautifulSoup(res.text, "html.parser")
data = parse_data(soup)
data["source_url"] = url
return data if data["embed_link"] else None
except Exception:
pass
return None
# ────────────────────────────────────────────────────────────────
def crawl_index_page(base_url, page_num, seen, cursor, db):
url = f"{base_url}?p={page_num}"
print(f"📄 Page {page_num}")
try:
res = requests.get(url, timeout=15)
if not res.ok:
return 0
soup = BeautifulSoup(res.text, "html.parser")
links = [urljoin(base_url, a["href"]) for a in soup.select("a.thumbnail-link[href]")]
links = [link for link in links if link not in seen]
if not links:
return 0
new_count = 0
with ThreadPoolExecutor(max_workers=50) as pool:
for f in as_completed(pool.submit(crawl_user_page, l) for l in links):
data = f.result()
if data:
insert_video_to_db(data, cursor, db)
new_count += 1
return new_count
except Exception:
return 0
# ────────────────────────────────────────────────────────────────
def crawl_all(base_url):
db = get_db_connection()
cursor = db.cursor()
seen = preload_source_urls(cursor)
QUIT_LOGIC = True
page, total, empty_results = 1000, 0, 0
while True:
url = f"{init_url}?p={page}"
print(f"\n🕷️ Crawling index page {page}: {url}")
response = requests.get(url)
if response.status_code != 200:
print(f"❌ Page {page} returned {response.status_code}, stopping.")
break
soup = BeautifulSoup(response.text, "html.parser")
user_links = soup.find_all("a", class_="thumbnail-link", href=True)
if not user_links:
print("⚠️ No user links found — reached end of site.")
break
for link in user_links:
user_path = link["href"]
user_data = crawl_user_page(init_url, user_path)
if not user_data:
print("⚠️ Skipping empty user_data.")
continue
if not user_data["embed_link"]:
print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
continue
insert_video_to_db(user_data)
time.sleep(0.5)
page += 1
time.sleep(1)
print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}")
return all_data
batch = range(page, page + 10)
print(f"\n🚀 Batch {page}{page + 9}")
if __name__ == "__main__":
BASE_URL = "https://webcamrips.to"
results = crawl_all(BASE_URL)
print("💾 All data saved to users_data.json")
with ThreadPoolExecutor(max_workers=10) as pool:
results = [f.result() for f in as_completed(pool.submit(crawl_index_page, base_url, p, seen, cursor, db) for p in batch)]
batch_total = sum(results)
total += batch_total
print(f"📦 Batch complete — {batch_total} new videos (total: {total})")
if not QUIT_LOGIC:
if batch_total == 0:
empty_results += 1
if empty_results >= 10:
print("\n🛑 No new videos found for 30 consecutive pages. Stopping.")
break
page += 10
cursor.close()
db.close()
print(f"\n✅ Done! Total new videos: {total}")
# ────────────────────────────────────────────────────────────────
if __name__ == "__main__":
crawl_all("https://webcamrips.to")

Loading…
Cancel
Save