ASS STILL HURT

main
Your Name 9 hours ago
parent f90cbd7b53
commit f5d91117d8

@ -1,136 +1,118 @@
from config import get_db_connection, preload_source_urls import requests, mysql.connector, time
import requests, time, mysql.connector
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
from datetime import datetime from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from config import get_db_connection, preload_source_urls
# ────────────────────────────────────────────────────────────────
def insert_video_to_db(data, db): def insert_video_to_db(data):
try: try:
db = get_db_connection()
cursor = db.cursor() cursor = db.cursor()
sql = """ cursor.execute("""
INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at) INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
VALUES (%s, %s, %s, %s, NOW()) VALUES (%s, %s, %s, %s, NOW())
""" """, (data['username'], data['date'], data['embed_link'], data['source_url']))
values = (
data['username'],
data['date'],
data['embed_link'],
data['source_url']
)
cursor.execute(sql, values)
db.commit() db.commit()
if cursor.rowcount == 0: if cursor.rowcount > 0:
print("❌ Video already exists in DB") print(f"✅ New: {data['username']}{data['date']}")
else:
print("✅ Inserted into DB!")
except mysql.connector.Error as err: except mysql.connector.Error as err:
print(f"Failed to insert: {err}") print(f"❌ DB insert error: {err}")
finally: finally:
cursor.close() cursor.close()
db.close()
def crawl_user_page(full_url): # ────────────────────────────────────────────────────────────────
try:
response = requests.get(full_url)
if response.status_code != 200:
print(f"❌ Failed to load {full_url}")
return None
soup = BeautifulSoup(response.text, "html.parser")
data = parse_data(soup)
data["source_url"] = full_url
return data
except Exception as e:
print(f"❌ Exception while crawling {full_url}: {e}")
return None
def parse_data(soup): def parse_data(soup):
title_tag = soup.find("h1", class_="entry-title") username = soup.select_one("h1.entry-title")
username = title_tag.contents[0].strip() if title_tag else "unknown" username = username.contents[0].strip() if username and username.contents else None
date_tag = soup.find("span", class_="entry-date")
date = date_tag.text.strip() if date_tag else None
date_tag = soup.select_one("span.entry-date")
date = date_tag.text.strip() if date_tag else None
if date: if date:
try: try:
date_obj = datetime.strptime(date, "%d/%m/%Y") date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d")
date = date_obj.strftime("%Y-%m-%d")
except ValueError: except ValueError:
print(f"⚠️ Failed to parse date: {date}")
date = None date = None
embed_link = None embed_link = next((i["src"] for i in soup.find_all("iframe", src=True) if "xpornium.net" in i["src"]), None)
for iframe in soup.find_all("iframe", src=True): return {"username": username, "date": date, "embed_link": embed_link}
src = iframe["src"]
if "xpornium.net" in src:
embed_link = src
break
print(f"\n✅ Scraped {username}: — {date}")
return {
"username": username,
"date": date,
"embed_link": embed_link,
}
def process_link(link, seen_urls, init_url): # ────────────────────────────────────────────────────────────────
full_url = init_url + link def crawl_user_page(url):
if full_url in seen_urls: try:
print(f"⚠️ Skipping {link} - already seen.") res = requests.get(url, timeout=15)
return if res.ok:
soup = BeautifulSoup(res.text, "html.parser")
user_data = crawl_user_page(full_url) data = parse_data(soup)
if not user_data: data["source_url"] = url
print("⚠️ Skipping empty user_data.") return data if data["embed_link"] else None
return except Exception:
pass
if not user_data["embed_link"]: return None
print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
return # ────────────────────────────────────────────────────────────────
def crawl_index_page(base_url, page_num, seen, cursor, db):
local_db = get_db_connection() url = f"{base_url}?p={page_num}"
insert_video_to_db(user_data, local_db) print(f"📄 Page {page_num}")
local_db.close() try:
res = requests.get(url, timeout=15)
if not res.ok:
def crawl_all(init_url): return 0
page = 1
soup = BeautifulSoup(res.text, "html.parser")
links = [urljoin(base_url, a["href"]) for a in soup.select("a.thumbnail-link[href]")]
links = [link for link in links if link not in seen]
if not links:
return 0
new_count = 0
with ThreadPoolExecutor(max_workers=50) as pool:
for f in as_completed(pool.submit(crawl_user_page, l) for l in links):
data = f.result()
if data:
insert_video_to_db(data, cursor, db)
new_count += 1
return new_count
except Exception:
return 0
# ────────────────────────────────────────────────────────────────
def crawl_all(base_url):
db = get_db_connection() db = get_db_connection()
cursor = db.cursor() cursor = db.cursor()
seen_urls = preload_source_urls(cursor) seen = preload_source_urls(cursor)
cursor.close()
db.close() QUIT_LOGIC = True
page, total, empty_results = 1000, 0, 0
while True: while True:
url = f"{init_url}?p={page}" batch = range(page, page + 10)
print(f"\n🕷️ Crawling index page {page}: {url}") print(f"\n🚀 Batch {page}{page + 9}")
response = requests.get(url)
if response.status_code != 200:
print(f"❌ Page {page} returned {response.status_code}, stopping.")
break
soup = BeautifulSoup(response.text, "html.parser") with ThreadPoolExecutor(max_workers=10) as pool:
video_pages = soup.find_all("a", class_="thumbnail-link", href=True) results = [f.result() for f in as_completed(pool.submit(crawl_index_page, base_url, p, seen, cursor, db) for p in batch)]
video_pages = [link['href'] for link in video_pages]
if not video_pages: batch_total = sum(results)
print("⚠️ No user links found — reached end of site.") total += batch_total
break print(f"📦 Batch complete — {batch_total} new videos (total: {total})")
with ThreadPoolExecutor(max_workers=50) as executor: if not QUIT_LOGIC:
futures = [executor.submit(process_link, link, seen_urls, init_url) for link in video_pages] if batch_total == 0:
for _ in as_completed(futures): empty_results += 1
pass # we already log inside the functions
page += 1 if empty_results >= 10:
print("\n🛑 No new videos found for 30 consecutive pages. Stopping.")
break
print("\n✅ Finished crawling all pages.") page += 10
cursor.close()
db.close()
print(f"\n✅ Done! Total new videos: {total}")
# ────────────────────────────────────────────────────────────────
if __name__ == "__main__": if __name__ == "__main__":
BASE_URL = "https://webcamrips.to" crawl_all("https://webcamrips.to")
crawl_all(BASE_URL)

Loading…
Cancel
Save