ASS STILL HURT

main
Your Name 11 hours ago
parent f90cbd7b53
commit f5d91117d8

@ -1,136 +1,118 @@
from config import get_db_connection, preload_source_urls import requests, mysql.connector, time
import requests, time, mysql.connector
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
from datetime import datetime from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from config import get_db_connection, preload_source_urls
# ────────────────────────────────────────────────────────────────
def insert_video_to_db(data, db): def insert_video_to_db(data):
try: try:
db = get_db_connection()
cursor = db.cursor() cursor = db.cursor()
sql = """ cursor.execute("""
INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at) INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
VALUES (%s, %s, %s, %s, NOW()) VALUES (%s, %s, %s, %s, NOW())
""" """, (data['username'], data['date'], data['embed_link'], data['source_url']))
values = (
data['username'],
data['date'],
data['embed_link'],
data['source_url']
)
cursor.execute(sql, values)
db.commit() db.commit()
if cursor.rowcount == 0: if cursor.rowcount > 0:
print("❌ Video already exists in DB") print(f"✅ New: {data['username']}{data['date']}")
else:
print("✅ Inserted into DB!")
except mysql.connector.Error as err: except mysql.connector.Error as err:
print(f"Failed to insert: {err}") print(f"❌ DB insert error: {err}")
finally: finally:
cursor.close() cursor.close()
db.close()
def crawl_user_page(full_url): # ────────────────────────────────────────────────────────────────
try:
response = requests.get(full_url)
if response.status_code != 200:
print(f"❌ Failed to load {full_url}")
return None
soup = BeautifulSoup(response.text, "html.parser")
data = parse_data(soup)
data["source_url"] = full_url
return data
except Exception as e:
print(f"❌ Exception while crawling {full_url}: {e}")
return None
def parse_data(soup): def parse_data(soup):
title_tag = soup.find("h1", class_="entry-title") username = soup.select_one("h1.entry-title")
username = title_tag.contents[0].strip() if title_tag else "unknown" username = username.contents[0].strip() if username and username.contents else None
date_tag = soup.find("span", class_="entry-date")
date = date_tag.text.strip() if date_tag else None
date_tag = soup.select_one("span.entry-date")
date = date_tag.text.strip() if date_tag else None
if date: if date:
try: try:
date_obj = datetime.strptime(date, "%d/%m/%Y") date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d")
date = date_obj.strftime("%Y-%m-%d")
except ValueError: except ValueError:
print(f"⚠️ Failed to parse date: {date}")
date = None date = None
embed_link = None embed_link = next((i["src"] for i in soup.find_all("iframe", src=True) if "xpornium.net" in i["src"]), None)
for iframe in soup.find_all("iframe", src=True): return {"username": username, "date": date, "embed_link": embed_link}
src = iframe["src"]
if "xpornium.net" in src:
embed_link = src
break
print(f"\n✅ Scraped {username}: — {date}")
return {
"username": username,
"date": date,
"embed_link": embed_link,
}
def process_link(link, seen_urls, init_url):
full_url = init_url + link
if full_url in seen_urls:
print(f"⚠️ Skipping {link} - already seen.")
return
user_data = crawl_user_page(full_url)
if not user_data:
print("⚠️ Skipping empty user_data.")
return
if not user_data["embed_link"]:
print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
return
local_db = get_db_connection()
insert_video_to_db(user_data, local_db)
local_db.close()
# ────────────────────────────────────────────────────────────────
def crawl_user_page(url):
try:
res = requests.get(url, timeout=15)
if res.ok:
soup = BeautifulSoup(res.text, "html.parser")
data = parse_data(soup)
data["source_url"] = url
return data if data["embed_link"] else None
except Exception:
pass
return None
def crawl_all(init_url): # ────────────────────────────────────────────────────────────────
page = 1 def crawl_index_page(base_url, page_num, seen, cursor, db):
url = f"{base_url}?p={page_num}"
print(f"📄 Page {page_num}")
try:
res = requests.get(url, timeout=15)
if not res.ok:
return 0
soup = BeautifulSoup(res.text, "html.parser")
links = [urljoin(base_url, a["href"]) for a in soup.select("a.thumbnail-link[href]")]
links = [link for link in links if link not in seen]
if not links:
return 0
new_count = 0
with ThreadPoolExecutor(max_workers=50) as pool:
for f in as_completed(pool.submit(crawl_user_page, l) for l in links):
data = f.result()
if data:
insert_video_to_db(data, cursor, db)
new_count += 1
return new_count
except Exception:
return 0
# ────────────────────────────────────────────────────────────────
def crawl_all(base_url):
db = get_db_connection() db = get_db_connection()
cursor = db.cursor() cursor = db.cursor()
seen_urls = preload_source_urls(cursor) seen = preload_source_urls(cursor)
cursor.close()
db.close()
QUIT_LOGIC = True
page, total, empty_results = 1000, 0, 0
while True: while True:
url = f"{init_url}?p={page}" batch = range(page, page + 10)
print(f"\n🕷️ Crawling index page {page}: {url}") print(f"\n🚀 Batch {page}{page + 9}")
response = requests.get(url)
if response.status_code != 200:
print(f"❌ Page {page} returned {response.status_code}, stopping.")
break
soup = BeautifulSoup(response.text, "html.parser") with ThreadPoolExecutor(max_workers=10) as pool:
video_pages = soup.find_all("a", class_="thumbnail-link", href=True) results = [f.result() for f in as_completed(pool.submit(crawl_index_page, base_url, p, seen, cursor, db) for p in batch)]
video_pages = [link['href'] for link in video_pages]
if not video_pages: batch_total = sum(results)
print("⚠️ No user links found — reached end of site.") total += batch_total
break print(f"📦 Batch complete — {batch_total} new videos (total: {total})")
with ThreadPoolExecutor(max_workers=50) as executor: if not QUIT_LOGIC:
futures = [executor.submit(process_link, link, seen_urls, init_url) for link in video_pages] if batch_total == 0:
for _ in as_completed(futures): empty_results += 1
pass # we already log inside the functions
page += 1 if empty_results >= 10:
print("\n🛑 No new videos found for 30 consecutive pages. Stopping.")
break
print("\n✅ Finished crawling all pages.") page += 10
cursor.close()
db.close()
print(f"\n✅ Done! Total new videos: {total}")
# ────────────────────────────────────────────────────────────────
if __name__ == "__main__": if __name__ == "__main__":
BASE_URL = "https://webcamrips.to" crawl_all("https://webcamrips.to")
crawl_all(BASE_URL)

Loading…
Cancel
Save