You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

209 lines
6.3 KiB
Plaintext

15 hours ago
import requests, os, time, mysql.connector, json
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
from dotenv import load_dotenv
from xpornium import get_file_info, upload_video, get_upload_url, remote_upload
load_dotenv()
def insert_video_to_db(data):
try:
db_host = os.getenv("DB_HOST")
db_user = os.getenv("DB_USER")
db_pass = os.getenv("DB_PASS")
db_name = os.getenv("DB_NAME")
db = mysql.connector.connect(host=db_host, user=db_user, password=db_pass, database=db_name)
except mysql.connector.Error as err:
print(f"❌ Failed to connect to DB: {err}")
return # dont continue if DB failed
try:
cursor = db.cursor()
sql = """
INSERT IGNORE INTO videos (username, url, title, date, embed_link, source_url, created_at)
VALUES (%s, %s, %s, %s, %s, %s, NOW())
"""
values = (
data['username'],
data['url'],
data['title'],
data['date'],
data['embed_link'],
data['source_url']
)
cursor.execute(sql, values)
db.commit()
print("✅ Inserted into DB!")
except mysql.connector.Error as err:
print(f"❌ Failed to insert: {err}")
finally:
cursor.close()
db.close()
def save_xpornium_upload(embed_link, fileid, xpornium_url, title, cat_id, duration, thumbnail):
try:
db = mysql.connector.connect(
host=os.getenv("DB_HOST"),
user=os.getenv("DB_USER"),
password=os.getenv("DB_PASS"),
database=os.getenv("DB_NAME")
)
cursor = db.cursor()
new_embed_link = f"https://xpornium.net/embed/{fileid}"
sql = """
INSERT INTO xpornium_uploads
(original_embed_link, xpornium_fileid, xpornium_url, new_embed_link, title, category_id, uploaded_at, duration, thumbnail)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
values = (
embed_link,
fileid,
xpornium_url,
new_embed_link,
title,
cat_id,
datetime.now(),
duration,
thumbnail
)
cursor.execute(sql, values)
db.commit()
print(f"✅ Saved to DB: {fileid} | duration: {duration}s | thumb: {thumbnail}")
except Exception as e:
print(f"❌ DB Save Failed: {e}")
finally:
if db and db.is_connected():
cursor.close()
db.close()
def crawl_user_page(base_url, user_path):
"""Visits one 'url/username' page and extracts info."""
full_url = urljoin(base_url, user_path)
response = requests.get(full_url)
if response.status_code != 200:
print(f"❌ Failed to load {full_url}")
return None
soup = BeautifulSoup(response.text, "html.parser")
username = user_path.strip("/")
title_tag = soup.find("h1", class_="entry-title")
title = title_tag.text.strip() if title_tag else "(no title)"
source_url = full_url
date_tag = soup.find("span", class_="entry-date")
date = date_tag.text.strip() if date_tag else None
# Convert DD/MM/YYYY → YYYY-MM-DD
if date:
try:
date_obj = datetime.strptime(date, "%d/%m/%Y")
date = date_obj.strftime("%Y-%m-%d")
except ValueError:
print(f"⚠️ Failed to parse date: {date}")
date = None
embed_link = None
for iframe in soup.find_all("iframe", src=True):
src = iframe["src"]
if "xpornium.net" in src:
embed_link = src # no urljoin needed!
break # stop after finding the first match
# --- print info after crawling this user ---
print(f"\n✅ Scraped {username}: — {date}")
# -------------------------------------------
return {
"username": username,
"url": full_url,
"title": title,
"date": date,
"embed_link": embed_link,
"source_url": source_url
}
def crawl_all(init_url):
"""Crawl page by page and extract user data as we go."""
page = 1
all_data = []
while True:
url = f"{init_url}?p={page}"
print(f"\n🕷 Crawling index page {page}: {url}")
response = requests.get(url)
if response.status_code != 200:
print(f"❌ Page {page} returned {response.status_code}, stopping.")
break
soup = BeautifulSoup(response.text, "html.parser")
user_links = soup.find_all("a", class_="thumbnail-link", href=True)
if not user_links:
print("⚠️ No user links found — reached end of site.")
break
for link in user_links:
user_path = link["href"]
user_data = crawl_user_page(init_url, user_path)
if not user_data:
print("⚠️ Skipping empty user_data.")
continue
if not user_data["embed_link"]:
print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
continue
insert_video_to_db(user_data)
# Get fileid and xpornium url
fileid = user_data["embed_link"].split("/")[-1]
xpornium_url = f"https://xpornium.net/embed/{fileid}"
remote_upload(xpornium_url)
# Get file info (duration, thumb)
info_response = get_file_info(fileid)
info_json = info_response.json()
if info_json.get("status") != 200 or not info_json.get("result"):
print(f"❌ Failed to get file info for {fileid}")
continue
info = info_json["result"][0]
duration = info.get("duration") or 0
thumbnail = info.get("thumbnail") or ""
# Save to DB
save_xpornium_upload(user_data["embed_link"], fileid, xpornium_url, user_data["title"], 127, duration, thumbnail)
time.sleep(0.5)
page += 1
time.sleep(1)
print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}")
return all_data
if __name__ == "__main__":
BASE_URL = "https://webcamrips.to"
results = crawl_all(BASE_URL)
print("💾 All data saved to users_data.json")