|
|
|
|
|
import requests, os, time, mysql.connector, json
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
from urllib.parse import urljoin
|
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
|
|
from xpornium import get_file_info, upload_video, get_upload_url, remote_upload
|
|
|
|
|
|
load_dotenv()
|
|
|
|
|
|
def insert_video_to_db(data):
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
db_host = os.getenv("DB_HOST")
|
|
|
|
|
|
db_user = os.getenv("DB_USER")
|
|
|
|
|
|
db_pass = os.getenv("DB_PASS")
|
|
|
|
|
|
db_name = os.getenv("DB_NAME")
|
|
|
|
|
|
|
|
|
|
|
|
db = mysql.connector.connect(host=db_host, user=db_user, password=db_pass, database=db_name)
|
|
|
|
|
|
except mysql.connector.Error as err:
|
|
|
|
|
|
print(f"❌ Failed to connect to DB: {err}")
|
|
|
|
|
|
return # don’t continue if DB failed
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
cursor = db.cursor()
|
|
|
|
|
|
sql = """
|
|
|
|
|
|
INSERT IGNORE INTO videos (username, url, title, date, embed_link, source_url, created_at)
|
|
|
|
|
|
VALUES (%s, %s, %s, %s, %s, %s, NOW())
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
values = (
|
|
|
|
|
|
data['username'],
|
|
|
|
|
|
data['url'],
|
|
|
|
|
|
data['title'],
|
|
|
|
|
|
data['date'],
|
|
|
|
|
|
data['embed_link'],
|
|
|
|
|
|
data['source_url']
|
|
|
|
|
|
)
|
|
|
|
|
|
cursor.execute(sql, values)
|
|
|
|
|
|
db.commit()
|
|
|
|
|
|
print("✅ Inserted into DB!")
|
|
|
|
|
|
|
|
|
|
|
|
except mysql.connector.Error as err:
|
|
|
|
|
|
print(f"❌ Failed to insert: {err}")
|
|
|
|
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
cursor.close()
|
|
|
|
|
|
db.close()
|
|
|
|
|
|
|
|
|
|
|
|
def save_xpornium_upload(embed_link, fileid, xpornium_url, title, cat_id, duration, thumbnail):
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
db = mysql.connector.connect(
|
|
|
|
|
|
host=os.getenv("DB_HOST"),
|
|
|
|
|
|
user=os.getenv("DB_USER"),
|
|
|
|
|
|
password=os.getenv("DB_PASS"),
|
|
|
|
|
|
database=os.getenv("DB_NAME")
|
|
|
|
|
|
)
|
|
|
|
|
|
cursor = db.cursor()
|
|
|
|
|
|
|
|
|
|
|
|
new_embed_link = f"https://xpornium.net/embed/{fileid}"
|
|
|
|
|
|
|
|
|
|
|
|
sql = """
|
|
|
|
|
|
INSERT INTO xpornium_uploads
|
|
|
|
|
|
(original_embed_link, xpornium_fileid, xpornium_url, new_embed_link, title, category_id, uploaded_at, duration, thumbnail)
|
|
|
|
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
values = (
|
|
|
|
|
|
embed_link,
|
|
|
|
|
|
fileid,
|
|
|
|
|
|
xpornium_url,
|
|
|
|
|
|
new_embed_link,
|
|
|
|
|
|
title,
|
|
|
|
|
|
cat_id,
|
|
|
|
|
|
datetime.now(),
|
|
|
|
|
|
duration,
|
|
|
|
|
|
thumbnail
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
cursor.execute(sql, values)
|
|
|
|
|
|
db.commit()
|
|
|
|
|
|
print(f"✅ Saved to DB: {fileid} | duration: {duration}s | thumb: {thumbnail}")
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"❌ DB Save Failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
if db and db.is_connected():
|
|
|
|
|
|
cursor.close()
|
|
|
|
|
|
db.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def crawl_user_page(base_url, user_path):
|
|
|
|
|
|
"""Visits one 'url/username' page and extracts info."""
|
|
|
|
|
|
full_url = urljoin(base_url, user_path)
|
|
|
|
|
|
response = requests.get(full_url)
|
|
|
|
|
|
if response.status_code != 200:
|
|
|
|
|
|
print(f"❌ Failed to load {full_url}")
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
|
|
|
|
|
|
|
|
username = user_path.strip("/")
|
|
|
|
|
|
title_tag = soup.find("h1", class_="entry-title")
|
|
|
|
|
|
title = title_tag.text.strip() if title_tag else "(no title)"
|
|
|
|
|
|
source_url = full_url
|
|
|
|
|
|
|
|
|
|
|
|
date_tag = soup.find("span", class_="entry-date")
|
|
|
|
|
|
date = date_tag.text.strip() if date_tag else None
|
|
|
|
|
|
|
|
|
|
|
|
# Convert DD/MM/YYYY → YYYY-MM-DD
|
|
|
|
|
|
if date:
|
|
|
|
|
|
try:
|
|
|
|
|
|
date_obj = datetime.strptime(date, "%d/%m/%Y")
|
|
|
|
|
|
date = date_obj.strftime("%Y-%m-%d")
|
|
|
|
|
|
except ValueError:
|
|
|
|
|
|
print(f"⚠️ Failed to parse date: {date}")
|
|
|
|
|
|
date = None
|
|
|
|
|
|
|
|
|
|
|
|
embed_link = None
|
|
|
|
|
|
for iframe in soup.find_all("iframe", src=True):
|
|
|
|
|
|
src = iframe["src"]
|
|
|
|
|
|
if "xpornium.net" in src:
|
|
|
|
|
|
embed_link = src # no urljoin needed!
|
|
|
|
|
|
break # stop after finding the first match
|
|
|
|
|
|
|
|
|
|
|
|
# --- print info after crawling this user ---
|
|
|
|
|
|
print(f"\n✅ Scraped {username}: — {date}")
|
|
|
|
|
|
|
|
|
|
|
|
# -------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
"username": username,
|
|
|
|
|
|
"url": full_url,
|
|
|
|
|
|
"title": title,
|
|
|
|
|
|
"date": date,
|
|
|
|
|
|
"embed_link": embed_link,
|
|
|
|
|
|
"source_url": source_url
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def crawl_all(init_url):
|
|
|
|
|
|
"""Crawl page by page and extract user data as we go."""
|
|
|
|
|
|
page = 1
|
|
|
|
|
|
all_data = []
|
|
|
|
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
|
url = f"{init_url}?p={page}"
|
|
|
|
|
|
print(f"\n🕷️ Crawling index page {page}: {url}")
|
|
|
|
|
|
response = requests.get(url)
|
|
|
|
|
|
if response.status_code != 200:
|
|
|
|
|
|
print(f"❌ Page {page} returned {response.status_code}, stopping.")
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
|
|
user_links = soup.find_all("a", class_="thumbnail-link", href=True)
|
|
|
|
|
|
if not user_links:
|
|
|
|
|
|
print("⚠️ No user links found — reached end of site.")
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
for link in user_links:
|
|
|
|
|
|
user_path = link["href"]
|
|
|
|
|
|
user_data = crawl_user_page(init_url, user_path)
|
|
|
|
|
|
if not user_data:
|
|
|
|
|
|
print("⚠️ Skipping empty user_data.")
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if not user_data["embed_link"]:
|
|
|
|
|
|
print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
insert_video_to_db(user_data)
|
|
|
|
|
|
|
|
|
|
|
|
# Get fileid and xpornium url
|
|
|
|
|
|
fileid = user_data["embed_link"].split("/")[-1]
|
|
|
|
|
|
xpornium_url = f"https://xpornium.net/embed/{fileid}"
|
|
|
|
|
|
remote_upload(xpornium_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Get file info (duration, thumb)
|
|
|
|
|
|
info_response = get_file_info(fileid)
|
|
|
|
|
|
info_json = info_response.json()
|
|
|
|
|
|
|
|
|
|
|
|
if info_json.get("status") != 200 or not info_json.get("result"):
|
|
|
|
|
|
print(f"❌ Failed to get file info for {fileid}")
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
info = info_json["result"][0]
|
|
|
|
|
|
duration = info.get("duration") or 0
|
|
|
|
|
|
thumbnail = info.get("thumbnail") or ""
|
|
|
|
|
|
|
|
|
|
|
|
# Save to DB
|
|
|
|
|
|
save_xpornium_upload(user_data["embed_link"], fileid, xpornium_url, user_data["title"], 127, duration, thumbnail)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
|
|
|
|
|
|
page += 1
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}")
|
|
|
|
|
|
return all_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
BASE_URL = "https://webcamrips.to"
|
|
|
|
|
|
results = crawl_all(BASE_URL)
|
|
|
|
|
|
print("💾 All data saved to users_data.json")
|