|
|
|
@ -1,136 +1,118 @@
|
|
|
|
from config import get_db_connection, preload_source_urls
|
|
|
|
import requests, mysql.connector, time
|
|
|
|
import requests, time, mysql.connector
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from urllib.parse import urljoin
|
|
|
|
from urllib.parse import urljoin
|
|
|
|
from datetime import datetime
|
|
|
|
from datetime import datetime
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
|
|
|
|
from config import get_db_connection, preload_source_urls
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ────────────────────────────────────────────────────────────────
|
|
|
|
def insert_video_to_db(data, db):
|
|
|
|
def insert_video_to_db(data):
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
|
|
|
|
db = get_db_connection()
|
|
|
|
cursor = db.cursor()
|
|
|
|
cursor = db.cursor()
|
|
|
|
sql = """
|
|
|
|
cursor.execute("""
|
|
|
|
INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
|
|
|
|
INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
|
|
|
|
VALUES (%s, %s, %s, %s, NOW())
|
|
|
|
VALUES (%s, %s, %s, %s, NOW())
|
|
|
|
"""
|
|
|
|
""", (data['username'], data['date'], data['embed_link'], data['source_url']))
|
|
|
|
values = (
|
|
|
|
|
|
|
|
data['username'],
|
|
|
|
|
|
|
|
data['date'],
|
|
|
|
|
|
|
|
data['embed_link'],
|
|
|
|
|
|
|
|
data['source_url']
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
cursor.execute(sql, values)
|
|
|
|
|
|
|
|
db.commit()
|
|
|
|
db.commit()
|
|
|
|
if cursor.rowcount == 0:
|
|
|
|
if cursor.rowcount > 0:
|
|
|
|
print("❌ Video already exists in DB")
|
|
|
|
print(f"✅ New: {data['username']} — {data['date']}")
|
|
|
|
else:
|
|
|
|
|
|
|
|
print("✅ Inserted into DB!")
|
|
|
|
|
|
|
|
except mysql.connector.Error as err:
|
|
|
|
except mysql.connector.Error as err:
|
|
|
|
print(f"❌ Failed to insert: {err}")
|
|
|
|
print(f"❌ DB insert error: {err}")
|
|
|
|
finally:
|
|
|
|
finally:
|
|
|
|
cursor.close()
|
|
|
|
cursor.close()
|
|
|
|
|
|
|
|
db.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def crawl_user_page(full_url):
|
|
|
|
# ────────────────────────────────────────────────────────────────
|
|
|
|
try:
|
|
|
|
|
|
|
|
response = requests.get(full_url)
|
|
|
|
|
|
|
|
if response.status_code != 200:
|
|
|
|
|
|
|
|
print(f"❌ Failed to load {full_url}")
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
|
|
|
|
data = parse_data(soup)
|
|
|
|
|
|
|
|
data["source_url"] = full_url
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
print(f"❌ Exception while crawling {full_url}: {e}")
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_data(soup):
|
|
|
|
def parse_data(soup):
|
|
|
|
title_tag = soup.find("h1", class_="entry-title")
|
|
|
|
username = soup.select_one("h1.entry-title")
|
|
|
|
username = title_tag.contents[0].strip() if title_tag else "unknown"
|
|
|
|
username = username.contents[0].strip() if username and username.contents else None
|
|
|
|
date_tag = soup.find("span", class_="entry-date")
|
|
|
|
|
|
|
|
date = date_tag.text.strip() if date_tag else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
date_tag = soup.select_one("span.entry-date")
|
|
|
|
|
|
|
|
date = date_tag.text.strip() if date_tag else None
|
|
|
|
if date:
|
|
|
|
if date:
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
date_obj = datetime.strptime(date, "%d/%m/%Y")
|
|
|
|
date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d")
|
|
|
|
date = date_obj.strftime("%Y-%m-%d")
|
|
|
|
|
|
|
|
except ValueError:
|
|
|
|
except ValueError:
|
|
|
|
print(f"⚠️ Failed to parse date: {date}")
|
|
|
|
|
|
|
|
date = None
|
|
|
|
date = None
|
|
|
|
|
|
|
|
|
|
|
|
embed_link = None
|
|
|
|
embed_link = next((i["src"] for i in soup.find_all("iframe", src=True) if "xpornium.net" in i["src"]), None)
|
|
|
|
for iframe in soup.find_all("iframe", src=True):
|
|
|
|
return {"username": username, "date": date, "embed_link": embed_link}
|
|
|
|
src = iframe["src"]
|
|
|
|
|
|
|
|
if "xpornium.net" in src:
|
|
|
|
|
|
|
|
embed_link = src
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\n✅ Scraped {username}: — {date}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
|
|
"username": username,
|
|
|
|
|
|
|
|
"date": date,
|
|
|
|
|
|
|
|
"embed_link": embed_link,
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_link(link, seen_urls, init_url):
|
|
|
|
# ────────────────────────────────────────────────────────────────
|
|
|
|
full_url = init_url + link
|
|
|
|
def crawl_user_page(url):
|
|
|
|
if full_url in seen_urls:
|
|
|
|
try:
|
|
|
|
print(f"⚠️ Skipping {link} - already seen.")
|
|
|
|
res = requests.get(url, timeout=15)
|
|
|
|
return
|
|
|
|
if res.ok:
|
|
|
|
|
|
|
|
soup = BeautifulSoup(res.text, "html.parser")
|
|
|
|
user_data = crawl_user_page(full_url)
|
|
|
|
data = parse_data(soup)
|
|
|
|
if not user_data:
|
|
|
|
data["source_url"] = url
|
|
|
|
print("⚠️ Skipping empty user_data.")
|
|
|
|
return data if data["embed_link"] else None
|
|
|
|
return
|
|
|
|
except Exception:
|
|
|
|
|
|
|
|
pass
|
|
|
|
if not user_data["embed_link"]:
|
|
|
|
return None
|
|
|
|
print(f"⚠️ Skipping {user_data['username']} - no embed link found.")
|
|
|
|
|
|
|
|
return
|
|
|
|
# ────────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
def crawl_index_page(base_url, page_num, seen, cursor, db):
|
|
|
|
local_db = get_db_connection()
|
|
|
|
url = f"{base_url}?p={page_num}"
|
|
|
|
insert_video_to_db(user_data, local_db)
|
|
|
|
print(f"📄 Page {page_num}")
|
|
|
|
local_db.close()
|
|
|
|
try:
|
|
|
|
|
|
|
|
res = requests.get(url, timeout=15)
|
|
|
|
|
|
|
|
if not res.ok:
|
|
|
|
def crawl_all(init_url):
|
|
|
|
return 0
|
|
|
|
page = 1
|
|
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(res.text, "html.parser")
|
|
|
|
|
|
|
|
links = [urljoin(base_url, a["href"]) for a in soup.select("a.thumbnail-link[href]")]
|
|
|
|
|
|
|
|
links = [link for link in links if link not in seen]
|
|
|
|
|
|
|
|
if not links:
|
|
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
new_count = 0
|
|
|
|
|
|
|
|
with ThreadPoolExecutor(max_workers=50) as pool:
|
|
|
|
|
|
|
|
for f in as_completed(pool.submit(crawl_user_page, l) for l in links):
|
|
|
|
|
|
|
|
data = f.result()
|
|
|
|
|
|
|
|
if data:
|
|
|
|
|
|
|
|
insert_video_to_db(data, cursor, db)
|
|
|
|
|
|
|
|
new_count += 1
|
|
|
|
|
|
|
|
return new_count
|
|
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ────────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
def crawl_all(base_url):
|
|
|
|
db = get_db_connection()
|
|
|
|
db = get_db_connection()
|
|
|
|
cursor = db.cursor()
|
|
|
|
cursor = db.cursor()
|
|
|
|
seen_urls = preload_source_urls(cursor)
|
|
|
|
seen = preload_source_urls(cursor)
|
|
|
|
cursor.close()
|
|
|
|
|
|
|
|
db.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
QUIT_LOGIC = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
page, total, empty_results = 1000, 0, 0
|
|
|
|
while True:
|
|
|
|
while True:
|
|
|
|
url = f"{init_url}?p={page}"
|
|
|
|
batch = range(page, page + 10)
|
|
|
|
print(f"\n🕷️ Crawling index page {page}: {url}")
|
|
|
|
print(f"\n🚀 Batch {page}–{page + 9}")
|
|
|
|
response = requests.get(url)
|
|
|
|
|
|
|
|
if response.status_code != 200:
|
|
|
|
|
|
|
|
print(f"❌ Page {page} returned {response.status_code}, stopping.")
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
with ThreadPoolExecutor(max_workers=10) as pool:
|
|
|
|
video_pages = soup.find_all("a", class_="thumbnail-link", href=True)
|
|
|
|
results = [f.result() for f in as_completed(pool.submit(crawl_index_page, base_url, p, seen, cursor, db) for p in batch)]
|
|
|
|
video_pages = [link['href'] for link in video_pages]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not video_pages:
|
|
|
|
batch_total = sum(results)
|
|
|
|
print("⚠️ No user links found — reached end of site.")
|
|
|
|
total += batch_total
|
|
|
|
break
|
|
|
|
print(f"📦 Batch complete — {batch_total} new videos (total: {total})")
|
|
|
|
|
|
|
|
|
|
|
|
with ThreadPoolExecutor(max_workers=50) as executor:
|
|
|
|
if not QUIT_LOGIC:
|
|
|
|
futures = [executor.submit(process_link, link, seen_urls, init_url) for link in video_pages]
|
|
|
|
if batch_total == 0:
|
|
|
|
for _ in as_completed(futures):
|
|
|
|
empty_results += 1
|
|
|
|
pass # we already log inside the functions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
page += 1
|
|
|
|
if empty_results >= 10:
|
|
|
|
|
|
|
|
print("\n🛑 No new videos found for 30 consecutive pages. Stopping.")
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
print("\n✅ Finished crawling all pages.")
|
|
|
|
page += 10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cursor.close()
|
|
|
|
|
|
|
|
db.close()
|
|
|
|
|
|
|
|
print(f"\n✅ Done! Total new videos: {total}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ────────────────────────────────────────────────────────────────
|
|
|
|
if __name__ == "__main__":
|
|
|
|
if __name__ == "__main__":
|
|
|
|
BASE_URL = "https://webcamrips.to"
|
|
|
|
crawl_all("https://webcamrips.to")
|
|
|
|
crawl_all(BASE_URL)
|
|
|
|
|
|
|
|
|