|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
import time
|
|
|
|
|
|
MXDROP_PREFIX = "https://mxdrop.to/e/"
|
|
|
|
|
|
def build_session():
|
|
|
s = requests.Session()
|
|
|
s.headers.update({
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
|
"Connection": "keep-alive",
|
|
|
})
|
|
|
return s
|
|
|
|
|
|
|
|
|
def fetch_video_links(session, page_url, base_domain="https://striphub.cam"):
|
|
|
"""Collect /play/ links from one paginated list page."""
|
|
|
try:
|
|
|
r = session.get(page_url, timeout=10)
|
|
|
r.raise_for_status()
|
|
|
except requests.RequestException as e:
|
|
|
print(f"❌ {page_url} failed: {e}")
|
|
|
return []
|
|
|
|
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
|
links = []
|
|
|
for a in soup.find_all("a", href=True):
|
|
|
if a["href"].startswith("/play/"):
|
|
|
links.append(base_domain + a["href"])
|
|
|
print(f"✅ {page_url}: {len(links)} /play/ links")
|
|
|
return links
|
|
|
|
|
|
|
|
|
def extract_iframe_links(session, play_url):
|
|
|
"""Open a /play/ page and collect all <iframe src="https://mxdrop.to/e/...">."""
|
|
|
try:
|
|
|
r = session.get(play_url, timeout=12, headers={"Referer": play_url})
|
|
|
r.raise_for_status()
|
|
|
except requests.RequestException as e:
|
|
|
print(f"❌ could not fetch {play_url}: {e}")
|
|
|
return []
|
|
|
|
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
|
embeds = [
|
|
|
iframe["src"]
|
|
|
for iframe in soup.find_all("iframe", src=True)
|
|
|
if iframe["src"].startswith(MXDROP_PREFIX)
|
|
|
]
|
|
|
if embeds:
|
|
|
print(f"🔗 {play_url}: found {len(embeds)} mxdrop embeds")
|
|
|
else:
|
|
|
title = soup.title.string.strip() if soup.title and soup.title.string else "(no title)"
|
|
|
print(f"🚫 {play_url}: no iframes, title={title}")
|
|
|
return embeds
|
|
|
|
|
|
|
|
|
def crawl_all_pages(base_url, total_pages, base_domain="https://striphub.cam",
|
|
|
video_out="video_links.txt", embed_out="embedLinks.txt"):
|
|
|
session = build_session()
|
|
|
|
|
|
# 1️⃣ gather all /play/ links
|
|
|
all_play = []
|
|
|
for p in range(1, total_pages + 1):
|
|
|
page_url = f"{base_url}/page/{p}"
|
|
|
print(f"\n🌐 Crawling {page_url}")
|
|
|
all_play.extend(fetch_video_links(session, page_url, base_domain))
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
all_play = sorted(set(all_play))
|
|
|
with open(video_out, "w", encoding="utf-8") as f:
|
|
|
f.writelines(link + "\n" for link in all_play)
|
|
|
print(f"\n✅ saved {len(all_play)} /play/ links → {video_out}")
|
|
|
|
|
|
# 2️⃣ visit each /play/ page and collect mxdrop iframes
|
|
|
all_embeds = set()
|
|
|
for i, link in enumerate(all_play, 1):
|
|
|
embeds = extract_iframe_links(session, link)
|
|
|
all_embeds.update(embeds)
|
|
|
print(f"[{i}/{len(all_play)}] total embeds: {len(all_embeds)}")
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
with open(embed_out, "w", encoding="utf-8") as f:
|
|
|
f.writelines(e + "\n" for e in sorted(all_embeds))
|
|
|
print(f"\n✅ saved {len(all_embeds)} mxdrop embeds → {embed_out}")
|
|
|
return all_play, sorted(all_embeds)
|
|
|
|
|
|
|
|
|
# Example usage:
|
|
|
crawl_all_pages("https://striphub.cam", total_pages=5)
|