You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

92 lines
3.1 KiB
Python

14 hours ago
import requests
from bs4 import BeautifulSoup
import time
MXDROP_PREFIX = "https://mxdrop.to/e/"
def build_session():
s = requests.Session()
s.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
})
return s
def fetch_video_links(session, page_url, base_domain="https://striphub.cam"):
"""Collect /play/ links from one paginated list page."""
try:
r = session.get(page_url, timeout=10)
r.raise_for_status()
except requests.RequestException as e:
print(f"{page_url} failed: {e}")
return []
soup = BeautifulSoup(r.text, "html.parser")
links = []
for a in soup.find_all("a", href=True):
if a["href"].startswith("/play/"):
links.append(base_domain + a["href"])
print(f"{page_url}: {len(links)} /play/ links")
return links
def extract_iframe_links(session, play_url):
"""Open a /play/ page and collect all <iframe src="https://mxdrop.to/e/...">."""
try:
r = session.get(play_url, timeout=12, headers={"Referer": play_url})
r.raise_for_status()
except requests.RequestException as e:
print(f"❌ could not fetch {play_url}: {e}")
return []
soup = BeautifulSoup(r.text, "html.parser")
embeds = [
iframe["src"]
for iframe in soup.find_all("iframe", src=True)
if iframe["src"].startswith(MXDROP_PREFIX)
]
if embeds:
print(f"🔗 {play_url}: found {len(embeds)} mxdrop embeds")
else:
title = soup.title.string.strip() if soup.title and soup.title.string else "(no title)"
print(f"🚫 {play_url}: no iframes, title={title}")
return embeds
def crawl_all_pages(base_url, total_pages, base_domain="https://striphub.cam",
video_out="video_links.txt", embed_out="embedLinks.txt"):
session = build_session()
# 1⃣ gather all /play/ links
all_play = []
for p in range(1, total_pages + 1):
page_url = f"{base_url}/page/{p}"
print(f"\n🌐 Crawling {page_url}")
all_play.extend(fetch_video_links(session, page_url, base_domain))
time.sleep(0.5)
all_play = sorted(set(all_play))
with open(video_out, "w", encoding="utf-8") as f:
f.writelines(link + "\n" for link in all_play)
print(f"\n✅ saved {len(all_play)} /play/ links → {video_out}")
# 2⃣ visit each /play/ page and collect mxdrop iframes
all_embeds = set()
for i, link in enumerate(all_play, 1):
embeds = extract_iframe_links(session, link)
all_embeds.update(embeds)
print(f"[{i}/{len(all_play)}] total embeds: {len(all_embeds)}")
time.sleep(0.5)
with open(embed_out, "w", encoding="utf-8") as f:
f.writelines(e + "\n" for e in sorted(all_embeds))
print(f"\n✅ saved {len(all_embeds)} mxdrop embeds → {embed_out}")
return all_play, sorted(all_embeds)
# Example usage:
crawl_all_pages("https://striphub.cam", total_pages=5)