AllCode/Python/crawler/crawlStriphub.py

import requests
from bs4 import BeautifulSoup
import time

def fetch_video_links(page_url):

    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

    try:
        response = requests.get(page_url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"❌ Failed to fetch {page_url}: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")

    video_links = []
    for a_tag in soup.find_all("a", href=True):
        if a_tag["href"].startswith("/play/"):
            # Build full URL
            base_url = "https://striphub.cam"
            full_link = base_url + a_tag["href"]
            video_links.append(full_link)

    print(f"✅ Found {len(video_links)} videos on {page_url}")
    return video_links


def crawl_all_pages(base_url, total_pages, output_file="video_links.txt"):

    all_links = []

    for page in range(1, total_pages + 1):
        page_url = f"{base_url}/page/{page}"
        print(f"\n🌐 Crawling page {page_url}...")
        links = fetch_video_links(page_url)
        all_links.extend(links)
        time.sleep(1)  # polite delay so you don't hammer the server

    # Remove duplicates
    all_links = list(set(all_links))

    # Save all to file
    with open(output_file, "w", encoding="utf-8") as f:
        for link in all_links:
            f.write(link + "\n")

    print(f"\n✅ Done! Saved {len(all_links)} unique video links to {output_file}")
    return all_links
    for link in all_links:
        r = requests.get(link)


# Example usage:
crawl_all_pages("https://striphub.cam", total_pages=5)