diff --git a/__pycache__/test1.cpython-314.pyc b/__pycache__/test1.cpython-314.pyc new file mode 100644 index 0000000..f3c1e76 Binary files /dev/null and b/__pycache__/test1.cpython-314.pyc differ diff --git a/wcr/__pycache__/config.cpython-313.pyc b/wcr/__pycache__/config.cpython-313.pyc new file mode 100644 index 0000000..7b7cdaa Binary files /dev/null and b/wcr/__pycache__/config.cpython-313.pyc differ diff --git a/wcr/__pycache__/config.cpython-314.pyc b/wcr/__pycache__/config.cpython-314.pyc index 05bd58a..329c90a 100644 Binary files a/wcr/__pycache__/config.cpython-314.pyc and b/wcr/__pycache__/config.cpython-314.pyc differ diff --git a/wcr/config.py b/wcr/config.py index 2244b8f..c628d6f 100644 --- a/wcr/config.py +++ b/wcr/config.py @@ -14,4 +14,8 @@ def get_db_connection(): except mysql.connector.Error as err: print(f"❌ Failed to connect to DB: {err}") return # don’t continue if DB failed - return db \ No newline at end of file + return db + +def preload_source_urls(cursor): + cursor.execute("SELECT source_url FROM videos") + return set(row[0] for row in cursor.fetchall()) diff --git a/wcr/crawl_wcr.py b/wcr/crawl_wcr.py index 713d66b..3c470ec 100644 --- a/wcr/crawl_wcr.py +++ b/wcr/crawl_wcr.py @@ -1,4 +1,4 @@ -from config import get_db_connection +from config import get_db_connection, preload_source_urls import requests, time, mysql.connector from bs4 import BeautifulSoup from urllib.parse import urljoin @@ -6,36 +6,34 @@ from datetime import datetime def insert_video_to_db(data): - db = get_db_connection() + try: cursor = db.cursor() sql = """ - INSERT IGNORE INTO videos (username, url, title, date, embed_link, source_url, created_at) - VALUES (%s, %s, %s, %s, %s, %s, NOW()) + INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at) + VALUES (%s, %s, %s, %s, NOW()) """ values = ( data['username'], - data['url'], - data['title'], data['date'], data['embed_link'], data['source_url'] ) cursor.execute(sql, values) db.commit() - print("✅ Inserted into DB!") + if cursor.rowcount == 0: + print("❌ Video already exists in DB") + else: + print("✅ Inserted into DB!") except mysql.connector.Error as err: print(f"❌ Failed to insert: {err}") - finally: - cursor.close() - db.close() + -def crawl_user_page(base_url, user_path): - full_url = urljoin(base_url, user_path) +def crawl_user_page(full_url): response = requests.get(full_url) if response.status_code != 200: print(f"❌ Failed to load {full_url}") @@ -43,13 +41,15 @@ def crawl_user_page(base_url, user_path): soup = BeautifulSoup(response.text, "html.parser") + data = parse_data(soup) + data["source_url"] = full_url + return data def parse_data(soup): - username = user_path.strip("/") + title_tag = soup.find("h1", class_="entry-title") - title = title_tag.text.strip() if title_tag else "(no title)" - + username = title_tag.contents[0].strip() date_tag = soup.find("span", class_="entry-date") date = date_tag.text.strip() if date_tag else None @@ -75,7 +75,6 @@ def parse_data(soup): return { "username": username, - "title": title, "date": date, "embed_link": embed_link, } @@ -94,14 +93,24 @@ def crawl_all(init_url): break soup = BeautifulSoup(response.text, "html.parser") - user_links = soup.find_all("a", class_="thumbnail-link", href=True) - if not user_links: + video_pages = soup.find_all("a", class_="thumbnail-link", href=True) + video_pages = [link['href'] for link in video_pages] + if not video_pages: print("⚠️ No user links found — reached end of site.") break - - for link in user_links: - user_path = link["href"] - user_data = crawl_user_page(init_url, user_path) + cursor = db.cursor() + seen_urls = preload_source_urls(cursor) + + for link in video_pages: + full_url = init_url + link + + if full_url in seen_urls: + print(f"⚠️ Skipping {link} - already seen.") + continue + + + + user_data = crawl_user_page(full_url) # slow as fuk if not user_data: print("⚠️ Skipping empty user_data.") continue @@ -111,25 +120,19 @@ def crawl_all(init_url): continue insert_video_to_db(user_data) - time.sleep(0.5) + page += 1 - time.sleep(1) + print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}") return all_data if __name__ == "__main__": + db = get_db_connection() BASE_URL = "https://webcamrips.to" results = crawl_all(BASE_URL) print("💾 All data saved to users_data.json") - - - - - - - - - - + cursor = db.cursor() + cursor.close() + db.close() \ No newline at end of file