MY ASS BROKEN!!!!

7 hours ago · c51464d68e
parent c79a86eddb
commit c51464d68e
5 changed files with 42 additions and 35 deletions
--- a/pycache/test1.cpython-314.pyc
+++ b/pycache/test1.cpython-314.pyc
--- a/wcr/pycache/config.cpython-313.pyc
+++ b/wcr/pycache/config.cpython-313.pyc
--- a/wcr/pycache/config.cpython-314.pyc
+++ b/wcr/pycache/config.cpython-314.pyc
--- a/wcr/config.py
+++ b/wcr/config.py
@ -14,4 +14,8 @@ def get_db_connection():
    except mysql.connector.Error as err:
        print(f"❌ Failed to connect to DB: {err}")
        return  # don’t continue if DB failed
-    return db
+    return db
+
+def preload_source_urls(cursor):
+    cursor.execute("SELECT source_url FROM videos")
+    return set(row[0] for row in cursor.fetchall())
--- a/wcr/crawl_wcr.py
+++ b/wcr/crawl_wcr.py
@ -1,4 +1,4 @@
-from config import get_db_connection
+from config import get_db_connection, preload_source_urls
 import requests, time, mysql.connector
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
@ -6,36 +6,34 @@ from datetime import datetime


 def insert_video_to_db(data):
-    db = get_db_connection()
+    
    
    try:
        cursor = db.cursor()
        sql = """
-        INSERT IGNORE INTO videos (username, url, title, date, embed_link, source_url, created_at)
-        VALUES (%s, %s, %s, %s, %s, %s, NOW())
+        INSERT IGNORE INTO videos (username, date, embed_link, source_url, created_at)
+        VALUES (%s, %s, %s, %s, NOW())
                """

        values = (
            data['username'],
-            data['url'],
-            data['title'],
            data['date'],
            data['embed_link'],
            data['source_url']
        )
        cursor.execute(sql, values)
        db.commit()
-        print("✅ Inserted into DB!")
+        if cursor.rowcount == 0:
+            print("❌ Video already exists in DB")
+        else:
+            print("✅ Inserted into DB!")

    except mysql.connector.Error as err:
        print(f"❌ Failed to insert: {err}")

-    finally:
-        cursor.close()
-        db.close()
+    

-def crawl_user_page(base_url, user_path):
-    full_url = urljoin(base_url, user_path)
+def crawl_user_page(full_url):
    response = requests.get(full_url)
    if response.status_code != 200:
        print(f"❌ Failed to load {full_url}")
@ -43,13 +41,15 @@ def crawl_user_page(base_url, user_path):

    soup = BeautifulSoup(response.text, "html.parser")

+    data = parse_data(soup)
+    data["source_url"] = full_url
+    return data
    

 def parse_data(soup):
-    username = user_path.strip("/")
+    
    title_tag = soup.find("h1", class_="entry-title")
-    title = title_tag.text.strip() if title_tag else "(no title)"
-
+    username = title_tag.contents[0].strip() 
    date_tag = soup.find("span", class_="entry-date")
    date = date_tag.text.strip() if date_tag else None

@ -75,7 +75,6 @@ def parse_data(soup):

        return {
            "username": username,
-            "title": title,
            "date": date,
            "embed_link": embed_link,
        }
@ -94,14 +93,24 @@ def crawl_all(init_url):
            break

        soup = BeautifulSoup(response.text, "html.parser")
-        user_links = soup.find_all("a", class_="thumbnail-link", href=True)
-        if not user_links:
+        video_pages = soup.find_all("a", class_="thumbnail-link", href=True)
+        video_pages = [link['href'] for link in video_pages]
+        if not video_pages:
            print("⚠️ No user links found — reached end of site.")
            break
-
-        for link in user_links:
-            user_path = link["href"]
-            user_data = crawl_user_page(init_url, user_path)
+        cursor = db.cursor()    
+        seen_urls = preload_source_urls(cursor)
+        
+        for link in video_pages:
+            full_url = init_url + link
+            
+            if full_url in seen_urls:
+                print(f"⚠️ Skipping {link} - already seen.")
+                continue
+            
+            
+            
+            user_data = crawl_user_page(full_url) # slow as fuk
            if not user_data:
                print("⚠️ Skipping empty user_data.")
                continue
@ -111,25 +120,19 @@ def crawl_all(init_url):
                continue

            insert_video_to_db(user_data)
-            time.sleep(0.5)
+            

        page += 1
-        time.sleep(1)
+        
            
    print(f"\n✅ Finished crawling all pages. Total users: {len(all_data)}")
    return all_data

 if __name__ == "__main__":
+    db = get_db_connection()
    BASE_URL = "https://webcamrips.to"
    results = crawl_all(BASE_URL)
    print("💾 All data saved to users_data.json")
-            
-            
-   
-            
-                
-
-
-
-
-
+    cursor = db.cursor()
+    cursor.close()
+    db.close()