updated and optimized and cleaned up

3 weeks ago · 42afcdc539
parent d440a25e1d
commit 42afcdc539
4 changed files with 98 additions and 111 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/config.py
+++ b/config.py
@ -10,7 +10,7 @@ def gen_connection():
    print("Connecting to database")
    newDB = mysql.connector.connect(host=host, user=username, password=password, database=database, port=port)
    print("Connected to database")
-    return newDB, newDB.cursor()
+    return newDB, newDB.cursor(dictionary=True)

 def get_storage():
    from BunnyCDN.Storage import Storage
--- a/storysave_scanner.py
+++ b/storysave_scanner.py
@ -1,7 +1,6 @@
 from watchdog.events import FileSystemEventHandler
 from watchdog.observers import Observer
 import shutil
-import time
 import os
 from funcs import get_media_dimensions

@ -13,20 +12,6 @@ os.makedirs(stories_dir, exist_ok=True)
 os.makedirs(posts_dir, exist_ok=True)


-def wait_for_complete(file_path, timeout=10):
-    prev_size = -1
-    for _ in range(timeout * 2):  # check every 0.5 sec
-        try:
-            size = os.path.getsize(file_path)
-        except FileNotFoundError:
-            return False
-        if size == prev_size:
-            return True
-        prev_size = size
-        time.sleep(0.5)
-    return False
-
-
 def is_story(width, height, tolerance=0.02):
    if width == 0 or height == 0:
        return False
@ -50,16 +35,13 @@ class DownloadHandler(FileSystemEventHandler):
    def process_file(self, file_path):
        file = os.path.basename(file_path)

+        # Ignore incomplete or weird temp names
        if "crdownload" in file or file.count("~") != 3:
            return

        if not os.path.exists(file_path):
            return

-        if not wait_for_complete(file_path):
-            print(f"File {file_path} did not stabilize. Skipping.")
-            return
-
        post_type = determine_post_type(file_path)
        if post_type == "posts":
            dest_dir = posts_dir
@ -91,6 +73,13 @@ class DownloadHandler(FileSystemEventHandler):
 if __name__ == "__main__":
    download_path = os.path.join(os.path.expanduser("~"), "Downloads")
    event_handler = DownloadHandler()
+
+    # Initial scan for files already in Downloads
+    for f in os.listdir(download_path):
+        full_path = os.path.join(download_path, f)
+        if os.path.isfile(full_path):
+            event_handler.process_file(full_path)
+
    observer = Observer()
    observer.schedule(event_handler, download_path, recursive=False)
    observer.start()
--- a/storysave_dump.py
+++ b/storysave_dump.py
@ -12,20 +12,20 @@ directory = 'media'
 os.makedirs(temp_directory, exist_ok=True)

 media_types = {
-    'stories' : 'story',
-    'posts' : 'post',
-    'profile' : 'profile'
+    'stories': 'story',
+    'posts': 'post',
+    'profile': 'profile'
 }

 for media_type, _ in media_types.items():
    os.makedirs(os.path.join(directory, media_type), exist_ok=True)

-existing_media_ids = {}
-
+existing_media_ids = set()
 UPLOAD_CUSTOM = False
 CACHE_FILE = os.path.join(temp_directory, 'existing_media_ids.json')
 CACHE_TTL = timedelta(hours=48)

+
 def UploadMedia(media):
    username = media['username']
    user_id = media['user_id']
@ -37,12 +37,12 @@ def UploadMedia(media):
    post_type = media['post_type']
    thumbnail_url = None
    phash = None
-    
+
    if media_id and media_id in existing_media_ids:
        print('Duplicate file detected. Removing...')
        os.remove(filepath)
        return True
-    
+
    file_size = os.path.getsize(filepath)
    filename = os.path.basename(filepath)
    file_extension = os.path.splitext(filename)[1].lower()
@ -56,14 +56,16 @@ def UploadMedia(media):
        print(f'Error determining media type for {filename}. Skipping...')
        return False

-    try:post_date = datetime.fromtimestamp(int(timestamp))
-    except:post_date = datetime.fromtimestamp(os.path.getctime(filepath))
+    try:
+        post_date = datetime.fromtimestamp(int(timestamp))
+    except:
+        post_date = datetime.fromtimestamp(os.path.getctime(filepath))

    width, height = funcs.get_media_dimensions(filepath)
    if 0 in (width, height):
        print(f'Error getting dimensions for {filename}. Skipping...')
        return False
-    
+
    duration = funcs.get_video_duration(filepath)

    if media_type == 'image':
@ -71,7 +73,7 @@ def UploadMedia(media):
    elif media_type == 'video':
        try:
            thumb_path = generate_thumbnail(filepath)
-            obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') # this might be a problem in case of duplicate hashes
+            obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg')
            thumbnail_url = f"https://cdn.altpins.com/thumbnails/{file_hash}.jpg"
            phash = funcs.generate_phash(thumb_path)
            os.remove(thumb_path)
@ -81,18 +83,17 @@ def UploadMedia(media):

    custom_filename = media_id if media_id else file_hash
    newFilename = f'{custom_filename}{file_extension}'
-
    server_path = f'media/{post_type}/{username}/{newFilename}'
    file_url = f"https://cdn.altpins.com/{server_path}"
-   
+
    obj_storage.PutFile(filepath, server_path)

    if highlight_id:
-        newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id))
+        newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)",
+                          (highlight_id, user_id, media_id))
        newDB.commit()
        print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')

-
    query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform, file_size) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
    values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, platform, file_size)

@ -104,14 +105,13 @@ def UploadMedia(media):
    print(f'File: {filename}')
    print(f'URL: {file_url}')
    print(f'Pin URL: https://altpins.com/pin/{newCursor.lastrowid}')
-    print("="*100)
+    print("=" * 100)

    os.remove(filepath)
-
    existing_media_ids.add(media_id)
-
    return newCursor.lastrowid

+
 def generate_thumbnail(filepath):
    thumb_path = os.path.join(temp_directory, f'{uuid4()}.jpg')
    cap = cv2.VideoCapture(filepath)
@ -120,16 +120,16 @@ def generate_thumbnail(filepath):
    cap.release()
    return thumb_path

+
 def get_user_id(username):
    username = username.lower()
    if username in existing_users:
        return existing_users[username]
-
    return None

+
 def get_media_data(filepath):
    filename = os.path.basename(filepath)
-
    parts = filename.split('~')
    if len(parts) != 4:
        return False
@ -141,7 +141,7 @@ def get_media_data(filepath):
    platform = 'instagram'

    highlight_id = user_id.replace('highlight', '') if 'highlight' in user_id else None
-    
+
    if user_id.isdigit():
        user_id = int(user_id)
    else:
@ -152,17 +152,17 @@ def get_media_data(filepath):
    else:
        media_id = None

-    data = {'username': username, 'timestamp': timestamp, 'media_id': media_id, 'user_id': user_id, 'filepath': filepath, 'highlight_id': highlight_id, 'platform': platform}
-
+    data = {'username': username, 'timestamp': timestamp, 'media_id': media_id, 'user_id': user_id,
+            'filepath': filepath, 'highlight_id': highlight_id, 'platform': platform}
    return data

+
 def get_media():
    medias = []
    failed_medias = []

    for media_type, post_type in media_types.items():
        media_folder_path = os.path.join(directory, media_type)
-
        if not os.path.exists(media_folder_path):
            continue

@ -172,26 +172,23 @@ def get_media():
            if not data:
                failed_medias.append(filepath)
                continue
-
            data['post_type'] = post_type
            medias.append(data)
-    
+
    return medias, failed_medias

+
 def get_custom_media(failed_medias):
    medias = []
-
    for media_type, post_type in media_types.items():
        folder_path = os.path.join(directory, media_type)
-
        user_dirs = [d for d in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, d))]
+
        for username in user_dirs:
            user_folder_path = os.path.join(folder_path, username)
-
            for filename in os.listdir(user_folder_path):
                if filename.startswith('.'):
                    continue
-
                filepath = os.path.join(user_folder_path, filename)
                if not filepath in failed_medias:
                    continue
@ -199,7 +196,7 @@ def get_custom_media(failed_medias):
                user_id = get_user_id(username)
                timestamp = int(os.path.getctime(filepath))
                media_id = os.path.splitext(filename)[0]
-                
+
                if media_id.isdigit():
                    media_id = int(media_id)
                    if media_id < 10000000:
@ -217,40 +214,35 @@ def get_custom_media(failed_medias):
                    "highlight_id": None,
                    "post_type": post_type
                }
-
                medias.append(data)
-
    return medias

+
 def save_highlight_data(highlights):
    filename = f'{uuid4()}.json'
    filepath = os.path.join('highlight_data', filename)
    with open(filepath, 'w') as f:
        json.dump(highlights, f)

+
 def dump_instagram():
    medias, failed_medias = get_media()
    medias = clean_dupes(medias)
    failed_medias = get_custom_media(failed_medias)
-    
+
    medias.sort(key=lambda x: (x['username'].lower(), x['timestamp']))

-    # Update new user ids and existing user ids
    new_user_ids = {}
    for media in medias:
        user_id = media['user_id']
        username = media['username']
-
        if not media['user_id']:
            continue
-
        if username in existing_users:
            continue
-            
        existing_users[username] = user_id
        new_user_ids[username] = user_id

-    # Assign user ids
    for media in medias:
        if media['user_id']:
            continue
@ -262,13 +254,12 @@ def dump_instagram():
        if not media['highlight_id']:
            continue
        highlights.append({
-                "media_id": media["media_id"],
-                "user_id": media["user_id"],
-                "highlight_id": media['highlight_id'],
-                "username": media['username'],
-            })
+            "media_id": media["media_id"],
+            "user_id": media["user_id"],
+            "highlight_id": media['highlight_id'],
+            "username": media['username'],
+        })

-    # save highlights data into folder highlight_Data
    if highlights:
        save_highlight_data(highlights)

@ -280,85 +271,97 @@ def dump_instagram():
        for media in failed_medias:
            pinid = UploadMedia(media)

+
 def clean_dupes(medias):
    removed_count = 0
    new_medias = []
    for media in medias:
        media_id = media['media_id']
        filepath = media['filepath']
-        
        if not media_id:
            print(f'Invalid media_id for file {filepath}. Skipping...')
            continue
-        
        if media_id in existing_media_ids:
            removed_count += 1
            print(f'Found duplicate file {filepath}. Removing...')
            os.remove(filepath)
            continue
-
        if re.search(r'\(\d+\)', filepath):
            removed_count += 1
            print(f'Found duplicate file {filepath}. Removing...')
            os.remove(filepath)
            continue
-
        new_medias.append(media)
-
    print(f'Removed {removed_count} duplicate files.')
    return new_medias

+
+# -------------------- CACHE SYSTEM --------------------
+
 def get_cached_data():
    if not os.path.exists(CACHE_FILE):
        print('No cache file found. Generating new cache…')
-        return None, None
-
+        return None, None, None
    try:
        with open(CACHE_FILE, 'r') as f:
-            cache_data = json.load(f)
-
-        timestamp = datetime.fromisoformat(cache_data.get('timestamp', ''))
-        if datetime.now() - timestamp < CACHE_TTL:
-            print('Using cached data…')
-            return set(tuple(x) for x in cache_data.get('existing_media_ids', [])), cache_data.get('existing_users', {})
+            cache = json.load(f)
+        media_ids = set(cache.get('media_ids', []))
+        users = {k.lower(): v for k, v in cache.get('existing_users', {}).items()}
+        last_id = cache.get('last_id', 0)
+        return media_ids, users, last_id
    except Exception as e:
        print(f"Cache read error: {e}")
+        return None, None, None

-    return None, None

-def save_cached_data(existing_media_ids, existing_users):
+def save_cached_data(media_ids, existing_users, last_id):
    with open(CACHE_FILE, 'w') as f:
-        json.dump({'timestamp': datetime.now().isoformat(), 'existing_media_ids': list(existing_media_ids), 'existing_users': existing_users}, f)
+        json.dump({
+            'timestamp': datetime.now().isoformat(),
+            'media_ids': list(media_ids),
+            'existing_users': existing_users,
+            'last_id': last_id
+        }, f)
+
+
+def get_user_ids(cur):
+    cur.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL AND platform='instagram'")
+    rows = cur.fetchall()
+    return {user['username'].lower(): user['user_id'] for user in rows}
+

-def get_existing_medias(newCursor):
-    existing_media_ids, existing_users = get_cached_data()
+def get_existing_media_ids(cur):
+    cur.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform='instagram' AND status='public'")
+    rows = cur.fetchall()
+    media_ids = {row['media_id'] for row in rows}
+    last_id = max((row['id'] for row in rows), default=0)
+    return media_ids, last_id

-    if existing_media_ids and existing_users:
-        newest_id = max(existing_media_ids, key=lambda x: x[0])[0]

-        existing_media_ids = {image[1] for image in existing_media_ids}
+def get_existing_medias(cur):
+    media_ids, users, last_id = get_cached_data()

-        newCursor.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform = 'instagram' AND status = 'public' AND id > %s ORDER BY id DESC", (newest_id,))
-        new_media_ids = {image[1] for image in newCursor.fetchall()}
+    if not media_ids or not users:
+        print('Cold cache → pulling full data...')
+        media_ids, last_id = get_existing_media_ids(cur)
+        users = get_user_ids(cur)
+        save_cached_data(media_ids, users, last_id)
+        return media_ids, users

-        for media_id in new_media_ids:
-            existing_media_ids.add(media_id)
-        
-        return existing_media_ids, existing_users
-        
-    print('Getting existing files and users...')
-    newCursor.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform = 'instagram' AND status = 'public';")
-    existing_media_ids = {image for image in newCursor.fetchall()}
+    cur.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform='instagram' AND status='public' AND id > %s ORDER BY id ASC", (last_id,))
+    rows = cur.fetchall()

-    print('Getting existing users...')
-    newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL AND platform = 'instagram'")
-    existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
+    for r in rows:
+        media_ids.add(r['media_id'])
+        last_id = max(last_id, r['id'])

-    cache_file = os.path.join(temp_directory, 'existing_media_ids.json')
-    with open(cache_file, 'w') as f:
-        json.dump({'timestamp': datetime.now().isoformat(), 'existing_media_ids': list(existing_media_ids), 'existing_users': existing_users}, f)
-    
-    return existing_media_ids, existing_users
+    if rows:
+        save_cached_data(media_ids, users, last_id)
+
+    return media_ids, users
+
+
+# -------------------- MAIN --------------------

 if __name__ == '__main__':
    print('Starting processing...')
@ -366,16 +369,11 @@ if __name__ == '__main__':
    if not funcs.get_files(directory):
        print('No files to process. Exiting...')
        exit()
-    
-    newDB, newCursor = config.gen_connection()

+    newDB, newCursor = config.gen_connection()
    obj_storage = config.get_storage()

    existing_media_ids, existing_users = get_existing_medias(newCursor)
-    
    dump_instagram()

-    print("Processing completed.")
-
-    # for mediatype, _ in media_types.items():
-    #     funcs.clean_empty_folders(os.path.join(directory, mediatype))
+    print("Processing completed.")