last update

11 months ago · 1d8bb3c85f
parent 89c8e35e3b
commit 1d8bb3c85f
34 changed files with 486916 additions and 65 deletions
--- a/.gitignore
+++ b/.gitignore
@ -20,3 +20,6 @@ temp/
 /duplicates
 /ready_to_upload
 /archive
+/images
+/images
+/sorted
--- a/dedupe_phash.py
+++ b/dedupe_phash.py
@ -0,0 +1,95 @@
+import os
+import json
+import config
+import imagehash
+from PIL import Image
+from funcs import get_files, calculate_file_hash
+
+
+def generate_image_phash(filepath, hash_size=8):
+    try:
+        # Open the image using PIL
+        pil_image = Image.open(filepath)
+        
+        # Compute pHash using the imagehash library
+        phash = imagehash.phash(pil_image, hash_size=hash_size)
+        return phash
+    except Exception as e:
+        print(f"Error processing image {filepath}: {e}")
+        return None
+
+def are_phashes_duplicates(phash1, phash2, threshold=5):
+    try:
+        # Compute the Hamming distance between the pHashes
+        distance = phash1 - phash2
+        return distance <= threshold
+    except TypeError as e:
+        print(f"Error comparing pHashes: {e}")
+        return False
+
+def get_media_by_phash(phash, username, existing_medias, threshold=5):
+    for media in existing_medias:
+        existing_phash_str = media[1]
+        existing_username = media[2]
+        
+        # Convert stored pHash string to ImageHash object
+        existing_phash = imagehash.hex_to_hash(existing_phash_str)
+
+        # Check if the current pHash is a duplicate
+        if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
+            return media
+    return None
+
+def get_media_by_hash(hash, existing_medias):
+    for media in existing_medias:
+        existing_hash = media[1]
+        if hash == existing_hash:
+            return media
+    return None
+
+def get_media_by_id(media_id, existing_medias):
+    for media in existing_medias:
+        existing_media_id = media[1]
+        if media_id == existing_media_id:
+            return media
+    return None
+
+def get_data_by_filename(filename, data):
+    for item in data:
+        if filename in item['filepath']:
+            return item
+    return None
+
+directory = 'check_if_exists'  # Directory containing user images
+
+# Database connection
+db, cursor = config.gen_connection()
+    
+# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
+cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['image'])
+existing_medias = cursor.fetchall()
+
+usernames = os.listdir(directory)
+
+for username in usernames:
+    files = get_files(os.path.join(directory, username))
+    for filepath in files:
+        image_filename = os.path.basename(filepath)
+        print(f'Processing {image_filename}...')
+
+        # Generate pHash for the image
+        phash = generate_image_phash(filepath, hash_size=8)
+        if phash is None:
+            continue  # Skip this image if there's an issue
+
+        phash_str = str(phash)
+
+        # Check if the image is a duplicate of any in the database
+        duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
+        if duplicate_media:
+            print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
+            print(f'Duplicate image path: {filepath}')
+            newpath = os.path.join('duplicates', duplicate_media[2], image_filename)
+            os.makedirs(os.path.dirname(newpath), exist_ok=True)
+            os.rename(filepath, newpath)
+            print(f'Moved {image_filename} to duplicates/')
--- a/dupecleaner_phash.py
+++ b/dupecleaner_phash.py
@ -0,0 +1,68 @@
+import os
+from funcs import generate_phash  # Assuming this function computes the pHash and returns a string
+import imagehash
+
+def get_files(directory):
+    # Recursively get all files in the directory
+    file_list = []
+    for root, dirs, files in os.walk(directory):
+        for filename in files:
+            file_list.append(os.path.join(root, filename))
+    return file_list
+
+# Function to compute pHashes for all images in a directory
+def compute_phashes(image_paths):
+    phash_dict = {}
+    for image_path in image_paths:
+        try:
+            # Compute pHash and get it as a string
+            phash_str = generate_phash(image_path)
+            # Convert the hash string to an ImageHash object
+            phash = imagehash.hex_to_hash(phash_str)
+            phash_dict[image_path] = phash
+        except Exception as e:
+            print(f"Error processing {image_path}: {e}")
+    return phash_dict
+
+# Get all image files from 'ready_to_upload' and 'sorted' directories
+ready_images = get_files('ready_to_upload')
+ready_images = [image for image in ready_images if not image.lower().endswith('.mp4')]
+
+sorted_images = get_files('sorted')
+sorted_images = [image for image in sorted_images if not image.lower().endswith('.mp4')]
+
+# Compute pHashes for images in 'ready_to_upload'
+print("Computing pHashes for 'ready_to_upload' images...")
+ready_image_phashes = compute_phashes(ready_images)
+
+# Compute pHashes for images in 'sorted'
+print("Computing pHashes for 'sorted' images...")
+sorted_image_phashes = compute_phashes(sorted_images)
+
+# Prepare the 'already_processed' directory
+os.makedirs('already_processed', exist_ok=True)
+
+# Set a Hamming distance threshold for considering images as duplicates
+threshold = 5  # Adjust this value as needed
+
+# Find and move duplicates
+for sorted_image, sorted_phash in sorted_image_phashes.items():
+    duplicate_found = False
+    for ready_image, ready_phash in ready_image_phashes.items():
+        # Compute Hamming distance between the two pHashes
+        try:
+            distance = sorted_phash - ready_phash
+        except TypeError as e:
+            print(f"Error comparing hashes for {sorted_image} and {ready_image}: {e}")
+            continue
+
+        if distance <= threshold:
+            # Duplicate found
+            newpath = sorted_image.replace('sorted', 'already_processed')
+            os.makedirs(os.path.dirname(newpath), exist_ok=True)
+            print(f"Moving {sorted_image} (duplicate of {ready_image}) to 'already_processed'")
+            os.rename(sorted_image, newpath)
+            duplicate_found = True
+            break  # Exit the loop since a duplicate is found
+    if not duplicate_found:
+        print(f"No duplicate found for {sorted_image}")
--- a/find_by_phash.py
+++ b/find_by_phash.py
@ -0,0 +1,61 @@
+import os
+import funcs
+import config
+
+# Function to find the closest perceptual hash (phash) match
+def find_almost_identical_phash(phash, usernames, max_distance=1):
+    """
+    Find a username whose phash is nearly identical to the given phash.
+    :param phash: The phash to compare (e.g., from the 'unknown' image).
+    :param usernames: List of tuples containing (username, phash).
+    :param max_distance: Maximum Hamming distance to consider as "identical".
+    :return: The matching username and phash, or None if no match is found.
+    """
+    for username in usernames:
+        dist = hamming_distance(phash, username[1])
+        if dist <= max_distance:
+            return username
+    return None
+
+def hamming_distance(phash1, phash2):
+    """
+    Calculate the Hamming distance between two binary strings.
+    """
+    if len(phash1) != len(phash2):
+        raise ValueError("Hashes must be of the same length")
+    return sum(c1 != c2 for c1, c2 in zip(phash1, phash2))
+
+
+# Establish database connection
+db, cursor = config.gen_connection()
+
+# Fetch all images with an 'unknown' username
+cursor.execute("SELECT id, username, phash FROM media WHERE username = 'unknown'")
+rows = cursor.fetchall()
+
+# Fetch all non-unknown usernames and their associated phash
+cursor.execute("SELECT username, phash FROM media WHERE username != 'unknown' AND phash IS NOT NULL AND status = 'public'")
+usernames = cursor.fetchall()
+
+# Ensure there are valid usernames to compare against
+if not usernames:
+    print("No known usernames found in the database.")
+    exit()
+
+# Adjusted section in your script
+for row in rows:
+    id = row[0]
+    phash = row[2]
+
+    # Find a nearly identical phash match
+    closest = find_almost_identical_phash(phash, usernames, max_distance=2)
+
+    if closest:
+        print(f"Found match for image {id}: {closest[0]} with phash {closest[1]}")
+        cursor.execute(
+            "UPDATE media SET username = %s WHERE id = %s",
+            (closest[0], id),
+        )
+        db.commit()
+    else:
+        print(f"No nearly identical match found for image {id}.")
--- a/find_duplicates_by_phash.py
+++ b/find_duplicates_by_phash.py
@ -25,13 +25,13 @@ def are_phashes_duplicates(phash1, phash2, threshold=5):
        print(f"Error comparing pHashes: {e}")
        return False

-def get_media_by_phash(phash, username, existing_medias, threshold=6):
+def get_media_by_phash(phash, username, existing_medias, threshold=5):
    for media in existing_medias:
        existing_phash_str = media[1]
-        existing_username = media[2]
-        
-        if existing_username != username:
-            continue  # Only compare with the same user's media
+
+        # existing_username = media[2]
+        # if existing_username != username:
+        #     continue  # Only compare with the same user's media
        
        # Convert stored pHash string to ImageHash object
        existing_phash = imagehash.hex_to_hash(existing_phash_str)
@ -44,15 +44,19 @@ def get_media_by_phash(phash, username, existing_medias, threshold=6):
 # Database connection
 db, cursor = config.gen_connection()

+directory = 'check_if_exists'  # Directory containing user images
+
 # Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
 cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['image'])
 existing_medias = cursor.fetchall()

-# Go through the 'sorted' folder where each subfolder is a username
-users = os.listdir('sorted')
+existing_phashes = [media[1] for media in existing_medias]
+
+# Go through the directory folder where each subfolder is a username
+users = os.listdir(directory)

 for username in users:
-    user_images_path = os.path.join('sorted', username)
+    user_images_path = os.path.join(directory, username)
    if not os.path.isdir(user_images_path):
        continue  # Skip non-directory files

@ -70,12 +74,17 @@ for username in users:

        phash_str = str(phash)

+        if phash_str not in existing_phashes:
+            print(f'No duplicate found for {image_filename}')
+            continue
+
        # Check if the image is a duplicate of any in the database
        duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
        if duplicate_media:
+            found_username = duplicate_media[2]
            print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
            print(f'Duplicate image path: {filepath}')
-            newpath = filepath.replace('sorted', 'duplicates')
+            newpath = os.path.join('duplicates', found_username, image_filename)
            os.makedirs(os.path.dirname(newpath), exist_ok=True)
            os.rename(filepath, newpath)
            print(f'Moved {image_filename} to duplicates/')
--- a/find_duplicates_by_phash_videos.py
+++ b/find_duplicates_by_phash_videos.py
@ -0,0 +1,87 @@
+import os
+import config
+import cv2
+import imagehash
+from PIL import Image
+
+def generate_thumbnail_phash(filepath, hash_size=8):  # Set hash_size to 8
+    cap = cv2.VideoCapture(filepath)
+    ret, frame = cap.read()
+    cap.release()
+
+    if not ret:
+        print(f"Error reading frame from {filepath}")
+        return None
+
+    # Resize frame to a standard size
+    standard_size = (320, 240)
+    resized_frame = cv2.resize(frame, standard_size, interpolation=cv2.INTER_AREA)
+
+    # Convert OpenCV image (BGR) to PIL Image (RGB)
+    image_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
+    pil_image = Image.fromarray(image_rgb)
+
+    # Compute pHash
+    phash = imagehash.phash(pil_image, hash_size=hash_size)
+
+    return phash
+
+def are_phashes_duplicates(phash1, phash2, threshold=5):
+    # Compute Hamming distance between the pHashes
+    try:
+        distance = phash1 - phash2
+    except TypeError as e:
+        print(f"Error comparing pHashes: {e}")
+        return False
+
+    return distance <= threshold
+
+def get_media_by_phash(phash, username, existing_medias, threshold=5):
+    for media in existing_medias:
+        existing_phash_str = media[1]
+        existing_username = media[2]
+        if existing_username != username:
+            continue
+
+        # Convert stored phash string to ImageHash object
+        existing_phash = imagehash.hex_to_hash(existing_phash_str)
+
+        if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
+            return media
+    return None
+
+# Database connection
+db, cursor = config.gen_connection()
+
+# Directory containing user videos
+directory = 'check_if_exists'
+
+# Fetch existing videos with pHashes
+cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['video'])
+existing_medias = cursor.fetchall()
+
+users = os.listdir(directory)  # Assuming 'check_if_exists' contains user videos
+for username in users:
+    user_videos_path = os.path.join(directory, username)
+    if not os.path.isdir(user_videos_path):
+        continue
+
+    videos = [video for video in os.listdir(user_videos_path) if video.endswith(('.mp4', '.avi', '.mov'))]
+    for video in videos:
+        print(f'Processing {video}...')
+        filepath = os.path.join(user_videos_path, video)
+
+        phash = generate_thumbnail_phash(filepath, hash_size=8)  # Use hash_size=8
+        if phash is None:
+            continue
+
+        phash_str = str(phash)
+
+        duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
+        if duplicate_media:
+            print(f'Duplicate url found: https://altpins.com/pin/{duplicate_media[0]}')
+            print(f'Duplicate video path: {filepath}')
+            newpath = filepath.replace(directory, 'duplicates')
+            os.makedirs(os.path.dirname(newpath), exist_ok=True)
+            os.rename(filepath, newpath)
+            print(f'Moved {video} to duplicates/')
--- a/funcs.py
+++ b/funcs.py
@ -11,7 +11,8 @@ def generate_phash(image_path):
    try:
        image = Image.open(image_path)
        return str(imagehash.phash(image))
-    except:
+    except Exception as e:
+        print(f"Error generating phash for {image_path}: {e}")
        return False

 def cleanEmptyFolders(path):
@ -86,6 +87,33 @@ def compare_images(image_path1, image_path2):
    else:
        return False

+def remove_empty_folders(dir_path):
+    import shutil
+    
+    def is_folder_empty(folder_path):
+        return len(os.listdir(folder_path)) == 0
+
+    num_folder = 0
+    for root, dirs, files in os.walk(dir_path, topdown=False):
+        for dir_name in dirs:
+            dir_path = os.path.join(root, dir_name)
+
+            if not os.path.isdir(dir_path):
+                continue
+
+            if '$' in dir_name or '$' in dir_path:
+                print(f"Skipping system folder: {dir_path}")
+                continue
+
+            if 'system volume information' in dir_name.lower() or 'system volume information' in dir_path.lower():
+                print(f"Skipping system folder: {dir_path}")
+                continue
+
+            if is_folder_empty(dir_path) or dir_name.lower() == '__pycache__':
+                shutil.rmtree(dir_path)
+                print(f"Moved empty folder: {dir_path}")
+                num_folder+=1
+
 def download_file(url, filePath):
    try:
        response = requests.get(url, stream=True, headers=headers)
@ -99,23 +127,14 @@ def download_file(url, filePath):
        with open(filePath, "wb") as out_file:
            for chunk in response.iter_content(chunk_size=8192):
                out_file.write(chunk)
+                
        print(f"Downloaded {filePath}")
    except Exception as e:
        print(f"Failed to download {url}. Error: {e}")

-def determine_post_type(filepath, mediatype):
-    if mediatype == 'image':
-        try:
-            with Image.open(filepath) as img:
-                width, height = img.size
-        except:
-            print(f"Error opening image {filepath}")
-            return False
-    elif mediatype == 'video':
-        width, height = get_video_dimensions(filepath)
-    else:
-        return False
-    
+def determine_post_type(filepath):
+    width, height = get_media_dimensions(filepath)
+        
    if 0 in (width, height):
        return False
    
@ -128,20 +147,32 @@ def determine_post_type(filepath, mediatype):
 def get_media_type(filename):
    image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif", ".svg", ".eps", ".raw", ".cr2", ".nef", ".orf", ".sr2", ".heic", ".indd", ".ai", ".psd", ".svg"}
    video_extensions = {".mp4", ".mov"}
+    filetype_dict = {"image": image_extensions, "video": video_extensions}

    extension = os.path.splitext(filename.lower())[1]  # Get the extension and convert to lower case

-    if extension in image_extensions:
-        return 'image'
-    elif extension in video_extensions:
-        return 'video'
-    else:
-        return 'unknown'
+    for filetype, extensions in filetype_dict.items():
+        if extension in extensions:
+            return filetype
+    return None
    
 def get_video_duration(file_path):
+    if not os.path.exists(file_path):
+        print(f"File not found: {file_path}")
+        return 0
+    
+    video_types = {".mp4", ".mov", ".mkv"}
+    extension = os.path.splitext(file_path.lower())[1]
+    if extension not in video_types:
+        print(f"File is not a video: {file_path}")
+        return 0
+    
    try:
        with VideoFileClip(file_path) as video:
-            return video.duration
+            duration = video.duration
+            if duration == 0:
+                duration = 1
+            return duration
    except Exception as e:
        print(f"Error getting duration for {file_path}: {e}")
        return 0
@ -153,6 +184,13 @@ def get_video_dimensions(video_path):
    cap.release()
    return width, height

+def get_media_dimensions(media_path):
+    if get_media_type(media_path) == 'video':
+        return get_video_dimensions(media_path)
+    else:
+        with Image.open(media_path) as img:
+            return img.size
+
 def get_video_data(video_path):
    data = {'duration': 0, 'width': 0, 'height': 0}
    try:
--- a/image_dupe_cleaner.py
+++ b/image_dupe_cleaner.py
@ -0,0 +1,58 @@
+import os
+from funcs import generate_phash
+
+def find_duplicates(source_dir, target_dir, extensions, max_distance):
+    """Remove duplicates in target_dir that are found in source_dir based on Hamming distance."""
+    source_files = {}
+    target_files = {}
+
+    # Helper function to filter files by extension
+    def filter_files(files):
+        return [f for f in files if os.path.splitext(f)[1].lower() in extensions]
+
+    # Build hash map of source directory
+    for dirpath, _, filenames in os.walk(source_dir):
+        for filename in filter_files(filenames):
+            filepath = os.path.join(dirpath, filename)
+            filehash = generate_phash(filepath, str=False)
+            if filehash:
+                source_files[filehash] = filepath
+
+    # Build hash map of target directory and compare
+    for dirpath, _, filenames in os.walk(target_dir):
+        for filename in filter_files(filenames):
+            filepath = os.path.join(dirpath, filename)
+            filehash = generate_phash(filepath, str=False)
+            if not filehash:
+                continue
+
+            # Check if this file is similar to any of the source files
+            is_duplicate = False
+            for source_hash in source_files.keys():
+                distance = filehash - source_hash  # Hamming distance
+                if distance <= max_distance:
+                    is_duplicate = True
+                    break  # Found a duplicate
+
+            if is_duplicate:
+                newpath = os.path.join('duplicates', filename)
+                os.makedirs(os.path.dirname(newpath), exist_ok=True)
+                os.rename(filepath, newpath)
+                print(f"Moved duplicate: {filepath} to duplicates/ (distance: {distance})")
+            else:
+                target_files[filehash] = filepath
+
+if __name__ == '__main__':
+    # Paths to the directories
+    source_dir = 'D:/Crawlers/media/Coomer/sadierayxo'
+    target_dir = 'sorted/sadierayxo'
+
+    # List of accepted extensions
+    extensions = {'.png', '.jpg', '.jpeg', '.webp', '.gif'}
+
+    # Maximum Hamming distance to consider as duplicates
+    MAX_DISTANCE = 5  # Adjust this threshold as needed
+
+    find_duplicates(source_dir, target_dir, extensions, MAX_DISTANCE)
+
+    print("Duplicate removal process completed.")
--- a/import_cache.py
+++ b/import_cache.py
@ -0,0 +1,34 @@
+from concurrent.futures import ThreadPoolExecutor
+from BunnyCDN.Storage import Storage
+import config, os
+
+def DownloadFile(serverPath, cacheDir):
+    localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
+    
+    if os.path.exists(localFilePath):
+        print(f"File already exists: {localFilePath}")
+        return localFilePath
+    
+    obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir)
+    print(f"Downloaded {serverPath} to {localFilePath}")
+    return localFilePath
+
+def ImportMedias(results):
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        for video in results:
+            serverPath = video[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
+            executor.submit(DownloadFile, serverPath, cacheDir)
+
+
+obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
+
+db, cursor = config.gen_connection()
+
+cursor.execute("SELECT id, media_id, media_url FROM media WHERE file_size = 0;")
+results = cursor.fetchall()
+
+cacheDir = 'cache'
+
+print(f"Found {len(results)} files to process.")
+
+ImportMedias(results)
--- a/old/dupecleaner_filenames.py
+++ b/old/dupecleaner_filenames.py
@ -0,0 +1,32 @@
+import os, funcs
+from funcs import generate_phash
+
+def get_username(image, ready_images):
+    for ready_image in ready_images:
+        if os.path.basename(image) in ready_image:
+            ready_image = ready_image.replace('\\', '/')
+            return ready_image.split('/')[1]
+    return None
+    
+ready_images = funcs.get_files('ready_to_upload')
+ready_images = [image for image in ready_images if not image.endswith('.mp4')]
+
+sorted_images = funcs.get_files('sorted')
+sorted_images = [image for image in sorted_images if not image.endswith('.mp4')]
+
+os.makedirs('already_processed', exist_ok=True)
+
+for image in sorted_images:
+    image = image.replace('\\', '/')
+    username = image.split('/')[1]
+    filename = os.path.basename(image)
+    
+    for ready_image in ready_images:
+        if filename in ready_image:
+            username = get_username(image, ready_images)
+            newpath = ready_image.replace('ready_to_upload', 'already_processed')
+            os.makedirs(os.path.dirname(newpath), exist_ok=True)
+            print(f'Moving {image} which is a match for {ready_image} to already_processed')
+            os.rename(image, newpath)
+            print(f'Moved {ready_image} to already_processed')
+            break
--- a/old/fixes/fix_filepaths.py
+++ b/old/fixes/fix_filepaths.py
@ -0,0 +1,40 @@
+import config, os, json
+from PIL import Image
+import imagehash
+
+def find_file(filename, directory):
+    filename = filename.lower().split('.')[0]
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if filename in file:
+                return os.path.join(root, file)
+    return None
+
+def generate_phash(image_path):
+    image = Image.open(image_path)
+    return str(imagehash.phash(image))
+
+count = 0
+
+cacheDir = 'sorted'
+dataPath = 'pins.json'
+
+os.makedirs(cacheDir, exist_ok=True)
+
+medias = json.load(open(dataPath))
+
+for item in medias:
+    count += 1
+
+    filepath = item['filepath']
+    if os.path.exists(filepath):
+        continue
+    
+    newfilepath = find_file(os.path.basename(filepath), cacheDir)
+    if newfilepath:
+        print(f"Found file {newfilepath} for {filepath}")
+        item['filepath'] = newfilepath
+        
+        
+with open(dataPath, 'w') as f:
+    json.dump(medias, f)
--- a/old/fixes/fix_phash.py
+++ b/old/fixes/fix_phash.py
@ -0,0 +1,28 @@
+import os, json
+from funcs import generate_phash
+
+count = 0
+cacheDir = '_sort'
+dataPath = 'pins.json'
+
+os.makedirs(cacheDir, exist_ok=True)
+
+medias = json.load(open(dataPath))
+
+for item in medias:
+    count += 1
+    if item['type'] == 'image':
+        filepath = item['filepath']
+        if 'phash' in item:
+            print(f"Skipping {count}/{len(medias)}: already processed.")
+            continue
+        
+        if not os.path.exists(filepath):
+            print(f"File {filepath} does not exist, skipping.")
+            continue
+        phash = generate_phash(filepath)
+        item['phash'] = phash
+        print(f"Processed {count}/{len(medias)}: with pHash {phash}")
+        
+with open(dataPath, 'w') as f:
+    json.dump(medias, f)
--- a/old/fixes/generate_missing_phash_db.py
+++ b/old/fixes/generate_missing_phash_db.py
@ -0,0 +1,36 @@
+import config
+from funcs import generate_phash
+
+count = 0
+
+storage = config.get_storage()
+
+db, cursor = config.gen_connection()
+
+generate_for = 'media_url'
+media_type = 'image'
+
+cursor.execute(f"SELECT id, {generate_for} FROM media WHERE media_type = %s AND phash IS NULL;", [media_type])
+medias = cursor.fetchall()
+
+for item in medias:
+    count += 1
+    
+    itemID = item[0]
+    media_url = item[1]
+
+    server_path = media_url.replace('https://storysave.b-cdn.net/', '').replace('\\', '/')
+    filepath = storage.DownloadFile(server_path, 'temp')
+    if not filepath:
+        print(f"Error downloading {server_path}")
+        continue
+    
+    phash = generate_phash(filepath)
+    if not phash:
+        print(f"Error generating pHash for {filepath}")
+        continue
+
+    cursor.execute("UPDATE media SET phash = %s WHERE id = %s", [phash, itemID])
+    db.commit()
+    
+    print(f"[{cursor.rowcount}] Processed {count}/{len(medias)}: with pHash {phash}")
--- a/old/generate_missing_data/fix_phash_db.py
+++ b/old/generate_missing_data/fix_phash_db.py
@ -0,0 +1,39 @@
+import config, os
+from funcs import generate_phash
+
+db, cursor = config.gen_connection()
+
+cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'image' AND phash = 0;")
+results = cursor.fetchall()
+
+count = 0
+cacheDir = 'cache'
+os.makedirs(cacheDir, exist_ok=True)
+print(f"Found {len(results)} files to process.")
+
+
+for result in results:
+    count += 1
+    itemID = result[0]
+    mediaID = result[1]
+    if not mediaID:
+        print(f"Media ID is null, skipping.")
+        continue
+    mediaURL = result[2]
+    
+    serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
+    localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
+    
+    if not os.path.exists(localFilePath):
+        print(f"File {localFilePath} does not exist, skipping.")
+        continue
+    
+    phash = generate_phash(localFilePath)
+    if not phash:
+        print(f"Error generating pHash for {localFilePath}, skipping.")
+        continue
+    
+    cursor.execute("UPDATE media SET phash = %s WHERE id = %s", (phash, itemID))
+    db.commit()
+    
+    print(f"Processed {count}/{len(results)}: {mediaID} with pHash {phash}")
--- a/old/generate_missing_data/fix_phash_db_fast.py
+++ b/old/generate_missing_data/fix_phash_db_fast.py
@ -0,0 +1,74 @@
+import config, os, threading, queue
+from funcs import generate_phash
+
+# Initialize database connection
+db, cursor = config.gen_connection()
+
+# Query the media table for unprocessed images
+cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'image' AND phash = '0';")
+results = cursor.fetchall()
+
+# Setup cache directory
+cacheDir = 'cache'
+os.makedirs(cacheDir, exist_ok=True)
+
+print(f"Found {len(results)} files to process.")
+
+# Thread-safe queue for processed media
+processed_media_queue = queue.Queue()
+
+def process_media():
+    """Thread function to update database with processed pHash values."""
+    while True:
+        try:
+            item = processed_media_queue.get(timeout=10)  # Timeout prevents infinite blocking
+            if item is None:  # Sentinel value to exit the loop
+                break
+            
+            itemID, phash = item
+            cursor.execute("UPDATE media SET phash = %s WHERE id = %s", (phash, itemID))
+            db.commit()
+            print(f"Updated database for ID {itemID} with pHash {phash}.")
+        except queue.Empty:
+            continue
+
+# Start the database update thread
+update_thread = threading.Thread(target=process_media, daemon=True)
+update_thread.start()
+
+# Main processing loop for generating pHash
+count = 0
+
+for result in results:
+    count += 1
+    itemID = result[0]
+    mediaID = result[1]
+    
+    if not mediaID:
+        print(f"Media ID is null, skipping.")
+        continue
+
+    mediaURL = result[2]
+    serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
+    localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
+
+    if not os.path.exists(localFilePath):
+        print(f"File {localFilePath} does not exist, skipping.")
+        continue
+
+    phash = generate_phash(localFilePath)
+    if not phash:
+        print(f"Error generating pHash for {localFilePath}, skipping.")
+        continue
+
+    # Add the processed media to the queue
+    processed_media_queue.put((itemID, phash))
+    print(f"Processed {count}/{len(results)}: {mediaID} with pHash {phash}")
+
+# Signal the update thread to stop
+processed_media_queue.put(None)
+
+# Wait for the update thread to finish
+update_thread.join()
+
+print("Processing completed.")
--- a/old/generate_missing_data/fix_phash_db_videos.py
+++ b/old/generate_missing_data/fix_phash_db_videos.py
@ -0,0 +1,51 @@
+import os
+import config
+import cv2
+from funcs import generate_phash
+from BunnyCDN.Storage import Storage
+
+db, cursor = config.gen_connection()
+obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
+
+cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'video' AND phash = '0';")
+results = cursor.fetchall()
+
+count = 0
+cacheDir = 'cache'
+os.makedirs(cacheDir, exist_ok=True) 
+print(f"Found {len(results)} files to process.")
+
+for result in results:
+    count += 1
+    itemID = result[0]
+    media_id = result[1]
+    
+    if not media_id:
+        print(f"Media ID is null, skipping.")
+        continue
+    
+    mediaURL = result[2]
+    
+    serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
+    localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
+    
+    if not os.path.exists(localFilePath):
+        print(f"File {localFilePath} does not exist, skipping.")
+        continue
+    
+    thumbPath = f'temp/{media_id}.jpg'
+    cap = cv2.VideoCapture(localFilePath)
+    ret, frame = cap.read()
+    cv2.imwrite(thumbPath, frame)
+    cap.release()
+    phash = generate_phash(thumbPath)
+    os.remove(thumbPath)
+    
+    if not phash:
+        print(f"Error generating pHash for {localFilePath}, skipping.")
+        continue
+    
+    cursor.execute("UPDATE media SET phash = %s WHERE id = %s", (phash, itemID))
+    db.commit()
+    
+    print(f"Processed {count}/{len(results)}: {media_id} with pHash {phash}")
--- a/old/generate_missing_data/generate_file_size_data.py
+++ b/old/generate_missing_data/generate_file_size_data.py
@ -0,0 +1,43 @@
+import os
+import json
+import config
+
+# Establish database connection
+db, cursor = config.gen_connection()
+
+# Fetch rows with file_size = 0
+cursor.execute("SELECT id, media_id, media_url FROM media WHERE file_size = 0;")
+results = cursor.fetchall()
+
+cacheDir = 'cache'
+os.makedirs(cacheDir, exist_ok=True)
+print(f"Found {len(results)} files to process.")
+
+update_data = []
+for result in results:
+    itemID = result[0]
+    media_id = result[1]
+
+    if not media_id:
+        print(f"Media ID is null for ID {itemID}, skipping.")
+        continue
+
+    mediaURL = result[2]
+    serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
+    localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
+
+    if not os.path.exists(localFilePath):
+        print(f"File {localFilePath} does not exist for ID {itemID}, skipping.")
+        continue
+
+    file_size = os.path.getsize(localFilePath)
+    update_data.append({"id": itemID, "file_size": file_size})
+
+# Save the results to a JSON file
+output_file = "update_data.json"
+with open(output_file, 'w') as f:
+    json.dump(update_data, f, indent=4)
+
+print(f"Saved {len(update_data)} updates to {output_file}.")
+cursor.close()
+db.close()
--- a/old/generate_missing_data/update_data.json
+++ b/old/generate_missing_data/update_data.json
--- a/old/generate_missing_data/update_data_filesize.py
+++ b/old/generate_missing_data/update_data_filesize.py
@ -0,0 +1,29 @@
+import json
+import config
+
+# Establish database connection
+db, cursor = config.gen_connection()
+
+# Load update data from the JSON file
+input_file = "update_data.json"
+with open(input_file, 'r') as f:
+    update_data = json.load(f)
+
+print(f"Loaded {len(update_data)} records to update.")
+
+# Process each record one by one
+for count, item in enumerate(update_data, start=1):
+    item_id = item["id"]
+    file_size = item["file_size"]
+
+    try:
+        cursor.execute("UPDATE media SET file_size = %s WHERE id = %s", (file_size, item_id))
+        db.commit()
+        print(f"Processed {count}/{len(update_data)}: ID {item_id} updated with file size {file_size}.")
+    except Exception as e:
+        print(f"Error updating ID {item_id}: {e}")
+        db.rollback()
+
+print("All updates completed.")
+cursor.close()
+db.close()
--- a/old/generate_missing_data/update_filesize.py
+++ b/old/generate_missing_data/update_filesize.py
@ -0,0 +1,31 @@
+from BunnyCDN.Storage import Storage
+import config, os
+
+db, cursor = config.gen_connection()
+obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
+
+cursor.execute("SELECT id, media_id, media_url FROM media WHERE file_size = 0;")
+results = cursor.fetchall()
+print(f"Found {len(results)} files to process.")
+
+cacheDir = 'cache'
+
+for result in results:
+    itemID = result[0]
+
+    mediaURL = result[2]
+    serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
+    localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
+
+    if not os.path.exists(localFilePath):
+        continue    
+
+    file_size = os.path.getsize(localFilePath)
+
+    cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, itemID))
+    db.commit()
+    
+    print(f"Processed ID {itemID}: updated with file size {file_size}.")
+
+cursor.close()
+db.close()
--- a/old/init_dedupe.py
+++ b/old/init_dedupe.py
@ -0,0 +1,181 @@
+import os
+import json
+import config
+import imagehash
+from PIL import Image
+from funcs import get_files, calculate_file_hash, remove_empty_folders  # Assuming this is defined elsewhere
+
+def generate_image_phash(filepath, hash_size=8):
+    try:
+        # Open the image using PIL
+        pil_image = Image.open(filepath)
+        
+        # Compute pHash using the imagehash library
+        phash = imagehash.phash(pil_image, hash_size=hash_size)
+        return phash
+    except Exception as e:
+        print(f"Error processing image {filepath}: {e}")
+        return None
+
+def are_phashes_duplicates(phash1, phash2, threshold=5):
+    try:
+        # Compute the Hamming distance between the pHashes
+        distance = phash1 - phash2
+        return distance <= threshold
+    except TypeError as e:
+        print(f"Error comparing pHashes: {e}")
+        return False
+
+def get_media_by_phash(phash, username, existing_medias, threshold=5):
+    for media in existing_medias:
+        existing_phash_str = media[1]
+        existing_username = media[2]
+        
+        if existing_username != username:
+            continue
+        
+        # Convert stored pHash string to ImageHash object
+        existing_phash = imagehash.hex_to_hash(existing_phash_str)
+
+        # Check if the current pHash is a duplicate
+        if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
+            return media
+    return None
+
+def get_media_by_hash(hash, existing_medias):
+    for media in existing_medias:
+        existing_hash = media[1]
+        if hash == existing_hash:
+            return media
+    return None
+
+def get_media_by_id(media_id, existing_medias):
+    for media in existing_medias:
+        existing_media_id = media[1]
+        if media_id == existing_media_id:
+            return media
+    return None
+
+def get_data_by_filename(filename, data):
+    for item in data:
+        if filename in item['filepath']:
+            return item
+    return None
+
+directory = 'images'
+data = json.load(open('pins.json'))
+
+files = get_files(directory)
+knownExtensions = ['jpg', 'png', 'jpeg', 'gif', 'webp']
+for file in files:
+    fileExt = file.split('.')[-1].lower()
+    if fileExt not in knownExtensions:
+        print(f"Data not found for {file}")
+        filehash = calculate_file_hash(file)
+        newfilename = f"{filehash}.jpg"
+        currentDir = os.path.dirname(file)
+        newfilepath = os.path.join(currentDir, newfilename)
+        os.rename(file, newfilepath)
+
+files = get_files(directory)
+
+# Sort files by username and move them into the directory folder where each subfolder is a username
+for file in files:
+    item_data = get_data_by_filename(os.path.basename(file).split('.')[0], data)
+    
+    if not item_data:
+        print(f"Data not found for {file}")
+        continue
+    
+    username = item_data['username']
+    newpath = os.path.join(directory, username, os.path.basename(file))
+
+    if newpath == file:
+        continue
+    
+    os.makedirs(os.path.dirname(newpath), exist_ok=True)
+    os.rename(file, newpath)
+
+# Database connection
+db, cursor = config.gen_connection()
+
+# now find dupes by media_id
+cursor.execute("SELECT id, media_id, username FROM media WHERE media_type = %s AND media_id IS NOT NULL", ['image'])
+items = cursor.fetchall()
+
+media_ids = [item[1] for item in items]
+
+files = get_files(directory)
+
+for file in files:
+    try:
+        media_id = os.path.basename(file).split('.')[0]
+        media_id = int(media_id)
+    except:
+        print(f"Error parsing media_id from {file}")
+        continue
+    
+    if media_id in media_ids:
+        media_item = get_media_by_id(media_id, items)
+        print(f"Duplicate found: https://altpins.com/pin/{media_item[0]}")
+        print(f"Duplicate file: {file}")
+        newpath = os.path.join('duplicates', media_item[2], os.path.basename(file))
+        os.makedirs(os.path.dirname(newpath), exist_ok=True)
+        os.rename(file, newpath)
+    else:
+        print(f"Unique file: {file}")
+
+        
+cursor.execute("SELECT id, hash, username FROM media WHERE media_type = %s AND hash IS NOT NULL", ['image'])
+items = cursor.fetchall()
+
+hashes = [item[1] for item in items]
+
+files = get_files(directory)
+
+for file in files:
+    hash = calculate_file_hash(file)
+    if hash in hashes:
+        media_item = get_media_by_hash(hash, items)
+        print(f"Duplicate found: https://altpins.com/pin/{media_item[0]}")
+        print(f"Duplicate file: {file}")
+        newpath = os.path.join('duplicates', media_item[2], os.path.basename(file))
+        os.makedirs(os.path.dirname(newpath), exist_ok=True)
+        os.rename(file, newpath)
+    else:
+        print(f"Unique file: {file}")
+    
+    
+# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
+cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['image'])
+existing_medias = cursor.fetchall()
+
+# Go through the directory folder where each subfolder is a username
+files = get_files(directory)
+
+for filepath in files:
+    image_filename = os.path.basename(filepath)
+    print(f'Processing {image_filename}...')
+
+    # Generate pHash for the image
+    phash = generate_image_phash(filepath, hash_size=8)
+    if phash is None:
+        continue  # Skip this image if there's an issue
+
+    phash_str = str(phash)
+
+    item_data = get_data_by_filename(image_filename, data)
+    if not item_data:
+        print(f"Data not found for {image_filename}")
+        continue
+    username = item_data['username']
+
+    # Check if the image is a duplicate of any in the database
+    duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
+    if duplicate_media:
+        print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
+        print(f'Duplicate image path: {filepath}')
+        newpath = os.path.join('duplicates', duplicate_media[2], image_filename)
+        os.makedirs(os.path.dirname(newpath), exist_ok=True)
+        os.rename(filepath, newpath)
+        print(f'Moved {image_filename} to duplicates/')
--- a/old/scan_dupes.py
+++ b/old/scan_dupes.py
@ -0,0 +1,17 @@
+import os, config, funcs
+
+db, cursor = config.gen_connection()
+
+cursor.execute("SELECT phash FROM media WHERE phash IS NOT NULL")
+phashes = set([x[0] for x in cursor.fetchall()])
+
+files = funcs.get_files("check_if_exists")
+
+for file in files:
+    image_phash = funcs.generate_phash(file)
+    
+    if image_phash in phashes:
+        print(f"File {file} exists in the database")
+        os.remove(file)
+        
+funcs.cleanEmptyFolders("check_if_exists")
--- a/organize_tiktoks.py
+++ b/organize_tiktoks.py
@ -0,0 +1,57 @@
+import os
+import hashlib
+
+# Directories
+fucked_dir = 'tiktoks/fucked/aleksandra'
+source_dir = 'tiktoks/waiting_for_process/aleksandraverse'
+
+def hash_file(filepath):
+    """Generate MD5 hash of a file."""
+    hash_md5 = hashlib.md5()
+    with open(filepath, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+
+def get_file_hashes(directory):
+    """Generate a dictionary of file hashes for all files in a directory."""
+    file_hashes = {}
+    for root, _, files in os.walk(directory):
+        for file in files:
+            file_path = os.path.join(root, file)
+            file_hashes[file_path] = hash_file(file_path)
+    return file_hashes
+
+def files_are_identical(file1, file2):
+    """Compare two files byte-by-byte."""
+    with open(file1, "rb") as f1, open(file2, "rb") as f2:
+        while True:
+            chunk1 = f1.read(4096)
+            chunk2 = f2.read(4096)
+            if chunk1 != chunk2:
+                return False
+            if not chunk1:  # End of file
+                return True
+
+def remove_duplicates(fucked_dir, source_files):
+    """Remove files in 'fucked' that are identical to those in 'source_files'."""
+    for root, _, files in os.walk(fucked_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            for source_file in source_files:
+                if files_are_identical(file_path, source_file):
+                    print(f"Duplicate found. Removing: {file_path}")
+                    os.remove(file_path)
+                    break
+
+def main():
+    print("Scanning source directory for hashes...")
+    source_hashes = get_file_hashes(source_dir)
+    
+    print("Scanning 'fucked' directory for duplicates...")
+    remove_duplicates(fucked_dir, source_hashes)
+    
+    print("Cleanup complete.")
+
+if __name__ == "__main__":
+    main()
--- a/session_data.json
+++ b/session_data.json
@ -1,22 +1,22 @@
 {
    "uuids": {
-        "phone_id": "6dc6b4c0-375c-416f-8e62-3ff2d417e4f5",
-        "uuid": "224b114f-b02e-4473-80aa-f59d62199a11",
-        "client_session_id": "60907f75-8352-4d76-b7e6-1a90411cb758",
-        "advertising_id": "1bac4b76-44b4-49a6-af0e-d24eee9ca975",
-        "android_device_id": "android-d6477804f91acf95",
-        "request_id": "90e58451-9e82-4fe7-987b-b1563aa3282c",
-        "tray_session_id": "011dccd2-040e-4471-915f-d4fcfa1e7907"
+        "phone_id": "53c03380-c7b9-44ab-b10e-1b585e8e428b",
+        "uuid": "2a9c7a37-c902-4332-8a32-1fd903acd991",
+        "client_session_id": "2b0a28f0-86c4-4cd4-b044-c4effd953cc9",
+        "advertising_id": "d330f041-56f1-4f45-906d-d3740717f0b1",
+        "android_device_id": "android-df5a2572f9762ff7",
+        "request_id": "35de6403-02e2-46b4-a02c-403cea1fe9c6",
+        "tray_session_id": "ed1874f7-cb8d-4ed6-bea8-13c53b9c3d67"
    },
-    "mid": "ZnCZjQABAAGMSIaYXEanvavj6xms",
+    "mid": "ZwOR_QABAAGgkEbeoytBO3EL-dgC",
    "ig_u_rur": null,
    "ig_www_claim": null,
    "authorization_data": {
        "ds_user_id": "1587432849",
-        "sessionid": "1587432849%3AXXakd6sE4Iw3kR%3A2%3AAYcSu5MUADRQSpPi2YW89hUL5om1TD01EOZqxq5qHA"
+        "sessionid": "1587432849%3Ak5q9QqmHia2WWq%3A18%3AAYcDFsLKMiFCtVhCcqYl7KZrFLw5IOSgf1pNfQZYLA"
    },
    "cookies": {},
-    "last_login": 1718658980.6259732,
+    "last_login": 1728287241.130515,
    "device_settings": {
        "app_version": "269.0.0.18.75",
        "android_version": 26,
--- a/snapchat.json
+++ b/snapchat.json
--- a/snapchat.py
+++ b/snapchat.py
@ -0,0 +1,70 @@
+import os, requests, json
+from bs4 import BeautifulSoup
+from funcs import download_file
+
+def get_data(username):
+    url = f"https://www.snapchat.com/add/{username}"
+    headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"}
+    response = requests.get(url, headers=headers)
+    soup = BeautifulSoup(response.text, "html.parser")
+    data = soup.find("script", id="__NEXT_DATA__")
+    data = json.loads(data.string)
+    return data
+
+def parse_stories(stories):
+    parsed_stories = []
+    for story in stories:
+        snap_id = story['snapId']['value']
+        snap_url = story['snapUrls']['mediaUrl']
+        timestamp = story['timestampInSec']['value']
+        parsed_stories.append({"media_id": snap_id, "url": snap_url, "timestamp": timestamp})
+    
+    return parsed_stories
+
+def get_stories(data):
+    stories = data['props']['pageProps']['story']['snapList']
+
+    stories = parse_stories(stories)
+
+    return stories
+
+def get_highlights(data):
+    highlights = data['props']['pageProps']['curatedHighlights']
+    return highlights
+
+def get_highlight_stories(data):
+    highlights = get_highlights(data)
+    stories = []
+    for highlight in highlights:
+        stories.extend(parse_stories(highlight['snapList']))
+    return stories    
+
+def main():
+    directory = "snapchat_stories"
+    usernames = ['little.warren1', 'neiima22', 'awesome.nads', 'noordabash', 'aleximarianna', ]
+    
+    for username in usernames:
+        print(f"Getting stories for {username}...")
+
+        data = get_data(username)
+
+        print("Getting stories...")
+        stories = get_stories(data)
+
+        print("Getting highlights...")
+        stories.extend(get_highlight_stories(data))
+
+        for story in stories:
+            media_id = story['media_id']
+            url = story['url']
+            timestamp = story['timestamp']
+
+            filename = f"{media_id}.jpg"
+            filepath = os.path.join(directory, filename)
+
+            download_file(url, filepath)
+
+            print(f"Downloaded {filename} at {timestamp}")
+
+if __name__ == "__main__":
+    main()
--- a/snappy.py
+++ b/snappy.py
@ -0,0 +1,164 @@
+import os
+import requests
+import json
+from bs4 import BeautifulSoup
+
+def get_data(username):
+    url = f"https://www.snapchat.com/add/{username}"
+    headers = {
+        "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                       "AppleWebKit/537.36 (KHTML, like Gecko) "
+                       "Chrome/129.0.0.0 Safari/537.36")
+    }
+    response = requests.get(url, headers=headers)
+    soup = BeautifulSoup(response.text, "html.parser")
+    data_script = soup.find("script", id="__NEXT_DATA__")
+    if not data_script:
+        print(f"No data found for {username}.")
+        return None
+    data = json.loads(data_script.string)
+    return data
+
+def parse_stories(stories):
+    parsed_stories = []
+    for story in stories:
+        snap_id = story.get('snapId', {}).get('value', '')
+        snap_url = story.get('snapUrls', {}).get('mediaUrl', '')
+        timestamp = story.get('timestampInSec', {}).get('value', '')
+        if snap_url and timestamp and snap_id:
+            parsed_stories.append({
+                "media_id": snap_id,
+                "url": snap_url,
+                "timestamp": timestamp
+            })
+    return parsed_stories
+
+def get_stories(data):
+    try:
+        stories = data['props']['pageProps']['story']['snapList']
+        return parse_stories(stories)
+    except KeyError:
+        return []
+
+def get_highlights(data):
+    highlights = []
+    page_props = data.get('props', {}).get('pageProps', {})
+    # Possible keys that might contain highlights
+    possible_highlight_keys = ['curatedHighlights', 'savedHighlights', 'highlights']
+    for key in possible_highlight_keys:
+        highlight_data = page_props.get(key, [])
+        if highlight_data:
+            highlights.extend(highlight_data)
+    return highlights
+
+def get_highlight_stories(data):
+	stories = []
+	highlights = get_highlights(data)
+	for highlight in highlights:
+		snap_list = highlight.get('snapList', [])
+
+		for snap in snap_list:
+			timestamp = snap.get('timestampInSec', {}).get('value', '')
+			snap_url = snap.get('snapUrls', {}).get('mediaUrl', '')
+			stories.append({
+				"media_id": snap.get('snapId', {}).get('value', ''),
+				"url": snap_url,
+				"timestamp": timestamp
+			})
+			
+	return stories
+
+def get_existing_media_ids(directory):
+	# get all files and their their base filename without extension, split the filename by ~ and get the 3rd element
+	existing_media_ids = set()
+	for root, _, files in os.walk(directory):
+		for file in files:
+			if '~' not in file:
+				continue
+			
+			filename, _ = os.path.splitext(file)
+			media_id = filename.split('~')[2]
+			existing_media_ids.add(media_id)
+	return existing_media_ids
+
+def main():
+	directory = "snapchat"
+	if not os.path.exists(directory):
+		os.makedirs(directory)
+
+	usernames = [
+		'aleximarianna', 'little.warren1', 'neiima22', 'awesome.nads', 'noordabash',
+		'jaynagirl', 'sierracannon', 'stefaniedra6',
+		'ciaoxxw', 'nadia-stone', 'c.aitknight', 'aimeejaiii',
+		'leonanaomii', 'ratskelet0n', 
+	]
+    
+	existing_media_ids = get_existing_media_ids(directory)
+	
+	for username in usernames:
+		print(f"Getting stories for {username}...")
+		data = get_data(username)
+		if not data:
+			continue
+
+		print("Getting stories...")
+		stories = get_stories(data)
+
+		print("Getting highlights...")
+		stories.extend(get_highlight_stories(data))
+
+		for story in stories:
+			media_id = story['media_id']
+			url = story['url']
+			timestamp = story['timestamp']
+
+			# Check if media already exists
+			if media_id in existing_media_ids:
+				print(f"Media {media_id} already exists. Skipping download.")
+				continue
+
+			# Determine file extension using HEAD request
+			response = requests.head(url)
+			if response.status_code != 200:
+				print(f"Failed to access media {media_id}")
+				continue
+
+			content_type = response.headers.get('Content-Type', '')
+			if 'image' in content_type:
+				extension = '.jpg'
+			elif 'video' in content_type:
+				extension = '.mp4'
+			else:
+				print(f"Unknown content type for media {media_id}")
+				continue
+
+			if media_id:
+				filename = f"{username}~{timestamp}~{media_id}{extension}"
+				filepath = os.path.join(directory, filename)
+			else:
+				media_url_filename = url.split('/')[-1].split('?')[0]
+				etag = response.headers.get('ETag', '').replace('"', '')
+				filename = f"{username}~{timestamp}-{media_url_filename}~{etag}{extension}"
+				filepath = os.path.join(directory, 'highlights', filename)
+				
+			# Check if file already exists
+			if os.path.exists(filepath):
+				print(f"File {filename} already exists. Skipping download.")
+				continue
+
+			# Download the media
+			response = requests.get(url, stream=True)
+			if response.status_code != 200:
+				print(f"Failed to download media {media_id}")
+				continue
+
+			# Save the file
+			with open(filepath, 'wb') as f:
+				for chunk in response.iter_content(chunk_size=1024):
+					if chunk:
+						f.write(chunk)
+
+			print(f"Downloaded {filename} at {timestamp}")
+
+if __name__ == "__main__":
+    main()
--- a/snappy_dump.py
+++ b/snappy_dump.py
@ -0,0 +1,120 @@
+from BunnyCDN.Storage import Storage
+from datetime import datetime
+import os, config, funcs, cv2
+from PIL import Image
+
+
+def UploadMedia(media):
+    username = media['username']
+    timestamp = media['timestamp']
+    filepath = media['filepath']
+    thumbnail_url = None
+    phash = None
+    
+    filename = os.path.basename(filepath)
+    file_extension = os.path.splitext(filename)[1].lower()
+    
+    if filename in existing_files:
+        print('Duplicate file detected. Removing...')
+        os.remove(filepath)
+        return True
+        
+
+    media_type = funcs.get_media_type(filename)
+
+    file_hash = funcs.calculate_file_hash(filepath)
+
+    if '-' in timestamp:
+        timestamp = timestamp.split('-')[0]
+    post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now()
+
+    width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size
+    
+    duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 # slower
+
+    if media_type == 'video':
+        try:
+            thumbPath = f'temp/{file_hash}.jpg'
+            cap = cv2.VideoCapture(filepath)
+            ret, frame = cap.read()
+            cv2.imwrite(thumbPath, frame)
+            cap.release()
+            obj_storage.PutFile(thumbPath, f'thumbnails/{file_hash}.jpg') # slower
+            thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg"
+            phash = funcs.generate_phash(thumbPath)
+            os.remove(thumbPath)
+        except:
+            print('Error generating thumbnail. Skipping...')
+            return False
+    elif media_type == 'image':
+        phash = funcs.generate_phash(filepath)
+
+    newFilename = f'{file_hash}{file_extension}'
+    server_path = f'media/snaps/{username}/{newFilename}'
+
+    file_url = f"https://storysave.b-cdn.net/{server_path}"
+
+    obj_storage.PutFile(filepath, server_path) # slow as fuck
+
+    query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
+    values = (username, media_type, file_url, width, height, 'story', post_date, file_hash, filename, duration, thumbnail_url, phash, 'snapchat')
+
+    newCursor.execute(query, values) # slower
+    newDB.commit()
+    print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
+
+    os.remove(filepath)
+
+    return True
+
+def get_media_data(filepath):
+    filename = os.path.basename(filepath)
+    parts = filename.split('~')
+    if len(parts) < 3:
+        return False
+
+    username = parts[0]
+    timestamp = parts[1]
+
+    data = {'username': username, 'timestamp': timestamp, 'filepath': filepath}
+
+    return data
+
+def get_media(folder_path):
+    medias = []
+    
+    for root, dirs, files in os.walk(folder_path):
+        for filename in files:
+            filepath = os.path.join(root, filename)
+            
+            data = get_media_data(filepath)
+            if data:
+                medias.append(data)
+    
+    return medias
+
+def dump(folder_path):
+    medias = get_media(folder_path)
+
+    for media in medias:
+        UploadMedia(media)
+
+if __name__ == '__main__':
+	print('Starting processing...')
+
+	directory = 'snapchat/'
+
+	if not os.listdir(directory):
+		print('No files to process. Exiting...')
+		exit()
+
+	newDB, newCursor = config.gen_connection()
+
+	obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
+
+	newCursor.execute("SELECT filename FROM media WHERE filename IS NOT NULL AND platform = 'snapchat'")
+	existing_files = [image[0] for image in newCursor.fetchall()]
+
+	dump(directory)
+
+	print("Processing completed.")
--- a/storysave_api.py
+++ b/storysave_api.py
@ -1,4 +1,5 @@
 import json, requests
+from bs4 import BeautifulSoup

 def findPost(filePath = 'test.json'):
    params = {'av': '17841401225494803','__a': '1','__req': '1','__hs': '19906.HYP:instagram_web_pkg.2.1..0.1','dpr': '1','__ccg': 'UNKNOWN','__rev': '1014609539','__s': 'guk60j:651i2v:pmhu0r','__hsi': '7386834689999716220','__dyn': '7xe5WwlEnwn8K2Wmm1twpUnwgU7S6EdF8aUco38w5ux609vCwjE1xoswaq0yE6u0nS4oaEd86a3a1YwBgao1aU2swbOU2zxe2GewGw9a362W2K0zEnwhEe82mwww4cwJCwLyES1TwTwFwIwbS1LwTwKG1pg2Xwr86C1mwrd6goK3ibxKi2K7ErwYCz8rwHw','__csr': 'igAzIj5OgR5YBHdRtivbkyFv-zJIZE_ykzfahdAydeHCHAAAqyk4pqBgDzeV4-qlbBF29UlCxFpVokDwAyosyV9KWUmx6iu58WqdwSDCDAFwHxi3C00lWy2FG4k583NxW8yFE0bUyxd06lxO5C2a8yFm2u290ejg1JU2Gw2rQ061U','__comet_req': '7','fb_dtsg': 'NAcPDfX2XufdLkctek6zNxz3DWxPW4t-cJzz39QtOQ5KS-_Rq3erT4A:17843708194158284:1719013044','jazoest': '26262','lsd': 'D0zmaX16yIQu_GwDXKTbMc','__spin_r': '1014609539','__spin_b': 'trunk','__spin_t': '1719881474','__jssesw': '1','fb_api_caller_class': 'RelayModern','fb_api_req_friendly_name': 'PolarisProfilePageContentDirectQuery', 'variables': '{"id":"57771591453","render_surface":"PROFILE"}','server_timestamps': 'true','doc_id': '7663723823674585'}
@ -23,4 +24,123 @@ def getHDProfilePicture():
    response = requests.post(url, data=zoom_data, headers=headers)

    with open('image.jpg', 'wb') as f:
-        f.write(response.content)
+        f.write(response.content)
+        
+
+def get_username_by_user_id(user_id):
+    url = 'https://www.instagram.com/graphql/query/'
+    
+    variables = {
+        "user_id": str(user_id),
+        "include_chaining": False,
+        "include_reel": True,
+        "include_suggested_users": False,
+        "include_logged_out_extras": False,
+        "include_highlight_reels": False,
+        "include_related_profiles": False
+    }
+    
+    params = {
+        'query_hash': 'c9100bf9110dd6361671f113dd02e7d6',
+        'variables': json.dumps(variables)
+    }
+    
+    headers = {
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
+    }
+    
+    response = requests.get(url, headers=headers, params=params)
+    
+    if response.status_code == 200:
+        try:
+            data = response.json()
+            username = data['data']['user']['reel']['user']['username']
+            return username
+        except (KeyError, TypeError) as e:
+            print(f"Error parsing JSON response: {e}")
+            return None
+    else:
+        print(f"Failed to fetch data. Status code: {response.status_code}")
+        return None
+
+def extract_script_tags(username):
+    url = f"https://www.instagram.com/{username}/"
+    try:
+        # Fetch the HTML content of the page
+        response = requests.get(url)
+        response.raise_for_status()
+        
+        # Parse the HTML content with BeautifulSoup
+        soup = BeautifulSoup(response.text, 'html.parser')
+        
+        # Find the specific script tag by its type
+        scripts = soup.find_all('script', {'type': 'application/json'})
+        
+        if scripts:
+            return scripts
+        else:
+            return False
+    except requests.exceptions.RequestException as e:
+        return f"Error fetching the URL: {e}"
+
+def get_user_id(username):
+    scripts = extract_script_tags(username)
+    for script in scripts:
+        if "profile_id" in str(script):
+            json_data = json.loads(script.string)
+            return user_id
+
+def get_profile_data(username):
+    url = 'https://www.instagram.com/graphql/query'
+
+    data = {
+        'av': '17841401225494803',
+        '__d': 'www',
+        '__user': 0,
+        '__a': 1,   
+        '__req': 2,
+        '__hs': '20047.HYP:instagram_web_pkg.2.1..0.1',
+        'dpr': 1,
+        '__ccg': 'EXCELLENT',
+        '__rev': 1018347086,
+        '__s': '8di41h:vwko3r:whjifd',
+        '__hsi': 7439320945163371549,
+        '__dyn': '7xe5WwlEnwn8K2Wmm1twpUnwgU7S6EdF8aUco38w5ux60p-0LVE4W0qa0FE2awgo1EUhwnU6a3a0EA2C0iK0D830wae4UaEW2G0AEco5G0zE5W0Y81eEdEGdwtU662O0Lo6-3u2WE15E6O1FwlE6PhA6bwg8rAwHxW1oCz8rwHwcOEym5oqw',
+        '__csr': 'hA5I8EAy7hnfqiIBklLZHVkmTHQmVmAh5UCchA9GQByu_yfD-nUBaVaDmSbDyUydCDgzyQAcggDK48Sm2ai8y8lxe6UTgmjwCyUC8yFXK9zooxmez9FUW684qu4awQwF9w04XAg0wi0nB03981oU082Oa0fMe3e19g512AK6Ulo5C3lw7Uy8G6Efo9k08mgiaaw25VobU2bw3KU023zw6Pw',
+        '__comet_req': 7,
+        'fb_dtsg': 'NAcO7gvrsNlfWXA8giwQC4bVYRXXAGomAqcIRYUJUE2Hk8HmABf56Yg:17854575481098892:1732030177',
+        'jazoest': 26190,
+        'lsd': 'zcsn3c8we8kpMB_AVukeii',
+        '__spin_r': 1018347086,
+        '__spin_b': 'trunk',
+        '__spin_t': 1732101883,
+        'fb_api_caller_class': 'RelayModern',
+        'fb_api_req_friendly_name': 'PolarisProfilePageContentQuery',
+        'variables': '{"id":"6687693830","render_surface":"PROFILE"}',
+        'server_timestamps': 'true',
+        'doc_id': 9539110062771438
+    }
+
+    headers = {
+        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
+    }
+
+
+    response = requests.post(url, headers=headers, data=data)
+
+    json_data = response.json()
+
+    return json_data
+
+
+user_id = get_user_id('intergalacticum')
+
+
+
+user_id = 10217929812
+username = get_username_by_user_id(user_id)
+
+if username:
+    print(f"Username: {username}")
+else:
+    print("Could not retrieve username.")
--- a/storysave_dump.py
+++ b/storysave_dump.py
@ -1,8 +1,6 @@
 from BunnyCDN.Storage import Storage
 from datetime import datetime
 import os, config, funcs, cv2
-from PIL import Image
-

 def UploadMedia(media):
    media_id = media['media_id']
@ -11,6 +9,7 @@ def UploadMedia(media):
    user_id = media['user_id']
    filepath = media['filepath']
    highlight_id = media['highlight_id']
+    file_size = os.path.getsize(filepath)
    thumbnail_url = None
    phash = None
    
@ -23,8 +22,11 @@ def UploadMedia(media):
    file_extension = os.path.splitext(filename)[1].lower()

    media_type = funcs.get_media_type(filename)
+    if not media_type:
+        print(f'Error determining media type for {filename}. Skipping...')
+        return False

-    post_type = funcs.determine_post_type(filepath, media_type)
+    post_type = funcs.determine_post_type(filepath)
    if not post_type:
        print(f'Error determining post type for {filename}. Skipping...')
        return False
@ -33,11 +35,13 @@ def UploadMedia(media):

    post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now()

-    width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size
+    width, height = funcs.get_media_dimensions(filepath)
    
-    duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 # slower
+    duration = funcs.get_video_duration(filepath)

-    if media_type == 'video':
+    if media_type == 'image':
+        phash = funcs.generate_phash(filepath)
+    elif media_type == 'video':
        try:
            thumbPath = f'temp/{media_id}.jpg'
            cap = cv2.VideoCapture(filepath)
@ -51,8 +55,6 @@ def UploadMedia(media):
        except:
            print('Error generating thumbnail. Skipping...')
            return False
-    elif media_type == 'image':
-        phash = funcs.generate_phash(filepath)

    newFilename = f'{media_id}{file_extension}'
    server_path = f'media/{post_type}/{username}/{newFilename}'
@ -67,8 +69,8 @@ def UploadMedia(media):
        print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')

    post_type = 'story' if post_type == 'stories' else 'post'
-    query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
-    values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash)
+    query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform, file_size) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
+    values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, 'instagram', file_size)

    newCursor.execute(query, values) # slower
    newDB.commit()
@ -134,6 +136,10 @@ def dump_instagram(folder_path):
 if __name__ == '__main__':
    print('Starting processing...')

+    if not os.listdir('storysaver/'):
+        print('No files to process. Exiting...')
+        exit()
+
    newDB, newCursor = config.gen_connection()

    obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
--- a/storysave_dump_media.py
+++ b/storysave_dump_media.py
@ -0,0 +1,141 @@
+from BunnyCDN.Storage import Storage
+from datetime import datetime
+import os, config, funcs, cv2
+from PIL import Image
+
+
+def UploadMedia(media):
+    media_id = media['media_id']
+    username = media['username']
+    post_date = media['timestamp']
+    user_id = media['user_id']
+    filepath = media['filepath']
+    highlight_id = media['highlight_id']
+    post_type = media['post_type']
+    thumbnail_url = None
+    phash = None
+    
+    if media_id and int(media_id) in existing_files:
+        print('Duplicate file detected. Removing...')
+        os.remove(filepath)
+        return True
+        
+    filename = os.path.basename(filepath)
+    file_extension = os.path.splitext(filename)[1].lower()
+
+    media_type = funcs.get_media_type(filename)
+
+    file_hash = funcs.calculate_file_hash(filepath)
+
+    width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size
+    
+    duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 # slower
+
+    if media_type == 'video':
+        try:
+            thumbPath = f'temp/{media_id}.jpg'
+            cap = cv2.VideoCapture(filepath)
+            ret, frame = cap.read()
+            cv2.imwrite(thumbPath, frame)
+            cap.release()
+            obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') # slower
+            thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg"
+            phash = funcs.generate_phash(thumbPath)
+            os.remove(thumbPath)
+        except:
+            print('Error generating thumbnail. Skipping...')
+            return False
+    elif media_type == 'image':
+        phash = funcs.generate_phash(filepath)
+
+    if media_id:
+        newFilename = f'{media_id}{file_extension}'
+    else:
+        newFilename = f'{file_hash}{file_extension}'
+
+    server_path = f'media/{post_type}/{username}/{newFilename}'
+
+    file_url = f"https://storysave.b-cdn.net/{server_path}"
+
+    obj_storage.PutFile(filepath, server_path) # slow as fuck
+
+    if highlight_id:
+        newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id))
+        newDB.commit()
+        print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')
+
+    query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
+    values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, 'instagram')
+
+    newCursor.execute(query, values) # slower
+    newDB.commit()
+    print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
+
+    os.remove(filepath)
+
+    return True
+
+def get_user_id(username):
+    username = username.lower()
+    if username in existing_users:
+        return existing_users[username]
+    
+    return None
+
+def get_media():
+    medias = []
+    post_types = {
+        'posts': 'post',
+        'stories': 'story',
+        'profile': 'profile',
+    }
+    
+    for post_type in os.listdir('media'):
+        users = os.listdir(f'media/{post_type}')
+        for user in users:
+            user_path = f'media/{post_type}/{user}'
+            for filename in os.listdir(user_path):
+                data = {}
+                filepath = os.path.join(user_path, filename)
+                
+                data['post_type'] = post_types[post_type]
+                data['username'] = user
+                data['timestamp'] = filename.split('__')[-1].split('.')[0] if 'com.instagram.android__' in filename else datetime.now()
+                if 'com.instagram.android__' in filename:
+                    data['timestamp'] = datetime.strptime(data, '%Y%m%d%H%M%S%f')
+                data['filepath'] = filepath
+                data['media_id'] = None
+                data['user_id'] = get_user_id(data['username'])
+                data['highlight_id'] = None
+                medias.append(data)
+    
+    return medias
+
+def dump_instagram():
+    medias = get_media()
+
+    for media in medias:
+        UploadMedia(media)
+        existing_files.append(media['media_id'])
+ 
+
+if __name__ == '__main__':
+    print('Starting processing...')
+
+    if not os.listdir('storysaver/'):
+        print('No files to process. Exiting...')
+        exit()
+
+    newDB, newCursor = config.gen_connection()
+
+    obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
+
+    newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL")
+    existing_files = [image[0] for image in newCursor.fetchall()]
+
+    newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
+    existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
+    
+    dump_instagram()
+
+    print("Processing completed.")
--- a/storysave_dump_tiktok.py
+++ b/storysave_dump_tiktok.py
@ -0,0 +1,38 @@
+import os
+
+
+# file name : masstik_caammmyyy_1310_655_going blonde wednesdayyyy.mp4
+# file name : masstiktok_aleksandraverse__#fyp #trending #viral #foryou.mp4
+# where the first item is prefix, second is username and after those is the tiktok title
+
+processed_dir = 'processed_tiktoks'
+os.makedirs(processed_dir, exist_ok=True)
+
+users = os.listdir('tiktoks')
+
+for user in users:
+    files = os.path.join('tiktoks', user)
+    for file in os.listdir(files):
+        if 'masstik' not in file and 'masstiktok' not in file:
+            print(f"Skipping {file}")
+            continue
+        
+        filepath = os.path.join(files, file)
+        file_ext = os.path.splitext(file)[1]
+        data = file.split('_')
+        prefix = data[0]
+        username = data[1]
+        username = username.replace('@', '')
+        title = ' '.join(data[2:])
+        title = os.path.splitext(title)[0]
+        
+        print("="*100)
+        title = title.encode('utf-8', 'ignore').decode('utf-8')
+        print(f"Prefix: {prefix}\nUsername: {username}\nTitle: {title}")
+        print("="*100)
+        
+        new_filename = f"{username}~{title}.{file_ext}"
+        new_filepath = os.path.join(processed_dir, new_filename)
+        
+        os.rename(filepath, new_filepath)
+        print(f"Renamed {file} to {new_filename}")
--- a/storysave_dump_unknown.py
+++ b/storysave_dump_unknown.py
@ -0,0 +1,109 @@
+from BunnyCDN.Storage import Storage
+from datetime import datetime
+import os, config, funcs, cv2
+from PIL import Image
+
+directory = 'ready_to_upload/'
+
+def UploadMedia(username, user_id, filepath):
+    thumbnail_url = None
+    phash = None
+        
+    filename = os.path.basename(filepath)
+    file_extension = os.path.splitext(filename)[1].lower()
+
+    media_type = funcs.get_media_type(filename)
+
+    post_type = funcs.determine_post_type(filepath, media_type)
+    if not post_type:
+        print(f'Error determining post type for {filename}. Skipping...')
+        return False
+
+    file_hash = funcs.calculate_file_hash(filepath)
+
+    post_date = datetime.now()
+
+    width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size
+    
+    duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 # slower
+
+    if media_type == 'video':
+        try:
+            thumbPath = f'temp/{file_hash}.jpg'
+            cap = cv2.VideoCapture(filepath)
+            ret, frame = cap.read()
+            cv2.imwrite(thumbPath, frame)
+            cap.release()
+            obj_storage.PutFile(thumbPath, f'thumbnails/{file_hash}.jpg') # slower
+            thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg"
+            phash = funcs.generate_phash(thumbPath)
+            os.remove(thumbPath)
+        except:
+            print('Error generating thumbnail. Skipping...')
+            return False
+    elif media_type == 'image':
+        phash = funcs.generate_phash(filepath)
+
+    newFilename = f'{file_hash}{file_extension}'
+    server_path = f'media/{post_type}/{username}/{newFilename}'
+
+    file_url = f"https://storysave.b-cdn.net/{server_path}"
+
+    obj_storage.PutFile(filepath, server_path) # slow as fuck
+
+    post_type = 'story' if post_type == 'stories' else 'post'
+    query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
+    values = (username, media_type, file_url, width, height, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, 'instagram')
+
+    newCursor.execute(query, values) # slower
+    newDB.commit()
+    print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
+
+    os.remove(filepath)
+
+    return True
+
+def get_user_id(username):
+    username = username.lower()
+    if username in existing_users:
+        return existing_users[username]
+    
+    return None
+
+def get_media(folder_path):
+    medias = []
+    
+    for user_folder in os.listdir(folder_path):
+        files = os.listdir(os.path.join(folder_path, user_folder))
+        for filename in files:
+            filepath = os.path.join(folder_path, user_folder, filename)
+            media = {
+                'username': user_folder,
+                'filepath': filepath,
+                'user_id': get_user_id(user_folder)
+            }
+            
+            medias.append(media)
+                
+    return medias
+
+def dump_instagram(folder_path):
+    medias = get_media(folder_path)
+
+    for media in medias:
+        UploadMedia(media['username'], media['user_id'], media['filepath'])
+ 
+
+if __name__ == '__main__':
+    print('Starting processing...')
+
+    newDB, newCursor = config.gen_connection()
+
+    obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
+
+    newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
+    existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
+    
+    dump_instagram(directory)
+
+    print("Processing completed.")
--- a/storysaver_new.py
+++ b/storysaver_new.py
@ -188,14 +188,19 @@ if __name__ == "__main__":
            profile = get_profile_picture(client, user_id, username)
            allStories = get_all_stories(client, user_id, firstImport)
            allPosts = get_all_posts(client, user_id)
-
+            
            medias = allStories + allPosts
+
+            medias = [parse_media_data(media) for media in medias]
+            medias = [media for media in medias if media["media_id"] not in existing_files]
+            
            for mediaInfo in medias:
-                filePath = os.path.join('media', mediaInfo['mediaDir'], username, mediaInfo['filename'])
+                filePath = os.path.join('media', mediaInfo['post_type'], username, mediaInfo['filename'])
                
                funcs.download_file(mediaInfo['media_url'], filePath)

                mediaInfo["hash"] = funcs.calculate_file_hash(filePath)
+                mediaInfo["username"] = username

                if mediaInfo["media_type"] == "image":
                    with Image.open(filePath) as img:
@ -215,15 +220,4 @@ if __name__ == "__main__":
                os.remove("session_data.json")
                client = login()
            else:
-                print("An unexpected error occurred:", e)
-
-                
-#https://www.instagram.com/anya_shtril/
-#https://www.instagram.com/anyarodionov/
-#https://www.instagram.com/neomi_hanukayev/
-#https://www.instagram.com/osher_yakir/
-#https://www.instagram.com/m1ry2m_/
-#https://www.instagram.com/4m1t_f1shpot/
-#https://www.instagram.com/yarden.bengigi/
-#https://www.instagram.com/a.roniiiiii/
-#https://www.instagram.com/nonsalemwitch/
+                print("An unexpected error occurred:", e)