cleanup

9 months ago · 48d2330193
parent 4d23278033
commit 48d2330193
6 changed files with 406 additions and 0 deletions
--- a/archived/compress_data.py
+++ b/archived/compress_data.py
@ -0,0 +1,35 @@
+import os
+import json
+import gzip
+
+data_dir = 'data'
+data_compressed_dir = 'data_compressed'
+os.makedirs(data_compressed_dir, exist_ok=True)
+
+def compress_file(filepath, output_file):
+    with open(filepath, 'r') as f:
+        data = json.load(f)
+    compress_data(data, output_file)
+    return output_file
+
+def compress_data(data, output_file):
+    with gzip.open(output_file, 'wb') as f:
+        f.write(json.dumps(data).encode('utf-8'))
+    return output_file
+
+    
+data_files = os.listdir(data_dir)
+for file in data_files:
+    if not file.endswith('.json'):
+        continue
+
+    filepath = f'{data_dir}/{file}'
+    output_file = f'{data_compressed_dir}/{file}.gz'
+    output_file = compress_file(filepath, output_file)
+    if output_file:
+        print(f'Compressed {file} to {output_file}')
+        os.remove(filepath)
+    else:
+        print(f'Failed to compress {file}')
+
+print('Data compression completed')
--- a/archived/dedupe_scripts/deduper_new.py
+++ b/archived/dedupe_scripts/deduper_new.py
@ -0,0 +1,87 @@
+from funcs import get_files
+from PIL import Image
+import imagehash
+import config
+import os
+
+def generate_image_phash(filepath, hash_size=8):
+    try:
+        # Open the image using PIL
+        pil_image = Image.open(filepath)
+        
+        # Compute pHash using the imagehash library
+        phash = imagehash.phash(pil_image, hash_size=hash_size)
+        return phash
+    except Exception as e:
+        print(f"Error processing image {filepath}: {e}")
+        return None
+
+def are_phashes_duplicates(phash1, phash2, threshold=5):
+    try:
+        # Compute the Hamming distance between the pHashes
+        distance = phash1 - phash2
+        return distance <= threshold
+    except TypeError as e:
+        print(f"Error comparing pHashes: {e}")
+        return False
+
+def find_duplicate_phash(phash, existing_medias, threshold=5):
+    for media in existing_medias:
+        existing_phash_str = media[1]
+        existing_username = media[2]
+        
+        # Convert stored pHash string to ImageHash object
+        existing_phash = imagehash.hex_to_hash(existing_phash_str)
+
+        # Check if the current pHash is a duplicate
+        if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
+            return media
+    return None
+
+def get_media_by_hash(hash, existing_medias):
+    for media in existing_medias:
+        existing_hash = media[1]
+        if hash == existing_hash:
+            return media
+    return None
+
+def get_media_by_id(media_id, existing_medias):
+    for media in existing_medias:
+        existing_media_id = media[1]
+        if media_id == existing_media_id:
+            return media
+    return None
+
+def get_data_by_filename(filename, data):
+    for item in data:
+        if filename in item['filepath']:
+            return item
+    return None
+
+
+# Database connection
+db, cursor = config.gen_connection()
+    
+# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
+cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL AND media_id IS NULL;", ['image'])
+existing_medias = cursor.fetchall()
+
+directory = 'check_if_exists/'  # Directory containing user images
+files = [file for file in get_files(directory) if file.endswith(('.jpg', '.jpeg', '.png'))]
+for filepath in files:
+    image_filename = os.path.basename(filepath)
+
+    # Generate pHash for the image
+    phash = generate_image_phash(filepath, hash_size=8)
+    if phash is None:
+        continue
+
+    # Check if the image is a duplicate of any in the database
+    duplicate_media = find_duplicate_phash(phash, existing_medias)
+    if duplicate_media:
+        print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
+        print(f'Duplicate image path: {filepath}')
+        newpath = os.path.join('duplicates', duplicate_media[2], image_filename)
+        os.makedirs(os.path.dirname(newpath), exist_ok=True)
+        os.rename(filepath, newpath)
+        print(f'Moved {image_filename} to duplicates/')
--- a/archived/dedupe_scripts/find_duplicates_by_phash_videos_new.py
+++ b/archived/dedupe_scripts/find_duplicates_by_phash_videos_new.py
@ -0,0 +1,79 @@
+from funcs import get_files
+from PIL import Image
+import imagehash
+import config
+import cv2
+import os
+
+def get_video_phash(filepath, hash_size=8):
+    cap = cv2.VideoCapture(filepath)
+    ret, frame = cap.read()
+    cap.release()
+
+    if not ret:
+        print(f"Error reading frame from {filepath}")
+        return None
+
+    # Resize frame to a standard size
+    standard_size = (320, 240)
+    resized_frame = cv2.resize(frame, standard_size, interpolation=cv2.INTER_AREA)
+
+    # Convert OpenCV image (BGR) to PIL Image (RGB)
+    image_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
+    pil_image = Image.fromarray(image_rgb)
+
+    # Compute pHash
+    phash = imagehash.phash(pil_image, hash_size=hash_size)
+
+    return phash
+
+def are_phashes_duplicates(phash1, phash2, threshold=5):
+    # Compute Hamming distance between the pHashes
+    try:
+        distance = phash1 - phash2
+    except TypeError as e:
+        print(f"Error comparing pHashes: {e}")
+        return False
+
+    return distance <= threshold
+
+def get_media_by_phash(phash, existing_medias, threshold=5):
+    for media in existing_medias:
+        existing_phash_str = media[1]
+        existing_username = media[2]
+
+        existing_phash = imagehash.hex_to_hash(existing_phash_str)
+
+        if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
+            return media
+    return None
+
+# Database connection
+db, cursor = config.gen_connection()
+
+# Directory containing user videos
+directory = 'check_if_exists/'  # Directory containing user images
+
+# Fetch existing videos with pHashes
+cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL;", ['video'])
+existing_medias = cursor.fetchall()
+
+# make a list of all video files
+files = [file for file in get_files(directory) if file.endswith(('.mp4', '.avi', '.mov'))]
+
+
+for filepath in files:
+    video_filename = os.path.basename(filepath)
+
+    phash = get_video_phash(filepath, hash_size=8)  # Use hash_size=8
+    if phash is None:
+        continue
+
+    duplicate_media = get_media_by_phash(phash, existing_medias, threshold=5)
+    if duplicate_media:
+        print(f'Duplicate url found: https://altpins.com/pin/{duplicate_media[0]}')
+        print(f'Duplicate video path: {filepath}')
+        newpath = os.path.join('duplicates', duplicate_media[2], video_filename)
+        os.makedirs(os.path.dirname(newpath), exist_ok=True)
+        os.rename(filepath, newpath)
+        print(f'Moved {filepath} to duplicates/')
--- a/archived/fixes/fix_user_id_api.py
+++ b/archived/fixes/fix_user_id_api.py
@ -0,0 +1,19 @@
+import config, storysave_api
+
+
+db, cursor = config.gen_connection()
+
+usernames = []
+with open('usernames.txt', 'r') as f:
+    for line in f:
+        usernames.append(line.strip())
+
+for username in usernames:
+    print(f"Username: {username}")
+
+    user_id = storysave_api.get_user_id(username)
+    
+    # Update the user_id in the database
+    cursor.execute("UPDATE media SET user_id = %s WHERE username = %s AND user_id IS NULL;", [user_id, username])
+    db.commit()
+    print(f"[{cursor.rowcount}] Updated user_id for {username}")
--- a/archived/fixes/fixsize.py
+++ b/archived/fixes/fixsize.py
@ -0,0 +1,32 @@
+import config
+import os
+
+temp_directory = "cache"
+os.makedirs(temp_directory, exist_ok=True)
+
+obj_storage = config.get_storage()
+db, cursor = config.gen_connection()
+
+cursor.execute("SELECT id, media_url FROM media WHERE file_size = 0;")
+results = cursor.fetchall()
+
+count = 0
+print(f"Found {len(results)} files to process.")
+
+for result in results:
+    count += 1
+    
+    id, media_url = result
+    
+    serverPath = media_url.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
+    localFilePath = os.path.join(os.getcwd(), temp_directory, os.path.basename(serverPath))
+
+    if not os.path.exists(localFilePath):
+        continue
+    
+    file_size = os.path.getsize(localFilePath)
+
+    cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, id))
+    db.commit()
+
+    print(f"[{count}/{len(results)}] {media_url}: {file_size}, {cursor.rowcount}")
--- a/archived/storysave_dump_media.py
+++ b/archived/storysave_dump_media.py
@ -0,0 +1,154 @@
+from datetime import datetime
+import config
+import funcs
+import cv2
+import os
+
+directory = 'media/instagram/'
+
+def UploadMedia(media):
+    media_id = media['media_id']
+    username = media['username']
+    post_date = media['timestamp']
+    user_id = media['user_id']
+    filepath = media['filepath']
+    highlight_id = media['highlight_id']
+    post_type = media['post_type']
+    thumbnail_url = None
+    phash = None
+    
+    if media_id and int(media_id) in existing_files:
+        print('Duplicate file detected. Removing...')
+        os.remove(filepath)
+        return True
+        
+    filename = os.path.basename(filepath)
+    file_extension = os.path.splitext(filename)[1].lower()
+
+    media_type = funcs.get_media_type(filename)
+
+    file_hash = funcs.calculate_file_hash(filepath)
+
+    width, height = funcs.get_media_dimensions(filepath)
+    
+    duration = funcs.get_video_duration(filepath)
+
+    if media_type == 'video':
+        try:
+            thumbPath = f'temp/{media_id}.jpg'
+            cap = cv2.VideoCapture(filepath)
+            ret, frame = cap.read()
+            cv2.imwrite(thumbPath, frame)
+            cap.release()
+            obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') # slower
+            thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg"
+            phash = funcs.generate_phash(thumbPath)
+            os.remove(thumbPath)
+        except:
+            print('Error generating thumbnail. Skipping...')
+            return False
+    elif media_type == 'image':
+        phash = funcs.generate_phash(filepath)
+
+    if media_id:
+        newFilename = f'{media_id}{file_extension}'
+    else:
+        newFilename = f'{file_hash}{file_extension}'
+
+    server_path = f'media/{post_type}/{username}/{newFilename}'
+
+    file_url = f"https://storysave.b-cdn.net/{server_path}"
+
+    obj_storage.PutFile(filepath, server_path) # slow as fuck
+
+    if highlight_id:
+        newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id))
+        newDB.commit()
+        print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')
+
+    query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
+    values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, 'instagram')
+
+    newCursor.execute(query, values) # slower
+    newDB.commit()
+    print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
+
+    os.remove(filepath)
+
+    return True
+
+def get_user_id(username):
+    username = username.lower()
+    if username in existing_users:
+        return existing_users[username]
+    
+    return None
+
+def get_media():
+    medias = []
+    post_types = {
+        'posts': 'post',
+        'stories': 'story',
+        'profile': 'profile',
+    }
+    
+    for post_type in os.listdir(directory):
+        users_dir = os.path.join(directory, post_type)
+        if not os.path.isdir(users_dir):
+            continue
+        users = os.listdir(users_dir)
+        
+        for username in users:
+            user_path = os.path.join(directory, post_type, username)
+            if not os.path.isdir(user_path):
+                continue
+            for filename in os.listdir(user_path):
+                if filename.startswith('.'):
+                    continue
+                
+                data = {}
+                filepath = os.path.join(user_path, filename)
+                
+                if 'com.instagram.android__' in filename:
+                    timestamp_str = filename.split('__')[-1].split('.')[0]
+                    data['timestamp'] = datetime.strptime(timestamp_str, '%Y%m%d%H%M%S%f')
+                else:
+                    data['timestamp'] = datetime.now()
+
+                data['post_type'] = post_types[post_type]
+                data['username'] = username
+                data['filepath'] = filepath
+                data['media_id'] = None
+                data['user_id'] = get_user_id(data['username'])
+                data['highlight_id'] = None
+                medias.append(data)
+    
+    return medias
+
+def dump_instagram():
+    medias = get_media()
+
+    for media in medias:
+        UploadMedia(media)
+        existing_files.append(media['media_id'])
+
+if __name__ == '__main__':
+    print('Starting processing...')
+
+    if not os.listdir(directory):
+        print('No files to process. Exiting...')
+        exit()
+
+    newDB, newCursor = config.gen_connection()
+
+    obj_storage = config.get_storage()
+
+    newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL")
+    existing_files = [image[0] for image in newCursor.fetchall()]
+
+    newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
+    existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
+    
+    dump_instagram()
+
+    print("Processing completed.")