diff --git a/archived/compress_data.py b/archived/compress_data.py new file mode 100644 index 0000000..eaa762d --- /dev/null +++ b/archived/compress_data.py @@ -0,0 +1,35 @@ +import os +import json +import gzip + +data_dir = 'data' +data_compressed_dir = 'data_compressed' +os.makedirs(data_compressed_dir, exist_ok=True) + +def compress_file(filepath, output_file): + with open(filepath, 'r') as f: + data = json.load(f) + compress_data(data, output_file) + return output_file + +def compress_data(data, output_file): + with gzip.open(output_file, 'wb') as f: + f.write(json.dumps(data).encode('utf-8')) + return output_file + + +data_files = os.listdir(data_dir) +for file in data_files: + if not file.endswith('.json'): + continue + + filepath = f'{data_dir}/{file}' + output_file = f'{data_compressed_dir}/{file}.gz' + output_file = compress_file(filepath, output_file) + if output_file: + print(f'Compressed {file} to {output_file}') + os.remove(filepath) + else: + print(f'Failed to compress {file}') + +print('Data compression completed') \ No newline at end of file diff --git a/archived/dedupe_scripts/deduper_new.py b/archived/dedupe_scripts/deduper_new.py new file mode 100644 index 0000000..87ffdd1 --- /dev/null +++ b/archived/dedupe_scripts/deduper_new.py @@ -0,0 +1,87 @@ +from funcs import get_files +from PIL import Image +import imagehash +import config +import os + +def generate_image_phash(filepath, hash_size=8): + try: + # Open the image using PIL + pil_image = Image.open(filepath) + + # Compute pHash using the imagehash library + phash = imagehash.phash(pil_image, hash_size=hash_size) + return phash + except Exception as e: + print(f"Error processing image {filepath}: {e}") + return None + +def are_phashes_duplicates(phash1, phash2, threshold=5): + try: + # Compute the Hamming distance between the pHashes + distance = phash1 - phash2 + return distance <= threshold + except TypeError as e: + print(f"Error comparing pHashes: {e}") + return False + +def find_duplicate_phash(phash, existing_medias, threshold=5): + for media in existing_medias: + existing_phash_str = media[1] + existing_username = media[2] + + # Convert stored pHash string to ImageHash object + existing_phash = imagehash.hex_to_hash(existing_phash_str) + + # Check if the current pHash is a duplicate + if are_phashes_duplicates(phash, existing_phash, threshold=threshold): + return media + return None + +def get_media_by_hash(hash, existing_medias): + for media in existing_medias: + existing_hash = media[1] + if hash == existing_hash: + return media + return None + +def get_media_by_id(media_id, existing_medias): + for media in existing_medias: + existing_media_id = media[1] + if media_id == existing_media_id: + return media + return None + +def get_data_by_filename(filename, data): + for item in data: + if filename in item['filepath']: + return item + return None + + +# Database connection +db, cursor = config.gen_connection() + +# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed) +cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL AND media_id IS NULL;", ['image']) +existing_medias = cursor.fetchall() + +directory = 'check_if_exists/' # Directory containing user images +files = [file for file in get_files(directory) if file.endswith(('.jpg', '.jpeg', '.png'))] +for filepath in files: + image_filename = os.path.basename(filepath) + + # Generate pHash for the image + phash = generate_image_phash(filepath, hash_size=8) + if phash is None: + continue + + # Check if the image is a duplicate of any in the database + duplicate_media = find_duplicate_phash(phash, existing_medias) + if duplicate_media: + print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}') + print(f'Duplicate image path: {filepath}') + newpath = os.path.join('duplicates', duplicate_media[2], image_filename) + os.makedirs(os.path.dirname(newpath), exist_ok=True) + os.rename(filepath, newpath) + print(f'Moved {image_filename} to duplicates/') \ No newline at end of file diff --git a/archived/dedupe_scripts/find_duplicates_by_phash_videos_new.py b/archived/dedupe_scripts/find_duplicates_by_phash_videos_new.py new file mode 100644 index 0000000..7af3432 --- /dev/null +++ b/archived/dedupe_scripts/find_duplicates_by_phash_videos_new.py @@ -0,0 +1,79 @@ +from funcs import get_files +from PIL import Image +import imagehash +import config +import cv2 +import os + +def get_video_phash(filepath, hash_size=8): + cap = cv2.VideoCapture(filepath) + ret, frame = cap.read() + cap.release() + + if not ret: + print(f"Error reading frame from {filepath}") + return None + + # Resize frame to a standard size + standard_size = (320, 240) + resized_frame = cv2.resize(frame, standard_size, interpolation=cv2.INTER_AREA) + + # Convert OpenCV image (BGR) to PIL Image (RGB) + image_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB) + pil_image = Image.fromarray(image_rgb) + + # Compute pHash + phash = imagehash.phash(pil_image, hash_size=hash_size) + + return phash + +def are_phashes_duplicates(phash1, phash2, threshold=5): + # Compute Hamming distance between the pHashes + try: + distance = phash1 - phash2 + except TypeError as e: + print(f"Error comparing pHashes: {e}") + return False + + return distance <= threshold + +def get_media_by_phash(phash, existing_medias, threshold=5): + for media in existing_medias: + existing_phash_str = media[1] + existing_username = media[2] + + existing_phash = imagehash.hex_to_hash(existing_phash_str) + + if are_phashes_duplicates(phash, existing_phash, threshold=threshold): + return media + return None + +# Database connection +db, cursor = config.gen_connection() + +# Directory containing user videos +directory = 'check_if_exists/' # Directory containing user images + +# Fetch existing videos with pHashes +cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL;", ['video']) +existing_medias = cursor.fetchall() + +# make a list of all video files +files = [file for file in get_files(directory) if file.endswith(('.mp4', '.avi', '.mov'))] + + +for filepath in files: + video_filename = os.path.basename(filepath) + + phash = get_video_phash(filepath, hash_size=8) # Use hash_size=8 + if phash is None: + continue + + duplicate_media = get_media_by_phash(phash, existing_medias, threshold=5) + if duplicate_media: + print(f'Duplicate url found: https://altpins.com/pin/{duplicate_media[0]}') + print(f'Duplicate video path: {filepath}') + newpath = os.path.join('duplicates', duplicate_media[2], video_filename) + os.makedirs(os.path.dirname(newpath), exist_ok=True) + os.rename(filepath, newpath) + print(f'Moved {filepath} to duplicates/') \ No newline at end of file diff --git a/archived/fixes/fix_user_id_api.py b/archived/fixes/fix_user_id_api.py new file mode 100644 index 0000000..07ec342 --- /dev/null +++ b/archived/fixes/fix_user_id_api.py @@ -0,0 +1,19 @@ +import config, storysave_api + + +db, cursor = config.gen_connection() + +usernames = [] +with open('usernames.txt', 'r') as f: + for line in f: + usernames.append(line.strip()) + +for username in usernames: + print(f"Username: {username}") + + user_id = storysave_api.get_user_id(username) + + # Update the user_id in the database + cursor.execute("UPDATE media SET user_id = %s WHERE username = %s AND user_id IS NULL;", [user_id, username]) + db.commit() + print(f"[{cursor.rowcount}] Updated user_id for {username}") \ No newline at end of file diff --git a/archived/fixes/fixsize.py b/archived/fixes/fixsize.py new file mode 100644 index 0000000..4d3e922 --- /dev/null +++ b/archived/fixes/fixsize.py @@ -0,0 +1,32 @@ +import config +import os + +temp_directory = "cache" +os.makedirs(temp_directory, exist_ok=True) + +obj_storage = config.get_storage() +db, cursor = config.gen_connection() + +cursor.execute("SELECT id, media_url FROM media WHERE file_size = 0;") +results = cursor.fetchall() + +count = 0 +print(f"Found {len(results)} files to process.") + +for result in results: + count += 1 + + id, media_url = result + + serverPath = media_url.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/') + localFilePath = os.path.join(os.getcwd(), temp_directory, os.path.basename(serverPath)) + + if not os.path.exists(localFilePath): + continue + + file_size = os.path.getsize(localFilePath) + + cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, id)) + db.commit() + + print(f"[{count}/{len(results)}] {media_url}: {file_size}, {cursor.rowcount}") \ No newline at end of file diff --git a/archived/storysave_dump_media.py b/archived/storysave_dump_media.py new file mode 100644 index 0000000..b22aeb3 --- /dev/null +++ b/archived/storysave_dump_media.py @@ -0,0 +1,154 @@ +from datetime import datetime +import config +import funcs +import cv2 +import os + +directory = 'media/instagram/' + +def UploadMedia(media): + media_id = media['media_id'] + username = media['username'] + post_date = media['timestamp'] + user_id = media['user_id'] + filepath = media['filepath'] + highlight_id = media['highlight_id'] + post_type = media['post_type'] + thumbnail_url = None + phash = None + + if media_id and int(media_id) in existing_files: + print('Duplicate file detected. Removing...') + os.remove(filepath) + return True + + filename = os.path.basename(filepath) + file_extension = os.path.splitext(filename)[1].lower() + + media_type = funcs.get_media_type(filename) + + file_hash = funcs.calculate_file_hash(filepath) + + width, height = funcs.get_media_dimensions(filepath) + + duration = funcs.get_video_duration(filepath) + + if media_type == 'video': + try: + thumbPath = f'temp/{media_id}.jpg' + cap = cv2.VideoCapture(filepath) + ret, frame = cap.read() + cv2.imwrite(thumbPath, frame) + cap.release() + obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') # slower + thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg" + phash = funcs.generate_phash(thumbPath) + os.remove(thumbPath) + except: + print('Error generating thumbnail. Skipping...') + return False + elif media_type == 'image': + phash = funcs.generate_phash(filepath) + + if media_id: + newFilename = f'{media_id}{file_extension}' + else: + newFilename = f'{file_hash}{file_extension}' + + server_path = f'media/{post_type}/{username}/{newFilename}' + + file_url = f"https://storysave.b-cdn.net/{server_path}" + + obj_storage.PutFile(filepath, server_path) # slow as fuck + + if highlight_id: + newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id)) + newDB.commit() + print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}') + + query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, 'instagram') + + newCursor.execute(query, values) # slower + newDB.commit() + print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}') + + os.remove(filepath) + + return True + +def get_user_id(username): + username = username.lower() + if username in existing_users: + return existing_users[username] + + return None + +def get_media(): + medias = [] + post_types = { + 'posts': 'post', + 'stories': 'story', + 'profile': 'profile', + } + + for post_type in os.listdir(directory): + users_dir = os.path.join(directory, post_type) + if not os.path.isdir(users_dir): + continue + users = os.listdir(users_dir) + + for username in users: + user_path = os.path.join(directory, post_type, username) + if not os.path.isdir(user_path): + continue + for filename in os.listdir(user_path): + if filename.startswith('.'): + continue + + data = {} + filepath = os.path.join(user_path, filename) + + if 'com.instagram.android__' in filename: + timestamp_str = filename.split('__')[-1].split('.')[0] + data['timestamp'] = datetime.strptime(timestamp_str, '%Y%m%d%H%M%S%f') + else: + data['timestamp'] = datetime.now() + + data['post_type'] = post_types[post_type] + data['username'] = username + data['filepath'] = filepath + data['media_id'] = None + data['user_id'] = get_user_id(data['username']) + data['highlight_id'] = None + medias.append(data) + + return medias + +def dump_instagram(): + medias = get_media() + + for media in medias: + UploadMedia(media) + existing_files.append(media['media_id']) + +if __name__ == '__main__': + print('Starting processing...') + + if not os.listdir(directory): + print('No files to process. Exiting...') + exit() + + newDB, newCursor = config.gen_connection() + + obj_storage = config.get_storage() + + newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL") + existing_files = [image[0] for image in newCursor.fetchall()] + + newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL") + existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()} + + dump_instagram() + + print("Processing completed.") \ No newline at end of file