import os from funcs import generate_phash # Assuming this function computes the pHash and returns a string import imagehash def get_files(directory): # Recursively get all files in the directory file_list = [] for root, dirs, files in os.walk(directory): for filename in files: file_list.append(os.path.join(root, filename)) return file_list # Function to compute pHashes for all images in a directory def compute_phashes(image_paths): phash_dict = {} for image_path in image_paths: try: # Compute pHash and get it as a string phash_str = generate_phash(image_path) # Convert the hash string to an ImageHash object phash = imagehash.hex_to_hash(phash_str) phash_dict[image_path] = phash except Exception as e: print(f"Error processing {image_path}: {e}") return phash_dict # Get all image files from 'ready_to_upload' and 'sorted' directories ready_images = get_files('ready_to_upload') ready_images = [image for image in ready_images if not image.lower().endswith('.mp4')] sorted_images = get_files('sorted') sorted_images = [image for image in sorted_images if not image.lower().endswith('.mp4')] # Compute pHashes for images in 'ready_to_upload' print("Computing pHashes for 'ready_to_upload' images...") ready_image_phashes = compute_phashes(ready_images) # Compute pHashes for images in 'sorted' print("Computing pHashes for 'sorted' images...") sorted_image_phashes = compute_phashes(sorted_images) # Prepare the 'already_processed' directory os.makedirs('already_processed', exist_ok=True) # Set a Hamming distance threshold for considering images as duplicates threshold = 5 # Adjust this value as needed # Find and move duplicates for sorted_image, sorted_phash in sorted_image_phashes.items(): duplicate_found = False for ready_image, ready_phash in ready_image_phashes.items(): # Compute Hamming distance between the two pHashes try: distance = sorted_phash - ready_phash except TypeError as e: print(f"Error comparing hashes for {sorted_image} and {ready_image}: {e}") continue if distance <= threshold: # Duplicate found newpath = sorted_image.replace('sorted', 'already_processed') os.makedirs(os.path.dirname(newpath), exist_ok=True) print(f"Moving {sorted_image} (duplicate of {ready_image}) to 'already_processed'") os.rename(sorted_image, newpath) duplicate_found = True break # Exit the loop since a duplicate is found if not duplicate_found: print(f"No duplicate found for {sorted_image}")