You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
69 lines
2.7 KiB
Python
69 lines
2.7 KiB
Python
import os
|
|
from funcs import generate_phash # Assuming this function computes the pHash and returns a string
|
|
import imagehash
|
|
|
|
def get_files(directory):
|
|
# Recursively get all files in the directory
|
|
file_list = []
|
|
for root, dirs, files in os.walk(directory):
|
|
for filename in files:
|
|
file_list.append(os.path.join(root, filename))
|
|
return file_list
|
|
|
|
# Function to compute pHashes for all images in a directory
|
|
def compute_phashes(image_paths):
|
|
phash_dict = {}
|
|
for image_path in image_paths:
|
|
try:
|
|
# Compute pHash and get it as a string
|
|
phash_str = generate_phash(image_path)
|
|
# Convert the hash string to an ImageHash object
|
|
phash = imagehash.hex_to_hash(phash_str)
|
|
phash_dict[image_path] = phash
|
|
except Exception as e:
|
|
print(f"Error processing {image_path}: {e}")
|
|
return phash_dict
|
|
|
|
# Get all image files from 'ready_to_upload' and 'sorted' directories
|
|
ready_images = get_files('ready_to_upload')
|
|
ready_images = [image for image in ready_images if not image.lower().endswith('.mp4')]
|
|
|
|
sorted_images = get_files('sorted')
|
|
sorted_images = [image for image in sorted_images if not image.lower().endswith('.mp4')]
|
|
|
|
# Compute pHashes for images in 'ready_to_upload'
|
|
print("Computing pHashes for 'ready_to_upload' images...")
|
|
ready_image_phashes = compute_phashes(ready_images)
|
|
|
|
# Compute pHashes for images in 'sorted'
|
|
print("Computing pHashes for 'sorted' images...")
|
|
sorted_image_phashes = compute_phashes(sorted_images)
|
|
|
|
# Prepare the 'already_processed' directory
|
|
os.makedirs('already_processed', exist_ok=True)
|
|
|
|
# Set a Hamming distance threshold for considering images as duplicates
|
|
threshold = 5 # Adjust this value as needed
|
|
|
|
# Find and move duplicates
|
|
for sorted_image, sorted_phash in sorted_image_phashes.items():
|
|
duplicate_found = False
|
|
for ready_image, ready_phash in ready_image_phashes.items():
|
|
# Compute Hamming distance between the two pHashes
|
|
try:
|
|
distance = sorted_phash - ready_phash
|
|
except TypeError as e:
|
|
print(f"Error comparing hashes for {sorted_image} and {ready_image}: {e}")
|
|
continue
|
|
|
|
if distance <= threshold:
|
|
# Duplicate found
|
|
newpath = sorted_image.replace('sorted', 'already_processed')
|
|
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
|
print(f"Moving {sorted_image} (duplicate of {ready_image}) to 'already_processed'")
|
|
os.rename(sorted_image, newpath)
|
|
duplicate_found = True
|
|
break # Exit the loop since a duplicate is found
|
|
if not duplicate_found:
|
|
print(f"No duplicate found for {sorted_image}")
|