last update
parent
89c8e35e3b
commit
1d8bb3c85f
@ -0,0 +1,95 @@
|
||||
import os
|
||||
import json
|
||||
import config
|
||||
import imagehash
|
||||
from PIL import Image
|
||||
from funcs import get_files, calculate_file_hash
|
||||
|
||||
|
||||
def generate_image_phash(filepath, hash_size=8):
|
||||
try:
|
||||
# Open the image using PIL
|
||||
pil_image = Image.open(filepath)
|
||||
|
||||
# Compute pHash using the imagehash library
|
||||
phash = imagehash.phash(pil_image, hash_size=hash_size)
|
||||
return phash
|
||||
except Exception as e:
|
||||
print(f"Error processing image {filepath}: {e}")
|
||||
return None
|
||||
|
||||
def are_phashes_duplicates(phash1, phash2, threshold=5):
|
||||
try:
|
||||
# Compute the Hamming distance between the pHashes
|
||||
distance = phash1 - phash2
|
||||
return distance <= threshold
|
||||
except TypeError as e:
|
||||
print(f"Error comparing pHashes: {e}")
|
||||
return False
|
||||
|
||||
def get_media_by_phash(phash, username, existing_medias, threshold=5):
|
||||
for media in existing_medias:
|
||||
existing_phash_str = media[1]
|
||||
existing_username = media[2]
|
||||
|
||||
# Convert stored pHash string to ImageHash object
|
||||
existing_phash = imagehash.hex_to_hash(existing_phash_str)
|
||||
|
||||
# Check if the current pHash is a duplicate
|
||||
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
|
||||
return media
|
||||
return None
|
||||
|
||||
def get_media_by_hash(hash, existing_medias):
|
||||
for media in existing_medias:
|
||||
existing_hash = media[1]
|
||||
if hash == existing_hash:
|
||||
return media
|
||||
return None
|
||||
|
||||
def get_media_by_id(media_id, existing_medias):
|
||||
for media in existing_medias:
|
||||
existing_media_id = media[1]
|
||||
if media_id == existing_media_id:
|
||||
return media
|
||||
return None
|
||||
|
||||
def get_data_by_filename(filename, data):
|
||||
for item in data:
|
||||
if filename in item['filepath']:
|
||||
return item
|
||||
return None
|
||||
|
||||
directory = 'check_if_exists' # Directory containing user images
|
||||
|
||||
# Database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
|
||||
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['image'])
|
||||
existing_medias = cursor.fetchall()
|
||||
|
||||
usernames = os.listdir(directory)
|
||||
|
||||
for username in usernames:
|
||||
files = get_files(os.path.join(directory, username))
|
||||
for filepath in files:
|
||||
image_filename = os.path.basename(filepath)
|
||||
print(f'Processing {image_filename}...')
|
||||
|
||||
# Generate pHash for the image
|
||||
phash = generate_image_phash(filepath, hash_size=8)
|
||||
if phash is None:
|
||||
continue # Skip this image if there's an issue
|
||||
|
||||
phash_str = str(phash)
|
||||
|
||||
# Check if the image is a duplicate of any in the database
|
||||
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
|
||||
if duplicate_media:
|
||||
print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
|
||||
print(f'Duplicate image path: {filepath}')
|
||||
newpath = os.path.join('duplicates', duplicate_media[2], image_filename)
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
os.rename(filepath, newpath)
|
||||
print(f'Moved {image_filename} to duplicates/')
|
||||
@ -0,0 +1,68 @@
|
||||
import os
|
||||
from funcs import generate_phash # Assuming this function computes the pHash and returns a string
|
||||
import imagehash
|
||||
|
||||
def get_files(directory):
|
||||
# Recursively get all files in the directory
|
||||
file_list = []
|
||||
for root, dirs, files in os.walk(directory):
|
||||
for filename in files:
|
||||
file_list.append(os.path.join(root, filename))
|
||||
return file_list
|
||||
|
||||
# Function to compute pHashes for all images in a directory
|
||||
def compute_phashes(image_paths):
|
||||
phash_dict = {}
|
||||
for image_path in image_paths:
|
||||
try:
|
||||
# Compute pHash and get it as a string
|
||||
phash_str = generate_phash(image_path)
|
||||
# Convert the hash string to an ImageHash object
|
||||
phash = imagehash.hex_to_hash(phash_str)
|
||||
phash_dict[image_path] = phash
|
||||
except Exception as e:
|
||||
print(f"Error processing {image_path}: {e}")
|
||||
return phash_dict
|
||||
|
||||
# Get all image files from 'ready_to_upload' and 'sorted' directories
|
||||
ready_images = get_files('ready_to_upload')
|
||||
ready_images = [image for image in ready_images if not image.lower().endswith('.mp4')]
|
||||
|
||||
sorted_images = get_files('sorted')
|
||||
sorted_images = [image for image in sorted_images if not image.lower().endswith('.mp4')]
|
||||
|
||||
# Compute pHashes for images in 'ready_to_upload'
|
||||
print("Computing pHashes for 'ready_to_upload' images...")
|
||||
ready_image_phashes = compute_phashes(ready_images)
|
||||
|
||||
# Compute pHashes for images in 'sorted'
|
||||
print("Computing pHashes for 'sorted' images...")
|
||||
sorted_image_phashes = compute_phashes(sorted_images)
|
||||
|
||||
# Prepare the 'already_processed' directory
|
||||
os.makedirs('already_processed', exist_ok=True)
|
||||
|
||||
# Set a Hamming distance threshold for considering images as duplicates
|
||||
threshold = 5 # Adjust this value as needed
|
||||
|
||||
# Find and move duplicates
|
||||
for sorted_image, sorted_phash in sorted_image_phashes.items():
|
||||
duplicate_found = False
|
||||
for ready_image, ready_phash in ready_image_phashes.items():
|
||||
# Compute Hamming distance between the two pHashes
|
||||
try:
|
||||
distance = sorted_phash - ready_phash
|
||||
except TypeError as e:
|
||||
print(f"Error comparing hashes for {sorted_image} and {ready_image}: {e}")
|
||||
continue
|
||||
|
||||
if distance <= threshold:
|
||||
# Duplicate found
|
||||
newpath = sorted_image.replace('sorted', 'already_processed')
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
print(f"Moving {sorted_image} (duplicate of {ready_image}) to 'already_processed'")
|
||||
os.rename(sorted_image, newpath)
|
||||
duplicate_found = True
|
||||
break # Exit the loop since a duplicate is found
|
||||
if not duplicate_found:
|
||||
print(f"No duplicate found for {sorted_image}")
|
||||
@ -0,0 +1,61 @@
|
||||
import os
|
||||
import funcs
|
||||
import config
|
||||
|
||||
# Function to find the closest perceptual hash (phash) match
|
||||
def find_almost_identical_phash(phash, usernames, max_distance=1):
|
||||
"""
|
||||
Find a username whose phash is nearly identical to the given phash.
|
||||
:param phash: The phash to compare (e.g., from the 'unknown' image).
|
||||
:param usernames: List of tuples containing (username, phash).
|
||||
:param max_distance: Maximum Hamming distance to consider as "identical".
|
||||
:return: The matching username and phash, or None if no match is found.
|
||||
"""
|
||||
for username in usernames:
|
||||
dist = hamming_distance(phash, username[1])
|
||||
if dist <= max_distance:
|
||||
return username
|
||||
return None
|
||||
|
||||
def hamming_distance(phash1, phash2):
|
||||
"""
|
||||
Calculate the Hamming distance between two binary strings.
|
||||
"""
|
||||
if len(phash1) != len(phash2):
|
||||
raise ValueError("Hashes must be of the same length")
|
||||
return sum(c1 != c2 for c1, c2 in zip(phash1, phash2))
|
||||
|
||||
|
||||
# Establish database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Fetch all images with an 'unknown' username
|
||||
cursor.execute("SELECT id, username, phash FROM media WHERE username = 'unknown'")
|
||||
rows = cursor.fetchall()
|
||||
|
||||
# Fetch all non-unknown usernames and their associated phash
|
||||
cursor.execute("SELECT username, phash FROM media WHERE username != 'unknown' AND phash IS NOT NULL AND status = 'public'")
|
||||
usernames = cursor.fetchall()
|
||||
|
||||
# Ensure there are valid usernames to compare against
|
||||
if not usernames:
|
||||
print("No known usernames found in the database.")
|
||||
exit()
|
||||
|
||||
# Adjusted section in your script
|
||||
for row in rows:
|
||||
id = row[0]
|
||||
phash = row[2]
|
||||
|
||||
# Find a nearly identical phash match
|
||||
closest = find_almost_identical_phash(phash, usernames, max_distance=2)
|
||||
|
||||
if closest:
|
||||
print(f"Found match for image {id}: {closest[0]} with phash {closest[1]}")
|
||||
cursor.execute(
|
||||
"UPDATE media SET username = %s WHERE id = %s",
|
||||
(closest[0], id),
|
||||
)
|
||||
db.commit()
|
||||
else:
|
||||
print(f"No nearly identical match found for image {id}.")
|
||||
@ -0,0 +1,87 @@
|
||||
import os
|
||||
import config
|
||||
import cv2
|
||||
import imagehash
|
||||
from PIL import Image
|
||||
|
||||
def generate_thumbnail_phash(filepath, hash_size=8): # Set hash_size to 8
|
||||
cap = cv2.VideoCapture(filepath)
|
||||
ret, frame = cap.read()
|
||||
cap.release()
|
||||
|
||||
if not ret:
|
||||
print(f"Error reading frame from {filepath}")
|
||||
return None
|
||||
|
||||
# Resize frame to a standard size
|
||||
standard_size = (320, 240)
|
||||
resized_frame = cv2.resize(frame, standard_size, interpolation=cv2.INTER_AREA)
|
||||
|
||||
# Convert OpenCV image (BGR) to PIL Image (RGB)
|
||||
image_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
|
||||
pil_image = Image.fromarray(image_rgb)
|
||||
|
||||
# Compute pHash
|
||||
phash = imagehash.phash(pil_image, hash_size=hash_size)
|
||||
|
||||
return phash
|
||||
|
||||
def are_phashes_duplicates(phash1, phash2, threshold=5):
|
||||
# Compute Hamming distance between the pHashes
|
||||
try:
|
||||
distance = phash1 - phash2
|
||||
except TypeError as e:
|
||||
print(f"Error comparing pHashes: {e}")
|
||||
return False
|
||||
|
||||
return distance <= threshold
|
||||
|
||||
def get_media_by_phash(phash, username, existing_medias, threshold=5):
|
||||
for media in existing_medias:
|
||||
existing_phash_str = media[1]
|
||||
existing_username = media[2]
|
||||
if existing_username != username:
|
||||
continue
|
||||
|
||||
# Convert stored phash string to ImageHash object
|
||||
existing_phash = imagehash.hex_to_hash(existing_phash_str)
|
||||
|
||||
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
|
||||
return media
|
||||
return None
|
||||
|
||||
# Database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Directory containing user videos
|
||||
directory = 'check_if_exists'
|
||||
|
||||
# Fetch existing videos with pHashes
|
||||
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['video'])
|
||||
existing_medias = cursor.fetchall()
|
||||
|
||||
users = os.listdir(directory) # Assuming 'check_if_exists' contains user videos
|
||||
for username in users:
|
||||
user_videos_path = os.path.join(directory, username)
|
||||
if not os.path.isdir(user_videos_path):
|
||||
continue
|
||||
|
||||
videos = [video for video in os.listdir(user_videos_path) if video.endswith(('.mp4', '.avi', '.mov'))]
|
||||
for video in videos:
|
||||
print(f'Processing {video}...')
|
||||
filepath = os.path.join(user_videos_path, video)
|
||||
|
||||
phash = generate_thumbnail_phash(filepath, hash_size=8) # Use hash_size=8
|
||||
if phash is None:
|
||||
continue
|
||||
|
||||
phash_str = str(phash)
|
||||
|
||||
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
|
||||
if duplicate_media:
|
||||
print(f'Duplicate url found: https://altpins.com/pin/{duplicate_media[0]}')
|
||||
print(f'Duplicate video path: {filepath}')
|
||||
newpath = filepath.replace(directory, 'duplicates')
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
os.rename(filepath, newpath)
|
||||
print(f'Moved {video} to duplicates/')
|
||||
@ -0,0 +1,58 @@
|
||||
import os
|
||||
from funcs import generate_phash
|
||||
|
||||
def find_duplicates(source_dir, target_dir, extensions, max_distance):
|
||||
"""Remove duplicates in target_dir that are found in source_dir based on Hamming distance."""
|
||||
source_files = {}
|
||||
target_files = {}
|
||||
|
||||
# Helper function to filter files by extension
|
||||
def filter_files(files):
|
||||
return [f for f in files if os.path.splitext(f)[1].lower() in extensions]
|
||||
|
||||
# Build hash map of source directory
|
||||
for dirpath, _, filenames in os.walk(source_dir):
|
||||
for filename in filter_files(filenames):
|
||||
filepath = os.path.join(dirpath, filename)
|
||||
filehash = generate_phash(filepath, str=False)
|
||||
if filehash:
|
||||
source_files[filehash] = filepath
|
||||
|
||||
# Build hash map of target directory and compare
|
||||
for dirpath, _, filenames in os.walk(target_dir):
|
||||
for filename in filter_files(filenames):
|
||||
filepath = os.path.join(dirpath, filename)
|
||||
filehash = generate_phash(filepath, str=False)
|
||||
if not filehash:
|
||||
continue
|
||||
|
||||
# Check if this file is similar to any of the source files
|
||||
is_duplicate = False
|
||||
for source_hash in source_files.keys():
|
||||
distance = filehash - source_hash # Hamming distance
|
||||
if distance <= max_distance:
|
||||
is_duplicate = True
|
||||
break # Found a duplicate
|
||||
|
||||
if is_duplicate:
|
||||
newpath = os.path.join('duplicates', filename)
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
os.rename(filepath, newpath)
|
||||
print(f"Moved duplicate: {filepath} to duplicates/ (distance: {distance})")
|
||||
else:
|
||||
target_files[filehash] = filepath
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Paths to the directories
|
||||
source_dir = 'D:/Crawlers/media/Coomer/sadierayxo'
|
||||
target_dir = 'sorted/sadierayxo'
|
||||
|
||||
# List of accepted extensions
|
||||
extensions = {'.png', '.jpg', '.jpeg', '.webp', '.gif'}
|
||||
|
||||
# Maximum Hamming distance to consider as duplicates
|
||||
MAX_DISTANCE = 5 # Adjust this threshold as needed
|
||||
|
||||
find_duplicates(source_dir, target_dir, extensions, MAX_DISTANCE)
|
||||
|
||||
print("Duplicate removal process completed.")
|
||||
@ -0,0 +1,34 @@
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from BunnyCDN.Storage import Storage
|
||||
import config, os
|
||||
|
||||
def DownloadFile(serverPath, cacheDir):
|
||||
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
|
||||
|
||||
if os.path.exists(localFilePath):
|
||||
print(f"File already exists: {localFilePath}")
|
||||
return localFilePath
|
||||
|
||||
obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir)
|
||||
print(f"Downloaded {serverPath} to {localFilePath}")
|
||||
return localFilePath
|
||||
|
||||
def ImportMedias(results):
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
for video in results:
|
||||
serverPath = video[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
executor.submit(DownloadFile, serverPath, cacheDir)
|
||||
|
||||
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE file_size = 0;")
|
||||
results = cursor.fetchall()
|
||||
|
||||
cacheDir = 'cache'
|
||||
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
ImportMedias(results)
|
||||
@ -0,0 +1,32 @@
|
||||
import os, funcs
|
||||
from funcs import generate_phash
|
||||
|
||||
def get_username(image, ready_images):
|
||||
for ready_image in ready_images:
|
||||
if os.path.basename(image) in ready_image:
|
||||
ready_image = ready_image.replace('\\', '/')
|
||||
return ready_image.split('/')[1]
|
||||
return None
|
||||
|
||||
ready_images = funcs.get_files('ready_to_upload')
|
||||
ready_images = [image for image in ready_images if not image.endswith('.mp4')]
|
||||
|
||||
sorted_images = funcs.get_files('sorted')
|
||||
sorted_images = [image for image in sorted_images if not image.endswith('.mp4')]
|
||||
|
||||
os.makedirs('already_processed', exist_ok=True)
|
||||
|
||||
for image in sorted_images:
|
||||
image = image.replace('\\', '/')
|
||||
username = image.split('/')[1]
|
||||
filename = os.path.basename(image)
|
||||
|
||||
for ready_image in ready_images:
|
||||
if filename in ready_image:
|
||||
username = get_username(image, ready_images)
|
||||
newpath = ready_image.replace('ready_to_upload', 'already_processed')
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
print(f'Moving {image} which is a match for {ready_image} to already_processed')
|
||||
os.rename(image, newpath)
|
||||
print(f'Moved {ready_image} to already_processed')
|
||||
break
|
||||
@ -0,0 +1,40 @@
|
||||
import config, os, json
|
||||
from PIL import Image
|
||||
import imagehash
|
||||
|
||||
def find_file(filename, directory):
|
||||
filename = filename.lower().split('.')[0]
|
||||
for root, dirs, files in os.walk(directory):
|
||||
for file in files:
|
||||
if filename in file:
|
||||
return os.path.join(root, file)
|
||||
return None
|
||||
|
||||
def generate_phash(image_path):
|
||||
image = Image.open(image_path)
|
||||
return str(imagehash.phash(image))
|
||||
|
||||
count = 0
|
||||
|
||||
cacheDir = 'sorted'
|
||||
dataPath = 'pins.json'
|
||||
|
||||
os.makedirs(cacheDir, exist_ok=True)
|
||||
|
||||
medias = json.load(open(dataPath))
|
||||
|
||||
for item in medias:
|
||||
count += 1
|
||||
|
||||
filepath = item['filepath']
|
||||
if os.path.exists(filepath):
|
||||
continue
|
||||
|
||||
newfilepath = find_file(os.path.basename(filepath), cacheDir)
|
||||
if newfilepath:
|
||||
print(f"Found file {newfilepath} for {filepath}")
|
||||
item['filepath'] = newfilepath
|
||||
|
||||
|
||||
with open(dataPath, 'w') as f:
|
||||
json.dump(medias, f)
|
||||
@ -0,0 +1,28 @@
|
||||
import os, json
|
||||
from funcs import generate_phash
|
||||
|
||||
count = 0
|
||||
cacheDir = '_sort'
|
||||
dataPath = 'pins.json'
|
||||
|
||||
os.makedirs(cacheDir, exist_ok=True)
|
||||
|
||||
medias = json.load(open(dataPath))
|
||||
|
||||
for item in medias:
|
||||
count += 1
|
||||
if item['type'] == 'image':
|
||||
filepath = item['filepath']
|
||||
if 'phash' in item:
|
||||
print(f"Skipping {count}/{len(medias)}: already processed.")
|
||||
continue
|
||||
|
||||
if not os.path.exists(filepath):
|
||||
print(f"File {filepath} does not exist, skipping.")
|
||||
continue
|
||||
phash = generate_phash(filepath)
|
||||
item['phash'] = phash
|
||||
print(f"Processed {count}/{len(medias)}: with pHash {phash}")
|
||||
|
||||
with open(dataPath, 'w') as f:
|
||||
json.dump(medias, f)
|
||||
@ -0,0 +1,36 @@
|
||||
import config
|
||||
from funcs import generate_phash
|
||||
|
||||
count = 0
|
||||
|
||||
storage = config.get_storage()
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
generate_for = 'media_url'
|
||||
media_type = 'image'
|
||||
|
||||
cursor.execute(f"SELECT id, {generate_for} FROM media WHERE media_type = %s AND phash IS NULL;", [media_type])
|
||||
medias = cursor.fetchall()
|
||||
|
||||
for item in medias:
|
||||
count += 1
|
||||
|
||||
itemID = item[0]
|
||||
media_url = item[1]
|
||||
|
||||
server_path = media_url.replace('https://storysave.b-cdn.net/', '').replace('\\', '/')
|
||||
filepath = storage.DownloadFile(server_path, 'temp')
|
||||
if not filepath:
|
||||
print(f"Error downloading {server_path}")
|
||||
continue
|
||||
|
||||
phash = generate_phash(filepath)
|
||||
if not phash:
|
||||
print(f"Error generating pHash for {filepath}")
|
||||
continue
|
||||
|
||||
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", [phash, itemID])
|
||||
db.commit()
|
||||
|
||||
print(f"[{cursor.rowcount}] Processed {count}/{len(medias)}: with pHash {phash}")
|
||||
@ -0,0 +1,39 @@
|
||||
import config, os
|
||||
from funcs import generate_phash
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'image' AND phash = 0;")
|
||||
results = cursor.fetchall()
|
||||
|
||||
count = 0
|
||||
cacheDir = 'cache'
|
||||
os.makedirs(cacheDir, exist_ok=True)
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
|
||||
for result in results:
|
||||
count += 1
|
||||
itemID = result[0]
|
||||
mediaID = result[1]
|
||||
if not mediaID:
|
||||
print(f"Media ID is null, skipping.")
|
||||
continue
|
||||
mediaURL = result[2]
|
||||
|
||||
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
|
||||
|
||||
if not os.path.exists(localFilePath):
|
||||
print(f"File {localFilePath} does not exist, skipping.")
|
||||
continue
|
||||
|
||||
phash = generate_phash(localFilePath)
|
||||
if not phash:
|
||||
print(f"Error generating pHash for {localFilePath}, skipping.")
|
||||
continue
|
||||
|
||||
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", (phash, itemID))
|
||||
db.commit()
|
||||
|
||||
print(f"Processed {count}/{len(results)}: {mediaID} with pHash {phash}")
|
||||
@ -0,0 +1,74 @@
|
||||
import config, os, threading, queue
|
||||
from funcs import generate_phash
|
||||
|
||||
# Initialize database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Query the media table for unprocessed images
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'image' AND phash = '0';")
|
||||
results = cursor.fetchall()
|
||||
|
||||
# Setup cache directory
|
||||
cacheDir = 'cache'
|
||||
os.makedirs(cacheDir, exist_ok=True)
|
||||
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
# Thread-safe queue for processed media
|
||||
processed_media_queue = queue.Queue()
|
||||
|
||||
def process_media():
|
||||
"""Thread function to update database with processed pHash values."""
|
||||
while True:
|
||||
try:
|
||||
item = processed_media_queue.get(timeout=10) # Timeout prevents infinite blocking
|
||||
if item is None: # Sentinel value to exit the loop
|
||||
break
|
||||
|
||||
itemID, phash = item
|
||||
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", (phash, itemID))
|
||||
db.commit()
|
||||
print(f"Updated database for ID {itemID} with pHash {phash}.")
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
# Start the database update thread
|
||||
update_thread = threading.Thread(target=process_media, daemon=True)
|
||||
update_thread.start()
|
||||
|
||||
# Main processing loop for generating pHash
|
||||
count = 0
|
||||
|
||||
for result in results:
|
||||
count += 1
|
||||
itemID = result[0]
|
||||
mediaID = result[1]
|
||||
|
||||
if not mediaID:
|
||||
print(f"Media ID is null, skipping.")
|
||||
continue
|
||||
|
||||
mediaURL = result[2]
|
||||
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
|
||||
|
||||
if not os.path.exists(localFilePath):
|
||||
print(f"File {localFilePath} does not exist, skipping.")
|
||||
continue
|
||||
|
||||
phash = generate_phash(localFilePath)
|
||||
if not phash:
|
||||
print(f"Error generating pHash for {localFilePath}, skipping.")
|
||||
continue
|
||||
|
||||
# Add the processed media to the queue
|
||||
processed_media_queue.put((itemID, phash))
|
||||
print(f"Processed {count}/{len(results)}: {mediaID} with pHash {phash}")
|
||||
|
||||
# Signal the update thread to stop
|
||||
processed_media_queue.put(None)
|
||||
|
||||
# Wait for the update thread to finish
|
||||
update_thread.join()
|
||||
|
||||
print("Processing completed.")
|
||||
@ -0,0 +1,51 @@
|
||||
import os
|
||||
import config
|
||||
import cv2
|
||||
from funcs import generate_phash
|
||||
from BunnyCDN.Storage import Storage
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'video' AND phash = '0';")
|
||||
results = cursor.fetchall()
|
||||
|
||||
count = 0
|
||||
cacheDir = 'cache'
|
||||
os.makedirs(cacheDir, exist_ok=True)
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
for result in results:
|
||||
count += 1
|
||||
itemID = result[0]
|
||||
media_id = result[1]
|
||||
|
||||
if not media_id:
|
||||
print(f"Media ID is null, skipping.")
|
||||
continue
|
||||
|
||||
mediaURL = result[2]
|
||||
|
||||
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
|
||||
|
||||
if not os.path.exists(localFilePath):
|
||||
print(f"File {localFilePath} does not exist, skipping.")
|
||||
continue
|
||||
|
||||
thumbPath = f'temp/{media_id}.jpg'
|
||||
cap = cv2.VideoCapture(localFilePath)
|
||||
ret, frame = cap.read()
|
||||
cv2.imwrite(thumbPath, frame)
|
||||
cap.release()
|
||||
phash = generate_phash(thumbPath)
|
||||
os.remove(thumbPath)
|
||||
|
||||
if not phash:
|
||||
print(f"Error generating pHash for {localFilePath}, skipping.")
|
||||
continue
|
||||
|
||||
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", (phash, itemID))
|
||||
db.commit()
|
||||
|
||||
print(f"Processed {count}/{len(results)}: {media_id} with pHash {phash}")
|
||||
@ -0,0 +1,43 @@
|
||||
import os
|
||||
import json
|
||||
import config
|
||||
|
||||
# Establish database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Fetch rows with file_size = 0
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE file_size = 0;")
|
||||
results = cursor.fetchall()
|
||||
|
||||
cacheDir = 'cache'
|
||||
os.makedirs(cacheDir, exist_ok=True)
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
update_data = []
|
||||
for result in results:
|
||||
itemID = result[0]
|
||||
media_id = result[1]
|
||||
|
||||
if not media_id:
|
||||
print(f"Media ID is null for ID {itemID}, skipping.")
|
||||
continue
|
||||
|
||||
mediaURL = result[2]
|
||||
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
|
||||
|
||||
if not os.path.exists(localFilePath):
|
||||
print(f"File {localFilePath} does not exist for ID {itemID}, skipping.")
|
||||
continue
|
||||
|
||||
file_size = os.path.getsize(localFilePath)
|
||||
update_data.append({"id": itemID, "file_size": file_size})
|
||||
|
||||
# Save the results to a JSON file
|
||||
output_file = "update_data.json"
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(update_data, f, indent=4)
|
||||
|
||||
print(f"Saved {len(update_data)} updates to {output_file}.")
|
||||
cursor.close()
|
||||
db.close()
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,29 @@
|
||||
import json
|
||||
import config
|
||||
|
||||
# Establish database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Load update data from the JSON file
|
||||
input_file = "update_data.json"
|
||||
with open(input_file, 'r') as f:
|
||||
update_data = json.load(f)
|
||||
|
||||
print(f"Loaded {len(update_data)} records to update.")
|
||||
|
||||
# Process each record one by one
|
||||
for count, item in enumerate(update_data, start=1):
|
||||
item_id = item["id"]
|
||||
file_size = item["file_size"]
|
||||
|
||||
try:
|
||||
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s", (file_size, item_id))
|
||||
db.commit()
|
||||
print(f"Processed {count}/{len(update_data)}: ID {item_id} updated with file size {file_size}.")
|
||||
except Exception as e:
|
||||
print(f"Error updating ID {item_id}: {e}")
|
||||
db.rollback()
|
||||
|
||||
print("All updates completed.")
|
||||
cursor.close()
|
||||
db.close()
|
||||
@ -0,0 +1,31 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
import config, os
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE file_size = 0;")
|
||||
results = cursor.fetchall()
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
cacheDir = 'cache'
|
||||
|
||||
for result in results:
|
||||
itemID = result[0]
|
||||
|
||||
mediaURL = result[2]
|
||||
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
|
||||
|
||||
if not os.path.exists(localFilePath):
|
||||
continue
|
||||
|
||||
file_size = os.path.getsize(localFilePath)
|
||||
|
||||
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, itemID))
|
||||
db.commit()
|
||||
|
||||
print(f"Processed ID {itemID}: updated with file size {file_size}.")
|
||||
|
||||
cursor.close()
|
||||
db.close()
|
||||
@ -0,0 +1,181 @@
|
||||
import os
|
||||
import json
|
||||
import config
|
||||
import imagehash
|
||||
from PIL import Image
|
||||
from funcs import get_files, calculate_file_hash, remove_empty_folders # Assuming this is defined elsewhere
|
||||
|
||||
def generate_image_phash(filepath, hash_size=8):
|
||||
try:
|
||||
# Open the image using PIL
|
||||
pil_image = Image.open(filepath)
|
||||
|
||||
# Compute pHash using the imagehash library
|
||||
phash = imagehash.phash(pil_image, hash_size=hash_size)
|
||||
return phash
|
||||
except Exception as e:
|
||||
print(f"Error processing image {filepath}: {e}")
|
||||
return None
|
||||
|
||||
def are_phashes_duplicates(phash1, phash2, threshold=5):
|
||||
try:
|
||||
# Compute the Hamming distance between the pHashes
|
||||
distance = phash1 - phash2
|
||||
return distance <= threshold
|
||||
except TypeError as e:
|
||||
print(f"Error comparing pHashes: {e}")
|
||||
return False
|
||||
|
||||
def get_media_by_phash(phash, username, existing_medias, threshold=5):
|
||||
for media in existing_medias:
|
||||
existing_phash_str = media[1]
|
||||
existing_username = media[2]
|
||||
|
||||
if existing_username != username:
|
||||
continue
|
||||
|
||||
# Convert stored pHash string to ImageHash object
|
||||
existing_phash = imagehash.hex_to_hash(existing_phash_str)
|
||||
|
||||
# Check if the current pHash is a duplicate
|
||||
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
|
||||
return media
|
||||
return None
|
||||
|
||||
def get_media_by_hash(hash, existing_medias):
|
||||
for media in existing_medias:
|
||||
existing_hash = media[1]
|
||||
if hash == existing_hash:
|
||||
return media
|
||||
return None
|
||||
|
||||
def get_media_by_id(media_id, existing_medias):
|
||||
for media in existing_medias:
|
||||
existing_media_id = media[1]
|
||||
if media_id == existing_media_id:
|
||||
return media
|
||||
return None
|
||||
|
||||
def get_data_by_filename(filename, data):
|
||||
for item in data:
|
||||
if filename in item['filepath']:
|
||||
return item
|
||||
return None
|
||||
|
||||
directory = 'images'
|
||||
data = json.load(open('pins.json'))
|
||||
|
||||
files = get_files(directory)
|
||||
knownExtensions = ['jpg', 'png', 'jpeg', 'gif', 'webp']
|
||||
for file in files:
|
||||
fileExt = file.split('.')[-1].lower()
|
||||
if fileExt not in knownExtensions:
|
||||
print(f"Data not found for {file}")
|
||||
filehash = calculate_file_hash(file)
|
||||
newfilename = f"{filehash}.jpg"
|
||||
currentDir = os.path.dirname(file)
|
||||
newfilepath = os.path.join(currentDir, newfilename)
|
||||
os.rename(file, newfilepath)
|
||||
|
||||
files = get_files(directory)
|
||||
|
||||
# Sort files by username and move them into the directory folder where each subfolder is a username
|
||||
for file in files:
|
||||
item_data = get_data_by_filename(os.path.basename(file).split('.')[0], data)
|
||||
|
||||
if not item_data:
|
||||
print(f"Data not found for {file}")
|
||||
continue
|
||||
|
||||
username = item_data['username']
|
||||
newpath = os.path.join(directory, username, os.path.basename(file))
|
||||
|
||||
if newpath == file:
|
||||
continue
|
||||
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
os.rename(file, newpath)
|
||||
|
||||
# Database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# now find dupes by media_id
|
||||
cursor.execute("SELECT id, media_id, username FROM media WHERE media_type = %s AND media_id IS NOT NULL", ['image'])
|
||||
items = cursor.fetchall()
|
||||
|
||||
media_ids = [item[1] for item in items]
|
||||
|
||||
files = get_files(directory)
|
||||
|
||||
for file in files:
|
||||
try:
|
||||
media_id = os.path.basename(file).split('.')[0]
|
||||
media_id = int(media_id)
|
||||
except:
|
||||
print(f"Error parsing media_id from {file}")
|
||||
continue
|
||||
|
||||
if media_id in media_ids:
|
||||
media_item = get_media_by_id(media_id, items)
|
||||
print(f"Duplicate found: https://altpins.com/pin/{media_item[0]}")
|
||||
print(f"Duplicate file: {file}")
|
||||
newpath = os.path.join('duplicates', media_item[2], os.path.basename(file))
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
os.rename(file, newpath)
|
||||
else:
|
||||
print(f"Unique file: {file}")
|
||||
|
||||
|
||||
cursor.execute("SELECT id, hash, username FROM media WHERE media_type = %s AND hash IS NOT NULL", ['image'])
|
||||
items = cursor.fetchall()
|
||||
|
||||
hashes = [item[1] for item in items]
|
||||
|
||||
files = get_files(directory)
|
||||
|
||||
for file in files:
|
||||
hash = calculate_file_hash(file)
|
||||
if hash in hashes:
|
||||
media_item = get_media_by_hash(hash, items)
|
||||
print(f"Duplicate found: https://altpins.com/pin/{media_item[0]}")
|
||||
print(f"Duplicate file: {file}")
|
||||
newpath = os.path.join('duplicates', media_item[2], os.path.basename(file))
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
os.rename(file, newpath)
|
||||
else:
|
||||
print(f"Unique file: {file}")
|
||||
|
||||
|
||||
# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
|
||||
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['image'])
|
||||
existing_medias = cursor.fetchall()
|
||||
|
||||
# Go through the directory folder where each subfolder is a username
|
||||
files = get_files(directory)
|
||||
|
||||
for filepath in files:
|
||||
image_filename = os.path.basename(filepath)
|
||||
print(f'Processing {image_filename}...')
|
||||
|
||||
# Generate pHash for the image
|
||||
phash = generate_image_phash(filepath, hash_size=8)
|
||||
if phash is None:
|
||||
continue # Skip this image if there's an issue
|
||||
|
||||
phash_str = str(phash)
|
||||
|
||||
item_data = get_data_by_filename(image_filename, data)
|
||||
if not item_data:
|
||||
print(f"Data not found for {image_filename}")
|
||||
continue
|
||||
username = item_data['username']
|
||||
|
||||
# Check if the image is a duplicate of any in the database
|
||||
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
|
||||
if duplicate_media:
|
||||
print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
|
||||
print(f'Duplicate image path: {filepath}')
|
||||
newpath = os.path.join('duplicates', duplicate_media[2], image_filename)
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
os.rename(filepath, newpath)
|
||||
print(f'Moved {image_filename} to duplicates/')
|
||||
@ -0,0 +1,17 @@
|
||||
import os, config, funcs
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
cursor.execute("SELECT phash FROM media WHERE phash IS NOT NULL")
|
||||
phashes = set([x[0] for x in cursor.fetchall()])
|
||||
|
||||
files = funcs.get_files("check_if_exists")
|
||||
|
||||
for file in files:
|
||||
image_phash = funcs.generate_phash(file)
|
||||
|
||||
if image_phash in phashes:
|
||||
print(f"File {file} exists in the database")
|
||||
os.remove(file)
|
||||
|
||||
funcs.cleanEmptyFolders("check_if_exists")
|
||||
@ -0,0 +1,57 @@
|
||||
import os
|
||||
import hashlib
|
||||
|
||||
# Directories
|
||||
fucked_dir = 'tiktoks/fucked/aleksandra'
|
||||
source_dir = 'tiktoks/waiting_for_process/aleksandraverse'
|
||||
|
||||
def hash_file(filepath):
|
||||
"""Generate MD5 hash of a file."""
|
||||
hash_md5 = hashlib.md5()
|
||||
with open(filepath, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(4096), b""):
|
||||
hash_md5.update(chunk)
|
||||
return hash_md5.hexdigest()
|
||||
|
||||
def get_file_hashes(directory):
|
||||
"""Generate a dictionary of file hashes for all files in a directory."""
|
||||
file_hashes = {}
|
||||
for root, _, files in os.walk(directory):
|
||||
for file in files:
|
||||
file_path = os.path.join(root, file)
|
||||
file_hashes[file_path] = hash_file(file_path)
|
||||
return file_hashes
|
||||
|
||||
def files_are_identical(file1, file2):
|
||||
"""Compare two files byte-by-byte."""
|
||||
with open(file1, "rb") as f1, open(file2, "rb") as f2:
|
||||
while True:
|
||||
chunk1 = f1.read(4096)
|
||||
chunk2 = f2.read(4096)
|
||||
if chunk1 != chunk2:
|
||||
return False
|
||||
if not chunk1: # End of file
|
||||
return True
|
||||
|
||||
def remove_duplicates(fucked_dir, source_files):
|
||||
"""Remove files in 'fucked' that are identical to those in 'source_files'."""
|
||||
for root, _, files in os.walk(fucked_dir):
|
||||
for file in files:
|
||||
file_path = os.path.join(root, file)
|
||||
for source_file in source_files:
|
||||
if files_are_identical(file_path, source_file):
|
||||
print(f"Duplicate found. Removing: {file_path}")
|
||||
os.remove(file_path)
|
||||
break
|
||||
|
||||
def main():
|
||||
print("Scanning source directory for hashes...")
|
||||
source_hashes = get_file_hashes(source_dir)
|
||||
|
||||
print("Scanning 'fucked' directory for duplicates...")
|
||||
remove_duplicates(fucked_dir, source_hashes)
|
||||
|
||||
print("Cleanup complete.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,70 @@
|
||||
import os, requests, json
|
||||
from bs4 import BeautifulSoup
|
||||
from funcs import download_file
|
||||
|
||||
def get_data(username):
|
||||
url = f"https://www.snapchat.com/add/{username}"
|
||||
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"}
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
data = soup.find("script", id="__NEXT_DATA__")
|
||||
data = json.loads(data.string)
|
||||
return data
|
||||
|
||||
def parse_stories(stories):
|
||||
parsed_stories = []
|
||||
for story in stories:
|
||||
snap_id = story['snapId']['value']
|
||||
snap_url = story['snapUrls']['mediaUrl']
|
||||
timestamp = story['timestampInSec']['value']
|
||||
parsed_stories.append({"media_id": snap_id, "url": snap_url, "timestamp": timestamp})
|
||||
|
||||
return parsed_stories
|
||||
|
||||
def get_stories(data):
|
||||
stories = data['props']['pageProps']['story']['snapList']
|
||||
|
||||
stories = parse_stories(stories)
|
||||
|
||||
return stories
|
||||
|
||||
def get_highlights(data):
|
||||
highlights = data['props']['pageProps']['curatedHighlights']
|
||||
return highlights
|
||||
|
||||
def get_highlight_stories(data):
|
||||
highlights = get_highlights(data)
|
||||
stories = []
|
||||
for highlight in highlights:
|
||||
stories.extend(parse_stories(highlight['snapList']))
|
||||
return stories
|
||||
|
||||
def main():
|
||||
directory = "snapchat_stories"
|
||||
usernames = ['little.warren1', 'neiima22', 'awesome.nads', 'noordabash', 'aleximarianna', ]
|
||||
|
||||
for username in usernames:
|
||||
print(f"Getting stories for {username}...")
|
||||
|
||||
data = get_data(username)
|
||||
|
||||
print("Getting stories...")
|
||||
stories = get_stories(data)
|
||||
|
||||
print("Getting highlights...")
|
||||
stories.extend(get_highlight_stories(data))
|
||||
|
||||
for story in stories:
|
||||
media_id = story['media_id']
|
||||
url = story['url']
|
||||
timestamp = story['timestamp']
|
||||
|
||||
filename = f"{media_id}.jpg"
|
||||
filepath = os.path.join(directory, filename)
|
||||
|
||||
download_file(url, filepath)
|
||||
|
||||
print(f"Downloaded {filename} at {timestamp}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -0,0 +1,164 @@
|
||||
import os
|
||||
import requests
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def get_data(username):
|
||||
url = f"https://www.snapchat.com/add/{username}"
|
||||
headers = {
|
||||
"user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/129.0.0.0 Safari/537.36")
|
||||
}
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
data_script = soup.find("script", id="__NEXT_DATA__")
|
||||
if not data_script:
|
||||
print(f"No data found for {username}.")
|
||||
return None
|
||||
data = json.loads(data_script.string)
|
||||
return data
|
||||
|
||||
def parse_stories(stories):
|
||||
parsed_stories = []
|
||||
for story in stories:
|
||||
snap_id = story.get('snapId', {}).get('value', '')
|
||||
snap_url = story.get('snapUrls', {}).get('mediaUrl', '')
|
||||
timestamp = story.get('timestampInSec', {}).get('value', '')
|
||||
if snap_url and timestamp and snap_id:
|
||||
parsed_stories.append({
|
||||
"media_id": snap_id,
|
||||
"url": snap_url,
|
||||
"timestamp": timestamp
|
||||
})
|
||||
return parsed_stories
|
||||
|
||||
def get_stories(data):
|
||||
try:
|
||||
stories = data['props']['pageProps']['story']['snapList']
|
||||
return parse_stories(stories)
|
||||
except KeyError:
|
||||
return []
|
||||
|
||||
def get_highlights(data):
|
||||
highlights = []
|
||||
page_props = data.get('props', {}).get('pageProps', {})
|
||||
# Possible keys that might contain highlights
|
||||
possible_highlight_keys = ['curatedHighlights', 'savedHighlights', 'highlights']
|
||||
for key in possible_highlight_keys:
|
||||
highlight_data = page_props.get(key, [])
|
||||
if highlight_data:
|
||||
highlights.extend(highlight_data)
|
||||
return highlights
|
||||
|
||||
def get_highlight_stories(data):
|
||||
stories = []
|
||||
highlights = get_highlights(data)
|
||||
for highlight in highlights:
|
||||
snap_list = highlight.get('snapList', [])
|
||||
|
||||
for snap in snap_list:
|
||||
timestamp = snap.get('timestampInSec', {}).get('value', '')
|
||||
snap_url = snap.get('snapUrls', {}).get('mediaUrl', '')
|
||||
stories.append({
|
||||
"media_id": snap.get('snapId', {}).get('value', ''),
|
||||
"url": snap_url,
|
||||
"timestamp": timestamp
|
||||
})
|
||||
|
||||
return stories
|
||||
|
||||
def get_existing_media_ids(directory):
|
||||
# get all files and their their base filename without extension, split the filename by ~ and get the 3rd element
|
||||
existing_media_ids = set()
|
||||
for root, _, files in os.walk(directory):
|
||||
for file in files:
|
||||
if '~' not in file:
|
||||
continue
|
||||
|
||||
filename, _ = os.path.splitext(file)
|
||||
media_id = filename.split('~')[2]
|
||||
existing_media_ids.add(media_id)
|
||||
return existing_media_ids
|
||||
|
||||
def main():
|
||||
directory = "snapchat"
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
|
||||
usernames = [
|
||||
'aleximarianna', 'little.warren1', 'neiima22', 'awesome.nads', 'noordabash',
|
||||
'jaynagirl', 'sierracannon', 'stefaniedra6',
|
||||
'ciaoxxw', 'nadia-stone', 'c.aitknight', 'aimeejaiii',
|
||||
'leonanaomii', 'ratskelet0n',
|
||||
]
|
||||
|
||||
existing_media_ids = get_existing_media_ids(directory)
|
||||
|
||||
for username in usernames:
|
||||
print(f"Getting stories for {username}...")
|
||||
data = get_data(username)
|
||||
if not data:
|
||||
continue
|
||||
|
||||
print("Getting stories...")
|
||||
stories = get_stories(data)
|
||||
|
||||
print("Getting highlights...")
|
||||
stories.extend(get_highlight_stories(data))
|
||||
|
||||
for story in stories:
|
||||
media_id = story['media_id']
|
||||
url = story['url']
|
||||
timestamp = story['timestamp']
|
||||
|
||||
# Check if media already exists
|
||||
if media_id in existing_media_ids:
|
||||
print(f"Media {media_id} already exists. Skipping download.")
|
||||
continue
|
||||
|
||||
# Determine file extension using HEAD request
|
||||
response = requests.head(url)
|
||||
if response.status_code != 200:
|
||||
print(f"Failed to access media {media_id}")
|
||||
continue
|
||||
|
||||
content_type = response.headers.get('Content-Type', '')
|
||||
if 'image' in content_type:
|
||||
extension = '.jpg'
|
||||
elif 'video' in content_type:
|
||||
extension = '.mp4'
|
||||
else:
|
||||
print(f"Unknown content type for media {media_id}")
|
||||
continue
|
||||
|
||||
if media_id:
|
||||
filename = f"{username}~{timestamp}~{media_id}{extension}"
|
||||
filepath = os.path.join(directory, filename)
|
||||
else:
|
||||
media_url_filename = url.split('/')[-1].split('?')[0]
|
||||
etag = response.headers.get('ETag', '').replace('"', '')
|
||||
filename = f"{username}~{timestamp}-{media_url_filename}~{etag}{extension}"
|
||||
filepath = os.path.join(directory, 'highlights', filename)
|
||||
|
||||
# Check if file already exists
|
||||
if os.path.exists(filepath):
|
||||
print(f"File {filename} already exists. Skipping download.")
|
||||
continue
|
||||
|
||||
# Download the media
|
||||
response = requests.get(url, stream=True)
|
||||
if response.status_code != 200:
|
||||
print(f"Failed to download media {media_id}")
|
||||
continue
|
||||
|
||||
# Save the file
|
||||
with open(filepath, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
|
||||
print(f"Downloaded {filename} at {timestamp}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -0,0 +1,120 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
from datetime import datetime
|
||||
import os, config, funcs, cv2
|
||||
from PIL import Image
|
||||
|
||||
|
||||
def UploadMedia(media):
|
||||
username = media['username']
|
||||
timestamp = media['timestamp']
|
||||
filepath = media['filepath']
|
||||
thumbnail_url = None
|
||||
phash = None
|
||||
|
||||
filename = os.path.basename(filepath)
|
||||
file_extension = os.path.splitext(filename)[1].lower()
|
||||
|
||||
if filename in existing_files:
|
||||
print('Duplicate file detected. Removing...')
|
||||
os.remove(filepath)
|
||||
return True
|
||||
|
||||
|
||||
media_type = funcs.get_media_type(filename)
|
||||
|
||||
file_hash = funcs.calculate_file_hash(filepath)
|
||||
|
||||
if '-' in timestamp:
|
||||
timestamp = timestamp.split('-')[0]
|
||||
post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now()
|
||||
|
||||
width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size
|
||||
|
||||
duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 # slower
|
||||
|
||||
if media_type == 'video':
|
||||
try:
|
||||
thumbPath = f'temp/{file_hash}.jpg'
|
||||
cap = cv2.VideoCapture(filepath)
|
||||
ret, frame = cap.read()
|
||||
cv2.imwrite(thumbPath, frame)
|
||||
cap.release()
|
||||
obj_storage.PutFile(thumbPath, f'thumbnails/{file_hash}.jpg') # slower
|
||||
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg"
|
||||
phash = funcs.generate_phash(thumbPath)
|
||||
os.remove(thumbPath)
|
||||
except:
|
||||
print('Error generating thumbnail. Skipping...')
|
||||
return False
|
||||
elif media_type == 'image':
|
||||
phash = funcs.generate_phash(filepath)
|
||||
|
||||
newFilename = f'{file_hash}{file_extension}'
|
||||
server_path = f'media/snaps/{username}/{newFilename}'
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
obj_storage.PutFile(filepath, server_path) # slow as fuck
|
||||
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, 'story', post_date, file_hash, filename, duration, thumbnail_url, phash, 'snapchat')
|
||||
|
||||
newCursor.execute(query, values) # slower
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
|
||||
|
||||
os.remove(filepath)
|
||||
|
||||
return True
|
||||
|
||||
def get_media_data(filepath):
|
||||
filename = os.path.basename(filepath)
|
||||
parts = filename.split('~')
|
||||
if len(parts) < 3:
|
||||
return False
|
||||
|
||||
username = parts[0]
|
||||
timestamp = parts[1]
|
||||
|
||||
data = {'username': username, 'timestamp': timestamp, 'filepath': filepath}
|
||||
|
||||
return data
|
||||
|
||||
def get_media(folder_path):
|
||||
medias = []
|
||||
|
||||
for root, dirs, files in os.walk(folder_path):
|
||||
for filename in files:
|
||||
filepath = os.path.join(root, filename)
|
||||
|
||||
data = get_media_data(filepath)
|
||||
if data:
|
||||
medias.append(data)
|
||||
|
||||
return medias
|
||||
|
||||
def dump(folder_path):
|
||||
medias = get_media(folder_path)
|
||||
|
||||
for media in medias:
|
||||
UploadMedia(media)
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
directory = 'snapchat/'
|
||||
|
||||
if not os.listdir(directory):
|
||||
print('No files to process. Exiting...')
|
||||
exit()
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
newCursor.execute("SELECT filename FROM media WHERE filename IS NOT NULL AND platform = 'snapchat'")
|
||||
existing_files = [image[0] for image in newCursor.fetchall()]
|
||||
|
||||
dump(directory)
|
||||
|
||||
print("Processing completed.")
|
||||
@ -0,0 +1,141 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
from datetime import datetime
|
||||
import os, config, funcs, cv2
|
||||
from PIL import Image
|
||||
|
||||
|
||||
def UploadMedia(media):
|
||||
media_id = media['media_id']
|
||||
username = media['username']
|
||||
post_date = media['timestamp']
|
||||
user_id = media['user_id']
|
||||
filepath = media['filepath']
|
||||
highlight_id = media['highlight_id']
|
||||
post_type = media['post_type']
|
||||
thumbnail_url = None
|
||||
phash = None
|
||||
|
||||
if media_id and int(media_id) in existing_files:
|
||||
print('Duplicate file detected. Removing...')
|
||||
os.remove(filepath)
|
||||
return True
|
||||
|
||||
filename = os.path.basename(filepath)
|
||||
file_extension = os.path.splitext(filename)[1].lower()
|
||||
|
||||
media_type = funcs.get_media_type(filename)
|
||||
|
||||
file_hash = funcs.calculate_file_hash(filepath)
|
||||
|
||||
width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size
|
||||
|
||||
duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 # slower
|
||||
|
||||
if media_type == 'video':
|
||||
try:
|
||||
thumbPath = f'temp/{media_id}.jpg'
|
||||
cap = cv2.VideoCapture(filepath)
|
||||
ret, frame = cap.read()
|
||||
cv2.imwrite(thumbPath, frame)
|
||||
cap.release()
|
||||
obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') # slower
|
||||
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg"
|
||||
phash = funcs.generate_phash(thumbPath)
|
||||
os.remove(thumbPath)
|
||||
except:
|
||||
print('Error generating thumbnail. Skipping...')
|
||||
return False
|
||||
elif media_type == 'image':
|
||||
phash = funcs.generate_phash(filepath)
|
||||
|
||||
if media_id:
|
||||
newFilename = f'{media_id}{file_extension}'
|
||||
else:
|
||||
newFilename = f'{file_hash}{file_extension}'
|
||||
|
||||
server_path = f'media/{post_type}/{username}/{newFilename}'
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
obj_storage.PutFile(filepath, server_path) # slow as fuck
|
||||
|
||||
if highlight_id:
|
||||
newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id))
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')
|
||||
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, 'instagram')
|
||||
|
||||
newCursor.execute(query, values) # slower
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
|
||||
|
||||
os.remove(filepath)
|
||||
|
||||
return True
|
||||
|
||||
def get_user_id(username):
|
||||
username = username.lower()
|
||||
if username in existing_users:
|
||||
return existing_users[username]
|
||||
|
||||
return None
|
||||
|
||||
def get_media():
|
||||
medias = []
|
||||
post_types = {
|
||||
'posts': 'post',
|
||||
'stories': 'story',
|
||||
'profile': 'profile',
|
||||
}
|
||||
|
||||
for post_type in os.listdir('media'):
|
||||
users = os.listdir(f'media/{post_type}')
|
||||
for user in users:
|
||||
user_path = f'media/{post_type}/{user}'
|
||||
for filename in os.listdir(user_path):
|
||||
data = {}
|
||||
filepath = os.path.join(user_path, filename)
|
||||
|
||||
data['post_type'] = post_types[post_type]
|
||||
data['username'] = user
|
||||
data['timestamp'] = filename.split('__')[-1].split('.')[0] if 'com.instagram.android__' in filename else datetime.now()
|
||||
if 'com.instagram.android__' in filename:
|
||||
data['timestamp'] = datetime.strptime(data, '%Y%m%d%H%M%S%f')
|
||||
data['filepath'] = filepath
|
||||
data['media_id'] = None
|
||||
data['user_id'] = get_user_id(data['username'])
|
||||
data['highlight_id'] = None
|
||||
medias.append(data)
|
||||
|
||||
return medias
|
||||
|
||||
def dump_instagram():
|
||||
medias = get_media()
|
||||
|
||||
for media in medias:
|
||||
UploadMedia(media)
|
||||
existing_files.append(media['media_id'])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
if not os.listdir('storysaver/'):
|
||||
print('No files to process. Exiting...')
|
||||
exit()
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL")
|
||||
existing_files = [image[0] for image in newCursor.fetchall()]
|
||||
|
||||
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
|
||||
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
|
||||
|
||||
dump_instagram()
|
||||
|
||||
print("Processing completed.")
|
||||
@ -0,0 +1,38 @@
|
||||
import os
|
||||
|
||||
|
||||
# file name : masstik_caammmyyy_1310_655_going blonde wednesdayyyy.mp4
|
||||
# file name : masstiktok_aleksandraverse__#fyp #trending #viral #foryou.mp4
|
||||
# where the first item is prefix, second is username and after those is the tiktok title
|
||||
|
||||
processed_dir = 'processed_tiktoks'
|
||||
os.makedirs(processed_dir, exist_ok=True)
|
||||
|
||||
users = os.listdir('tiktoks')
|
||||
|
||||
for user in users:
|
||||
files = os.path.join('tiktoks', user)
|
||||
for file in os.listdir(files):
|
||||
if 'masstik' not in file and 'masstiktok' not in file:
|
||||
print(f"Skipping {file}")
|
||||
continue
|
||||
|
||||
filepath = os.path.join(files, file)
|
||||
file_ext = os.path.splitext(file)[1]
|
||||
data = file.split('_')
|
||||
prefix = data[0]
|
||||
username = data[1]
|
||||
username = username.replace('@', '')
|
||||
title = ' '.join(data[2:])
|
||||
title = os.path.splitext(title)[0]
|
||||
|
||||
print("="*100)
|
||||
title = title.encode('utf-8', 'ignore').decode('utf-8')
|
||||
print(f"Prefix: {prefix}\nUsername: {username}\nTitle: {title}")
|
||||
print("="*100)
|
||||
|
||||
new_filename = f"{username}~{title}.{file_ext}"
|
||||
new_filepath = os.path.join(processed_dir, new_filename)
|
||||
|
||||
os.rename(filepath, new_filepath)
|
||||
print(f"Renamed {file} to {new_filename}")
|
||||
@ -0,0 +1,109 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
from datetime import datetime
|
||||
import os, config, funcs, cv2
|
||||
from PIL import Image
|
||||
|
||||
directory = 'ready_to_upload/'
|
||||
|
||||
def UploadMedia(username, user_id, filepath):
|
||||
thumbnail_url = None
|
||||
phash = None
|
||||
|
||||
filename = os.path.basename(filepath)
|
||||
file_extension = os.path.splitext(filename)[1].lower()
|
||||
|
||||
media_type = funcs.get_media_type(filename)
|
||||
|
||||
post_type = funcs.determine_post_type(filepath, media_type)
|
||||
if not post_type:
|
||||
print(f'Error determining post type for {filename}. Skipping...')
|
||||
return False
|
||||
|
||||
file_hash = funcs.calculate_file_hash(filepath)
|
||||
|
||||
post_date = datetime.now()
|
||||
|
||||
width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size
|
||||
|
||||
duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 # slower
|
||||
|
||||
if media_type == 'video':
|
||||
try:
|
||||
thumbPath = f'temp/{file_hash}.jpg'
|
||||
cap = cv2.VideoCapture(filepath)
|
||||
ret, frame = cap.read()
|
||||
cv2.imwrite(thumbPath, frame)
|
||||
cap.release()
|
||||
obj_storage.PutFile(thumbPath, f'thumbnails/{file_hash}.jpg') # slower
|
||||
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg"
|
||||
phash = funcs.generate_phash(thumbPath)
|
||||
os.remove(thumbPath)
|
||||
except:
|
||||
print('Error generating thumbnail. Skipping...')
|
||||
return False
|
||||
elif media_type == 'image':
|
||||
phash = funcs.generate_phash(filepath)
|
||||
|
||||
newFilename = f'{file_hash}{file_extension}'
|
||||
server_path = f'media/{post_type}/{username}/{newFilename}'
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
obj_storage.PutFile(filepath, server_path) # slow as fuck
|
||||
|
||||
post_type = 'story' if post_type == 'stories' else 'post'
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, 'instagram')
|
||||
|
||||
newCursor.execute(query, values) # slower
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
|
||||
|
||||
os.remove(filepath)
|
||||
|
||||
return True
|
||||
|
||||
def get_user_id(username):
|
||||
username = username.lower()
|
||||
if username in existing_users:
|
||||
return existing_users[username]
|
||||
|
||||
return None
|
||||
|
||||
def get_media(folder_path):
|
||||
medias = []
|
||||
|
||||
for user_folder in os.listdir(folder_path):
|
||||
files = os.listdir(os.path.join(folder_path, user_folder))
|
||||
for filename in files:
|
||||
filepath = os.path.join(folder_path, user_folder, filename)
|
||||
media = {
|
||||
'username': user_folder,
|
||||
'filepath': filepath,
|
||||
'user_id': get_user_id(user_folder)
|
||||
}
|
||||
|
||||
medias.append(media)
|
||||
|
||||
return medias
|
||||
|
||||
def dump_instagram(folder_path):
|
||||
medias = get_media(folder_path)
|
||||
|
||||
for media in medias:
|
||||
UploadMedia(media['username'], media['user_id'], media['filepath'])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
|
||||
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
|
||||
|
||||
dump_instagram(directory)
|
||||
|
||||
print("Processing completed.")
|
||||
Loading…
Reference in New Issue