update

11 months ago · 322e39b51f
parent 93c9d660f9
commit 322e39b51f
18 changed files with 163 additions and 80 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 # python files
 *.pyc
 __pycache__
+*.DS_Store

 # Content
 storysaver
--- a/dedupe_scripts/dedupe_phash.py
+++ b/dedupe_scripts/dedupe_phash.py
--- a/dedupe_scripts/dupecleaner_phash.py
+++ b/dedupe_scripts/dupecleaner_phash.py
--- a/dedupe_scripts/find_by_phash.py
+++ b/dedupe_scripts/find_by_phash.py
--- a/dedupe_scripts/find_duplicates_by_phash.py
+++ b/dedupe_scripts/find_duplicates_by_phash.py
--- a/dedupe_scripts/find_duplicates_by_phash_videos.py
+++ b/dedupe_scripts/find_duplicates_by_phash_videos.py
--- a/dedupe_scripts/image_dupe_cleaner.py
+++ b/dedupe_scripts/image_dupe_cleaner.py
--- a/dedupe_scripts/snappy_duplicates.py
+++ b/dedupe_scripts/snappy_duplicates.py
@ -0,0 +1,109 @@
+import os, config, funcs, cv2, imagehash
+from PIL import Image
+
+directory = "old_snapchats"
+duplicate_dir = 'dupelicate_snaps'
+
+
+def generate_video_phash(filepath):
+	try:
+		cap = cv2.VideoCapture(filepath)
+		ret, frame = cap.read()
+		cap.release()
+		if not ret:
+			return None
+		phash = imagehash.phash(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
+		return str(phash)
+	except:
+		return None
+
+def get_snapchat_files():
+	stories = funcs.get_files(directory)
+	stories = [get_media_data(filepath) for filepath in stories]
+	stories = [story for story in stories if story]
+	return stories
+
+def get_media_data(filepath):
+    filename = os.path.basename(filepath)
+    parts = filename.split('~')
+    if len(parts) < 3:
+        return False
+
+    username = parts[0]
+    timestamp = parts[1]
+    snap_id = parts[2]
+    snap_id = os.path.splitext(snap_id)[0]
+
+    # data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': snap_id, 'original_snap_id': None}
+    data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': None, 'original_snap_id': snap_id}
+
+    return data
+
+def process_snap_ids(filenames):
+    snap_ids = []
+    for filename in filenames:
+        snap_id = filename.split('~')[2]
+        snap_id = os.path.splitext(snap_id)[0]
+        if snap_id not in snap_ids:
+            snap_ids.append(snap_id)
+            
+    return snap_ids
+
+def find_duplicate_snap(existing_snaps, current_snap):
+	filepath = current_snap['filepath']
+	original_snap_id = current_snap['original_snap_id']
+	username = current_snap['username']
+	
+	snap_hash = funcs.calculate_file_hash(current_snap['filepath'])
+	if filepath.endswith('.mp4'):
+		phash = generate_video_phash(current_snap['filepath'])
+	elif filepath.endswith('.jpg'):
+		phash = funcs.generate_phash(current_snap['filepath'])
+
+	for snap in existing_snaps:
+		if username != snap[2]:
+			continue
+
+		if original_snap_id in snap[1]:
+			return snap
+		if original_snap_id == snap[5]:
+			return snap
+		if snap_hash == snap[3]:
+			return snap
+		if phash == snap[4]:
+			return snap
+	
+	return False
+
+if __name__ == '__main__':
+	print('Starting snappy...')
+
+	db, cursor = config.gen_connection()
+	obj_storage = config.get_storage()
+
+	stories_from_files = get_snapchat_files()
+
+	# this script will check if there are any duplicates in old_snapchats folder in the database in table media where platform = 'snapchat'
+	cursor.execute("SELECT id, filename, username, hash, phash, original_snap_id FROM media WHERE filename IS NOT NULL AND platform = 'snapchat'")
+	existing_medias = cursor.fetchall()
+
+	snap_files = get_snapchat_files()
+ 
+	os.makedirs(duplicate_dir, exist_ok=True)
+ 
+	for story in snap_files:
+		print(f"Processing {story['username']}...")
+		snap_id = story['snap_id']
+		original_snap_id = story['original_snap_id']
+		username = story['username']
+
+		# check if the snap_id is already in the database
+		existing_snap = find_duplicate_snap(existing_medias, story)
+
+		if existing_snap:
+			print(f"Snap {original_snap_id} already exists in the database.")
+			new_filename = os.path.basename(story['filepath'])
+			new_filepath = os.path.join(duplicate_dir, new_filename)
+			os.rename(story['filepath'], new_filepath)
+		
+	print("Processing completed.")
--- a/funcs.py
+++ b/funcs.py
@ -29,9 +29,6 @@ def get_files(directory):
            files.append(os.path.join(root, filename))
    return files

-import cv2
-import numpy as np
-
 def compare_images(image_path1, image_path2):
    # Load the images in grayscale
    img1 = cv2.imread(image_path1, cv2.IMREAD_GRAYSCALE)
--- a/old/organize_tiktoks.py
+++ b/old/organize_tiktoks.py
--- a/old/snappy.py
+++ b/old/snappy.py
@ -52,6 +52,29 @@ def get_file_extension(url):
 	else:
 		print(f"Unknown content type for media {url}")
 		return None
+	
+def extract_file_type(url):
+	file_types = {
+		'400': '.jpg',
+		'1322': '.mp4',
+		'1325': '.mp4',
+		'1034': '.mp4',
+		'1023': '.jpg'
+	}
+
+	base_url = url.split("?")[0]  # Remove query string
+
+	snap_data = base_url.split('/')[-1]
+
+	# Extract the file type number
+	data_parts = snap_data.split('.')
+	if len(data_parts) > 1:
+		file_type_number = data_parts[1]
+		if file_type_number in file_types:
+			return file_types[file_type_number]
+	else:
+		print(f"Unexpected URL format: {base_url}")
+		return None

 def download_media(url, filepath):
 	if os.path.exists(filepath):
@ -112,9 +135,10 @@ def main():

 			# Determine file extension using HEAD request.
 			# TODO: find a better way to determine file extension without downloading the file.
-			extension = get_file_extension(url)
+			extension = extract_file_type(url)
 			if not extension:
 				continue
+
 			filename = f"{username}~{timestamp}~{snap_id}{extension}"
 			filepath = os.path.join(directory, filename)
 			
--- a/old/update_snap_id.py
+++ b/old/update_snap_id.py
--- a/snapchat.py
+++ b/snapchat.py
@ -92,4 +92,4 @@ def get_highlight_stories(data):
            story = parse_story(snap)
            stories.append(story)
 			
-    return stories
+    return stories
--- a/snappy_master.py
+++ b/snappy_master.py
@ -21,7 +21,6 @@ def archive_data(data, username):
 	data_filepath = os.path.join(data_directory, data_filename)
 	with open(data_filepath, 'w') as f:
 		f.write(json.dumps(data))
-	print(f"Archived data for {username} at {data_filepath}")
 	
 def get_file_extension(url):
 	response = requests.head(url)
@ -64,7 +63,7 @@ def extract_file_type(url):
 		
 def download_media(url, filepath):
 	if os.path.exists(filepath):
-		print(f"File {filepath} already exists. Skipping download.")
+		# File already exists, skip download and return the filepath as if it was downloaded.
 		return filepath
 	
 	response = requests.get(url)
@ -76,55 +75,6 @@ def download_media(url, filepath):
 		f.write(response.content)
 	return filepath

-def get_all_stories(usernames):
-	snapchat_users_data = get_all_users_data(usernames)
-
-	all_stories = []
-	for username in usernames:
-		print(f"Getting stories for {username}...")
-		data = snapchat_users_data.get(username)
-		if not data:
-			print(f"Failed to get data for {username}. Skipping.")
-			continue
-			
-		archive_data(data, username)
-
-		print("Getting stories...")
-		stories = get_stories(data)
-
-		print("Getting highlights...")
-		stories.extend(get_highlight_stories(data))
-
-		for story in stories:
-			snap_id = story['snap_id']
-			url = story['url']
-			timestamp = story['timestamp']
-
-			# Determine file extension using HEAD request.
-			extension = extract_file_type(url)
-			if not extension:
-				print(f"Failed to determine file extension for {url}. Skipping.")
-				continue
-
-			filename = f"{username}~{timestamp}~{snap_id}{extension}"
-			filepath = os.path.join(directory, filename)
-
-			media = {
-				'username': username,
-				'timestamp': timestamp,
-				'filepath': filepath,
-				'snap_id': snap_id,
-				'original_snap_id': story['original_snap_id'],
-				'media_url': url,
-			}
-
-			all_stories.append(media)
-			print(f"Media {snap_id} ready for download.")
-
-		all_stories.extend(stories)
-
-	return all_stories
-
 def get_snapchat_stories():
 	os.makedirs(directory, exist_ok=True)
 	os.makedirs(data_directory, exist_ok=True)
@ -149,10 +99,8 @@ def get_snapchat_stories():
 		
 		archive_data(data, username)
 		
-		print("Getting stories...")
 		stories = get_stories(data)

-		print("Getting highlights...")
 		stories.extend(get_highlight_stories(data))

 		for story in stories:
@ -162,7 +110,7 @@ def get_snapchat_stories():
 			
 			duplicate_snap = find_duplicate_snap(existing_medias, snap_id, username)
 			if duplicate_snap:
-				print(f"Media {snap_id} already exists. Skipping download.")
+				# Snap already exists in the database
 				continue
 			
 			# Determine file extension using HEAD request.
@ -191,11 +139,25 @@ def get_snapchat_stories():

 	return ready_stories

+def get_snapchat_files():
+	stories = funcs.get_files(directory)
+	stories = [get_media_data(filepath) for filepath in stories]
+	stories = [story for story in stories if story]
+	return stories
+
+def main():
+	ready_stories = get_snapchat_stories()
+	stories_from_files = get_snapchat_files()
+
+	ready_stories.extend(stories_from_files)
+	
+	download_stories(ready_stories)
+
 def download_stories(stories):
 	for story in stories:
 		# Download the media
 		filepath = story['filepath']
-		url = story['media_url'] if 'media_url' in story else None
+		url = story['media_url']
 		filename = os.path.basename(filepath)
 		timestamp = story['timestamp']

@ -209,17 +171,6 @@ def download_stories(stories):

 		UploadMedia(story)

-def main():
-	ready_stories = get_snapchat_stories()
-
-	stories_from_files = funcs.get_files(directory)
-	stories_from_files = [get_media_data(filepath) for filepath in stories_from_files]
-	stories_from_files = [story for story in stories_from_files if story]
-
-	ready_stories.extend(stories_from_files)
-	
-	download_stories(ready_stories)
-
 def UploadMedia(media):
 	username = media['username']
 	timestamp = media['timestamp']
@ -288,7 +239,8 @@ def get_media_data(filepath):
    snap_id = parts[2]
    snap_id = os.path.splitext(snap_id)[0]

-    data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': snap_id, 'original_snap_id': None}
+    data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': snap_id, 'original_snap_id': None, 'media_url': None}
+    # data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': None, 'original_snap_id': snap_id, 'media_url': None}

    return data

--- a/storysave_dump_tiktok.py
+++ b/storysave_dump_tiktok.py
@ -120,7 +120,7 @@ def dump_instagram(folder_path):

 if __name__ == '__main__':
    print('Starting processing...')
-
+    
    if not os.listdir(directory):
        print('No files to process. Exiting...')
        exit()
--- a/storysave_dump_tiktok_process.py
+++ b/storysave_dump_tiktok_process.py
@ -10,10 +10,6 @@ def is_valid_uuid(uuid_to_test, version=4):

    return str(uuid_obj) == uuid_to_test

-# file name : masstik_caammmyyy_1310_655_going blonde wednesdayyyy.mp4
-# file name : masstiktok_aleksandraverse__#fyp #trending #viral #foryou.mp4
-# where the first item is prefix, second is username and after those is the tiktok title
-
 source_dir = 'tiktoks/'
 processed_dir = 'processed_tiktoks'

--- a/storysave_dump_unknown.py
+++ b/storysave_dump_unknown.py
@ -2,7 +2,7 @@ from datetime import datetime
 import os, config, funcs, cv2
 from uuid import uuid4

-directory = 'ready_to_upload'
+directory = 'ready_for_upload/instagram'

 def UploadMedia(username, user_id, filepath):
    thumbnail_url = None
@ -80,8 +80,12 @@ def get_user_id(username):
 def get_media(folder_path):
    medias = []
    
-    for user_folder in os.listdir(folder_path):
-        files = os.listdir(os.path.join(folder_path, user_folder))
+    user_folders = os.listdir(folder_path)
+    for user_folder in user_folders:
+        user_folder_path = os.path.join(folder_path, user_folder)
+        if not os.path.isdir(user_folder_path):
+            continue
+        files = os.listdir(user_folder_path)
        for filename in files:
            filepath = os.path.join(folder_path, user_folder, filename)
            media = {