from datetime import datetime from uuid import uuid4 import funcs import config import cv2 import os directory = 'processed_tiktoks' def UploadMedia(media): platform = 'TikTok' username = media['username'] filepath = media['filepath'] file_size = os.path.getsize(filepath) thumbnail_url = None phash = None filename = os.path.basename(filepath) file_extension = os.path.splitext(filename)[1].lower() media_type = funcs.get_media_type(filename) if not media_type: print(f'Error determining media type for {filename}. Skipping...') return False post_type = funcs.determine_post_type(filepath) if not post_type: print(f'Error determining post type for {filename}. Skipping...') return False file_hash = funcs.calculate_file_hash(filepath) if file_hash in existing_hashes: print(f'File {filename} already exists. Skipping...') return False post_date = datetime.now() width, height = funcs.get_media_dimensions(filepath) duration = funcs.get_video_duration(filepath) if media_type == 'image': phash = funcs.generate_phash(filepath) elif media_type == 'video': try: thumb_path = generate_thumbnail(filepath) obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') # this might be a problem in case of duplicate hashes thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg" phash = funcs.generate_phash(thumb_path) os.remove(thumb_path) except: print('Error generating thumbnail. Skipping...') return False newFilename = f'{file_hash}{file_extension}' server_path = f'media/tiktoks/{username}/{newFilename}' file_url = f"https://storysave.b-cdn.net/{server_path}" obj_storage.PutFile(filepath, server_path) # slow as fuck post_type = 'story' if post_type == 'stories' else 'post' query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, hash, filename, duration, thumbnail, phash, platform, file_size) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" values = (username, media_type, file_url, width, height, post_type, post_date, file_hash, filename, duration, thumbnail_url, phash, platform, file_size) newCursor.execute(query, values) # slower newDB.commit() print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}') os.remove(filepath) return True def generate_thumbnail(filepath): thumb_path = f'temp/{uuid4()}.jpg' cap = cv2.VideoCapture(filepath) ret, frame = cap.read() cv2.imwrite(thumb_path, frame) cap.release() return thumb_path def get_media_data(filepath): filename = os.path.basename(filepath) parts = filename.split('~') if len(parts) == 3: username, title, tiktok_id = parts elif len(parts) == 2: username, title = parts tiktok_id = None else: return False data = {'username': username, 'filepath': filepath, 'tiktok_id': tiktok_id, 'title': title} return data def get_media(folder_path): medias = [] users = os.listdir(folder_path) for user in users: user_folder = os.path.join(folder_path, user) if not os.path.isdir(user_folder): print(f"Skipping {user}") continue files = os.listdir(user_folder) for filename in files: filepath = os.path.join(user_folder, filename) data = get_media_data(filepath) if data: medias.append(data) return medias def dump_instagram(folder_path): medias = get_media(folder_path) for media in medias: UploadMedia(media) if __name__ == '__main__': print('Starting processing...') if not os.listdir(directory): print('No files to process. Exiting...') exit() newDB, newCursor = config.gen_connection() obj_storage = config.get_storage() newCursor.execute("SELECT hash FROM media WHERE hash IS NOT NULL AND platform = 'TikTok'") existing_hashes = [row[0] for row in newCursor.fetchall()] dump_instagram(directory) print("Processing completed.")