From 73889be10ea1935ed00a50f22bfe142720c01a15 Mon Sep 17 00:00:00 2001 From: oscar <> Date: Fri, 29 Nov 2024 14:22:40 -0800 Subject: [PATCH] update --- .gitignore | 5 +++- dump_instagram.py | 68 ++++++++++++++++++++++++++++++++++++++--------- fixresolution.py | 47 ++++++++++++++++++++++++++++++++ fixthumbnails.py | 63 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 170 insertions(+), 13 deletions(-) create mode 100644 fixresolution.py create mode 100644 fixthumbnails.py diff --git a/.gitignore b/.gitignore index 4180796..9c5186c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ # Content storysaver/ facebook/ -media/ \ No newline at end of file +media/ +cache/ +temp/ +*.pyc \ No newline at end of file diff --git a/dump_instagram.py b/dump_instagram.py index c504526..4376d4a 100644 --- a/dump_instagram.py +++ b/dump_instagram.py @@ -1,15 +1,16 @@ from BunnyCDN.Storage import Storage from datetime import datetime -import os, config, funcs +import os, config, funcs, cv2 from PIL import Image -def UploadMedia(filepath, username, media_id, timestamp = None, user_id = None): - if media_id and int(media_id) in existing_files: - print('Duplicate file detected. Removing...') - os.remove(filepath) - return True - +def UploadMedia(media): + media_id = media['media_id'] + username = media['username'] + timestamp = media['timestamp'] + user_id = media['user_id'] + filepath = media['filepath'] + filename = os.path.basename(filepath) file_extension = os.path.splitext(filename)[1].lower() @@ -25,6 +26,20 @@ def UploadMedia(filepath, username, media_id, timestamp = None, user_id = None): width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size + thumbnail_url = None + if media_type == 'video': + try: + thumbPath = f'temp/{media_id}.jpg' + cap = cv2.VideoCapture(filepath) + ret, frame = cap.read() + cv2.imwrite(thumbPath, frame) + cap.release() + obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') + thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg" + except: + print('Error generating thumbnail. Skipping...') + return False + server_path = f'media/{post_type}/{username}/{media_id}{file_extension}' file_url = f"https://storysave.b-cdn.net/{server_path}" @@ -47,8 +62,8 @@ def UploadMedia(filepath, username, media_id, timestamp = None, user_id = None): obj_storage.PutFile(filepath, server_path) - query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" - values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration) + query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url) newCursor.execute(query, values) newDB.commit() @@ -58,8 +73,8 @@ def UploadMedia(filepath, username, media_id, timestamp = None, user_id = None): return True - -def dump_instagram(folder_path): +def getMedias(folder_path): + medias = [] for filename in os.listdir(folder_path): parts = filename.split('~') if len(parts) < 4: @@ -71,7 +86,36 @@ def dump_instagram(folder_path): user_id = parts[3].split('_')[-1].split('.')[0] filepath = os.path.join(folder_path, filename) - UploadMedia(username=username, filepath=filepath, media_id=media_id, timestamp=timestamp, user_id=user_id) + + if not media_id: + print(f'Invalid media_id for file {filename}. Skipping...') + continue + + try:media_id = int(media_id) + except: + print(f'Invalid media_id for file {filename}. Skipping...') + continue + + data = { + 'username': username, + 'timestamp': timestamp, + 'media_id': media_id, + 'user_id': user_id, + 'filepath': filepath + } + medias.append(data) + return medias + +def dump_instagram(folder_path): + medias = getMedias(folder_path) + + for media in medias: + if media['media_id'] in existing_files: + print('Duplicate file detected. Removing...') + os.remove(media['filepath']) + + for media in medias: + UploadMedia(media) if __name__ == '__main__': diff --git a/fixresolution.py b/fixresolution.py new file mode 100644 index 0000000..4fa15e9 --- /dev/null +++ b/fixresolution.py @@ -0,0 +1,47 @@ +from BunnyCDN.Storage import Storage +import config, os, funcs +from PIL import Image + +# the hash of the images are different due to optimizer + +#obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins') +obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + +db, cursor = config.gen_connection() + +cursor.execute("SELECT id, media_id, media_url FROM media WHERE width = 0;") +results = cursor.fetchall() + +count = 0 +print(f"Found {len(results)} files to process.") + +cacheDir = 'cache' +for result in results: + count += 1 + videoID = result[0] + mediaID = result[1] + mediaURL = result[2] + extension = mediaURL.split('.')[-1] + + serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/') + + localFilePath = os.path.join(cacheDir, os.path.basename(serverPath)) + + if os.path.exists(localFilePath): + print(f"File already exists: {localFilePath}") + else: + obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir) + + mediaType = funcs.get_media_type(localFilePath) + + if mediaType == 'image': + with Image.open(localFilePath) as img: + width, height = img.size + elif mediaType == 'video': + width, height = funcs.get_video_dimensions(localFilePath) + + + cursor.execute("UPDATE media SET width = %s, height=%s WHERE id = %s;", (width, height, videoID)) + db.commit() + + print(f"[{count}/{len(results)}] width: {width}, height: {height} {cursor.rowcount}") \ No newline at end of file diff --git a/fixthumbnails.py b/fixthumbnails.py new file mode 100644 index 0000000..94861ca --- /dev/null +++ b/fixthumbnails.py @@ -0,0 +1,63 @@ +from BunnyCDN.Storage import Storage +import config, os, cv2 +from concurrent.futures import ThreadPoolExecutor + +# this script will take a screenshot of the first frame of each video and upload it as a thumbnail to BunnyCDN + +obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + +db, cursor = config.gen_connection() + +cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'video' AND thumbnail IS NULL and status = 'public';") +results = cursor.fetchall() + +count = 0 +print(f"Found {len(results)} files to process.") + +cacheDir = 'cache' + +def DownloadFile(serverPath, cacheDir): + localFilePath = os.path.join(cacheDir, os.path.basename(serverPath)) + + if os.path.exists(localFilePath): + print(f"File already exists: {localFilePath}") + return localFilePath + + obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir) + print(f"Downloaded {serverPath} to {localFilePath}") + return localFilePath + +def ImportMedias(): + with ThreadPoolExecutor(max_workers=10) as executor: + for video in results: + serverPath = video[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/') + executor.submit(DownloadFile, serverPath, cacheDir) + + +for result in results: + count += 1 + itemID = result[0] + mediaID = result[1] + mediaURL = result[2] + extension = mediaURL.split('.')[-1] + + serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/') + + localFilePath = os.path.join(cacheDir, os.path.basename(serverPath)) + + filePath = DownloadFile(serverPath, cacheDir) + + cap = cv2.VideoCapture(localFilePath) + ret, frame = cap.read() + cv2.imwrite('thumbnail.jpg', frame) + cap.release() + + thumbnailURL = f"https://storysave.b-cdn.net/thumbnails/{itemID}.jpg" + + obj_storage.PutFile('thumbnail.jpg', f'thumbnails/{itemID}.jpg') + + + cursor.execute("UPDATE media SET thumbnail = %s WHERE id = %s;", (thumbnailURL, itemID)) + db.commit() + + print(f"[{count}/{len(results)}] thumbnail: {thumbnailURL} {cursor.rowcount}") \ No newline at end of file