You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

140 lines
4.2 KiB
Python

from datetime import datetime
from uuid import uuid4
import funcs
import config
import cv2
import os
directory = 'processed_tiktoks'
def UploadMedia(media):
platform = 'TikTok'
username = media['username']
filepath = media['filepath']
file_size = os.path.getsize(filepath)
thumbnail_url = None
phash = None
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
media_type = funcs.get_media_type(filename)
if not media_type:
print(f'Error determining media type for {filename}. Skipping...')
return False
post_type = funcs.determine_post_type(filepath)
if not post_type:
print(f'Error determining post type for {filename}. Skipping...')
return False
file_hash = funcs.calculate_file_hash(filepath)
if file_hash in existing_hashes:
print(f'File {filename} already exists. Skipping...')
return False
post_date = datetime.now()
width, height = funcs.get_media_dimensions(filepath)
duration = funcs.get_video_duration(filepath)
if media_type == 'image':
phash = funcs.generate_phash(filepath)
elif media_type == 'video':
try:
thumb_path = generate_thumbnail(filepath)
obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') # this might be a problem in case of duplicate hashes
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg"
phash = funcs.generate_phash(thumb_path)
os.remove(thumb_path)
except:
print('Error generating thumbnail. Skipping...')
return False
newFilename = f'{file_hash}{file_extension}'
server_path = f'media/tiktoks/{username}/{newFilename}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
obj_storage.PutFile(filepath, server_path) # slow as fuck
post_type = 'story' if post_type == 'stories' else 'post'
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, hash, filename, duration, thumbnail, phash, platform, file_size) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, post_type, post_date, file_hash, filename, duration, thumbnail_url, phash, platform, file_size)
newCursor.execute(query, values) # slower
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
os.remove(filepath)
return True
def generate_thumbnail(filepath):
thumb_path = f'temp/{uuid4()}.jpg'
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cv2.imwrite(thumb_path, frame)
cap.release()
return thumb_path
def get_media_data(filepath):
filename = os.path.basename(filepath)
parts = filename.split('~')
if len(parts) == 3:
username, title, tiktok_id = parts
elif len(parts) == 2:
username, title = parts
tiktok_id = None
else:
return False
data = {'username': username, 'filepath': filepath, 'tiktok_id': tiktok_id, 'title': title}
return data
def get_media(folder_path):
medias = []
users = os.listdir(folder_path)
for user in users:
user_folder = os.path.join(folder_path, user)
if not os.path.isdir(user_folder):
print(f"Skipping {user}")
continue
files = os.listdir(user_folder)
for filename in files:
filepath = os.path.join(user_folder, filename)
data = get_media_data(filepath)
if data:
medias.append(data)
return medias
def dump_instagram(folder_path):
medias = get_media(folder_path)
for media in medias:
UploadMedia(media)
if __name__ == '__main__':
print('Starting processing...')
if not os.listdir(directory):
print('No files to process. Exiting...')
exit()
newDB, newCursor = config.gen_connection()
obj_storage = config.get_storage()
newCursor.execute("SELECT hash FROM media WHERE hash IS NOT NULL AND platform = 'TikTok'")
existing_hashes = [row[0] for row in newCursor.fetchall()]
dump_instagram(directory)
print("Processing completed.")