updated and optimized and cleaned up

main
oscar 3 weeks ago
parent d440a25e1d
commit 42afcdc539

BIN
.DS_Store vendored

Binary file not shown.

@ -10,7 +10,7 @@ def gen_connection():
print("Connecting to database")
newDB = mysql.connector.connect(host=host, user=username, password=password, database=database, port=port)
print("Connected to database")
return newDB, newDB.cursor()
return newDB, newDB.cursor(dictionary=True)
def get_storage():
from BunnyCDN.Storage import Storage

@ -1,7 +1,6 @@
from watchdog.events import FileSystemEventHandler
from watchdog.observers import Observer
import shutil
import time
import os
from funcs import get_media_dimensions
@ -13,20 +12,6 @@ os.makedirs(stories_dir, exist_ok=True)
os.makedirs(posts_dir, exist_ok=True)
def wait_for_complete(file_path, timeout=10):
prev_size = -1
for _ in range(timeout * 2): # check every 0.5 sec
try:
size = os.path.getsize(file_path)
except FileNotFoundError:
return False
if size == prev_size:
return True
prev_size = size
time.sleep(0.5)
return False
def is_story(width, height, tolerance=0.02):
if width == 0 or height == 0:
return False
@ -50,16 +35,13 @@ class DownloadHandler(FileSystemEventHandler):
def process_file(self, file_path):
file = os.path.basename(file_path)
# Ignore incomplete or weird temp names
if "crdownload" in file or file.count("~") != 3:
return
if not os.path.exists(file_path):
return
if not wait_for_complete(file_path):
print(f"File {file_path} did not stabilize. Skipping.")
return
post_type = determine_post_type(file_path)
if post_type == "posts":
dest_dir = posts_dir
@ -91,6 +73,13 @@ class DownloadHandler(FileSystemEventHandler):
if __name__ == "__main__":
download_path = os.path.join(os.path.expanduser("~"), "Downloads")
event_handler = DownloadHandler()
# Initial scan for files already in Downloads
for f in os.listdir(download_path):
full_path = os.path.join(download_path, f)
if os.path.isfile(full_path):
event_handler.process_file(full_path)
observer = Observer()
observer.schedule(event_handler, download_path, recursive=False)
observer.start()

@ -12,20 +12,20 @@ directory = 'media'
os.makedirs(temp_directory, exist_ok=True)
media_types = {
'stories' : 'story',
'posts' : 'post',
'profile' : 'profile'
'stories': 'story',
'posts': 'post',
'profile': 'profile'
}
for media_type, _ in media_types.items():
os.makedirs(os.path.join(directory, media_type), exist_ok=True)
existing_media_ids = {}
existing_media_ids = set()
UPLOAD_CUSTOM = False
CACHE_FILE = os.path.join(temp_directory, 'existing_media_ids.json')
CACHE_TTL = timedelta(hours=48)
def UploadMedia(media):
username = media['username']
user_id = media['user_id']
@ -37,12 +37,12 @@ def UploadMedia(media):
post_type = media['post_type']
thumbnail_url = None
phash = None
if media_id and media_id in existing_media_ids:
print('Duplicate file detected. Removing...')
os.remove(filepath)
return True
file_size = os.path.getsize(filepath)
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
@ -56,14 +56,16 @@ def UploadMedia(media):
print(f'Error determining media type for {filename}. Skipping...')
return False
try:post_date = datetime.fromtimestamp(int(timestamp))
except:post_date = datetime.fromtimestamp(os.path.getctime(filepath))
try:
post_date = datetime.fromtimestamp(int(timestamp))
except:
post_date = datetime.fromtimestamp(os.path.getctime(filepath))
width, height = funcs.get_media_dimensions(filepath)
if 0 in (width, height):
print(f'Error getting dimensions for {filename}. Skipping...')
return False
duration = funcs.get_video_duration(filepath)
if media_type == 'image':
@ -71,7 +73,7 @@ def UploadMedia(media):
elif media_type == 'video':
try:
thumb_path = generate_thumbnail(filepath)
obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') # this might be a problem in case of duplicate hashes
obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg')
thumbnail_url = f"https://cdn.altpins.com/thumbnails/{file_hash}.jpg"
phash = funcs.generate_phash(thumb_path)
os.remove(thumb_path)
@ -81,18 +83,17 @@ def UploadMedia(media):
custom_filename = media_id if media_id else file_hash
newFilename = f'{custom_filename}{file_extension}'
server_path = f'media/{post_type}/{username}/{newFilename}'
file_url = f"https://cdn.altpins.com/{server_path}"
obj_storage.PutFile(filepath, server_path)
if highlight_id:
newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id))
newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)",
(highlight_id, user_id, media_id))
newDB.commit()
print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform, file_size) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, platform, file_size)
@ -104,14 +105,13 @@ def UploadMedia(media):
print(f'File: {filename}')
print(f'URL: {file_url}')
print(f'Pin URL: https://altpins.com/pin/{newCursor.lastrowid}')
print("="*100)
print("=" * 100)
os.remove(filepath)
existing_media_ids.add(media_id)
return newCursor.lastrowid
def generate_thumbnail(filepath):
thumb_path = os.path.join(temp_directory, f'{uuid4()}.jpg')
cap = cv2.VideoCapture(filepath)
@ -120,16 +120,16 @@ def generate_thumbnail(filepath):
cap.release()
return thumb_path
def get_user_id(username):
username = username.lower()
if username in existing_users:
return existing_users[username]
return None
def get_media_data(filepath):
filename = os.path.basename(filepath)
parts = filename.split('~')
if len(parts) != 4:
return False
@ -141,7 +141,7 @@ def get_media_data(filepath):
platform = 'instagram'
highlight_id = user_id.replace('highlight', '') if 'highlight' in user_id else None
if user_id.isdigit():
user_id = int(user_id)
else:
@ -152,17 +152,17 @@ def get_media_data(filepath):
else:
media_id = None
data = {'username': username, 'timestamp': timestamp, 'media_id': media_id, 'user_id': user_id, 'filepath': filepath, 'highlight_id': highlight_id, 'platform': platform}
data = {'username': username, 'timestamp': timestamp, 'media_id': media_id, 'user_id': user_id,
'filepath': filepath, 'highlight_id': highlight_id, 'platform': platform}
return data
def get_media():
medias = []
failed_medias = []
for media_type, post_type in media_types.items():
media_folder_path = os.path.join(directory, media_type)
if not os.path.exists(media_folder_path):
continue
@ -172,26 +172,23 @@ def get_media():
if not data:
failed_medias.append(filepath)
continue
data['post_type'] = post_type
medias.append(data)
return medias, failed_medias
def get_custom_media(failed_medias):
medias = []
for media_type, post_type in media_types.items():
folder_path = os.path.join(directory, media_type)
user_dirs = [d for d in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, d))]
for username in user_dirs:
user_folder_path = os.path.join(folder_path, username)
for filename in os.listdir(user_folder_path):
if filename.startswith('.'):
continue
filepath = os.path.join(user_folder_path, filename)
if not filepath in failed_medias:
continue
@ -199,7 +196,7 @@ def get_custom_media(failed_medias):
user_id = get_user_id(username)
timestamp = int(os.path.getctime(filepath))
media_id = os.path.splitext(filename)[0]
if media_id.isdigit():
media_id = int(media_id)
if media_id < 10000000:
@ -217,40 +214,35 @@ def get_custom_media(failed_medias):
"highlight_id": None,
"post_type": post_type
}
medias.append(data)
return medias
def save_highlight_data(highlights):
filename = f'{uuid4()}.json'
filepath = os.path.join('highlight_data', filename)
with open(filepath, 'w') as f:
json.dump(highlights, f)
def dump_instagram():
medias, failed_medias = get_media()
medias = clean_dupes(medias)
failed_medias = get_custom_media(failed_medias)
medias.sort(key=lambda x: (x['username'].lower(), x['timestamp']))
# Update new user ids and existing user ids
new_user_ids = {}
for media in medias:
user_id = media['user_id']
username = media['username']
if not media['user_id']:
continue
if username in existing_users:
continue
existing_users[username] = user_id
new_user_ids[username] = user_id
# Assign user ids
for media in medias:
if media['user_id']:
continue
@ -262,13 +254,12 @@ def dump_instagram():
if not media['highlight_id']:
continue
highlights.append({
"media_id": media["media_id"],
"user_id": media["user_id"],
"highlight_id": media['highlight_id'],
"username": media['username'],
})
"media_id": media["media_id"],
"user_id": media["user_id"],
"highlight_id": media['highlight_id'],
"username": media['username'],
})
# save highlights data into folder highlight_Data
if highlights:
save_highlight_data(highlights)
@ -280,85 +271,97 @@ def dump_instagram():
for media in failed_medias:
pinid = UploadMedia(media)
def clean_dupes(medias):
removed_count = 0
new_medias = []
for media in medias:
media_id = media['media_id']
filepath = media['filepath']
if not media_id:
print(f'Invalid media_id for file {filepath}. Skipping...')
continue
if media_id in existing_media_ids:
removed_count += 1
print(f'Found duplicate file {filepath}. Removing...')
os.remove(filepath)
continue
if re.search(r'\(\d+\)', filepath):
removed_count += 1
print(f'Found duplicate file {filepath}. Removing...')
os.remove(filepath)
continue
new_medias.append(media)
print(f'Removed {removed_count} duplicate files.')
return new_medias
# -------------------- CACHE SYSTEM --------------------
def get_cached_data():
if not os.path.exists(CACHE_FILE):
print('No cache file found. Generating new cache…')
return None, None
return None, None, None
try:
with open(CACHE_FILE, 'r') as f:
cache_data = json.load(f)
timestamp = datetime.fromisoformat(cache_data.get('timestamp', ''))
if datetime.now() - timestamp < CACHE_TTL:
print('Using cached data…')
return set(tuple(x) for x in cache_data.get('existing_media_ids', [])), cache_data.get('existing_users', {})
cache = json.load(f)
media_ids = set(cache.get('media_ids', []))
users = {k.lower(): v for k, v in cache.get('existing_users', {}).items()}
last_id = cache.get('last_id', 0)
return media_ids, users, last_id
except Exception as e:
print(f"Cache read error: {e}")
return None, None, None
return None, None
def save_cached_data(existing_media_ids, existing_users):
def save_cached_data(media_ids, existing_users, last_id):
with open(CACHE_FILE, 'w') as f:
json.dump({'timestamp': datetime.now().isoformat(), 'existing_media_ids': list(existing_media_ids), 'existing_users': existing_users}, f)
json.dump({
'timestamp': datetime.now().isoformat(),
'media_ids': list(media_ids),
'existing_users': existing_users,
'last_id': last_id
}, f)
def get_user_ids(cur):
cur.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL AND platform='instagram'")
rows = cur.fetchall()
return {user['username'].lower(): user['user_id'] for user in rows}
def get_existing_medias(newCursor):
existing_media_ids, existing_users = get_cached_data()
def get_existing_media_ids(cur):
cur.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform='instagram' AND status='public'")
rows = cur.fetchall()
media_ids = {row['media_id'] for row in rows}
last_id = max((row['id'] for row in rows), default=0)
return media_ids, last_id
if existing_media_ids and existing_users:
newest_id = max(existing_media_ids, key=lambda x: x[0])[0]
existing_media_ids = {image[1] for image in existing_media_ids}
def get_existing_medias(cur):
media_ids, users, last_id = get_cached_data()
newCursor.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform = 'instagram' AND status = 'public' AND id > %s ORDER BY id DESC", (newest_id,))
new_media_ids = {image[1] for image in newCursor.fetchall()}
if not media_ids or not users:
print('Cold cache → pulling full data...')
media_ids, last_id = get_existing_media_ids(cur)
users = get_user_ids(cur)
save_cached_data(media_ids, users, last_id)
return media_ids, users
for media_id in new_media_ids:
existing_media_ids.add(media_id)
return existing_media_ids, existing_users
print('Getting existing files and users...')
newCursor.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform = 'instagram' AND status = 'public';")
existing_media_ids = {image for image in newCursor.fetchall()}
cur.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform='instagram' AND status='public' AND id > %s ORDER BY id ASC", (last_id,))
rows = cur.fetchall()
print('Getting existing users...')
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL AND platform = 'instagram'")
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
for r in rows:
media_ids.add(r['media_id'])
last_id = max(last_id, r['id'])
cache_file = os.path.join(temp_directory, 'existing_media_ids.json')
with open(cache_file, 'w') as f:
json.dump({'timestamp': datetime.now().isoformat(), 'existing_media_ids': list(existing_media_ids), 'existing_users': existing_users}, f)
return existing_media_ids, existing_users
if rows:
save_cached_data(media_ids, users, last_id)
return media_ids, users
# -------------------- MAIN --------------------
if __name__ == '__main__':
print('Starting processing...')
@ -366,16 +369,11 @@ if __name__ == '__main__':
if not funcs.get_files(directory):
print('No files to process. Exiting...')
exit()
newDB, newCursor = config.gen_connection()
newDB, newCursor = config.gen_connection()
obj_storage = config.get_storage()
existing_media_ids, existing_users = get_existing_medias(newCursor)
dump_instagram()
print("Processing completed.")
# for mediatype, _ in media_types.items():
# funcs.clean_empty_folders(os.path.join(directory, mediatype))
print("Processing completed.")
Loading…
Cancel
Save