main
oscar 11 months ago
parent 93c9d660f9
commit 322e39b51f

BIN
.DS_Store vendored

Binary file not shown.

1
.gitignore vendored

@ -1,6 +1,7 @@
# python files
*.pyc
__pycache__
*.DS_Store
# Content
storysaver

@ -0,0 +1,109 @@
import os, config, funcs, cv2, imagehash
from PIL import Image
directory = "old_snapchats"
duplicate_dir = 'dupelicate_snaps'
def generate_video_phash(filepath):
try:
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cap.release()
if not ret:
return None
phash = imagehash.phash(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
return str(phash)
except:
return None
def get_snapchat_files():
stories = funcs.get_files(directory)
stories = [get_media_data(filepath) for filepath in stories]
stories = [story for story in stories if story]
return stories
def get_media_data(filepath):
filename = os.path.basename(filepath)
parts = filename.split('~')
if len(parts) < 3:
return False
username = parts[0]
timestamp = parts[1]
snap_id = parts[2]
snap_id = os.path.splitext(snap_id)[0]
# data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': snap_id, 'original_snap_id': None}
data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': None, 'original_snap_id': snap_id}
return data
def process_snap_ids(filenames):
snap_ids = []
for filename in filenames:
snap_id = filename.split('~')[2]
snap_id = os.path.splitext(snap_id)[0]
if snap_id not in snap_ids:
snap_ids.append(snap_id)
return snap_ids
def find_duplicate_snap(existing_snaps, current_snap):
filepath = current_snap['filepath']
original_snap_id = current_snap['original_snap_id']
username = current_snap['username']
snap_hash = funcs.calculate_file_hash(current_snap['filepath'])
if filepath.endswith('.mp4'):
phash = generate_video_phash(current_snap['filepath'])
elif filepath.endswith('.jpg'):
phash = funcs.generate_phash(current_snap['filepath'])
for snap in existing_snaps:
if username != snap[2]:
continue
if original_snap_id in snap[1]:
return snap
if original_snap_id == snap[5]:
return snap
if snap_hash == snap[3]:
return snap
if phash == snap[4]:
return snap
return False
if __name__ == '__main__':
print('Starting snappy...')
db, cursor = config.gen_connection()
obj_storage = config.get_storage()
stories_from_files = get_snapchat_files()
# this script will check if there are any duplicates in old_snapchats folder in the database in table media where platform = 'snapchat'
cursor.execute("SELECT id, filename, username, hash, phash, original_snap_id FROM media WHERE filename IS NOT NULL AND platform = 'snapchat'")
existing_medias = cursor.fetchall()
snap_files = get_snapchat_files()
os.makedirs(duplicate_dir, exist_ok=True)
for story in snap_files:
print(f"Processing {story['username']}...")
snap_id = story['snap_id']
original_snap_id = story['original_snap_id']
username = story['username']
# check if the snap_id is already in the database
existing_snap = find_duplicate_snap(existing_medias, story)
if existing_snap:
print(f"Snap {original_snap_id} already exists in the database.")
new_filename = os.path.basename(story['filepath'])
new_filepath = os.path.join(duplicate_dir, new_filename)
os.rename(story['filepath'], new_filepath)
print("Processing completed.")

@ -29,9 +29,6 @@ def get_files(directory):
files.append(os.path.join(root, filename))
return files
import cv2
import numpy as np
def compare_images(image_path1, image_path2):
# Load the images in grayscale
img1 = cv2.imread(image_path1, cv2.IMREAD_GRAYSCALE)

@ -52,6 +52,29 @@ def get_file_extension(url):
else:
print(f"Unknown content type for media {url}")
return None
def extract_file_type(url):
file_types = {
'400': '.jpg',
'1322': '.mp4',
'1325': '.mp4',
'1034': '.mp4',
'1023': '.jpg'
}
base_url = url.split("?")[0] # Remove query string
snap_data = base_url.split('/')[-1]
# Extract the file type number
data_parts = snap_data.split('.')
if len(data_parts) > 1:
file_type_number = data_parts[1]
if file_type_number in file_types:
return file_types[file_type_number]
else:
print(f"Unexpected URL format: {base_url}")
return None
def download_media(url, filepath):
if os.path.exists(filepath):
@ -112,9 +135,10 @@ def main():
# Determine file extension using HEAD request.
# TODO: find a better way to determine file extension without downloading the file.
extension = get_file_extension(url)
extension = extract_file_type(url)
if not extension:
continue
filename = f"{username}~{timestamp}~{snap_id}{extension}"
filepath = os.path.join(directory, filename)

@ -92,4 +92,4 @@ def get_highlight_stories(data):
story = parse_story(snap)
stories.append(story)
return stories
return stories

@ -21,7 +21,6 @@ def archive_data(data, username):
data_filepath = os.path.join(data_directory, data_filename)
with open(data_filepath, 'w') as f:
f.write(json.dumps(data))
print(f"Archived data for {username} at {data_filepath}")
def get_file_extension(url):
response = requests.head(url)
@ -64,7 +63,7 @@ def extract_file_type(url):
def download_media(url, filepath):
if os.path.exists(filepath):
print(f"File {filepath} already exists. Skipping download.")
# File already exists, skip download and return the filepath as if it was downloaded.
return filepath
response = requests.get(url)
@ -76,55 +75,6 @@ def download_media(url, filepath):
f.write(response.content)
return filepath
def get_all_stories(usernames):
snapchat_users_data = get_all_users_data(usernames)
all_stories = []
for username in usernames:
print(f"Getting stories for {username}...")
data = snapchat_users_data.get(username)
if not data:
print(f"Failed to get data for {username}. Skipping.")
continue
archive_data(data, username)
print("Getting stories...")
stories = get_stories(data)
print("Getting highlights...")
stories.extend(get_highlight_stories(data))
for story in stories:
snap_id = story['snap_id']
url = story['url']
timestamp = story['timestamp']
# Determine file extension using HEAD request.
extension = extract_file_type(url)
if not extension:
print(f"Failed to determine file extension for {url}. Skipping.")
continue
filename = f"{username}~{timestamp}~{snap_id}{extension}"
filepath = os.path.join(directory, filename)
media = {
'username': username,
'timestamp': timestamp,
'filepath': filepath,
'snap_id': snap_id,
'original_snap_id': story['original_snap_id'],
'media_url': url,
}
all_stories.append(media)
print(f"Media {snap_id} ready for download.")
all_stories.extend(stories)
return all_stories
def get_snapchat_stories():
os.makedirs(directory, exist_ok=True)
os.makedirs(data_directory, exist_ok=True)
@ -149,10 +99,8 @@ def get_snapchat_stories():
archive_data(data, username)
print("Getting stories...")
stories = get_stories(data)
print("Getting highlights...")
stories.extend(get_highlight_stories(data))
for story in stories:
@ -162,7 +110,7 @@ def get_snapchat_stories():
duplicate_snap = find_duplicate_snap(existing_medias, snap_id, username)
if duplicate_snap:
print(f"Media {snap_id} already exists. Skipping download.")
# Snap already exists in the database
continue
# Determine file extension using HEAD request.
@ -191,11 +139,25 @@ def get_snapchat_stories():
return ready_stories
def get_snapchat_files():
stories = funcs.get_files(directory)
stories = [get_media_data(filepath) for filepath in stories]
stories = [story for story in stories if story]
return stories
def main():
ready_stories = get_snapchat_stories()
stories_from_files = get_snapchat_files()
ready_stories.extend(stories_from_files)
download_stories(ready_stories)
def download_stories(stories):
for story in stories:
# Download the media
filepath = story['filepath']
url = story['media_url'] if 'media_url' in story else None
url = story['media_url']
filename = os.path.basename(filepath)
timestamp = story['timestamp']
@ -209,17 +171,6 @@ def download_stories(stories):
UploadMedia(story)
def main():
ready_stories = get_snapchat_stories()
stories_from_files = funcs.get_files(directory)
stories_from_files = [get_media_data(filepath) for filepath in stories_from_files]
stories_from_files = [story for story in stories_from_files if story]
ready_stories.extend(stories_from_files)
download_stories(ready_stories)
def UploadMedia(media):
username = media['username']
timestamp = media['timestamp']
@ -288,7 +239,8 @@ def get_media_data(filepath):
snap_id = parts[2]
snap_id = os.path.splitext(snap_id)[0]
data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': snap_id, 'original_snap_id': None}
data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': snap_id, 'original_snap_id': None, 'media_url': None}
# data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': None, 'original_snap_id': snap_id, 'media_url': None}
return data

@ -120,7 +120,7 @@ def dump_instagram(folder_path):
if __name__ == '__main__':
print('Starting processing...')
if not os.listdir(directory):
print('No files to process. Exiting...')
exit()

@ -10,10 +10,6 @@ def is_valid_uuid(uuid_to_test, version=4):
return str(uuid_obj) == uuid_to_test
# file name : masstik_caammmyyy_1310_655_going blonde wednesdayyyy.mp4
# file name : masstiktok_aleksandraverse__#fyp #trending #viral #foryou.mp4
# where the first item is prefix, second is username and after those is the tiktok title
source_dir = 'tiktoks/'
processed_dir = 'processed_tiktoks'

@ -2,7 +2,7 @@ from datetime import datetime
import os, config, funcs, cv2
from uuid import uuid4
directory = 'ready_to_upload'
directory = 'ready_for_upload/instagram'
def UploadMedia(username, user_id, filepath):
thumbnail_url = None
@ -80,8 +80,12 @@ def get_user_id(username):
def get_media(folder_path):
medias = []
for user_folder in os.listdir(folder_path):
files = os.listdir(os.path.join(folder_path, user_folder))
user_folders = os.listdir(folder_path)
for user_folder in user_folders:
user_folder_path = os.path.join(folder_path, user_folder)
if not os.path.isdir(user_folder_path):
continue
files = os.listdir(user_folder_path)
for filename in files:
filepath = os.path.join(folder_path, user_folder, filename)
media = {

Loading…
Cancel
Save