massive update

main
oscar 3 months ago
parent 55484ebf11
commit a65cc43999

BIN
.DS_Store vendored

Binary file not shown.

1
.gitignore vendored

@ -32,3 +32,4 @@ uploadlater
snapchat.json
/add_to_liked
/.profiles
/.vscode

@ -1,17 +1,3 @@
import os
MEDIA_DIRECTORY = "media"
SNAPCHAT_DIRECTORY = "snapchat"
INSTAGRAM_DIRECTORY = "instagram"
@property
def get_instagram_directory():
return os.path.join(MEDIA_DIRECTORY, INSTAGRAM_DIRECTORY)
@property
def snapchat_output_dir():
return os.path.join(MEDIA_DIRECTORY, SNAPCHAT_DIRECTORY)
username = "doadmin"
password = "AVNS_2qeFJuiGRpBQXkJjlA6"
host = "storysave-do-user-13308724-0.c.db.ondigitalocean.com"

@ -1,5 +1,5 @@
import os
from funcs import calculate_file_hash, get_media_dimensions, get_media_type, generate_phash
from funcs import calculate_file_hash, get_media_dimensions, generate_phash
import config
# --- Configuration & Constants ---
@ -54,8 +54,11 @@ def update_dimensions(cursor, db, obj_storage):
obj_storage.DownloadFile(storage_path=server_path, download_path=CACHE_DIR)
# Optionally, you could get the media type if needed:
media_type = get_media_type(local_file)
width, height = get_media_dimensions(local_file)
if width == 0 or height == 0:
print(f"Error getting dimensions for {media_url}")
continue
cursor.execute("UPDATE media SET width = %s, height = %s WHERE id = %s;", (width, height, record_id))
db.commit()
@ -103,6 +106,31 @@ def update_phash(cursor, db, obj_storage):
db.commit()
print(f"[{idx}/{total}] Processed record {record_id} with pHash: {phash}")
def update_user_ids(cursor, db):
cursor.execute("SELECT DISTINCT username FROM media WHERE user_id IS NULL AND platform = 'instagram';")
usernames = [username[0] for username in cursor.fetchall()]
total = len(usernames)
print(f"Found {total} usernames to process for user_id updating.")
for idx, username in enumerate(usernames, start=1):
print(f"[{idx}/{total}] Username: {username}")
cursor.execute("SELECT DISTINCT user_id FROM media WHERE username = %s AND user_id IS NOT NULL;", [username])
possible_user_ids = [user_id for user_id, in cursor.fetchall()]
if len(possible_user_ids) == 0:
print(f"No user_id found for {username}")
continue
if len(possible_user_ids) > 1:
print(f"Multiple user_ids found for {username}: {possible_user_ids}")
continue
user_id = possible_user_ids[0]
cursor.execute("UPDATE media SET user_id = %s WHERE username = %s AND user_id IS NULL;", [user_id, username])
db.commit()
print(f"[{idx}/{total}] Updated user_id for {username}, Rows affected: {cursor.rowcount}")
def main():
obj_storage = config.get_storage()
db, cursor = config.gen_connection()
@ -111,6 +139,7 @@ def main():
update_dimensions(cursor, db, obj_storage)
update_file_size(cursor, db, obj_storage)
update_phash(cursor, db, obj_storage)
update_user_ids(cursor, db)
if __name__ == '__main__':
main()

@ -107,6 +107,7 @@ def compare_images(image_path1, image_path2):
def download_file(url, filePath):
try:
if os.path.exists(filePath):
print(f"File already exists: {filePath}")
return filePath
if not url:
@ -198,4 +199,15 @@ def calculate_file_hash(file_path, hash_func='sha256'):
while chunk:
h.update(chunk)
chunk = file.read(8192)
return h.hexdigest()
return h.hexdigest()
def files_are_identical(file1, file2):
"""Compare two files byte-by-byte."""
with open(file1, "rb") as f1, open(file2, "rb") as f2:
while True:
chunk1 = f1.read(4096)
chunk2 = f2.read(4096)
if chunk1 != chunk2:
return False
if not chunk1: # End of file
return True

@ -1 +0,0 @@
DH3ucOuYLbJ2Va3lfJPEYQq_6mk_v3R9dnrAYSQHr-Q=

File diff suppressed because one or more lines are too long

@ -1 +0,0 @@
gAAAAABmRUff7c9t9gngWj_2cwvaTBrUDJ_JUyYVUfG-p3SvDV7qOSHddJ4eHADiJeRtJNtY9UxkohSB5I1MmLahAb_hxxwIVA==

@ -1,20 +1,41 @@
from storysave_api import get_hd_profile_picture
import config, funcs, os
import config, funcs, os, time
known_phashes = {'e7c51a904b69d366': 'default empty profile picture',
'cb3ce46194c335dc': 'default empty profile picture',
}
known_hashes = {
'09c3cf34d4f117d99fa6285f4bfd3a0d888d7ab2cbca665b16097f6b93ca0de6' : 'default empty profile picture',
'2b9c0914d8f3f0aa6cf86705df70b7b21e9ca2f9013a346463788e7cebd0158f' : 'default empty profile picture',
}
db, cursor = config.gen_connection()
cursor.execute(f"SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL AND username IN (SELECT username FROM following WHERE platform = 'instagram');")
cursor.execute("SELECT DISTINCT username, user_id, favorite FROM following WHERE user_id IS NOT NULL AND platform = 'instagram' ORDER BY favorite DESC;")
usernames = cursor.fetchall()
for username, user_id in usernames:
for username, user_id, favorite in usernames:
profilepicurl = get_hd_profile_picture(user_id=user_id)
if not profilepicurl:
print(f'Failed for {username}')
continue
filename = os.path.basename(profilepicurl).split('?')[0]
user_dir = os.path.join('media', 'instagram', 'profile', username)
filepath = os.path.join(user_dir, filename)
funcs.download_file(profilepicurl, filepath)
print(f"Downloaded profile picture for {username}.")
filepath = funcs.download_file(profilepicurl, filepath)
if not filepath:
continue
phash = funcs.generate_phash(filepath)
if phash in known_phashes:
print(f"Profile picture for {username} is the default empty profile picture.")
os.remove(filepath)
continue
print(f"Downloaded profile picture for {username}.")
time.sleep(1)

@ -18,4 +18,5 @@ tqdm
webdriver-manager
moviepy==1.0.3
instagrapi
ImageHash
ImageHash
watchdog

@ -1,153 +0,0 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
import requests
import json
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"}
snap_types = {
27 : ['spotlight', 'video'],
256 : ['thumbnail', 'image'],
400 : ['idk', 'image'],
1023 : ['idk', 'image'],
1034 : ['downscaled_video', 'video'],
1322 : ['idk', 'video'],
1325 : ['idk', 'video'],
}
def get_data(username):
url = f"https://www.snapchat.com/add/{username}"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
data_script = soup.find("script", id="__NEXT_DATA__")
if not data_script:
print(f"No data found for {username}.")
return None
data = json.loads(data_script.string)
return data
def get_social_medias(data):
website_url = None
try:
website_url = data['props']['pageProps']['userProfile']['publicProfileInfo']['websiteUrl']
except KeyError:
pass
return website_url
def get_related_profiles(data):
related_profiles = []
try:
related_profiles_data = data['props']['pageProps']['userProfile']['relatedProfiles']
for profile in related_profiles_data:
related_profiles.append(profile['username'])
except KeyError:
pass
return related_profiles
def get_all_users_data(usernames):
all_data = {}
# Define a helper function for threading
def fetch_data(username):
return username, get_data(username)
# Use ThreadPoolExecutor for concurrent fetching
with ThreadPoolExecutor() as executor:
futures = {executor.submit(fetch_data, username): username for username in usernames}
for future in as_completed(futures):
username = futures[future]
try:
username, data = future.result()
all_data[username] = data
except Exception as e:
print(f"Error fetching data for {username}: {e}")
all_data[username] = None
return all_data
def parse_stories(stories):
parsed_stories = []
for story in stories:
parsed_story = parse_story(story)
parsed_stories.append(parsed_story)
return parsed_stories
def get_stories(data):
"""Extract story list from the JSON data."""
try:
stories = data['props']['pageProps']['story']['snapList']
if not type(stories) == list:
return []
stories.sort(key=lambda x: x.get('snapIndex'), reverse=True)
return stories
except:
return []
def get_highlights(data):
"""Extract highlights from possible highlight keys in JSON data."""
highlights = []
page_props = data.get('props', {}).get('pageProps', {})
possible_highlight_keys = ['curatedHighlights', 'savedHighlights', 'highlights']
for key in possible_highlight_keys:
highlight_data = page_props.get(key, [])
if highlight_data:
highlights.extend(highlight_data)
return highlights
def parse_story(story):
original_snap_id = story.get('snapId', {}).get('value', '')
snap_url = story.get('snapUrls', {}).get('mediaUrl', '')
timestamp = story.get('timestampInSec', {}).get('value', '')
media_type = story.get('snapMediaType')
media_type = 'image' if media_type == 0 else 'video'
return {
"original_snap_id": original_snap_id,
"snap_id": get_snap_id(snap_url),
"url": snap_url,
"timestamp": timestamp,
"platform": "snapchat",
"type": "story",
"username": story.get('username', ''),
"media_type": media_type,
}
def get_snap_id(url):
return url.split('?')[0].split('/')[-1].split('.')[0]
def get_highlight_stories(data):
stories = []
highlights = get_highlights(data)
for highlight in highlights:
snap_list = highlight.get('snapList', [])
for snap in snap_list:
story = parse_story(snap)
stories.append(story)
return stories
def get_spotlight_metadata(data):
"""Extract spotlight metadata from JSON data."""
try:
return data['props']['pageProps']['spotlightStoryMetadata']
except KeyError:
return []
def get_username(data):
"""Extract username from JSON data."""
try:
return data['props']['pageProps']['userProfile']['publicProfileInfo']['username']
except KeyError:
return None

@ -1,126 +0,0 @@
import os
import json
from tqdm import tqdm
from funcs import get_files
from snapchat import get_stories, get_highlights, get_spotlight_metadata, get_username
# import config as altpinsConfig
import altpinsConfig
def get_data(filepath):
try:
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
except:
print(f"Error reading {filepath}")
return None
def process_story(story, username, story_type, db, cursor):
snap_urls = story.get('snapUrls', {})
media_url = snap_urls.get('mediaUrl', '').split('?')[0]
media_id = media_url.split('/')[-1].split('.')[0].split('?')[-1]
if media_id in existing_media_ids:
return False
media_url = f"https://cf-st.sc-cdn.net/d/{media_url.split('/')[-1]}"
media_preview_url = snap_urls.get('mediaPreviewUrl', '').get('value', '').split('?')[0]
media_preview_url = f"https://cf-st.sc-cdn.net/d/{media_preview_url.split('/')[-1]}"
timestamp = story.get('timestampInSec', {}).get('value', '')
media_type = story.get('snapMediaType')
snap_id = story.get('snapId', {}).get('value', '')
query = "INSERT IGNORE INTO snapchat_stories (snapId, mediaUrl, mediaPreviewUrl, timestampInSec, snapMediaType, storyType, username, media_id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
cursor.execute(query, (snap_id, media_url, media_preview_url, timestamp, media_type, story_type, username, media_id))
db.commit()
existing_media_ids.add(media_id)
print_emoji = '' if cursor.rowcount else ''
print(f"{print_emoji} Inserted story {media_id}")
def process_json(json_path, db, cursor):
"""
Given a path to a JSON file, parse it and insert relevant data
into the database.
"""
# Load JSON data
data = get_data(json_path)
username = get_username(data)
ready_stories = []
# Insert stories (regular)
stories = get_stories(data)
for story in stories:
story['storyType'] = 'story'
ready_stories.append(story)
# Insert stories (highlights)
highlights = get_highlights(data)
highlight_stories = [story for highlight in highlights for story in highlight.get('snapList', [])]
highlight_stories.sort(key=lambda x: x.get('snapIndex'), reverse=True)
for story in highlight_stories:
story['storyType'] = 'highlight'
ready_stories.append(story)
for story in ready_stories:
story_type = story.get('storyType')
process_story(story, username, story_type, db, cursor)
# Insert spotlight metadata
spotlight_metadata = get_spotlight_metadata(data)
for story in spotlight_metadata:
try:
media_id = story['videoMetadata']['contentUrl'].split('/')[-1].split('.')[0].split('?')[-1]
deepLinkUrl = story['oneLinkParams']['deepLinkUrl'].split('?')[0]
except:
continue
if not all((media_id, deepLinkUrl)):
continue
if deepLinkUrl in existing_spotlights:
continue
deepLinkId = deepLinkUrl.split('/')[-1]
description = story['description']
insert_query = "INSERT IGNORE INTO snapchat_metadata (media_id, deepLinkUrl, description, username, deepLinkId) VALUES (%s, %s, %s, %s, %s)"
cursor.execute(insert_query, (media_id, deepLinkUrl, description, username, deepLinkId))
db.commit()
existing_spotlights.add(deepLinkUrl)
print_emoji = '' if cursor.rowcount else ''
print(f"{print_emoji} Inserted spotlight {media_id}")
os.remove(json_path)
db, cursor = altpinsConfig.gen_connection()
existing_media_ids = []
cursor.execute("SELECT media_id FROM snapchat_stories WHERE media_id != '';")
existing_media_ids = {row[0] for row in cursor.fetchall()}
existing_spotlights = []
cursor.execute("SELECT deepLinkUrl FROM snapchat_metadata;")
existing_spotlights = {row[0] for row in cursor.fetchall()}
data_dir = 'data'
files = [f for f in get_files(data_dir) if f.endswith('.json')]
# Wrap the file list with tqdm to show a progress bar
for filepath in tqdm(files, desc="Processing files", unit="file"):
process_json(filepath, db, cursor)
db.close()

@ -1,66 +0,0 @@
from snapchat import get_all_users_data, get_stories, get_highlight_stories, get_social_medias, get_related_profiles
import os, config
snapchat_directory = "snapchat"
media_directory = "media"
temp_directory = ".temp"
data_directory = "data"
directory = os.path.join(media_directory, snapchat_directory)
def get_snapchat_stories(usernames):
usernames = usernames[:5]
snapchat_users_data = get_all_users_data(usernames)
snapchat_users_data = dict(sorted(snapchat_users_data.items()))
ready_stories = []
for username, data in snapchat_users_data.items():
print(f"Getting stories for {username}...")
data = snapchat_users_data.get(username)
if not data:
print(f"Failed to get data for {username}. Skipping.")
continue
website_url = get_social_medias(data)
related_profiles = get_related_profiles(data)
stories = get_stories(data)
stories.extend(get_highlight_stories(data))
for story in stories:
snap_id = story['snap_id']
url = story['url']
timestamp = story['timestamp']
# Determine file extension
extension = '.jpg' if story['media_type'] == 'image' else '.mp4'
filename = f"{username}~{timestamp}~{snap_id}{extension}"
filepath = os.path.join(directory, filename)
story['media_url'] = url
story['snap_id'] = snap_id
story['filepath'] = filepath
story['username'] = username
story['timestamp'] = timestamp
story['original_snap_id'] = story['original_snap_id']
ready_stories.append(story)
# sort ready_stories by timestamp from oldest to newest
ready_stories.sort(key=lambda x: x['timestamp'])
return ready_stories
db, cursor = config.gen_connection()
cursor.execute("SELECT username FROM following WHERE platform = 'snapchat' ORDER BY id DESC")
usernames = [row[0] for row in cursor.fetchall()]
stories = get_snapchat_stories(usernames)

@ -1,243 +0,0 @@
from snapchat import get_stories, get_highlight_stories, get_all_users_data, parse_stories
from datetime import datetime
from uuid import uuid4
import config
import funcs
import cv2
import os
import json
UPLOAD_MODE = True
media_directory = "media"
snapchat_directory = "snapchat"
temp_directory = ".temp"
data_directory = "data"
directory = os.path.join(media_directory, snapchat_directory)
os.makedirs(media_directory, exist_ok=True)
os.makedirs(directory, exist_ok=True)
os.makedirs(temp_directory, exist_ok=True)
os.makedirs(data_directory, exist_ok=True)
def find_duplicate_snap(existing_snap_ids, snap_id):
return snap_id in existing_snap_ids
def archive_data(data, username):
try:
current_timestamp = int(datetime.now().timestamp())
data_filename = f"{username}~{current_timestamp}.json"
data_filepath = os.path.join(data_directory, data_filename)
with open(data_filepath, 'w') as f:
f.write(json.dumps(data, indent=4))
except:
print(f"Failed to archive data for {username}.")
return False
def get_snapchat_stories(usernames):
snapchat_users_data = get_all_users_data(usernames)
snapchat_users_data = dict(sorted(snapchat_users_data.items()))
ready_stories = []
for username, data in snapchat_users_data.items():
print(f"Getting stories for {username}...")
if not data:
print(f"Failed to get data for {username}. Skipping.")
continue
archive_data(data, username)
stories = get_stories(data)
stories = parse_stories(stories)
stories.extend(get_highlight_stories(data))
for story in stories:
snap_id = story['snap_id']
url = story['url']
timestamp = story['timestamp']
# Determine file extension
file_exts = {'image': '.jpg', 'video': '.mp4'}
extension = file_exts.get(story['media_type'])
if not extension:
print(f"Failed to determine file extension for {url}. Skipping.")
continue
filename = f"{username}~{timestamp}~{snap_id}{extension}"
filepath = os.path.join(directory, filename)
story['media_url'] = url
story['snap_id'] = snap_id
story['filepath'] = filepath
story['username'] = username
story['timestamp'] = timestamp
story['original_snap_id'] = story['original_snap_id']
ready_stories.append(story)
ready_stories.sort(key=lambda x: x['timestamp'])
return ready_stories
def get_snapchat_files():
stories = funcs.get_files(directory)
stories = [get_media_data(filepath) for filepath in stories]
stories = [story for story in stories if story]
return stories
def main():
print('Initializing snappy...')
ready_stories = []
stories_from_files = get_snapchat_files()
cursor.execute("SELECT username FROM following WHERE platform = 'snapchat' ORDER BY id DESC")
usernames = [row[0] for row in cursor.fetchall()]
print(f"Getting stories for {len(usernames)} users...")
new_stories = get_snapchat_stories(usernames)
cleaned_stories = []
print("Checking for duplicates...")
for story in new_stories:
duplicate_snap = find_duplicate_snap(existing_snap_ids, story['snap_id'])
if duplicate_snap:
print(f"Snap {story['filepath']} already exists in the database. Removing...")
continue
cleaned_stories.append(story)
cleaned_stories = download_stories(cleaned_stories)
ready_stories.extend(cleaned_stories)
ready_stories.extend(stories_from_files)
for story in ready_stories:
UploadMedia(story)
def download_stories(stories):
downloaded_stories = []
for story in stories:
filepath = story['filepath']
url = story['media_url']
filepath = funcs.download_file(url, filepath)
print(f"Downloaded {os.path.basename(filepath)}")
if not filepath:
continue
story['hash'] = funcs.calculate_file_hash(filepath)
story['size'] = os.path.getsize(filepath)
downloaded_stories.append(story)
return downloaded_stories
def UploadMedia(media):
file_size = media['size']
file_hash = media['hash']
filepath = media['filepath']
filename = os.path.basename(filepath)
username = media['username']
timestamp = media['timestamp']
media_type = media['media_type']
snap_id = media['snap_id']
original_snap_id = media['original_snap_id']
thumbnail_url = None
phash = None
duplicate_snap = find_duplicate_snap(existing_snap_ids, media['snap_id'])
if duplicate_snap:
print(f"Snap {filename} already exists in the database. Removing...")
os.remove(filepath)
return False
post_date = datetime.fromtimestamp(int(timestamp))
width, height = funcs.get_media_dimensions(filepath)
duration = funcs.get_video_duration(filepath)
if media_type == 'image':
phash = funcs.generate_phash(filepath)
elif media_type == 'video':
try:
thumb_path = generate_thumbnail(filepath)
obj_storage.PutFile(thumb_path, f'thumbnails/{filename}')
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{filename}"
phash = funcs.generate_phash(thumb_path)
os.remove(thumb_path)
except:
print('Error generating thumbnail. Skipping...')
return False
server_path = f'media/snaps/{username}/{filename}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
obj_storage.PutFile(filepath, server_path)
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, hash, filename, duration, thumbnail, phash, platform, snap_id, original_snap_id, file_size) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, 'story', post_date, file_hash, filename, duration, thumbnail_url, phash, 'snapchat', snap_id, original_snap_id, file_size)
cursor.execute(query, values)
db.commit()
print(f'[{cursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
os.remove(filepath)
return True
def generate_thumbnail(filepath):
thumb_path = os.path.join(temp_directory, f'{uuid4()}.jpg')
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cv2.imwrite(thumb_path, frame)
cap.release()
return thumb_path
def get_media_data(filepath):
filename = os.path.basename(filepath)
parts = filename.split('~')
if len(parts) < 3:
return False
username = parts[0]
timestamp = parts[1]
snap_id = parts[2]
snap_id = os.path.splitext(snap_id)[0]
file_size = os.path.getsize(filepath)
file_hash = funcs.calculate_file_hash(filepath)
data = {
"username": username,
"timestamp": timestamp,
"filepath": filepath,
"snap_id": snap_id,
"original_snap_id": None,
"media_url": None,
"size": file_size,
"hash": file_hash
}
return data
if __name__ == '__main__':
print('Starting snappy...')
db, cursor = config.gen_connection()
obj_storage = config.get_storage()
cursor.execute("SELECT snap_id FROM media WHERE filename IS NOT NULL AND platform = 'snapchat' ORDER BY id DESC")
existing_snap_ids = cursor.fetchall()
existing_snap_ids = {row[0] for row in existing_snap_ids}
main()
print("Processing completed.")

@ -2,20 +2,50 @@ from bs4 import BeautifulSoup
import requests
import json
doc_ids = [7663723823674585, 9539110062771438]
doc_ids = [7663723823674585, 9539110062771438, 8964418863643891, 9066276850131169]
active_doc_id = doc_ids[3]
def get_posts():
data = {
"variables": '{"id":"57771591453","render_surface":"PROFILE"}',
"doc_id": "7663723823674585",
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
}
def get_posts(username):
url = 'https://www.instagram.com/graphql/query/'
variables = {
"data": {
"count": 12,
"include_reel_media_seen_timestamp": True,
"include_relationship_info": True,
"latest_besties_reel_media": True,
"latest_reel_media": True
},
"username": username,
"__relay_internal__pv__PolarisIsLoggedInrelayprovider": True,
"__relay_internal__pv__PolarisShareSheetV3relayprovider": False
}
params = {
'variables': json.dumps(variables),
'doc_id': active_doc_id
}
data = requests.get('https://www.instagram.com/graphql/query', params=data).json()
posts = data['data']
posts = [post['node'] for post in posts]
response = requests.get(url, headers=headers, params=params)
return max(posts, key=lambda post: max(c['width'] * c['height'] for c in post['image_versions2']['candidates']))
if response.status_code == 200:
try:
data = response.json()
posts = data['data']['xdt_api__v1__feed__user_timeline_graphql_connection']['edges']
end_cursor = data['data']['xdt_api__v1__feed__user_timeline_graphql_connection']['page_info']['end_cursor']
return posts
except (KeyError, TypeError) as e:
print(f"Error parsing JSON response: {e}")
return None
else:
print(f"Failed to fetch data. Status code: {response.status_code}")
return None
def get_username_by_user_id(user_id):
url = 'https://www.instagram.com/graphql/query/'
@ -35,10 +65,6 @@ def get_username_by_user_id(user_id):
'variables': json.dumps(variables)
}
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
}
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
@ -57,7 +83,7 @@ def extract_script_tags(username):
url = f"https://www.instagram.com/{username}/"
try:
# Fetch the HTML content of the page
response = requests.get(url)
response = requests.get(url, headers=headers)
response.raise_for_status()
# Parse the HTML content with BeautifulSoup
@ -122,13 +148,14 @@ def get_profile_data(username):
user_id = get_user_id(username)
data = {
'variables': '{"id":"' + user_id + '","render_surface":"PROFILE"}',
'doc_id': 9539110062771438
variables = {
"id": user_id,
"render_surface": "PROFILE"
}
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
data = {
'variables': json.dumps(variables),
'doc_id': active_doc_id
}
response = requests.post(url, headers=headers, data=data)
@ -148,9 +175,45 @@ def get_hd_profile_picture(username = None, user_id = None):
if not user_id:
return None
variables = {
"id": user_id,
"render_surface": "PROFILE"
}
data = {
'variables': json.dumps(variables),
'doc_id': '9539110062771438'
}
data = {
'variables': '{"id":"' + user_id +' ","render_surface":"PROFILE"}',
'doc_id': 9539110062771438
'av': '17841401225494803',
'__d': 'www',
'__user': 0,
'__a': 1,
'__req': 4,
'__hs': '20231.HYP%3Ainstagram_web_pkg.2.1...1',
'dpr': 2,
'__ccg': 'GOOD',
'__rev': 1023131892,
'__s': 'g7nwhv%3Ad6c29x%3Aaag0uk',
'__hsi': 7507576467274562470,
'__dyn': '7xe5WwlEnwn8K2Wmm1twpUnwgU7S6EdF8aUco38w5ux609vCwjE1EE2Cw8G11wBw5Zx62G3i1ywOwa90Fw4Hw9O0Lbwae4UaEW2G0AEco5G0zEnwhE3Mw51wLyES1Twoob82ZwrUdUbGwmk0KU6O1FwlE6PhA6bwg8rAwHxW1oxe6UaU3cyUrw4rxO2C',
'__csr': 'gg84YIJgSyn2Ob7oDs-h7qhmToSsDl_8uAAaBigC8yQiaKJuumUkyybh4i9qBFaiayqBAVKczV4cBjhHUbqxeq3q9Suuum9zkEjAy9Ua8ymi45DUG7EgzoeUfKm2ym6UblG00kXK0jUE3Ug3dwh24DgAi1mo0AyaDw4WwiU1Y80bCm12g2Jwww5OCkE18Wc0mmqA4pU22wCw1Ucw06TW0csw7Gw',
'__hsdp': 'l2DMCyPBdbclSEgBiHWhqWiRV5kKKyoFtoYABrqafK699onQtK1fg96qiK5EZcIk0A5bwau0xVEhwAyQElwik0qi1cwam0m20ou06L82Ew56w4-w8O1Xw75wnoc85i',
'__hblp': '08K19xO0V89815oaEtwUCwhoOq4opxG5o8oS4Vk4U9o9o7C0zof82Nwg8uG0jV0Hweu1OwsE13o1ZU11UlwVwko2wwfy0G89E17U11EdU2cwuU5C0Yp8660Eo5idz8vxucw',
'__comet_req': 7,
'fb_dtsg': 'NAfvHXND-ELXKZFgyrogJIig1C4j6gRiNUaBBBomMZ1mNa-FvpKl6bw%3A17854231342124680%3A1731941013',
'jazoest': 26187,
'lsd': 'NFD0t4uLm10VsaniLLl9nv',
'__spin_r': 1023131892,
'__spin_b': 'trunk',
'__spin_t': 1747993861,
'__crn': 'comet.igweb.PolarisProfilePostsTabRoute',
'fb_api_caller_class': 'RelayModern',
'fb_api_req_friendly_name': 'PolarisProfileNoteBubbleQuery',
'variables': '%7B%22user_id%22%3A%228309584937%22%7D',
'server_timestamps': True,
'doc_id': 8698637896906070
}
try:
@ -166,4 +229,83 @@ def get_hd_profile_picture(username = None, user_id = None):
except:
hd_profile_pic = None
return hd_profile_pic
return hd_profile_pic
def get_user_id_by_username(username):
url = 'https://www.instagram.com/graphql/query'
variables = {
"data": {
"context": "blended",
"include_reel": True,
"query": username,
"rank_token": "",
"search_surface": "web_top_search"
},
"hasQuery": True
}
data = {
'variables': json.dumps(variables),
'doc_id': active_doc_id
}
response = requests.post(url, headers=headers, data=data)
if response.status_code == 200:
json_data = response.json()
users = json_data['data']['xdt_api__v1__fbsearch__topsearch_connection']['users']
for user in users:
user_data = user['user']
if user_data['username'] == username:
return user_data['pk']
else:
print(f"Failed to fetch data. Status code: {response.status_code}")
return None
def get_user_id_api(username):
url = f"https://www.instagram.com/api/v1/users/web_profile_info/?username={username}"
headers['referer'] = f"https://www.instagram.com/{username}/"
headers['x-ig-app-id'] = '936619743392459'
response = requests.get(url, headers=headers)
if response.status_code == 200:
try:
data = response.json()
user_id = data['data']['user']['id']
return user_id
except (KeyError, TypeError) as e:
print(f"Error parsing JSON response: {e}")
return None
else:
print(f"Failed to fetch data. Status code: {response.status_code}")
return None
def get_highest_quality_image(image_versions):
max_res = 0
max_res_url = None
for image in image_versions:
if image['width'] > max_res:
max_res = image['width']
max_res_url = image['url']
return max_res_url
def parse_post(post):
medias = post['node']['carousel_media']
media_items = []
for media in medias:
media_item = {}
image_versions = media['image_versions2']['candidates']
media_item['image_url'] = get_highest_quality_image(image_versions)
media_item['pk'] = media['pk']
media_item['media_type'] = media['media_type']
media_items.append(media_item)
return media_items

@ -1,13 +1,15 @@
from datetime import datetime
from datetime import datetime, timedelta
from uuid import uuid4
import funcs
import config
import funcs
import json
import cv2
import os
import re
temp_directory = ".temp"
directory = 'media/instagram/'
directory = 'media'
os.makedirs(temp_directory, exist_ok=True)
media_types = {
'stories' : 'story',
@ -15,39 +17,42 @@ media_types = {
'profile' : 'profile'
}
os.makedirs(temp_directory, exist_ok=True)
UPLOAD_CUSTOM = False
CACHE_FILE = os.path.join(temp_directory, 'existing_media_ids.json')
CACHE_TTL = timedelta(hours=48)
def UploadMedia(media):
username = media['username']
user_id = media['user_id']
filepath = media['filepath']
platform = media['platform']
media_id = media['media_id']
timestamp = media['timestamp']
highlight_id = media['highlight_id']
post_type = media['post_type']
file_size = os.path.getsize(filepath)
thumbnail_url = None
phash = None
if media_id and media_id in existing_files:
if media_id and media_id in existing_media_ids:
print('Duplicate file detected. Removing...')
os.remove(filepath)
return True
file_size = os.path.getsize(filepath)
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
file_hash = funcs.calculate_file_hash(filepath)
if not user_id:
user_id = get_user_id(username)
media_type = funcs.get_media_type(filename)
if not media_type:
print(f'Error determining media type for {filename}. Skipping...')
return False
file_hash = funcs.calculate_file_hash(filepath)
post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now()
try:post_date = datetime.fromtimestamp(int(timestamp))
except:post_date = datetime.fromtimestamp(os.path.getctime(filepath))
width, height = funcs.get_media_dimensions(filepath)
if 0 in (width, height):
@ -62,21 +67,19 @@ def UploadMedia(media):
try:
thumb_path = generate_thumbnail(filepath)
obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') # this might be a problem in case of duplicate hashes
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg"
phash = funcs.generate_phash(thumb_path)
thumbnail_url = f"https://cdn.altpins.com/thumbnails/{file_hash}.jpg"
phash = funcs.generate_phash(thumb_path)
os.remove(thumb_path)
except Exception as e:
print(f'Error generating thumbnail: {e}. Skipping...')
return False
if media_id:
newFilename = f'{media_id}{file_extension}'
else:
newFilename = f'{file_hash}{file_extension}'
custom_filename = media_id if media_id else file_hash
newFilename = f'{custom_filename}{file_extension}'
server_path = f'media/{post_type}/{username}/{newFilename}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
file_url = f"https://cdn.altpins.com/{server_path}"
obj_storage.PutFile(filepath, server_path)
if highlight_id:
@ -84,18 +87,25 @@ def UploadMedia(media):
newDB.commit()
print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform, file_size) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, platform, file_size)
newCursor.execute(query, values)
newDB.commit()
print(f'[{newCursor.rowcount}] records updated.\nFile: {filename}\nURL: {file_url}')
correct_emoji = '' if newCursor.rowcount > 0 else ''
print(f'{correct_emoji} added {filename} to database')
print(f'File: {filename}')
print(f'URL: {file_url}')
print(f'Pin URL: https://altpins.com/pin/{newCursor.lastrowid}')
print("="*100)
os.remove(filepath)
return True
existing_media_ids.add(media_id)
return newCursor.lastrowid
def generate_thumbnail(filepath):
thumb_path = os.path.join(temp_directory, f'{uuid4()}.jpg')
@ -114,8 +124,9 @@ def get_user_id(username):
def get_media_data(filepath):
filename = os.path.basename(filepath)
parts = filename.split('~')
if len(parts) < 4:
if len(parts) != 4:
return False
username = parts[0]
@ -126,7 +137,9 @@ def get_media_data(filepath):
highlight_id = user_id.replace('highlight', '') if 'highlight' in user_id else None
if not user_id.isdigit():
if user_id.isdigit():
user_id = int(user_id)
else:
user_id = get_user_id(username)
if media_id.isdigit():
@ -138,52 +151,46 @@ def get_media_data(filepath):
return data
def get_media(folder_path):
def get_media():
medias = []
failed_medias = []
for media_type, post_type in media_types.items():
folder_path = os.path.join(directory, media_type)
media_folder_path = os.path.join(directory, media_type)
if not os.path.exists(folder_path):
if not os.path.exists(media_folder_path):
continue
all_files = funcs.get_files(folder_path)
all_files = funcs.get_files(media_folder_path)
for filepath in all_files:
data = get_media_data(filepath)
if not data:
failed_medias.append(filepath)
continue
data['post_type'] = post_type
medias.append(data)
return medias
return medias, failed_medias
def get_custom_media():
def get_custom_media(failed_medias):
medias = []
folder_path = 'media/instagram'
platform = 'instagram'
for media_type, post_type in media_types.items():
folder_path = os.path.join(directory, media_type)
user_dirs = [d for d in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, d))]
for user_dir in user_dirs:
user_folder_path = os.path.join(folder_path, user_dir)
for username in user_dirs:
user_folder_path = os.path.join(folder_path, username)
if not os.path.exists(user_folder_path):
continue
username = user_dir
for filename in os.listdir(user_folder_path):
if filename.startswith('.'):
continue
files = os.listdir(user_folder_path)
for filename in files:
filepath = os.path.join(user_folder_path, filename)
if filename.startswith('.'):
if not filepath in failed_medias:
continue
user_id = get_user_id(username)
timestamp = int(os.path.getctime(filepath))
media_id = os.path.splitext(filename)[0]
@ -201,7 +208,7 @@ def get_custom_media():
"media_id": media_id,
"user_id": user_id,
"filepath": filepath,
"platform": platform,
"platform": 'instagram',
"highlight_id": None,
"post_type": post_type
}
@ -209,20 +216,40 @@ def get_custom_media():
medias.append(data)
return medias
def dump_instagram():
medias, failed_medias = get_media()
medias = clean_dupes(medias)
failed_medias = get_custom_media(failed_medias)
def dump_instagram(folder_path):
medias = get_media(folder_path)
# medias.extend(get_custom_media())
if cleanup_dupe_stories(medias):
medias = get_media(folder_path)
medias.sort(key=lambda x: (x['username'].lower(), x['timestamp']))
new_user_ids = {}
for media in medias:
if media['user_id']:
user_id = media['user_id']
username = media['username']
if username not in existing_users:
existing_users[username] = user_id
new_user_ids[username] = user_id
for media in medias:
UploadMedia(media)
existing_files.append(media['media_id'])
user_id = media['user_id']
username = media['username']
if user_id is None and username in new_user_ids:
media['user_id'] = new_user_ids[username]
def cleanup_dupe_stories(medias):
for media in medias:
pinid = UploadMedia(media)
existing_media_ids.add(media['media_id'])
if UPLOAD_CUSTOM:
for media in failed_medias:
pinid = UploadMedia(media)
def clean_dupes(medias):
removed_count = 0
new_medias = []
for media in medias:
media_id = media['media_id']
filepath = media['filepath']
@ -231,16 +258,70 @@ def cleanup_dupe_stories(medias):
print(f'Invalid media_id for file {filepath}. Skipping...')
continue
# Check if media_id is in existing_files OR if filepath contains any '(number)'
if media_id in existing_files or re.search(r'\(\d+\)', filepath):
# Check if media_id is in existing_media_ids OR if filepath contains any '(number)'
if media_id in existing_media_ids or re.search(r'\(\d+\)', filepath):
removed_count += 1
print(f'Found duplicate file {filepath}. Removing...')
os.remove(filepath)
continue
new_medias.append(media)
print(f'Removed {removed_count} duplicate files.')
return removed_count
return new_medias
def get_cached_data():
if not os.path.exists(CACHE_FILE):
print('No cache file found. Generating new cache…')
return None, None
try:
with open(CACHE_FILE, 'r') as f:
cache_data = json.load(f)
timestamp = datetime.fromisoformat(cache_data.get('timestamp', ''))
if datetime.now() - timestamp < CACHE_TTL:
print('Using cached data…')
return set(tuple(x) for x in cache_data.get('existing_media_ids', [])), cache_data.get('existing_users', {})
except Exception as e:
print(f"Cache read error: {e}")
return None, None
def save_cached_data(existing_media_ids, existing_users):
with open(CACHE_FILE, 'w') as f:
json.dump({'timestamp': datetime.now().isoformat(), 'existing_media_ids': list(existing_media_ids), 'existing_users': existing_users}, f)
def get_existing_medias(newCursor):
existing_media_ids, existing_users = get_cached_data()
if existing_media_ids and existing_users:
newest_id = max(existing_media_ids, key=lambda x: x[0])[0]
existing_media_ids = {image[1] for image in existing_media_ids}
newCursor.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform = 'instagram' AND status = 'public' AND id > %s ORDER BY id DESC", (newest_id,))
new_media_ids = {image[1] for image in newCursor.fetchall()}
for media_id in new_media_ids:
existing_media_ids.add(media_id)
return existing_media_ids, existing_users
print('Getting existing files and users...')
newCursor.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform = 'instagram' AND status = 'public';")
existing_media_ids = {image for image in newCursor.fetchall()}
print('Getting existing users...')
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL AND platform = 'instagram'")
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
cache_file = os.path.join(temp_directory, 'existing_media_ids.json')
with open(cache_file, 'w') as f:
json.dump({'timestamp': datetime.now().isoformat(), 'existing_media_ids': list(existing_media_ids), 'existing_users': existing_users}, f)
return existing_media_ids, existing_users
if __name__ == '__main__':
print('Starting processing...')
@ -252,19 +333,11 @@ if __name__ == '__main__':
obj_storage = config.get_storage()
print('Getting existing files and users...')
newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL AND platform = 'instagram'")
existing_files = [image[0] for image in newCursor.fetchall()]
print('Getting existing users...')
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL AND platform = 'instagram'")
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
existing_media_ids, existing_users = get_existing_medias(newCursor)
dump_instagram(directory)
dump_instagram()
print("Processing completed.")
newDB.close()
for mediatype, _ in media_types.items():
funcs.clean_empty_folders(os.path.join(directory, mediatype))

@ -1,147 +0,0 @@
from datetime import datetime
from uuid import uuid4
import funcs
import config
import cv2
import os
media_directory = "media/ready_for_upload"
platform = "instagram"
working_directory = os.path.join(media_directory, platform)
def UploadMedia(media):
username = media['username']
user_id = media['user_id']
filepath = media['filepath']
platform = media['platform']
media_id = media['media_id']
thumbnail_url = None
phash = None
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
media_type = funcs.get_media_type(filename)
if not media_type:
print(f'Error determining media type for {filename}. Skipping...')
return False
post_type = funcs.determine_post_type(filepath)
if not post_type:
print(f'Error determining post type for {filename}. Skipping...')
return False
file_hash = funcs.calculate_file_hash(filepath)
post_date = datetime.now()
width, height = funcs.get_media_dimensions(filepath)
duration = funcs.get_video_duration(filepath)
if media_type == 'image':
phash = funcs.generate_phash(filepath)
elif media_type == 'video':
try:
thumb_path = generate_thumbnail(filepath)
obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') # this might be a problem in case of duplicate hashes
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg"
phash = funcs.generate_phash(thumb_path)
os.remove(thumb_path)
except Exception as e:
print(f'Error generating thumbnail. Skipping... {e}')
return False
newFilename = f'{file_hash}{file_extension}'
server_path = f'media/{post_type}/{username}/{newFilename}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
obj_storage.PutFile(filepath, server_path) # slow as fuck
post_type = 'story' if post_type == 'stories' else 'post'
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform, media_id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, platform, media_id)
newCursor.execute(query, values) # slower
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
os.remove(filepath)
return True
def generate_thumbnail(filepath):
thumb_path = f'.temp/{uuid4()}.jpg'
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cv2.imwrite(thumb_path, frame)
cap.release()
return thumb_path
def get_user_id(username):
username = username.lower()
if username in existing_users:
return existing_users[username]
return None
def get_media(folder_path):
medias = []
user_folders = os.listdir(folder_path)
for user_folder in user_folders:
user_folder_path = os.path.join(folder_path, user_folder)
if not os.path.isdir(user_folder_path):
continue
files = os.listdir(user_folder_path)
for filename in files:
filepath = os.path.join(folder_path, user_folder, filename)
# skip file if its hidden
if filename.startswith('.'):
continue
try:
media_id = filename.split('.')[0]
media_id = int(media_id)
except:
media_id = None
media = {
'username': user_folder,
'filepath': filepath,
'user_id': get_user_id(user_folder),
'media_id': media_id,
'platform': platform
}
medias.append(media)
return medias
def dump_instagram(folder_path):
medias = get_media(folder_path)
for media in medias:
UploadMedia(media)
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = config.get_storage()
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
dump_instagram(working_directory)
print("Processing completed.")

@ -6,7 +6,6 @@ import os
from funcs import get_media_dimensions
media_dir = 'media'
output_dir = 'instagram'
stories_dir = 'stories'
posts_dir = 'posts'
@ -75,8 +74,6 @@ class DownloadHandler(FileSystemEventHandler):
if not os.path.exists(file_path):
return
print(f'Moving {file}...')
post_type = determine_post_type(file_path)
if post_type == 'posts':
media_type_dir = posts_dir
@ -86,9 +83,15 @@ class DownloadHandler(FileSystemEventHandler):
print(f"Could not determine post type for {file}. Skipping...")
return
outputPath = os.path.join(media_dir, output_dir, media_type_dir, file)
outputPath = os.path.join(media_dir, media_type_dir, file)
if os.path.exists(outputPath):
print(f"File already exists {outputPath}. Removing...")
os.remove(file_path)
return
shutil.move(file_path, outputPath)
print(f"Moved {file_path} to {outputPath}")
def on_created(self, event):
if not event.is_directory and 'crdownload' not in event.src_path:
@ -110,4 +113,4 @@ if __name__ == "__main__":
time.sleep(1) # Add a 1-second sleep to reduce CPU usage
except KeyboardInterrupt:
observer.stop()
observer.join()
observer.join()

@ -1,140 +0,0 @@
from datetime import datetime
from uuid import uuid4
import funcs
import config
import cv2
import os
directory = 'processed_tiktoks'
def UploadMedia(media):
platform = 'TikTok'
username = media['username']
filepath = media['filepath']
file_size = os.path.getsize(filepath)
thumbnail_url = None
phash = None
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
media_type = funcs.get_media_type(filename)
if not media_type:
print(f'Error determining media type for {filename}. Skipping...')
return False
post_type = funcs.determine_post_type(filepath)
if not post_type:
print(f'Error determining post type for {filename}. Skipping...')
return False
file_hash = funcs.calculate_file_hash(filepath)
if file_hash in existing_hashes:
print(f'File {filename} already exists. Skipping...')
return False
post_date = datetime.now()
width, height = funcs.get_media_dimensions(filepath)
duration = funcs.get_video_duration(filepath)
if media_type == 'image':
phash = funcs.generate_phash(filepath)
elif media_type == 'video':
try:
thumb_path = generate_thumbnail(filepath)
obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') # this might be a problem in case of duplicate hashes
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg"
phash = funcs.generate_phash(thumb_path)
os.remove(thumb_path)
except:
print('Error generating thumbnail. Skipping...')
return False
newFilename = f'{file_hash}{file_extension}'
server_path = f'media/tiktoks/{username}/{newFilename}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
obj_storage.PutFile(filepath, server_path) # slow as fuck
post_type = 'story' if post_type == 'stories' else 'post'
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, hash, filename, duration, thumbnail, phash, platform, file_size) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, post_type, post_date, file_hash, filename, duration, thumbnail_url, phash, platform, file_size)
newCursor.execute(query, values) # slower
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
os.remove(filepath)
return True
def generate_thumbnail(filepath):
thumb_path = f'temp/{uuid4()}.jpg'
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cv2.imwrite(thumb_path, frame)
cap.release()
return thumb_path
def get_media_data(filepath):
filename = os.path.basename(filepath)
parts = filename.split('~')
if len(parts) == 3:
username, title, tiktok_id = parts
elif len(parts) == 2:
username, title = parts
tiktok_id = None
else:
return False
data = {'username': username, 'filepath': filepath, 'tiktok_id': tiktok_id, 'title': title}
return data
def get_media(folder_path):
medias = []
users = os.listdir(folder_path)
for user in users:
user_folder = os.path.join(folder_path, user)
if not os.path.isdir(user_folder):
print(f"Skipping {user}")
continue
files = os.listdir(user_folder)
for filename in files:
filepath = os.path.join(user_folder, filename)
data = get_media_data(filepath)
if data:
medias.append(data)
return medias
def dump_instagram(folder_path):
medias = get_media(folder_path)
for media in medias:
UploadMedia(media)
if __name__ == '__main__':
print('Starting processing...')
if not os.listdir(directory):
print('No files to process. Exiting...')
exit()
newDB, newCursor = config.gen_connection()
obj_storage = config.get_storage()
newCursor.execute("SELECT hash FROM media WHERE hash IS NOT NULL AND platform = 'TikTok'")
existing_hashes = [row[0] for row in newCursor.fetchall()]
dump_instagram(directory)
print("Processing completed.")

@ -1,123 +0,0 @@
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc
import requests
import base64
import re
import os
def format_url(url):
clean_url = re.sub(r'%[0-9A-F]{2}', '', url)
return clean_url
def encode_offset(offset_num):
offset_base64 = str(offset_num).encode('utf-8')
offset_base64 = base64.b64encode(offset_base64).decode('utf-8')
return offset_base64
def get_clips(username):
url = 'https://gql.twitch.tv/gql'
offset_num = 20
offset_base64 = encode_offset(offset_num)
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
headers = {
'client-id': 'kimne78kx3ncx6brgo4mv6wki5h1ko',
'Content-Type': 'text/plain;charset=UTF-8',
'User-Agent': user_agent
}
data = {
"operationName":"ClipsCards__User",
"variables":{"login":username,"limit":100,},
"extensions":{"persistedQuery":{"version":1,"sha256Hash":"4eb8f85fc41a36c481d809e8e99b2a32127fdb7647c336d27743ec4a88c4ea44"}}
}
response = requests.post(url, headers=headers, json=data)
clips = response.json()
clips = clips['data']['user']['clips']['edges']
cleaned_clips = parse_clips(clips)
return cleaned_clips
def parse_clips(clips):
"""
clips is a list of dictionaries
"""
cleaned_clips = []
for clip in clips:
clip = clip['node']
clip_id = clip['id']
clip_url = clip['url']
clip_title = clip['title']
clip_view_count = clip['viewCount']
clip_duration = clip['durationSeconds']
cleaned_clip = {
'id': clip_id,
'url': clip_url,
'title': clip_title,
'views': clip_view_count,
'duration': clip_duration
}
cleaned_clips.append(cleaned_clip)
return cleaned_clips
def get_video_url(video_url, driver):
driver.get(video_url)
# Get the video element
video = driver.find_element(By.TAG_NAME, 'video')
# Get the video source
video_src = video.get_attribute('src')
return video_src
def download_video(video_url, filepath):
if os.path.exists(filepath):
return filepath
video = requests.get(video_url)
# Download in chunks
with open(filepath, 'wb') as f:
for chunk in video.iter_content(chunk_size=1024):
f.write(chunk)
return filepath
# Set up an undetected Chrome driver in headless mode
opts = uc.ChromeOptions()
opts.add_argument("--headless")
opts.add_argument("--window-size=1920,1080")
driver = uc.Chrome(use_subprocess=True, options=opts)
username = 'didicandy666'
clips = get_clips(username)
for clip in clips:
clip_url = clip['clip_url']
filename = f"{clip['id']}.mp4"
filepath = os.path.join('clips', filename)
if os.path.exists(filepath):
print(f"Already downloaded {filename}")
continue
video_url = get_video_url(clip_url, driver)
download_video(video_url, filepath)
print(f"Downloaded {filename}")

@ -0,0 +1,143 @@
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
# --- Configuration ---
USERNAME = "maorshabakov" # your Instagram username
PASSWORD = "PeyxCU%MD*Zq9p" # your Instagram password
TARGET_USER = "cata.leyah" # the username of the profile to scrape
DOWNLOAD_DIR = "downloads" # directory to save media
SCROLL_PAUSE_TIME = 2 # seconds to wait after each scroll
# --- Helper functions ---
def login_instagram(driver, username, password):
driver.get("https://www.instagram.com/accounts/login/")
time.sleep(3) # wait for the login page to load
# Accept cookies if prompted (may need to adjust for your region)
try:
accept_button = driver.find_element(By.XPATH, "//button[text()='Allow all cookies']")
accept_button.click()
time.sleep(2)
except Exception:
pass
# check if already logged in by checking if the current url has been redirected to the home page
if driver.current_url == "https://www.instagram.com/":
print("Already logged in.")
return
# Enter username and password
username_input = driver.find_element(By.NAME, "username")
password_input = driver.find_element(By.NAME, "password")
username_input.send_keys(username)
password_input.send_keys(password)
password_input.send_keys(Keys.RETURN)
time.sleep(5) # wait for login to complete
def scroll_to_load_posts(driver, post_count=12):
post_links = dict()
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
new_posts = get_post_links(driver)
for link in new_posts:
if link not in post_links:
post_links[link] = True
if len(post_links) >= post_count:
break
if new_height == last_height:
break
last_height = new_height
def get_post_links(driver):
# Find all post links on the profile page.
# Instagram posts are links with hrefs that contain '/p/'
post_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '/p/')]")
links = [elem.get_attribute("href") for elem in post_elements]
# Remove duplicates
return list(set(links))
def download_media(url, download_folder, filename):
response = requests.get(url, stream=True)
if response.status_code == 200:
filepath = os.path.join(download_folder, filename)
with open(filepath, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
print(f"Downloaded: {filename}")
else:
print(f"Failed to download: {url}")
def extract_media_url(driver):
# Try to get video first
try:
video = driver.find_element(By.TAG_NAME, "video")
media_url = video.get_attribute("src")
if media_url:
return media_url, "mp4"
except Exception:
pass
# Fallback to image extraction
try:
# Sometimes the post image is inside a div with role="button"
image = driver.find_element(By.XPATH, "//img[contains(@src, 'scontent')]")
media_url = image.get_attribute("src")
if media_url:
return media_url, "jpg"
except Exception:
pass
return None, None
# --- Main script ---
def main():
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
chrome_options = Options()
chrome_options.add_argument("--user-data-dir=.profiles/thenigga")
driver = webdriver.Chrome(options=chrome_options)
driver.maximize_window()
try:
# Log in to Instagram
login_instagram(driver, USERNAME, PASSWORD)
# Navigate to the target user's profile
driver.get(f"https://www.instagram.com/{TARGET_USER}/")
time.sleep(5) # let the page load
# Scroll down to load all posts
scroll_to_load_posts(driver)
# Gather all post links from the profile page
post_links = get_post_links(driver)
print(f"Found {len(post_links)} posts.")
# Process each post
for idx, post_link in enumerate(post_links):
driver.get(post_link)
time.sleep(3) # wait for post to load
# click download button where div class post-download-all-button
download_button = driver.find_element(By.XPATH, "//div[@class='post-download-all-button']")
driver.execute_script("arguments[0].click();", download_button)
time.sleep(1)
finally:
driver.quit()
if __name__ == "__main__":
main()
Loading…
Cancel
Save