main
oscar 9 months ago
parent 4d23278033
commit 48d2330193

@ -0,0 +1,35 @@
import os
import json
import gzip
data_dir = 'data'
data_compressed_dir = 'data_compressed'
os.makedirs(data_compressed_dir, exist_ok=True)
def compress_file(filepath, output_file):
with open(filepath, 'r') as f:
data = json.load(f)
compress_data(data, output_file)
return output_file
def compress_data(data, output_file):
with gzip.open(output_file, 'wb') as f:
f.write(json.dumps(data).encode('utf-8'))
return output_file
data_files = os.listdir(data_dir)
for file in data_files:
if not file.endswith('.json'):
continue
filepath = f'{data_dir}/{file}'
output_file = f'{data_compressed_dir}/{file}.gz'
output_file = compress_file(filepath, output_file)
if output_file:
print(f'Compressed {file} to {output_file}')
os.remove(filepath)
else:
print(f'Failed to compress {file}')
print('Data compression completed')

@ -0,0 +1,87 @@
from funcs import get_files
from PIL import Image
import imagehash
import config
import os
def generate_image_phash(filepath, hash_size=8):
try:
# Open the image using PIL
pil_image = Image.open(filepath)
# Compute pHash using the imagehash library
phash = imagehash.phash(pil_image, hash_size=hash_size)
return phash
except Exception as e:
print(f"Error processing image {filepath}: {e}")
return None
def are_phashes_duplicates(phash1, phash2, threshold=5):
try:
# Compute the Hamming distance between the pHashes
distance = phash1 - phash2
return distance <= threshold
except TypeError as e:
print(f"Error comparing pHashes: {e}")
return False
def find_duplicate_phash(phash, existing_medias, threshold=5):
for media in existing_medias:
existing_phash_str = media[1]
existing_username = media[2]
# Convert stored pHash string to ImageHash object
existing_phash = imagehash.hex_to_hash(existing_phash_str)
# Check if the current pHash is a duplicate
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
return media
return None
def get_media_by_hash(hash, existing_medias):
for media in existing_medias:
existing_hash = media[1]
if hash == existing_hash:
return media
return None
def get_media_by_id(media_id, existing_medias):
for media in existing_medias:
existing_media_id = media[1]
if media_id == existing_media_id:
return media
return None
def get_data_by_filename(filename, data):
for item in data:
if filename in item['filepath']:
return item
return None
# Database connection
db, cursor = config.gen_connection()
# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL AND media_id IS NULL;", ['image'])
existing_medias = cursor.fetchall()
directory = 'check_if_exists/' # Directory containing user images
files = [file for file in get_files(directory) if file.endswith(('.jpg', '.jpeg', '.png'))]
for filepath in files:
image_filename = os.path.basename(filepath)
# Generate pHash for the image
phash = generate_image_phash(filepath, hash_size=8)
if phash is None:
continue
# Check if the image is a duplicate of any in the database
duplicate_media = find_duplicate_phash(phash, existing_medias)
if duplicate_media:
print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
print(f'Duplicate image path: {filepath}')
newpath = os.path.join('duplicates', duplicate_media[2], image_filename)
os.makedirs(os.path.dirname(newpath), exist_ok=True)
os.rename(filepath, newpath)
print(f'Moved {image_filename} to duplicates/')

@ -0,0 +1,79 @@
from funcs import get_files
from PIL import Image
import imagehash
import config
import cv2
import os
def get_video_phash(filepath, hash_size=8):
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cap.release()
if not ret:
print(f"Error reading frame from {filepath}")
return None
# Resize frame to a standard size
standard_size = (320, 240)
resized_frame = cv2.resize(frame, standard_size, interpolation=cv2.INTER_AREA)
# Convert OpenCV image (BGR) to PIL Image (RGB)
image_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(image_rgb)
# Compute pHash
phash = imagehash.phash(pil_image, hash_size=hash_size)
return phash
def are_phashes_duplicates(phash1, phash2, threshold=5):
# Compute Hamming distance between the pHashes
try:
distance = phash1 - phash2
except TypeError as e:
print(f"Error comparing pHashes: {e}")
return False
return distance <= threshold
def get_media_by_phash(phash, existing_medias, threshold=5):
for media in existing_medias:
existing_phash_str = media[1]
existing_username = media[2]
existing_phash = imagehash.hex_to_hash(existing_phash_str)
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
return media
return None
# Database connection
db, cursor = config.gen_connection()
# Directory containing user videos
directory = 'check_if_exists/' # Directory containing user images
# Fetch existing videos with pHashes
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL;", ['video'])
existing_medias = cursor.fetchall()
# make a list of all video files
files = [file for file in get_files(directory) if file.endswith(('.mp4', '.avi', '.mov'))]
for filepath in files:
video_filename = os.path.basename(filepath)
phash = get_video_phash(filepath, hash_size=8) # Use hash_size=8
if phash is None:
continue
duplicate_media = get_media_by_phash(phash, existing_medias, threshold=5)
if duplicate_media:
print(f'Duplicate url found: https://altpins.com/pin/{duplicate_media[0]}')
print(f'Duplicate video path: {filepath}')
newpath = os.path.join('duplicates', duplicate_media[2], video_filename)
os.makedirs(os.path.dirname(newpath), exist_ok=True)
os.rename(filepath, newpath)
print(f'Moved {filepath} to duplicates/')

@ -0,0 +1,19 @@
import config, storysave_api
db, cursor = config.gen_connection()
usernames = []
with open('usernames.txt', 'r') as f:
for line in f:
usernames.append(line.strip())
for username in usernames:
print(f"Username: {username}")
user_id = storysave_api.get_user_id(username)
# Update the user_id in the database
cursor.execute("UPDATE media SET user_id = %s WHERE username = %s AND user_id IS NULL;", [user_id, username])
db.commit()
print(f"[{cursor.rowcount}] Updated user_id for {username}")

@ -0,0 +1,32 @@
import config
import os
temp_directory = "cache"
os.makedirs(temp_directory, exist_ok=True)
obj_storage = config.get_storage()
db, cursor = config.gen_connection()
cursor.execute("SELECT id, media_url FROM media WHERE file_size = 0;")
results = cursor.fetchall()
count = 0
print(f"Found {len(results)} files to process.")
for result in results:
count += 1
id, media_url = result
serverPath = media_url.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(os.getcwd(), temp_directory, os.path.basename(serverPath))
if not os.path.exists(localFilePath):
continue
file_size = os.path.getsize(localFilePath)
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, id))
db.commit()
print(f"[{count}/{len(results)}] {media_url}: {file_size}, {cursor.rowcount}")

@ -0,0 +1,154 @@
from datetime import datetime
import config
import funcs
import cv2
import os
directory = 'media/instagram/'
def UploadMedia(media):
media_id = media['media_id']
username = media['username']
post_date = media['timestamp']
user_id = media['user_id']
filepath = media['filepath']
highlight_id = media['highlight_id']
post_type = media['post_type']
thumbnail_url = None
phash = None
if media_id and int(media_id) in existing_files:
print('Duplicate file detected. Removing...')
os.remove(filepath)
return True
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
media_type = funcs.get_media_type(filename)
file_hash = funcs.calculate_file_hash(filepath)
width, height = funcs.get_media_dimensions(filepath)
duration = funcs.get_video_duration(filepath)
if media_type == 'video':
try:
thumbPath = f'temp/{media_id}.jpg'
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cv2.imwrite(thumbPath, frame)
cap.release()
obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') # slower
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg"
phash = funcs.generate_phash(thumbPath)
os.remove(thumbPath)
except:
print('Error generating thumbnail. Skipping...')
return False
elif media_type == 'image':
phash = funcs.generate_phash(filepath)
if media_id:
newFilename = f'{media_id}{file_extension}'
else:
newFilename = f'{file_hash}{file_extension}'
server_path = f'media/{post_type}/{username}/{newFilename}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
obj_storage.PutFile(filepath, server_path) # slow as fuck
if highlight_id:
newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id))
newDB.commit()
print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, 'instagram')
newCursor.execute(query, values) # slower
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
os.remove(filepath)
return True
def get_user_id(username):
username = username.lower()
if username in existing_users:
return existing_users[username]
return None
def get_media():
medias = []
post_types = {
'posts': 'post',
'stories': 'story',
'profile': 'profile',
}
for post_type in os.listdir(directory):
users_dir = os.path.join(directory, post_type)
if not os.path.isdir(users_dir):
continue
users = os.listdir(users_dir)
for username in users:
user_path = os.path.join(directory, post_type, username)
if not os.path.isdir(user_path):
continue
for filename in os.listdir(user_path):
if filename.startswith('.'):
continue
data = {}
filepath = os.path.join(user_path, filename)
if 'com.instagram.android__' in filename:
timestamp_str = filename.split('__')[-1].split('.')[0]
data['timestamp'] = datetime.strptime(timestamp_str, '%Y%m%d%H%M%S%f')
else:
data['timestamp'] = datetime.now()
data['post_type'] = post_types[post_type]
data['username'] = username
data['filepath'] = filepath
data['media_id'] = None
data['user_id'] = get_user_id(data['username'])
data['highlight_id'] = None
medias.append(data)
return medias
def dump_instagram():
medias = get_media()
for media in medias:
UploadMedia(media)
existing_files.append(media['media_id'])
if __name__ == '__main__':
print('Starting processing...')
if not os.listdir(directory):
print('No files to process. Exiting...')
exit()
newDB, newCursor = config.gen_connection()
obj_storage = config.get_storage()
newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL")
existing_files = [image[0] for image in newCursor.fetchall()]
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
dump_instagram()
print("Processing completed.")
Loading…
Cancel
Save