refactor(mkbsd.py): replace async image download with synchronous requests for simplicity and add duplicate removal and zipping functionality

feat(mkbsd.py): add argparse for command-line options to zip downloads and remove duplicates

chore: add requirements.txt for dependency management
This commit is contained in:
Andre Saddler 2024-09-26 10:44:32 -04:00
parent 82e50c64f0
commit ebd7a47ebe
2 changed files with 165 additions and 65 deletions

229
mkbsd.py
View File

@ -1,76 +1,175 @@
# Licensed under the WTFPL License import argparse
import multiprocessing as mp
import os import os
import time import re
import aiohttp import zipfile
import asyncio from collections import defaultdict
from urllib.parse import urlparse from concurrent.futures import ThreadPoolExecutor
url = 'https://storage.googleapis.com/panels-api/data/20240916/media-1a-i-p~s' from urllib.parse import unquote
async def delay(ms): import imagehash
await asyncio.sleep(ms / 1000) import requests
from PIL import Image
async def download_image(session, image_url, file_path):
# python mkbsd.py [--zip] [--zip-name CUSTOM_NAME] [--remove-duplicates]
def fetch_json_data(url):
response = requests.get(url)
if response.status_code == 200:
return response.json()
else:
raise Exception(
f"Failed to fetch JSON data. Status code: {response.status_code}"
)
def extract_urls(element):
urls = []
if isinstance(element, dict):
for key, value in element.items():
if key == "url":
urls.append(value)
else:
urls.extend(extract_urls(value))
elif isinstance(element, list):
for item in element:
urls.extend(extract_urls(item))
return urls
def download_file(url):
file_name = os.path.basename(unquote(url.split("?")[0]))
file_name = clean_filename(file_name)
file_path = os.path.join("downloads", file_name)
if not os.path.exists(file_path):
print(f"Downloading {url}")
response = requests.get(url, stream=True)
with open(file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
else:
print(f"Skipping {url}")
return file_path
def clean_filename(filename):
sanitized_name = filename.replace("~", " ")
sanitized_name = re.sub(r'[<>:"/\\|?*]', "_", sanitized_name)
sanitized_name = re.sub(r"[\s_]+", " ", sanitized_name).strip()
return sanitized_name
def zip_directory(path, zip_name):
with zipfile.ZipFile(zip_name, "w", zipfile.ZIP_DEFLATED) as zipf:
for root, _, files in os.walk(path):
for file in files:
file_path = os.path.join(root, file)
arcname = os.path.relpath(file_path, path)
zipf.write(file_path, arcname)
print(f"Created zip file: {zip_name}")
def compute_hash(filepath):
try: try:
async with session.get(image_url) as response: with Image.open(filepath) as img:
if response.status != 200: return imagehash.phash(img, hash_size=8), filepath
raise Exception(f"Failed to download image: {response.status}")
content = await response.read()
with open(file_path, 'wb') as f:
f.write(content)
except Exception as e: except Exception as e:
print(f"Error downloading image: {str(e)}") print(f"Error processing {filepath}: {e}")
return None
async def main():
def find_duplicate_images(directory, threshold=2):
image_files = [
os.path.join(directory, f)
for f in os.listdir(directory)
if f.lower().endswith((".jpg", ".jpeg", ".png"))
]
image_files.sort(key=os.path.getsize)
with mp.Pool(mp.cpu_count()) as pool:
results = pool.map(compute_hash, image_files)
hash_groups = defaultdict(list)
for result in filter(None, results):
hash_value, filepath = result
hash_groups[hash_value].append(filepath)
duplicates = []
for hash_value, filepaths in hash_groups.items():
if len(filepaths) > 1:
for i in range(len(filepaths)):
for j in range(i + 1, len(filepaths)):
duplicates.append((filepaths[i], filepaths[j]))
return duplicates
def remove_duplicates(duplicates):
for image1, image2 in duplicates:
try: try:
async with aiohttp.ClientSession() as session: if os.path.getsize(image1) < os.path.getsize(image2):
async with session.get(url) as response: os.remove(image1)
if response.status != 200: print(f"Removed duplicate: {image1}")
raise Exception(f"⛔ Failed to fetch JSON file: {response.status}") else:
json_data = await response.json() os.remove(image2)
data = json_data.get('data') print(f"Removed duplicate: {image2}")
if not data:
raise Exception('⛔ JSON does not have a "data" property at its root.')
download_dir = os.path.join(os.getcwd(), 'downloads')
if not os.path.exists(download_dir):
os.makedirs(download_dir)
print(f"📁 Created directory: {download_dir}")
file_index = 1
for key, subproperty in data.items():
if subproperty and subproperty.get('dhd'):
image_url = subproperty['dhd']
print(f"🔍 Found image URL!")
parsed_url = urlparse(image_url)
ext = os.path.splitext(parsed_url.path)[-1] or '.jpg'
filename = f"{file_index}{ext}"
file_path = os.path.join(download_dir, filename)
await download_image(session, image_url, file_path)
print(f"🖼️ Saved image to {file_path}")
file_index += 1
await delay(250)
except Exception as e: except Exception as e:
print(f"Error: {str(e)}") print(f"Error removing duplicate: {e}")
def main():
parser = argparse.ArgumentParser(
description="Download images from JSON data and remove duplicates."
)
parser.add_argument(
"--zip", action="store_true", help="Create a zip file of the downloaded images"
)
parser.add_argument(
"--zip-name",
type=str,
help="Custom name for the zip file (default: downloads.zip)",
)
parser.add_argument(
"--remove-duplicates",
action="store_true",
help="Remove duplicate images after download",
)
args = parser.parse_args()
json_url = "https://storage.googleapis.com/panels-cdn/data/20240730/all.json"
try:
json_data = fetch_json_data(json_url)
except Exception as e:
print(f"Error: {e}")
return
urls = extract_urls(json_data)
print(f"Found {len(urls)} URLs")
if not os.path.exists("downloads"):
os.makedirs("downloads")
with ThreadPoolExecutor(max_workers=10) as executor:
executor.map(download_file, urls)
if args.remove_duplicates:
print("Searching for duplicate images...")
duplicates = find_duplicate_images("downloads")
if duplicates:
print(f"Found {len(duplicates)} pairs of duplicate images.")
remove_duplicates(duplicates)
else:
print("No duplicate images found.")
if args.zip:
zip_name = args.zip_name if args.zip_name else "downloads.zip"
if not zip_name.endswith(".zip"):
zip_name += ".zip"
zip_directory("downloads", zip_name)
def ascii_art():
print("""
/$$ /$$ /$$ /$$ /$$$$$$$ /$$$$$$ /$$$$$$$
| $$$ /$$$| $$ /$$/| $$__ $$ /$$__ $$| $$__ $$
| $$$$ /$$$$| $$ /$$/ | $$ \\ $$| $$ \\__/| $$ \\ $$
| $$ $$/$$ $$| $$$$$/ | $$$$$$$ | $$$$$$ | $$ | $$
| $$ $$$| $$| $$ $$ | $$__ $$ \\____ $$| $$ | $$
| $$\\ $ | $$| $$\\ $$ | $$ \\ $$ /$$ \\ $$| $$ | $$
| $$ \\/ | $$| $$ \\ $$| $$$$$$$/| $$$$$$/| $$$$$$$/
|__/ |__/|__/ \\__/|_______/ \\______/ |_______/""")
print("")
print("🤑 Starting downloads from your favorite sellout grifter's wallpaper app...")
if __name__ == "__main__": if __name__ == "__main__":
ascii_art() main()
time.sleep(5)
asyncio.run(main())

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
imagehash