refactor(mkbsd.py): replace async image download with synchronous requests for simplicity and add duplicate removal and zipping functionality

feat(mkbsd.py): add argparse for command-line options to zip downloads and remove duplicates chore: add requirements.txt for dependency management
2024-12-23 02:05:16 +00:00 · 2024-09-26 10:44:32 -04:00 · 2024-09-26 10:44:32 -04:00 · ebd7a47ebe
commit ebd7a47ebe
parent 82e50c64f0
2 changed files with 165 additions and 65 deletions
--- a/mkbsd.py
+++ b/mkbsd.py
@ -1,76 +1,175 @@
-# Licensed under the WTFPL License
+import argparse
-
+import multiprocessing as mp
 import os
-import time
+import re
-import aiohttp
+import zipfile
-import asyncio
+from collections import defaultdict
-from urllib.parse import urlparse
+from concurrent.futures import ThreadPoolExecutor
-url = 'https://storage.googleapis.com/panels-api/data/20240916/media-1a-i-p~s'
+from urllib.parse import unquote
-async def delay(ms):
+import imagehash
-    await asyncio.sleep(ms / 1000)
+import requests
 from PIL import Image
-async def download_image(session, image_url, file_path):
+
 # python mkbsd.py [--zip] [--zip-name CUSTOM_NAME] [--remove-duplicates]
 def fetch_json_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(
            f"Failed to fetch JSON data. Status code: {response.status_code}"
        )
 def extract_urls(element):
    urls = []
    if isinstance(element, dict):
        for key, value in element.items():
            if key == "url":
                urls.append(value)
            else:
                urls.extend(extract_urls(value))
    elif isinstance(element, list):
        for item in element:
            urls.extend(extract_urls(item))
    return urls
 def download_file(url):
    file_name = os.path.basename(unquote(url.split("?")[0]))
    file_name = clean_filename(file_name)
    file_path = os.path.join("downloads", file_name)
    if not os.path.exists(file_path):
        print(f"Downloading {url}")
        response = requests.get(url, stream=True)
        with open(file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
    else:
        print(f"Skipping {url}")
    return file_path
 def clean_filename(filename):
    sanitized_name = filename.replace("~", " ")
    sanitized_name = re.sub(r'[<>:"/\\|?*]', "_", sanitized_name)
    sanitized_name = re.sub(r"[\s_]+", " ", sanitized_name).strip()
    return sanitized_name
 def zip_directory(path, zip_name):
    with zipfile.ZipFile(zip_name, "w", zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, path)
                zipf.write(file_path, arcname)
    print(f"Created zip file: {zip_name}")
 def compute_hash(filepath):
    try:
-        async with session.get(image_url) as response:
+        with Image.open(filepath) as img:
-            if response.status != 200:
+            return imagehash.phash(img, hash_size=8), filepath
                raise Exception(f"Failed to download image: {response.status}")
            content = await response.read()
            with open(file_path, 'wb') as f:
                f.write(content)
    except Exception as e:
-        print(f"Error downloading image: {str(e)}")
+        print(f"Error processing {filepath}: {e}")
        return None
-async def main():
+
 def find_duplicate_images(directory, threshold=2):
    image_files = [
        os.path.join(directory, f)
        for f in os.listdir(directory)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ]
    image_files.sort(key=os.path.getsize)
    with mp.Pool(mp.cpu_count()) as pool:
        results = pool.map(compute_hash, image_files)
    hash_groups = defaultdict(list)
    for result in filter(None, results):
        hash_value, filepath = result
        hash_groups[hash_value].append(filepath)
    duplicates = []
    for hash_value, filepaths in hash_groups.items():
        if len(filepaths) > 1:
            for i in range(len(filepaths)):
                for j in range(i + 1, len(filepaths)):
                    duplicates.append((filepaths[i], filepaths[j]))
    return duplicates
 def remove_duplicates(duplicates):
    for image1, image2 in duplicates:
        try:
-        async with aiohttp.ClientSession() as session:
+            if os.path.getsize(image1) < os.path.getsize(image2):
-            async with session.get(url) as response:
+                os.remove(image1)
-                if response.status != 200:
+                print(f"Removed duplicate: {image1}")
-                    raise Exception(f"⛔ Failed to fetch JSON file: {response.status}")
+            else:
-                json_data = await response.json()
+                os.remove(image2)
-                data = json_data.get('data')
+                print(f"Removed duplicate: {image2}")
                if not data:
                    raise Exception('⛔ JSON does not have a "data" property at its root.')
                download_dir = os.path.join(os.getcwd(), 'downloads')
                if not os.path.exists(download_dir):
                    os.makedirs(download_dir)
                    print(f"📁 Created directory: {download_dir}")
                file_index = 1
                for key, subproperty in data.items():
                    if subproperty and subproperty.get('dhd'):
                        image_url = subproperty['dhd']
                        print(f"🔍 Found image URL!")
                        parsed_url = urlparse(image_url)
                        ext = os.path.splitext(parsed_url.path)[-1] or '.jpg'
                        filename = f"{file_index}{ext}"
                        file_path = os.path.join(download_dir, filename)
                        await download_image(session, image_url, file_path)
                        print(f"🖼️ Saved image to {file_path}")
                        file_index += 1
                        await delay(250)
        except Exception as e:
-        print(f"Error: {str(e)}")
+            print(f"Error removing duplicate: {e}")
 def main():
    parser = argparse.ArgumentParser(
        description="Download images from JSON data and remove duplicates."
    )
    parser.add_argument(
        "--zip", action="store_true", help="Create a zip file of the downloaded images"
    )
    parser.add_argument(
        "--zip-name",
        type=str,
        help="Custom name for the zip file (default: downloads.zip)",
    )
    parser.add_argument(
        "--remove-duplicates",
        action="store_true",
        help="Remove duplicate images after download",
    )
    args = parser.parse_args()
    json_url = "https://storage.googleapis.com/panels-cdn/data/20240730/all.json"
    try:
        json_data = fetch_json_data(json_url)
    except Exception as e:
        print(f"Error: {e}")
        return
    urls = extract_urls(json_data)
    print(f"Found {len(urls)} URLs")
    if not os.path.exists("downloads"):
        os.makedirs("downloads")
    with ThreadPoolExecutor(max_workers=10) as executor:
        executor.map(download_file, urls)
    if args.remove_duplicates:
        print("Searching for duplicate images...")
        duplicates = find_duplicate_images("downloads")
        if duplicates:
            print(f"Found {len(duplicates)} pairs of duplicate images.")
            remove_duplicates(duplicates)
        else:
            print("No duplicate images found.")
    if args.zip:
        zip_name = args.zip_name if args.zip_name else "downloads.zip"
        if not zip_name.endswith(".zip"):
            zip_name += ".zip"
        zip_directory("downloads", zip_name)
 def ascii_art():
    print("""
 /$$      /$$ /$$   /$$ /$$$$$$$   /$$$$$$  /$$$$$$$
 | $$$    /$$$| $$  /$$/| $$__  $$ /$$__  $$| $$__  $$
 | $$$$  /$$$$| $$ /$$/ | $$  \\ $$| $$  \\__/| $$  \\ $$
 | $$ $$/$$ $$| $$$$$/  | $$$$$$$ |  $$$$$$ | $$  | $$
 | $$  $$$| $$| $$  $$  | $$__  $$ \\____  $$| $$  | $$
 | $$\\  $ | $$| $$\\  $$ | $$  \\ $$ /$$  \\ $$| $$  | $$
 | $$ \\/  | $$| $$ \\  $$| $$$$$$$/|  $$$$$$/| $$$$$$$/
 |__/     |__/|__/  \\__/|_______/  \\______/ |_______/""")
    print("")
    print("🤑 Starting downloads from your favorite sellout grifter's wallpaper app...")
 if __name__ == "__main__":
-    ascii_art()
+    main()
    time.sleep(5)
    asyncio.run(main())
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
 imagehash