refactor(mkbsd.py): switch from requests to aiohttp for async operations to improve performance and non-blocking I/O

feat(mkbsd.py): add asyncio for concurrent downloads and improve download speed chore(requirements.txt): add aiohttp dependency for async HTTP requests
2024-12-22 20:15:06 +00:00 · 2024-09-27 01:35:09 -04:00 · 2024-09-27 01:35:09 -04:00 · 1cd2c133eb
commit 1cd2c133eb
parent ebd7a47ebe
2 changed files with 46 additions and 27 deletions
--- a/mkbsd.py
+++ b/mkbsd.py
@ -1,28 +1,30 @@
 import argparse
 import asyncio
 import json
 import multiprocessing as mp
 import os
 import re
 import time
 import zipfile
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
 from urllib.parse import unquote
 import aiohttp
 import imagehash
 import requests
 from PIL import Image
-# python mkbsd.py [--zip] [--zip-name CUSTOM_NAME] [--remove-duplicates]
+async def fetch_json_data(url):
-
+    async with aiohttp.ClientSession() as session:
-
+        async with session.get(url) as response:
-def fetch_json_data(url):
+            if response.status == 200:
-    response = requests.get(url)
+                text = await response.text()
-    if response.status_code == 200:
+                try:
-        return response.json()
+                    return json.loads(text)
-    else:
+                except json.JSONDecodeError:
-        raise Exception(
+                    raise Exception(f"Failed to parse JSON data from {url}")
-            f"Failed to fetch JSON data. Status code: {response.status_code}"
+            else:
-        )
+                raise Exception(f"Failed to fetch data. Status code: {response.status}")
 def extract_urls(element):
@ -39,19 +41,27 @@ def extract_urls(element):
    return urls
-def download_file(url):
+async def download_file(session, url):
    file_name = os.path.basename(unquote(url.split("?")[0]))
    file_name = clean_filename(file_name)
    file_path = os.path.join("downloads", file_name)
    if not os.path.exists(file_path):
-        print(f"Downloading {url}")
+        try:
-        response = requests.get(url, stream=True)
+            async with session.get(url) as response:
-        with open(file_path, "wb") as f:
+                if response.status == 200:
-            for chunk in response.iter_content(chunk_size=8192):
+                    with open(file_path, "wb") as f:
-                f.write(chunk)
+                        while True:
                            chunk = await response.content.read(8192)
                            if not chunk:
                                break
                            f.write(chunk)
                    return f"Downloaded: {file_name}"
                else:
                    return f"Failed to download {file_name}: HTTP {response.status}"
        except Exception as e:
            return f"Error downloading {file_name}: {str(e)}"
    else:
-        print(f"Skipping {url}")
+        return f"Skipped (already exists): {file_name}"
    return file_path
 def clean_filename(filename):
@ -120,7 +130,7 @@ def remove_duplicates(duplicates):
            print(f"Error removing duplicate: {e}")
-def main():
+async def main():
    parser = argparse.ArgumentParser(
        description="Download images from JSON data and remove duplicates."
    )
@ -141,7 +151,7 @@ def main():
    json_url = "https://storage.googleapis.com/panels-cdn/data/20240730/all.json"
    try:
-        json_data = fetch_json_data(json_url)
+        json_data = await fetch_json_data(json_url)
    except Exception as e:
        print(f"Error: {e}")
        return
@ -152,8 +162,16 @@ def main():
    if not os.path.exists("downloads"):
        os.makedirs("downloads")
-    with ThreadPoolExecutor(max_workers=10) as executor:
+    start_time = time.time()
-        executor.map(download_file, urls)
+    async with aiohttp.ClientSession() as session:
        tasks = [download_file(session, url) for url in urls]
        for batch in [tasks[i : i + 50] for i in range(0, len(tasks), 50)]:
            results = await asyncio.gather(*batch)
            for result in results:
                print(result)
    end_time = time.time()
    print(f"Download completed in {end_time - start_time:.2f} seconds")
    if args.remove_duplicates:
        print("Searching for duplicate images...")
@ -172,4 +190,4 @@ def main():
 if __name__ == "__main__":
-    main()
+    asyncio.run(main())
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1,2 @@
 imagehash
 aiohttp