import os import tarfile import urllib.request import json repo_url = "https://api.github.com/repos/datahub-project/static-assets" def download_file(url, destination): with urllib.request.urlopen(url) as response: with open(destination, "wb") as f: while True: chunk = response.read(8192) if not chunk: break f.write(chunk) def fetch_tar_urls(repo_url, folder_path): api_url = f"{repo_url}/contents/{folder_path}" response = urllib.request.urlopen(api_url) data = response.read().decode('utf-8') tar_urls = [ file["download_url"] for file in json.loads(data) if file["name"].endswith(".tar.gz") ] print(tar_urls) return tar_urls def main(): folder_path = "versioned_docs" destination_dir = "versioned_docs" if not os.path.exists(destination_dir): os.makedirs(destination_dir) tar_urls = fetch_tar_urls(repo_url, folder_path) for url in tar_urls: filename = os.path.basename(url) destination_path = os.path.join(destination_dir, filename) version = '.'.join(filename.split('.')[:3]) extracted_path = os.path.join(destination_dir, version) print("extracted_path", extracted_path) if os.path.exists(extracted_path): print(f"{extracted_path} already exists, skipping downloads") continue try: download_file(url, destination_path) print(f"Downloaded {filename} to {destination_dir}") with tarfile.open(destination_path, "r:gz") as tar: tar.extractall() os.remove(destination_path) except urllib.error.URLError as e: print(f"Error while downloading {filename}: {e}") continue if __name__ == "__main__": main()