mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-10-25 07:54:37 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			104 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			104 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import json
 | |
| import os
 | |
| import tarfile
 | |
| import time
 | |
| import urllib.request
 | |
| import shutil
 | |
| 
 | |
| repo_url = "https://api.github.com/repos/datahub-project/static-assets"
 | |
| 
 | |
| 
 | |
| def download_file(url, destination):
 | |
|     with urllib.request.urlopen(url) as response:
 | |
|         with open(destination, "wb") as f:
 | |
|             while True:
 | |
|                 chunk = response.read(8192)
 | |
|                 if not chunk:
 | |
|                     break
 | |
|                 f.write(chunk)
 | |
| 
 | |
| 
 | |
| def fetch_urls(
 | |
|     repo_url: str, folder_path: str, file_format: str, active_versions: list, max_retries=3, retry_delay=5
 | |
| ):
 | |
|     api_url = f"{repo_url}/contents/{folder_path}"
 | |
|     for attempt in range(max_retries + 1):
 | |
|         try:
 | |
|             response = urllib.request.urlopen(api_url)
 | |
|             if response.status == 403 or (500 <= response.status < 600):
 | |
|                 raise Exception(f"HTTP Error {response.status}: {response.reason}")
 | |
|             data = response.read().decode("utf-8")
 | |
|             urls = [
 | |
|                 file["download_url"]
 | |
|                 for file in json.loads(data)
 | |
|                 if file["name"].endswith(file_format) and any(version in file["name"] for version in active_versions)
 | |
|             ]
 | |
|             print(urls)
 | |
|             return urls
 | |
|         except Exception as e:
 | |
|             if attempt < max_retries:
 | |
|                 print(f"Attempt {attempt + 1}/{max_retries}: {e}")
 | |
|                 time.sleep(retry_delay * 2**attempt)
 | |
|             else:
 | |
|                 print("Max retries reached. Unable to fetch data.")
 | |
|                 raise
 | |
| 
 | |
| 
 | |
| def extract_tar_file(destination_path):
 | |
|     with tarfile.open(destination_path, "r:gz") as tar:
 | |
|         tar.extractall()
 | |
|     os.remove(destination_path)
 | |
| 
 | |
| def get_active_versions():
 | |
|     # read versions.json
 | |
|     with open("versions.json") as f:
 | |
|         versions = json.load(f)
 | |
|     return versions
 | |
| 
 | |
| def clear_directory(directory):
 | |
|     if os.path.exists(directory):
 | |
|         shutil.rmtree(directory)
 | |
|     os.makedirs(directory)
 | |
| 
 | |
| def download_versioned_docs(folder_path: str, destination_dir: str, file_format: str):
 | |
|     clear_directory(destination_dir)  # Clear the directory before downloading
 | |
| 
 | |
|     active_versions = get_active_versions()
 | |
|     urls = fetch_urls(repo_url, folder_path, file_format, active_versions)
 | |
| 
 | |
|     for url in urls:
 | |
|         filename = os.path.basename(url)
 | |
|         destination_path = os.path.join(destination_dir, filename)
 | |
| 
 | |
|         version = ".".join(filename.split(".")[:3])
 | |
|         extracted_path = os.path.join(destination_dir, version)
 | |
|         print("extracted_path", extracted_path)
 | |
|         if os.path.exists(extracted_path):
 | |
|             print(f"{extracted_path} already exists, skipping downloads")
 | |
|             continue
 | |
|         try:
 | |
|             download_file(url, destination_path)
 | |
|             print(f"Downloaded {filename} to {destination_dir}")
 | |
|             if file_format == ".tar.gz":
 | |
|                 extract_tar_file(destination_path)
 | |
|         except urllib.error.URLError as e:
 | |
|             print(f"Error while downloading {filename}: {e}")
 | |
|             continue
 | |
| 
 | |
| 
 | |
| def main():
 | |
|     download_versioned_docs(
 | |
|         folder_path="versioned_docs",
 | |
|         destination_dir="versioned_docs",
 | |
|         file_format=".tar.gz",
 | |
|     )
 | |
|     download_versioned_docs(
 | |
|         folder_path="versioned_sidebars",
 | |
|         destination_dir="versioned_sidebars",
 | |
|         file_format=".json",
 | |
|     )
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     main()
 | 
