| 
									
										
										
										
											2023-08-30 09:27:38 +09:00
										 |  |  | import json | 
					
						
							| 
									
										
										
										
											2023-08-26 06:10:13 +09:00
										 |  |  | import os | 
					
						
							|  |  |  | import tarfile | 
					
						
							| 
									
										
										
										
											2023-10-06 14:36:32 +09:00
										 |  |  | import time | 
					
						
							| 
									
										
										
										
											2025-08-08 13:22:18 +05:30
										 |  |  | import requests | 
					
						
							| 
									
										
										
										
											2024-08-23 12:28:07 +09:00
										 |  |  | import shutil | 
					
						
							| 
									
										
										
										
											2025-08-08 13:22:18 +05:30
										 |  |  | from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type | 
					
						
							| 
									
										
										
										
											2023-08-26 06:10:13 +09:00
										 |  |  | 
 | 
					
						
							|  |  |  | repo_url = "https://api.github.com/repos/datahub-project/static-assets" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def download_file(url, destination): | 
					
						
							| 
									
										
										
										
											2025-08-08 13:22:18 +05:30
										 |  |  |     response = requests.get(url, stream=True) | 
					
						
							|  |  |  |     response.raise_for_status() | 
					
						
							|  |  |  |     with open(destination, "wb") as f: | 
					
						
							|  |  |  |         for chunk in response.iter_content(chunk_size=8192): | 
					
						
							|  |  |  |             f.write(chunk) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @retry( | 
					
						
							|  |  |  |     stop=stop_after_attempt(10), | 
					
						
							|  |  |  |     wait=wait_exponential(multiplier=1, min=1, max=30), | 
					
						
							|  |  |  |     retry=retry_if_exception_type(Exception) | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-10-06 14:36:32 +09:00
										 |  |  | def fetch_urls( | 
					
						
							| 
									
										
										
										
											2025-08-08 13:22:18 +05:30
										 |  |  |     repo_url: str, folder_path: str, file_format: str, active_versions: list | 
					
						
							| 
									
										
										
										
											2023-10-06 14:36:32 +09:00
										 |  |  | ): | 
					
						
							| 
									
										
										
										
											2023-08-26 06:10:13 +09:00
										 |  |  |     api_url = f"{repo_url}/contents/{folder_path}" | 
					
						
							| 
									
										
										
										
											2025-08-08 13:22:18 +05:30
										 |  |  |     response = requests.get(api_url) | 
					
						
							|  |  |  |     if response.status_code == 403 or (500 <= response.status_code < 600): | 
					
						
							|  |  |  |         raise Exception(f"HTTP Error {response.status_code}: {response.reason}") | 
					
						
							|  |  |  |     response.raise_for_status() | 
					
						
							|  |  |  |     data = response.json() | 
					
						
							|  |  |  |     urls = [ | 
					
						
							|  |  |  |         file["download_url"] | 
					
						
							|  |  |  |         for file in data | 
					
						
							|  |  |  |         if file["name"].endswith(file_format) and any(version in file["name"] for version in active_versions) | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  |     print(urls) | 
					
						
							|  |  |  |     return urls | 
					
						
							| 
									
										
										
										
											2023-08-26 06:10:13 +09:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-30 09:27:38 +09:00
										 |  |  | def extract_tar_file(destination_path): | 
					
						
							|  |  |  |     with tarfile.open(destination_path, "r:gz") as tar: | 
					
						
							|  |  |  |         tar.extractall() | 
					
						
							|  |  |  |     os.remove(destination_path) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-23 12:28:07 +09:00
										 |  |  | def get_active_versions(): | 
					
						
							|  |  |  |     # read versions.json | 
					
						
							|  |  |  |     with open("versions.json") as f: | 
					
						
							|  |  |  |         versions = json.load(f) | 
					
						
							|  |  |  |     return versions | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def clear_directory(directory): | 
					
						
							|  |  |  |     if os.path.exists(directory): | 
					
						
							|  |  |  |         shutil.rmtree(directory) | 
					
						
							|  |  |  |     os.makedirs(directory) | 
					
						
							| 
									
										
										
										
											2023-08-30 09:27:38 +09:00
										 |  |  | 
 | 
					
						
							|  |  |  | def download_versioned_docs(folder_path: str, destination_dir: str, file_format: str): | 
					
						
							| 
									
										
										
										
											2024-08-23 12:28:07 +09:00
										 |  |  |     clear_directory(destination_dir)  # Clear the directory before downloading | 
					
						
							| 
									
										
										
										
											2023-08-26 06:10:13 +09:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-23 12:28:07 +09:00
										 |  |  |     active_versions = get_active_versions() | 
					
						
							|  |  |  |     urls = fetch_urls(repo_url, folder_path, file_format, active_versions) | 
					
						
							| 
									
										
										
										
											2023-08-26 06:10:13 +09:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-30 09:27:38 +09:00
										 |  |  |     for url in urls: | 
					
						
							| 
									
										
										
										
											2023-08-26 06:10:13 +09:00
										 |  |  |         filename = os.path.basename(url) | 
					
						
							|  |  |  |         destination_path = os.path.join(destination_dir, filename) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-30 09:27:38 +09:00
										 |  |  |         version = ".".join(filename.split(".")[:3]) | 
					
						
							| 
									
										
										
										
											2023-08-26 06:10:13 +09:00
										 |  |  |         extracted_path = os.path.join(destination_dir, version) | 
					
						
							|  |  |  |         print("extracted_path", extracted_path) | 
					
						
							|  |  |  |         if os.path.exists(extracted_path): | 
					
						
							|  |  |  |             print(f"{extracted_path} already exists, skipping downloads") | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             download_file(url, destination_path) | 
					
						
							|  |  |  |             print(f"Downloaded {filename} to {destination_dir}") | 
					
						
							| 
									
										
										
										
											2023-08-30 09:27:38 +09:00
										 |  |  |             if file_format == ".tar.gz": | 
					
						
							|  |  |  |                 extract_tar_file(destination_path) | 
					
						
							| 
									
										
										
										
											2025-08-08 13:22:18 +05:30
										 |  |  |         except Exception as e: | 
					
						
							| 
									
										
										
										
											2023-08-26 06:10:13 +09:00
										 |  |  |             print(f"Error while downloading {filename}: {e}") | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-30 09:27:38 +09:00
										 |  |  | def main(): | 
					
						
							|  |  |  |     download_versioned_docs( | 
					
						
							|  |  |  |         folder_path="versioned_docs", | 
					
						
							|  |  |  |         destination_dir="versioned_docs", | 
					
						
							|  |  |  |         file_format=".tar.gz", | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     download_versioned_docs( | 
					
						
							|  |  |  |         folder_path="versioned_sidebars", | 
					
						
							|  |  |  |         destination_dir="versioned_sidebars", | 
					
						
							|  |  |  |         file_format=".json", | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-26 06:10:13 +09:00
										 |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  |     main() |