datahub/docs-website/download_historical_versions.py

import os
import tarfile
import urllib.request
import json

repo_url = "https://api.github.com/repos/datahub-project/static-assets"


def download_file(url, destination):
    with urllib.request.urlopen(url) as response:
        with open(destination, "wb") as f:
            while True:
                chunk = response.read(8192)
                if not chunk:
                    break
                f.write(chunk)


def fetch_tar_urls(repo_url, folder_path):
    api_url = f"{repo_url}/contents/{folder_path}"
    response = urllib.request.urlopen(api_url)
    data = response.read().decode('utf-8')
    tar_urls = [
        file["download_url"] for file in json.loads(data) if file["name"].endswith(".tar.gz")
    ]
    print(tar_urls)
    return tar_urls


def main():
    folder_path = "versioned_docs"
    destination_dir = "versioned_docs"
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)

    tar_urls = fetch_tar_urls(repo_url, folder_path)

    for url in tar_urls:
        filename = os.path.basename(url)
        destination_path = os.path.join(destination_dir, filename)

        version = '.'.join(filename.split('.')[:3])
        extracted_path = os.path.join(destination_dir, version)
        print("extracted_path", extracted_path)
        if os.path.exists(extracted_path):
            print(f"{extracted_path} already exists, skipping downloads")
            continue
        try:
            download_file(url, destination_path)
            print(f"Downloaded {filename} to {destination_dir}")
            with tarfile.open(destination_path, "r:gz") as tar:
                tar.extractall()
            os.remove(destination_path)
        except urllib.error.URLError as e:
            print(f"Error while downloading {filename}: {e}")
            continue


if __name__ == "__main__":
    main()
docs(docs): add native versioning (#8714) 2023-08-26 06:10:13 +09:00			`import os`
			`import tarfile`
			`import urllib.request`
			`import json`

			`repo_url = "https://api.github.com/repos/datahub-project/static-assets"`


			`def download_file(url, destination):`
			`with urllib.request.urlopen(url) as response:`
			`with open(destination, "wb") as f:`
			`while True:`
			`chunk = response.read(8192)`
			`if not chunk:`
			`break`
			`f.write(chunk)`


			`def fetch_tar_urls(repo_url, folder_path):`
			`api_url = f"{repo_url}/contents/{folder_path}"`
			`response = urllib.request.urlopen(api_url)`
			`data = response.read().decode('utf-8')`
			`tar_urls = [`
			`file["download_url"] for file in json.loads(data) if file["name"].endswith(".tar.gz")`
			`]`
			`print(tar_urls)`
			`return tar_urls`


			`def main():`
			`folder_path = "versioned_docs"`
			`destination_dir = "versioned_docs"`
			`if not os.path.exists(destination_dir):`
			`os.makedirs(destination_dir)`

			`tar_urls = fetch_tar_urls(repo_url, folder_path)`

			`for url in tar_urls:`
			`filename = os.path.basename(url)`
			`destination_path = os.path.join(destination_dir, filename)`

			`version = '.'.join(filename.split('.')[:3])`
			`extracted_path = os.path.join(destination_dir, version)`
			`print("extracted_path", extracted_path)`
			`if os.path.exists(extracted_path):`
			`print(f"{extracted_path} already exists, skipping downloads")`
			`continue`
			`try:`
			`download_file(url, destination_path)`
			`print(f"Downloaded {filename} to {destination_dir}")`
			`with tarfile.open(destination_path, "r:gz") as tar:`
			`tar.extractall()`
			`os.remove(destination_path)`
			`except urllib.error.URLError as e:`
			`print(f"Error while downloading {filename}: {e}")`
			`continue`


			`if __name__ == "__main__":`
			`main()`