autogen/website/process_notebooks.py

import sys
from pathlib import Path
import subprocess
import argparse
import shutil
import json
import typing
import concurrent.futures

try:
    import yaml
except ImportError:
    print("pyyaml not found.\n\nPlease install pyyaml:\n\tpip install pyyaml\n")
    sys.exit(1)

try:
    from termcolor import colored
except ImportError:

    def colored(x, *args, **kwargs):
        return x


class Result:
    def __init__(self, returncode: int, stdout: str, stderr: str):
        self.returncode = returncode
        self.stdout = stdout
        self.stderr = stderr


def check_quarto_bin(quarto_bin: str = "quarto"):
    """Check if quarto is installed."""
    try:
        subprocess.check_output([quarto_bin, "--version"])
    except FileNotFoundError:
        print("Quarto is not installed. Please install it from https://quarto.org")
        sys.exit(1)


def notebooks_target_dir(website_directory: Path) -> Path:
    """Return the target directory for notebooks."""
    return website_directory / "docs" / "notebooks"


def extract_yaml_from_notebook(notebook: Path) -> typing.Optional[typing.Dict]:
    with open(notebook, "r", encoding="utf-8") as f:
        content = f.read()

    json_content = json.loads(content)
    first_cell = json_content["cells"][0]

    # <!-- and --> must exists on lines on their own
    if first_cell["cell_type"] != "markdown":
        return None

    lines = first_cell["source"]
    if "<!--" != lines[0].strip():
        return None

    # remove trailing whitespace
    lines = [line.rstrip() for line in lines]

    if "-->" not in lines:
        return None

    closing_arrow_idx = lines.index("-->")

    front_matter_lines = lines[1:closing_arrow_idx]
    front_matter = yaml.safe_load("\n".join(front_matter_lines))
    return front_matter


def skip_reason_or_none_if_ok(notebook: Path) -> typing.Optional[str]:
    """Return a reason to skip the notebook, or None if it should not be skipped."""
    with open(notebook, "r", encoding="utf-8") as f:
        content = f.read()

    # Load the json and get the first cell
    json_content = json.loads(content)
    first_cell = json_content["cells"][0]

    # <!-- and --> must exists on lines on their own
    if first_cell["cell_type"] != "markdown":
        return "first cell is not markdown"

    lines = first_cell["source"]
    if "<!--" != lines[0].strip():
        return "first line does not contain only '<!--'"

    # remove trailing whitespace
    lines = [line.rstrip() for line in lines]

    if "-->" not in lines:
        return "no closing --> found, or it is not on a line on its own"

    try:
        front_matter = extract_yaml_from_notebook(notebook)
    except yaml.YAMLError as e:
        return colored(f"Failed to parse front matter in {notebook.name}: {e}", "red")

    # Should not be none at this point as we have already done the same checks as in extract_yaml_from_notebook
    assert front_matter is not None, f"Front matter is None for {notebook.name}"

    if "skip" in front_matter and front_matter["skip"] is True:
        return "skip is set to true"

    if "tags" not in front_matter:
        return "tags is not in front matter"

    if "description" not in front_matter:
        return "description is not in front matter"

    # Make sure tags is a list of strings
    if not all([isinstance(tag, str) for tag in front_matter["tags"]]):
        return "tags must be a list of strings"

    # Make sure description is a string
    if not isinstance(front_matter["description"], str):
        return "description must be a string"

    return None


def process_notebook(src_notebook: Path, dest_dir: Path, quarto_bin: str, dry_run: bool) -> str:
    """Process a single notebook."""
    reason_or_none = skip_reason_or_none_if_ok(src_notebook)
    if reason_or_none:
        return colored(f"Skipping {src_notebook.name}, reason: {reason_or_none}", "yellow")

    target_mdx_file = dest_dir / f"{src_notebook.stem}.mdx"
    intermediate_notebook = dest_dir / f"{src_notebook.stem}.ipynb"

    # If the intermediate_notebook already exists, check if it is newer than the source file
    if target_mdx_file.exists():
        if target_mdx_file.stat().st_mtime > src_notebook.stat().st_mtime:
            return colored(f"Skipping {src_notebook.name}, as target file is newer", "blue")

    if dry_run:
        return colored(f"Would process {src_notebook.name}", "green")

    # Copy notebook to target dir
    # The reason we copy the notebook is that quarto does not support rendering from a different directory
    shutil.copy(src_notebook, intermediate_notebook)

    # Check if another file has to be copied too
    # Solely added for the purpose of agent_library_example.json
    front_matter = extract_yaml_from_notebook(src_notebook)
    # Should not be none at this point as we have already done the same checks as in extract_yaml_from_notebook
    assert front_matter is not None, f"Front matter is None for {src_notebook.name}"
    if "extra_files_to_copy" in front_matter:
        for file in front_matter["extra_files_to_copy"]:
            shutil.copy(src_notebook.parent / file, dest_dir / file)

    # Capture output
    result = subprocess.run(
        [quarto_bin, "render", intermediate_notebook], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
    )
    if result.returncode != 0:
        return colored(f"Failed to render {intermediate_notebook}", "red") + f"\n{result.stderr}" + f"\n{result.stdout}"

    # Unlink intermediate files
    intermediate_notebook.unlink()

    if "extra_files_to_copy" in front_matter:
        for file in front_matter["extra_files_to_copy"]:
            (dest_dir / file).unlink()

    # Post process the file
    post_process_mdx(target_mdx_file)

    return colored(f"Processed {src_notebook.name}", "green")


# rendered_notebook is the final mdx file
def post_process_mdx(rendered_mdx: Path) -> None:
    notebook_name = f"{rendered_mdx.stem}.ipynb"
    with open(rendered_mdx, "r", encoding="utf-8") as f:
        content = f.read()

    # Check for existence of "export const quartoRawHtml", this indicates there was a front matter line in the file
    if "export const quartoRawHtml" not in content:
        raise ValueError(f"File {rendered_mdx} does not contain 'export const quartoRawHtml'")

    # Extract the text between <!-- and -->
    front_matter = content.split("<!--")[1].split("-->")[0]
    # Strip empty lines before and after
    front_matter = "\n".join([line for line in front_matter.split("\n") if line.strip() != ""])

    # add file path
    front_matter += f"\nsource_notebook: /notebook/{notebook_name}"
    # Custom edit url
    front_matter += f"\ncustom_edit_url: https://github.com/microsoft/autogen/edit/main/notebook/{notebook_name}"

    # inject in content directly after the markdown title the word done
    # Find the end of the line with the title
    title_end = content.find("\n", content.find("#"))

    # Extract page title
    title = content[content.find("#") + 1 : content.find("\n", content.find("#"))].strip()

    front_matter += f"\ntitle: {title}"

    github_link = f"https://github.com/microsoft/autogen/blob/main/notebook/{notebook_name}"
    content = (
        content[:title_end]
        + "\n[![Open on GitHub](https://img.shields.io/badge/Open%20on%20GitHub-grey?logo=github)]("
        + github_link
        + ")"
        + content[title_end:]
    )

    # If no colab link is present, insert one
    if "colab-badge.svg" not in content:
        content = (
            content[:title_end]
            + "\n[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/microsoft/autogen/blob/main/notebook/"
            + notebook_name
            + ")"
            + content[title_end:]
        )

    # Rewrite the content as
    # ---
    # front_matter
    # ---
    # content
    new_content = f"---\n{front_matter}\n---\n{content}"
    with open(rendered_mdx, "w", encoding="utf-8") as f:
        f.write(new_content)


def path(path_str: str) -> Path:
    """Return a Path object."""
    return Path(path_str)


def main():
    script_dir = Path(__file__).parent.absolute()
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--notebook-directory",
        type=path,
        help="Directory containing notebooks to process",
        default=script_dir / "../notebook",
    )
    parser.add_argument(
        "--website-directory", type=path, help="Root directory of docusarus website", default=script_dir
    )
    parser.add_argument("--quarto-bin", help="Path to quarto binary", default="quarto")
    parser.add_argument("--dry-run", help="Don't render", action="store_true")
    parser.add_argument("--workers", help="Number of workers to use", type=int, default=-1)

    args = parser.parse_args()

    if args.workers == -1:
        args.workers = None

    check_quarto_bin(args.quarto_bin)

    if not notebooks_target_dir(args.website_directory).exists():
        notebooks_target_dir(args.website_directory).mkdir(parents=True)

    with concurrent.futures.ProcessPoolExecutor(max_workers=args.workers) as executor:
        futures = [
            executor.submit(
                process_notebook, f, notebooks_target_dir(args.website_directory), args.quarto_bin, args.dry_run
            )
            for f in args.notebook_directory.glob("*.ipynb")
        ]
        for future in concurrent.futures.as_completed(futures):
            print(future.result())


if __name__ == "__main__":
    main()
Add notebooks section on website (#1495) * Initial infrasctructure for notebooks page * migrate two notebooks * add readme notification for notebook dir * override 'text' prism language to add basic syntactical structure to autogens output * Rework to retain existing directory and not expose front matter to consumers of the notebook * improve error handling of process notebooks * format, ruff and type fixes * undo changes to navbar * update readme, CI * whitespace * spelling mistakes * spelling * Add contributing guide for notebooks * update notebook * formatting 2024-02-03 12:01:00 -05:00			`import sys`
			`from pathlib import Path`
			`import subprocess`
			`import argparse`
			`import shutil`
			`import json`
			`import typing`
			`import concurrent.futures`

			`try:`
			`import yaml`
			`except ImportError:`
			`print("pyyaml not found.\n\nPlease install pyyaml:\n\tpip install pyyaml\n")`
			`sys.exit(1)`

			`try:`
			`from termcolor import colored`
			`except ImportError:`

			`def colored(x, args, *kwargs):`
			`return x`


			`class Result:`
			`def __init__(self, returncode: int, stdout: str, stderr: str):`
			`self.returncode = returncode`
			`self.stdout = stdout`
			`self.stderr = stderr`


			`def check_quarto_bin(quarto_bin: str = "quarto"):`
			`"""Check if quarto is installed."""`
			`try:`
			`subprocess.check_output([quarto_bin, "--version"])`
			`except FileNotFoundError:`
			`print("Quarto is not installed. Please install it from https://quarto.org")`
			`sys.exit(1)`


			`def notebooks_target_dir(website_directory: Path) -> Path:`
			`"""Return the target directory for notebooks."""`
			`return website_directory / "docs" / "notebooks"`


			`def extract_yaml_from_notebook(notebook: Path) -> typing.Optional[typing.Dict]:`
add other language drop down link to AutoGen website (#1573) * add other language drop down * fix format 2024-02-08 09:28:56 -08:00			`with open(notebook, "r", encoding="utf-8") as f:`
Add notebooks section on website (#1495) * Initial infrasctructure for notebooks page * migrate two notebooks * add readme notification for notebook dir * override 'text' prism language to add basic syntactical structure to autogens output * Rework to retain existing directory and not expose front matter to consumers of the notebook * improve error handling of process notebooks * format, ruff and type fixes * undo changes to navbar * update readme, CI * whitespace * spelling mistakes * spelling * Add contributing guide for notebooks * update notebook * formatting 2024-02-03 12:01:00 -05:00			`content = f.read()`

			`json_content = json.loads(content)`
			`first_cell = json_content["cells"][0]`

			`# <!-- and --> must exists on lines on their own`
			`if first_cell["cell_type"] != "markdown":`
			`return None`

			`lines = first_cell["source"]`
			`if "<!--" != lines[0].strip():`
			`return None`

			`# remove trailing whitespace`
			`lines = [line.rstrip() for line in lines]`

			`if "-->" not in lines:`
			`return None`

			`closing_arrow_idx = lines.index("-->")`

			`front_matter_lines = lines[1:closing_arrow_idx]`
			`front_matter = yaml.safe_load("\n".join(front_matter_lines))`
			`return front_matter`


			`def skip_reason_or_none_if_ok(notebook: Path) -> typing.Optional[str]:`
			`"""Return a reason to skip the notebook, or None if it should not be skipped."""`
add other language drop down link to AutoGen website (#1573) * add other language drop down * fix format 2024-02-08 09:28:56 -08:00			`with open(notebook, "r", encoding="utf-8") as f:`
Add notebooks section on website (#1495) * Initial infrasctructure for notebooks page * migrate two notebooks * add readme notification for notebook dir * override 'text' prism language to add basic syntactical structure to autogens output * Rework to retain existing directory and not expose front matter to consumers of the notebook * improve error handling of process notebooks * format, ruff and type fixes * undo changes to navbar * update readme, CI * whitespace * spelling mistakes * spelling * Add contributing guide for notebooks * update notebook * formatting 2024-02-03 12:01:00 -05:00			`content = f.read()`

			`# Load the json and get the first cell`
			`json_content = json.loads(content)`
			`first_cell = json_content["cells"][0]`

			`# <!-- and --> must exists on lines on their own`
			`if first_cell["cell_type"] != "markdown":`
			`return "first cell is not markdown"`

			`lines = first_cell["source"]`
			`if "<!--" != lines[0].strip():`
			`return "first line does not contain only '<!--'"`

			`# remove trailing whitespace`
			`lines = [line.rstrip() for line in lines]`

			`if "-->" not in lines:`
			`return "no closing --> found, or it is not on a line on its own"`

			`try:`
			`front_matter = extract_yaml_from_notebook(notebook)`
			`except yaml.YAMLError as e:`
			`return colored(f"Failed to parse front matter in {notebook.name}: {e}", "red")`

			`# Should not be none at this point as we have already done the same checks as in extract_yaml_from_notebook`
			`assert front_matter is not None, f"Front matter is None for {notebook.name}"`

			`if "skip" in front_matter and front_matter["skip"] is True:`
			`return "skip is set to true"`

			`if "tags" not in front_matter:`
			`return "tags is not in front matter"`

			`if "description" not in front_matter:`
			`return "description is not in front matter"`

			`# Make sure tags is a list of strings`
			`if not all([isinstance(tag, str) for tag in front_matter["tags"]]):`
			`return "tags must be a list of strings"`

			`# Make sure description is a string`
			`if not isinstance(front_matter["description"], str):`
			`return "description must be a string"`

			`return None`


			`def process_notebook(src_notebook: Path, dest_dir: Path, quarto_bin: str, dry_run: bool) -> str:`
			`"""Process a single notebook."""`
			`reason_or_none = skip_reason_or_none_if_ok(src_notebook)`
			`if reason_or_none:`
			`return colored(f"Skipping {src_notebook.name}, reason: {reason_or_none}", "yellow")`

			`target_mdx_file = dest_dir / f"{src_notebook.stem}.mdx"`
			`intermediate_notebook = dest_dir / f"{src_notebook.stem}.ipynb"`

			`# If the intermediate_notebook already exists, check if it is newer than the source file`
			`if target_mdx_file.exists():`
			`if target_mdx_file.stat().st_mtime > src_notebook.stat().st_mtime:`
			`return colored(f"Skipping {src_notebook.name}, as target file is newer", "blue")`

			`if dry_run:`
			`return colored(f"Would process {src_notebook.name}", "green")`

			`# Copy notebook to target dir`
			`# The reason we copy the notebook is that quarto does not support rendering from a different directory`
			`shutil.copy(src_notebook, intermediate_notebook)`

			`# Check if another file has to be copied too`
			`# Solely added for the purpose of agent_library_example.json`
			`front_matter = extract_yaml_from_notebook(src_notebook)`
			`# Should not be none at this point as we have already done the same checks as in extract_yaml_from_notebook`
			`assert front_matter is not None, f"Front matter is None for {src_notebook.name}"`
			`if "extra_files_to_copy" in front_matter:`
			`for file in front_matter["extra_files_to_copy"]:`
			`shutil.copy(src_notebook.parent / file, dest_dir / file)`

			`# Capture output`
			`result = subprocess.run(`
			`[quarto_bin, "render", intermediate_notebook], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True`
			`)`
			`if result.returncode != 0:`
			`return colored(f"Failed to render {intermediate_notebook}", "red") + f"\n{result.stderr}" + f"\n{result.stdout}"`

			`# Unlink intermediate files`
			`intermediate_notebook.unlink()`

			`if "extra_files_to_copy" in front_matter:`
			`for file in front_matter["extra_files_to_copy"]:`
			`(dest_dir / file).unlink()`

			`# Post process the file`
			`post_process_mdx(target_mdx_file)`

			`return colored(f"Processed {src_notebook.name}", "green")`


			`# rendered_notebook is the final mdx file`
			`def post_process_mdx(rendered_mdx: Path) -> None:`
			`notebook_name = f"{rendered_mdx.stem}.ipynb"`
add other language drop down link to AutoGen website (#1573) * add other language drop down * fix format 2024-02-08 09:28:56 -08:00			`with open(rendered_mdx, "r", encoding="utf-8") as f:`
Add notebooks section on website (#1495) * Initial infrasctructure for notebooks page * migrate two notebooks * add readme notification for notebook dir * override 'text' prism language to add basic syntactical structure to autogens output * Rework to retain existing directory and not expose front matter to consumers of the notebook * improve error handling of process notebooks * format, ruff and type fixes * undo changes to navbar * update readme, CI * whitespace * spelling mistakes * spelling * Add contributing guide for notebooks * update notebook * formatting 2024-02-03 12:01:00 -05:00			`content = f.read()`

			`# Check for existence of "export const quartoRawHtml", this indicates there was a front matter line in the file`
			`if "export const quartoRawHtml" not in content:`
			`raise ValueError(f"File {rendered_mdx} does not contain 'export const quartoRawHtml'")`

			`# Extract the text between <!-- and -->`
			`front_matter = content.split("<!--")[1].split("-->")[0]`
			`# Strip empty lines before and after`
			`front_matter = "\n".join([line for line in front_matter.split("\n") if line.strip() != ""])`

			`# add file path`
			`front_matter += f"\nsource_notebook: /notebook/{notebook_name}"`
			`# Custom edit url`
			`front_matter += f"\ncustom_edit_url: https://github.com/microsoft/autogen/edit/main/notebook/{notebook_name}"`

			`# inject in content directly after the markdown title the word done`
			`# Find the end of the line with the title`
			`title_end = content.find("\n", content.find("#"))`

			`# Extract page title`
			`title = content[content.find("#") + 1 : content.find("\n", content.find("#"))].strip()`

			`front_matter += f"\ntitle: {title}"`

			`github_link = f"https://github.com/microsoft/autogen/blob/main/notebook/{notebook_name}"`
			`content = (`
			`content[:title_end]`
			`+ "\n[![Open on GitHub](https://img.shields.io/badge/Open%20on%20GitHub-grey?logo=github)]("`
			`+ github_link`
			`+ ")"`
			`+ content[title_end:]`
			`)`

			`# If no colab link is present, insert one`
			`if "colab-badge.svg" not in content:`
			`content = (`
			`content[:title_end]`
			`+ "\n[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/microsoft/autogen/blob/main/notebook/"`
			`+ notebook_name`
			`+ ")"`
			`+ content[title_end:]`
			`)`

			`# Rewrite the content as`
			`# ---`
			`# front_matter`
			`# ---`
			`# content`
			`new_content = f"---\n{front_matter}\n---\n{content}"`
add other language drop down link to AutoGen website (#1573) * add other language drop down * fix format 2024-02-08 09:28:56 -08:00			`with open(rendered_mdx, "w", encoding="utf-8") as f:`
Add notebooks section on website (#1495) * Initial infrasctructure for notebooks page * migrate two notebooks * add readme notification for notebook dir * override 'text' prism language to add basic syntactical structure to autogens output * Rework to retain existing directory and not expose front matter to consumers of the notebook * improve error handling of process notebooks * format, ruff and type fixes * undo changes to navbar * update readme, CI * whitespace * spelling mistakes * spelling * Add contributing guide for notebooks * update notebook * formatting 2024-02-03 12:01:00 -05:00			`f.write(new_content)`


			`def path(path_str: str) -> Path:`
			`"""Return a Path object."""`
			`return Path(path_str)`


			`def main():`
			`script_dir = Path(__file__).parent.absolute()`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument(`
			`"--notebook-directory",`
			`type=path,`
			`help="Directory containing notebooks to process",`
			`default=script_dir / "../notebook",`
			`)`
			`parser.add_argument(`
			`"--website-directory", type=path, help="Root directory of docusarus website", default=script_dir`
			`)`
			`parser.add_argument("--quarto-bin", help="Path to quarto binary", default="quarto")`
			`parser.add_argument("--dry-run", help="Don't render", action="store_true")`
			`parser.add_argument("--workers", help="Number of workers to use", type=int, default=-1)`

			`args = parser.parse_args()`

			`if args.workers == -1:`
			`args.workers = None`

			`check_quarto_bin(args.quarto_bin)`

			`if not notebooks_target_dir(args.website_directory).exists():`
			`notebooks_target_dir(args.website_directory).mkdir(parents=True)`

			`with concurrent.futures.ProcessPoolExecutor(max_workers=args.workers) as executor:`
			`futures = [`
			`executor.submit(`
			`process_notebook, f, notebooks_target_dir(args.website_directory), args.quarto_bin, args.dry_run`
			`)`
			`for f in args.notebook_directory.glob("*.ipynb")`
			`]`
			`for future in concurrent.futures.as_completed(futures):`
			`print(future.result())`


			`if __name__ == "__main__":`
			`main()`