2024-02-29 15:47:30 -05:00
#!/usr/bin/env python
from __future__ import annotations
import signal
2024-02-03 12:01:00 -05:00
import sys
from pathlib import Path
import subprocess
import argparse
import shutil
import json
2024-02-29 15:47:30 -05:00
import tempfile
import threading
import time
2024-02-03 12:01:00 -05:00
import typing
import concurrent . futures
2024-02-29 15:47:30 -05:00
import os
2024-03-02 09:27:11 -05:00
from typing import Any , Dict , Optional , Tuple , Union
2024-02-29 15:47:30 -05:00
from dataclasses import dataclass
2024-03-02 09:27:11 -05:00
2024-02-29 15:47:30 -05:00
from multiprocessing import current_process
2024-02-03 12:01:00 -05:00
try :
import yaml
except ImportError :
print ( " pyyaml not found. \n \n Please install pyyaml: \n \t pip install pyyaml \n " )
sys . exit ( 1 )
2024-03-02 09:27:11 -05:00
2024-02-29 15:47:30 -05:00
try :
import nbclient
from nbclient . client import (
CellExecutionError ,
CellTimeoutError ,
NotebookClient ,
)
except ImportError :
if current_process ( ) . name == " MainProcess " :
print ( " nbclient not found. \n \n Please install nbclient: \n \t pip install nbclient \n " )
print ( " test won ' t work without nbclient " )
try :
import nbformat
from nbformat import NotebookNode
except ImportError :
if current_process ( ) . name == " MainProcess " :
print ( " nbformat not found. \n \n Please install nbformat: \n \t pip install nbformat \n " )
print ( " test won ' t work without nbclient " )
2024-02-03 12:01:00 -05:00
try :
from termcolor import colored
except ImportError :
def colored ( x , * args , * * kwargs ) :
return x
class Result :
def __init__ ( self , returncode : int , stdout : str , stderr : str ) :
self . returncode = returncode
self . stdout = stdout
self . stderr = stderr
2024-02-29 15:47:30 -05:00
def check_quarto_bin ( quarto_bin : str = " quarto " ) - > None :
2024-02-03 12:01:00 -05:00
""" Check if quarto is installed. """
try :
2024-03-02 14:45:28 -05:00
subprocess . check_output ( [ quarto_bin , " --version " ] , text = True ) . strip ( )
# version = tuple(map(int, version.split(".")))
# if version < (1, 5, 23):
# print("Quarto version is too old. Please upgrade to 1.5.23 or later.")
# sys.exit(1)
2024-03-02 09:27:11 -05:00
2024-02-03 12:01:00 -05:00
except FileNotFoundError :
print ( " Quarto is not installed. Please install it from https://quarto.org " )
sys . exit ( 1 )
def notebooks_target_dir ( website_directory : Path ) - > Path :
""" Return the target directory for notebooks. """
return website_directory / " docs " / " notebooks "
2024-03-02 09:27:11 -05:00
def load_metadata ( notebook : Path ) - > typing . Dict :
content = json . load ( notebook . open ( ) )
return content [ " metadata " ]
2024-02-03 12:01:00 -05:00
def skip_reason_or_none_if_ok ( notebook : Path ) - > typing . Optional [ str ] :
""" Return a reason to skip the notebook, or None if it should not be skipped. """
2024-02-29 15:47:30 -05:00
if notebook . suffix != " .ipynb " :
return " not a notebook "
if not notebook . exists ( ) :
return " file does not exist "
# Extra checks for notebooks in the notebook directory
if " notebook " not in notebook . parts :
return None
2024-02-08 09:28:56 -08:00
with open ( notebook , " r " , encoding = " utf-8 " ) as f :
2024-02-03 12:01:00 -05:00
content = f . read ( )
# Load the json and get the first cell
json_content = json . loads ( content )
first_cell = json_content [ " cells " ] [ 0 ]
# <!-- and --> must exists on lines on their own
2024-03-02 09:27:11 -05:00
if first_cell [ " cell_type " ] == " markdown " and first_cell [ " source " ] [ 0 ] . strip ( ) == " <!-- " :
raise ValueError (
f " Error in { str ( notebook . resolve ( ) ) } - Front matter should be defined in the notebook metadata now. "
)
2024-02-03 12:01:00 -05:00
2024-03-02 09:27:11 -05:00
metadata = load_metadata ( notebook )
2024-02-03 12:01:00 -05:00
2024-03-02 09:27:11 -05:00
if " front_matter " not in metadata :
return " front matter missing from notebook metadata ⚠️ "
2024-02-03 12:01:00 -05:00
2024-03-02 09:27:11 -05:00
front_matter = metadata [ " front_matter " ]
2024-02-03 12:01:00 -05:00
if " tags " not in front_matter :
return " tags is not in front matter "
if " description " not in front_matter :
return " description is not in front matter "
# Make sure tags is a list of strings
if not all ( [ isinstance ( tag , str ) for tag in front_matter [ " tags " ] ] ) :
return " tags must be a list of strings "
# Make sure description is a string
if not isinstance ( front_matter [ " description " ] , str ) :
return " description must be a string "
return None
2024-02-29 15:47:30 -05:00
def process_notebook ( src_notebook : Path , website_dir : Path , notebook_dir : Path , quarto_bin : str , dry_run : bool ) - > str :
2024-02-03 12:01:00 -05:00
""" Process a single notebook. """
2024-02-29 15:47:30 -05:00
in_notebook_dir = " notebook " in src_notebook . parts
2024-03-02 09:27:11 -05:00
metadata = load_metadata ( src_notebook )
if " skip_render " in metadata :
return fmt_skip ( src_notebook , " skip_render is in notebook metadata " )
2024-02-29 15:47:30 -05:00
if in_notebook_dir :
2024-03-02 09:27:11 -05:00
relative_notebook = src_notebook . resolve ( ) . relative_to ( notebook_dir . resolve ( ) )
2024-02-29 15:47:30 -05:00
dest_dir = notebooks_target_dir ( website_directory = website_dir )
2024-03-02 09:27:11 -05:00
target_file = dest_dir / relative_notebook . with_suffix ( " .mdx " )
2024-02-29 15:47:30 -05:00
intermediate_notebook = dest_dir / relative_notebook
# If the intermediate_notebook already exists, check if it is newer than the source file
2024-03-02 09:27:11 -05:00
if target_file . exists ( ) :
if target_file . stat ( ) . st_mtime > src_notebook . stat ( ) . st_mtime :
return fmt_skip ( src_notebook , f " target file ( { target_file . name } ) is newer ☑️ " )
2024-02-29 15:47:30 -05:00
if dry_run :
return colored ( f " Would process { src_notebook . name } " , " green " )
# Copy notebook to target dir
# The reason we copy the notebook is that quarto does not support rendering from a different directory
shutil . copy ( src_notebook , intermediate_notebook )
# Check if another file has to be copied too
# Solely added for the purpose of agent_library_example.json
2024-03-02 09:27:11 -05:00
if " extra_files_to_copy " in metadata :
for file in metadata [ " extra_files_to_copy " ] :
2024-02-29 15:47:30 -05:00
shutil . copy ( src_notebook . parent / file , dest_dir / file )
# Capture output
result = subprocess . run (
[ quarto_bin , " render " , intermediate_notebook ] , stdout = subprocess . PIPE , stderr = subprocess . PIPE , text = True
)
if result . returncode != 0 :
2024-03-02 09:27:11 -05:00
return fmt_error (
src_notebook , f " Failed to render { src_notebook } \n \n stderr: \n { result . stderr } \n stdout: \n { result . stdout } "
2024-02-29 15:47:30 -05:00
)
2024-02-03 12:01:00 -05:00
2024-02-29 15:47:30 -05:00
# Unlink intermediate files
intermediate_notebook . unlink ( )
else :
2024-03-02 09:27:11 -05:00
target_file = src_notebook . with_suffix ( " .mdx " )
2024-02-03 12:01:00 -05:00
2024-02-29 15:47:30 -05:00
# If the intermediate_notebook already exists, check if it is newer than the source file
2024-03-02 09:27:11 -05:00
if target_file . exists ( ) :
if target_file . stat ( ) . st_mtime > src_notebook . stat ( ) . st_mtime :
return fmt_skip ( src_notebook , f " target file ( { target_file . name } ) is newer ☑️ " )
2024-02-03 12:01:00 -05:00
2024-02-29 15:47:30 -05:00
if dry_run :
return colored ( f " Would process { src_notebook . name } " , " green " )
2024-02-03 12:01:00 -05:00
2024-02-29 15:47:30 -05:00
result = subprocess . run (
[ quarto_bin , " render " , src_notebook ] , stdout = subprocess . PIPE , stderr = subprocess . PIPE , text = True
)
if result . returncode != 0 :
2024-03-02 09:27:11 -05:00
return fmt_error (
src_notebook , f " Failed to render { src_notebook } \n \n stderr: \n { result . stderr } \n stdout: \n { result . stdout } "
)
front_matter = { }
if " front_matter " in metadata :
front_matter = metadata [ " front_matter " ]
2024-02-03 12:01:00 -05:00
2024-03-02 09:27:11 -05:00
post_process_mdx ( target_file , front_matter )
return fmt_ok ( src_notebook )
2024-02-03 12:01:00 -05:00
2024-02-29 15:47:30 -05:00
# Notebook execution based on nbmake: https://github.com/treebeardtech/nbmakes
@dataclass
class NotebookError :
error_name : str
error_value : Optional [ str ]
traceback : str
cell_source : str
@dataclass
class NotebookSkip :
reason : str
NB_VERSION = 4
def test_notebook ( notebook_path : Path , timeout : int = 300 ) - > Tuple [ Path , Optional [ Union [ NotebookError , NotebookSkip ] ] ] :
nb = nbformat . read ( str ( notebook_path ) , NB_VERSION )
2024-03-02 09:27:11 -05:00
if " skip_test " in nb . metadata :
return notebook_path , NotebookSkip ( reason = nb . metadata . skip_test )
2024-02-29 15:47:30 -05:00
try :
c = NotebookClient (
nb ,
timeout = timeout ,
2024-03-02 09:27:11 -05:00
allow_errors = False ,
2024-02-29 15:47:30 -05:00
record_timing = True ,
)
os . environ [ " PYDEVD_DISABLE_FILE_VALIDATION " ] = " 1 "
os . environ [ " TOKENIZERS_PARALLELISM " ] = " false "
with tempfile . TemporaryDirectory ( ) as tempdir :
c . execute ( cwd = tempdir )
except CellExecutionError :
error = get_error_info ( nb )
assert error is not None
return notebook_path , error
except CellTimeoutError :
error = get_timeout_info ( nb )
assert error is not None
return notebook_path , error
return notebook_path , None
# Find the first code cell which did not complete.
def get_timeout_info (
nb : NotebookNode ,
) - > Optional [ NotebookError ] :
for i , cell in enumerate ( nb . cells ) :
if cell . cell_type != " code " :
continue
if " shell.execute_reply " not in cell . metadata . execution :
return NotebookError (
error_name = " timeout " ,
error_value = " " ,
traceback = " " ,
cell_source = " " . join ( cell [ " source " ] ) ,
)
return None
def get_error_info ( nb : NotebookNode ) - > Optional [ NotebookError ] :
for cell in nb [ " cells " ] : # get LAST error
if cell [ " cell_type " ] != " code " :
continue
errors = [ output for output in cell [ " outputs " ] if output [ " output_type " ] == " error " or " ename " in output ]
if errors :
traceback = " \n " . join ( errors [ 0 ] . get ( " traceback " , " " ) )
return NotebookError (
error_name = errors [ 0 ] . get ( " ename " , " " ) ,
error_value = errors [ 0 ] . get ( " evalue " , " " ) ,
traceback = traceback ,
cell_source = " " . join ( cell [ " source " ] ) ,
)
return None
2024-02-03 12:01:00 -05:00
# rendered_notebook is the final mdx file
2024-03-02 09:27:11 -05:00
def post_process_mdx ( rendered_mdx : Path , front_matter : Dict ) - > None :
2024-02-03 12:01:00 -05:00
notebook_name = f " { rendered_mdx . stem } .ipynb "
2024-02-08 09:28:56 -08:00
with open ( rendered_mdx , " r " , encoding = " utf-8 " ) as f :
2024-02-03 12:01:00 -05:00
content = f . read ( )
2024-03-02 09:27:11 -05:00
# If there is front matter in the mdx file, we need to remove it
if content . startswith ( " --- " ) :
front_matter_end = content . find ( " --- " , 3 )
front_matter = yaml . safe_load ( content [ 4 : front_matter_end ] )
content = content [ front_matter_end + 3 : ]
2024-02-03 12:01:00 -05:00
2024-03-02 09:27:11 -05:00
front_matter [ " source_notebook " ] = f " /notebook/ { notebook_name } "
front_matter [ " custom_edit_url " ] = f " https://github.com/microsoft/autogen/edit/main/notebook/ { notebook_name } "
2024-02-03 12:01:00 -05:00
# inject in content directly after the markdown title the word done
# Find the end of the line with the title
title_end = content . find ( " \n " , content . find ( " # " ) )
# Extract page title
title = content [ content . find ( " # " ) + 1 : content . find ( " \n " , content . find ( " # " ) ) ] . strip ( )
2024-03-02 09:27:11 -05:00
# If there is a { in the title we trim off the { and everything after it
if " { " in title :
title = title [ : title . find ( " { " ) ] . strip ( )
2024-02-03 12:01:00 -05:00
2024-03-02 09:27:11 -05:00
front_matter [ " title " ] = title
2024-02-03 12:01:00 -05:00
github_link = f " https://github.com/microsoft/autogen/blob/main/notebook/ { notebook_name } "
content = (
content [ : title_end ]
+ " \n []( "
+ github_link
+ " ) "
+ content [ title_end : ]
)
# If no colab link is present, insert one
if " colab-badge.svg " not in content :
content = (
content [ : title_end ]
+ " \n [](https://colab.research.google.com/github/microsoft/autogen/blob/main/notebook/ "
+ notebook_name
+ " ) "
+ content [ title_end : ]
)
2024-03-02 09:27:11 -05:00
# Dump front_matter to ysaml
front_matter = yaml . dump ( front_matter , default_flow_style = False )
2024-02-03 12:01:00 -05:00
# Rewrite the content as
# ---
# front_matter
# ---
# content
2024-03-02 09:27:11 -05:00
new_content = f " --- \n { front_matter } --- \n { content } "
2024-02-08 09:28:56 -08:00
with open ( rendered_mdx , " w " , encoding = " utf-8 " ) as f :
2024-02-03 12:01:00 -05:00
f . write ( new_content )
def path ( path_str : str ) - > Path :
""" Return a Path object. """
return Path ( path_str )
2024-02-29 15:47:30 -05:00
def collect_notebooks ( notebook_directory : Path , website_directory : Path ) - > typing . List [ Path ] :
notebooks = list ( notebook_directory . glob ( " *.ipynb " ) )
notebooks . extend ( list ( website_directory . glob ( " docs/**/*.ipynb " ) ) )
return notebooks
2024-03-02 09:27:11 -05:00
def fmt_skip ( notebook : Path , reason : str ) - > None :
return f " { colored ( ' [Skip] ' , ' yellow ' ) } { colored ( notebook . name , ' blue ' ) } : { reason } "
def fmt_ok ( notebook : Path ) - > None :
return f " { colored ( ' [OK] ' , ' green ' ) } { colored ( notebook . name , ' blue ' ) } ✅ "
def fmt_error ( notebook : Path , error : NotebookError ) - > None :
return f " { colored ( ' [Error] ' , ' red ' ) } { colored ( notebook . name , ' blue ' ) } : { error . error_name } - { error . error_value } "
2024-02-29 15:47:30 -05:00
def start_thread_to_terminate_when_parent_process_dies ( ppid : int ) :
pid = os . getpid ( )
def f ( ) - > None :
while True :
try :
os . kill ( ppid , 0 )
except OSError :
os . kill ( pid , signal . SIGTERM )
time . sleep ( 1 )
thread = threading . Thread ( target = f , daemon = True )
thread . start ( )
def main ( ) - > None :
2024-02-03 12:01:00 -05:00
script_dir = Path ( __file__ ) . parent . absolute ( )
parser = argparse . ArgumentParser ( )
2024-02-29 15:47:30 -05:00
subparsers = parser . add_subparsers ( dest = " subcommand " )
2024-02-03 12:01:00 -05:00
parser . add_argument (
" --notebook-directory " ,
type = path ,
help = " Directory containing notebooks to process " ,
default = script_dir / " ../notebook " ,
)
parser . add_argument (
" --website-directory " , type = path , help = " Root directory of docusarus website " , default = script_dir
)
2024-02-29 15:47:30 -05:00
render_parser = subparsers . add_parser ( " render " )
render_parser . add_argument ( " --quarto-bin " , help = " Path to quarto binary " , default = " quarto " )
render_parser . add_argument ( " --dry-run " , help = " Don ' t render " , action = " store_true " )
render_parser . add_argument ( " notebooks " , type = path , nargs = " * " , default = None )
test_parser = subparsers . add_parser ( " test " )
test_parser . add_argument ( " --timeout " , help = " Timeout for each notebook " , type = int , default = 60 )
test_parser . add_argument ( " --exit-on-first-fail " , " -e " , help = " Exit after first test fail " , action = " store_true " )
test_parser . add_argument ( " notebooks " , type = path , nargs = " * " , default = None )
2024-03-02 09:27:11 -05:00
test_parser . add_argument ( " --workers " , help = " Number of workers to use " , type = int , default = - 1 )
2024-02-03 12:01:00 -05:00
2024-02-29 15:47:30 -05:00
args = parser . parse_args ( )
2024-02-03 12:01:00 -05:00
2024-02-29 15:47:30 -05:00
if args . subcommand is None :
print ( " No subcommand specified " )
sys . exit ( 1 )
2024-02-03 12:01:00 -05:00
2024-02-29 15:47:30 -05:00
if args . notebooks :
collected_notebooks = args . notebooks
else :
collected_notebooks = collect_notebooks ( args . notebook_directory , args . website_directory )
filtered_notebooks = [ ]
for notebook in collected_notebooks :
reason = skip_reason_or_none_if_ok ( notebook )
if reason :
2024-03-02 09:27:11 -05:00
print ( fmt_skip ( notebook , reason ) )
2024-02-29 15:47:30 -05:00
else :
filtered_notebooks . append ( notebook )
if args . subcommand == " test " :
2024-03-02 09:27:11 -05:00
if args . workers == - 1 :
args . workers = None
2024-02-29 15:47:30 -05:00
failure = False
with concurrent . futures . ProcessPoolExecutor (
max_workers = args . workers ,
initializer = start_thread_to_terminate_when_parent_process_dies ,
initargs = ( os . getpid ( ) , ) ,
) as executor :
futures = [ executor . submit ( test_notebook , f , args . timeout ) for f in filtered_notebooks ]
for future in concurrent . futures . as_completed ( futures ) :
notebook , optional_error_or_skip = future . result ( )
if isinstance ( optional_error_or_skip , NotebookError ) :
if optional_error_or_skip . error_name == " timeout " :
2024-03-02 09:27:11 -05:00
print ( fmt_error ( notebook , optional_error_or_skip . error_name ) )
2024-02-29 15:47:30 -05:00
else :
print ( " - " * 80 )
2024-03-02 09:27:11 -05:00
2024-02-29 15:47:30 -05:00
print (
2024-03-02 09:27:11 -05:00
fmt_error (
notebook , f " { optional_error_or_skip . error_name } - { optional_error_or_skip . error_value } "
)
2024-02-29 15:47:30 -05:00
)
print ( optional_error_or_skip . traceback )
print ( " - " * 80 )
if args . exit_on_first_fail :
sys . exit ( 1 )
failure = True
elif isinstance ( optional_error_or_skip , NotebookSkip ) :
2024-03-02 09:27:11 -05:00
print ( fmt_skip ( notebook , optional_error_or_skip . reason ) )
2024-02-29 15:47:30 -05:00
else :
2024-03-02 09:27:11 -05:00
print ( fmt_ok ( notebook ) )
2024-02-29 15:47:30 -05:00
if failure :
sys . exit ( 1 )
elif args . subcommand == " render " :
check_quarto_bin ( args . quarto_bin )
if not notebooks_target_dir ( args . website_directory ) . exists ( ) :
notebooks_target_dir ( args . website_directory ) . mkdir ( parents = True )
2024-03-02 09:27:11 -05:00
for notebook in filtered_notebooks :
print (
process_notebook (
notebook , args . website_directory , args . notebook_directory , args . quarto_bin , args . dry_run
2024-02-29 15:47:30 -05:00
)
2024-03-02 09:27:11 -05:00
)
2024-02-29 15:47:30 -05:00
else :
print ( " Unknown subcommand " )
sys . exit ( 1 )
2024-02-03 12:01:00 -05:00
if __name__ == " __main__ " :
main ( )