From 878aa4c3fc24fe59c79d7a8d0951d7dad08aee9b Mon Sep 17 00:00:00 2001 From: gagb Date: Thu, 20 Mar 2025 12:05:42 -0700 Subject: [PATCH] Add linter to AGBench (#6022) This pull request introduces a new linting feature to the benchmark configuration in the `agbench` package. The main changes include adding a new command to the CLI, implementing the linter functionality, and integrating it with the existing codebase. ### New Linting Feature: * [`python/packages/agbench/src/agbench/cli.py`](diffhunk://#diff-0eafed70ad5e99e6f7319927bf92ee3ce4787d156dd2775b10a61baad7ec1799R10): Added `lint_cli` import and integrated the new "lint" command into the `main` function. [[1]](diffhunk://#diff-0eafed70ad5e99e6f7319927bf92ee3ce4787d156dd2775b10a61baad7ec1799R10) [[2]](diffhunk://#diff-0eafed70ad5e99e6f7319927bf92ee3ce4787d156dd2775b10a61baad7ec1799R37-R41) ### Linter Implementation: * [`python/packages/agbench/src/agbench/linter/__init__.py`](diffhunk://#diff-45842e728e3daad063b3cf84d5857a4fdfe14e6d977fb2054f284eb9f5bb5272R1-R4): Added necessary imports to initialize the linter module. * [`python/packages/agbench/src/agbench/linter/_base.py`](diffhunk://#diff-f7ea2f6706232406b6c727fda6d71f09c568b4573f070af79bb7f3da3514e364R1-R81): Defined core classes such as `Document`, `Code`, `CodeExample`, `CodedDocument`, and the `BaseQualitativeCoder` protocol. * [`python/packages/agbench/src/agbench/linter/cli.py`](diffhunk://#diff-e6ad1e14dc0df2c10fe62fede5a06d83865ad1961f99ec2d78f9052feb4d663bR1-R86): Implemented the `lint_cli` function, which includes loading log files, coding them, and printing the results. * [`python/packages/agbench/src/agbench/linter/coders/oai_coder.py`](diffhunk://#diff-5059129410822c8a214f797a6167cbfcfbe31bd6a3b1efcb65a2dd703ef9b331R1-R212): Implemented the `OAIQualitativeCoder` class to interact with OpenAI for coding documents and caching results. Example usage: image image > If you are in VSCode Terminal, you can click on the links in the terminal output to jump to the exact error. --------- Co-authored-by: afourney --- python/packages/agbench/src/agbench/cli.py | 6 + .../agbench/src/agbench/linter/__init__.py | 4 + .../agbench/src/agbench/linter/_base.py | 82 +++++++ .../agbench/src/agbench/linter/cli.py | 105 +++++++++ .../src/agbench/linter/coders/__init__.py | 0 .../src/agbench/linter/coders/oai_coder.py | 210 ++++++++++++++++++ 6 files changed, 407 insertions(+) create mode 100644 python/packages/agbench/src/agbench/linter/__init__.py create mode 100644 python/packages/agbench/src/agbench/linter/_base.py create mode 100644 python/packages/agbench/src/agbench/linter/cli.py create mode 100644 python/packages/agbench/src/agbench/linter/coders/__init__.py create mode 100644 python/packages/agbench/src/agbench/linter/coders/oai_coder.py diff --git a/python/packages/agbench/src/agbench/cli.py b/python/packages/agbench/src/agbench/cli.py index bb11c9f93..059c02783 100644 --- a/python/packages/agbench/src/agbench/cli.py +++ b/python/packages/agbench/src/agbench/cli.py @@ -7,6 +7,7 @@ from .remove_missing_cmd import remove_missing_cli from .run_cmd import run_cli from .tabulate_cmd import tabulate_cli from .version import __version__ +from .linter.cli import lint_cli class CommandSpec(TypedDict): @@ -33,6 +34,11 @@ def main(args: Optional[List[str]] = None) -> None: "description": "tabulate the results of a previous run", "function": tabulate_cli, }, + { + "command": "lint", + "description": "lint the benchmark configuration", + "function": lint_cli, + }, { "command": "remove_missing", "description": "remove folders with missing results", diff --git a/python/packages/agbench/src/agbench/linter/__init__.py b/python/packages/agbench/src/agbench/linter/__init__.py new file mode 100644 index 000000000..797b7f272 --- /dev/null +++ b/python/packages/agbench/src/agbench/linter/__init__.py @@ -0,0 +1,4 @@ +# __init__.py +from ._base import Code, Document, CodedDocument, BaseQualitativeCoder + +__all__ = ["Code", "Document", "CodedDocument", "BaseQualitativeCoder"] diff --git a/python/packages/agbench/src/agbench/linter/_base.py b/python/packages/agbench/src/agbench/linter/_base.py new file mode 100644 index 000000000..4f6209b78 --- /dev/null +++ b/python/packages/agbench/src/agbench/linter/_base.py @@ -0,0 +1,82 @@ +import json +import hashlib +import re +from typing import Protocol, List, Set, Optional +from pydantic import BaseModel, Field + + +class Document(BaseModel): + text: str = Field(..., description="Text content of the document.") + name: Optional[str] = Field(None, description="Optional name of the document.") + + def __hash__(self) -> int: + return int(hashlib.md5(self.text.encode("utf-8")).hexdigest(), 16) + + +class CodeExample(BaseModel): + """ + Represents an example associated with a code. + """ + + reason: str = Field( + ..., description="A two sentence, human-readable explanation why this example and lines relate to the code." + ) + line_content: str = Field( + ..., description="The exact content of the line where the error is found. This should be a single line." + ) + line: int = Field(..., description="The most important line number where a human would say the error is.") + line_end: int = Field(..., description="Line number where the issue ends.") + + +class Code(BaseModel): + name: str = Field(..., description="Normalized unique name for the code (lowercase, hyphen separated).") + definition: str = Field(..., description="Definition of the code.") + examples: List[CodeExample] = Field( + ..., description="List of code examples associated with the code. Cannot be empty." + ) + severity: int = Field( + ..., description="Severity rating of the error identified using the code. Valid values: 0, 1, 2." + ) + id: Optional[int] = Field(None, description="Identifier computed using MD5 of name and definition.") + merged_from: Optional[List[int]] = Field(None, description="List of code ids from which this code is merged.") + + def __init__( + self, + name: str, + definition: str, + examples: List[CodeExample], + severity: int, + id: Optional[int] = None, + merged_from: Optional[List[int]] = None, + ): + super().__init__(name=name, definition=definition, examples=examples, severity=severity) + self.name = re.sub(r"[^a-z-]", "", self.name.lower().replace(" ", "-")) + self.id = int(hashlib.md5((self.name + self.definition).encode("utf-8")).hexdigest(), 16) + self.merged_from = None + + def __hash__(self) -> int: + if self.id is None: + raise ValueError("Code ID is not set.") + return self.id + + def add_merged_from(self, code_id: int) -> None: + if self.merged_from is None: + self.merged_from = [] + if code_id not in self.merged_from: + self.merged_from.append(code_id) + + +class CodedDocument(BaseModel): + doc: Document + codes: Set[Code] + + @classmethod + def from_json(cls, json_str: str) -> "CodedDocument": + data = json.loads(json_str) + doc = Document(**data["doc"]) + codes = {Code(**code) for code in data["codes"]} + return cls(doc=doc, codes=codes) + + +class BaseQualitativeCoder(Protocol): + def code_document(self, doc: Document, code_set: Optional[Set[Code]]) -> Optional[CodedDocument]: ... diff --git a/python/packages/agbench/src/agbench/linter/cli.py b/python/packages/agbench/src/agbench/linter/cli.py new file mode 100644 index 000000000..426890258 --- /dev/null +++ b/python/packages/agbench/src/agbench/linter/cli.py @@ -0,0 +1,105 @@ +import os +import argparse +from typing import List, Sequence, Optional +from openai import OpenAI +from ._base import Document, CodedDocument +from .coders.oai_coder import OAIQualitativeCoder + + +def prepend_line_numbers(lines: List[str]) -> List[str]: + """ + Returns a list of strings with each line prefixed by its right-justified + line number. + """ + width = len(str(len(lines))) + new_lines = [f"{i+1:>{width}}: {line}" for i, line in enumerate(lines)] + return new_lines + + +def load_log_file(path: str, prepend_numbers: bool = False) -> Document: + with open(path, "r") as f: + lines = f.readlines() + if prepend_numbers: + lines = prepend_line_numbers(lines) + + text = "".join(lines) + return Document(text=text, name=os.path.abspath(path)) + + +def code_log(path: str) -> Optional[CodedDocument]: + coder = OAIQualitativeCoder() + + if os.path.isfile(path): + doc = load_log_file(path, prepend_numbers=True) + coded_doc = coder.code_document(doc) + return coded_doc + else: + raise FileNotFoundError(f"File {path} does not exist.") + + +def print_coded_results(input_path: str, coded_doc: CodedDocument) -> None: + num_errors: int = 0 + # define map from severity to ANSI color + severity_color_map = {2: "\033[31m", 1: "\033[33m", 0: "\033[32m"} + + # sort the codes by severity with the most severe first + sorted_codes = sorted(coded_doc.codes, key=lambda x: x.severity, reverse=True) + + for code in sorted_codes: + # select color based on severity, default to white if missing + color = severity_color_map.get(code.severity, "\033[37m") + print(f"{color}[{code.severity}]: {code.name}\033[0m: {code.definition}") + for example in code.examples: + print(f"\033[1m{input_path}\033[0m:{example.line}" f":{example.line_end}\t{example.reason}") + num_errors += 1 + print("\n") + print(f"Found {num_errors} errors in {input_path}.") + print("\n") + + +def get_log_summary(input_path: str) -> str: + """ + Generate a single sentence of summary for the given log file. + """ + client = OpenAI() + + text = load_log_file(input_path, prepend_numbers=False).text + + response = client.responses.create( + model="gpt-4o", + input=f"Summarize the following log file in one sentence.\n{text}", + ) + return response.output_text + + +def code_command(input_path: str) -> None: + """ + Process the given input path by coding log files. + """ + if os.path.isfile(input_path): + print(f"Processing file: {input_path}") + print(get_log_summary(input_path)) + coded_doc = code_log(input_path) + if coded_doc is None: + raise ValueError("Failed to code the document.") + print_coded_results(input_path, coded_doc) + else: + print("Invalid input path.") + + +def lint_cli(args: Sequence[str]) -> None: + invocation_cmd = args[0] + + args = args[1:] + + parser = argparse.ArgumentParser( + prog=invocation_cmd, + description=f"{invocation_cmd} will analyze a console log." + " And detect errors/inefficiencies in the log files.", + ) + + parser.add_argument("logfile", type=str, help="Path to a log file.") + + parsed_args = parser.parse_args(args) + + code_command(parsed_args.logfile) diff --git a/python/packages/agbench/src/agbench/linter/coders/__init__.py b/python/packages/agbench/src/agbench/linter/coders/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/packages/agbench/src/agbench/linter/coders/oai_coder.py b/python/packages/agbench/src/agbench/linter/coders/oai_coder.py new file mode 100644 index 000000000..374093d3d --- /dev/null +++ b/python/packages/agbench/src/agbench/linter/coders/oai_coder.py @@ -0,0 +1,210 @@ +import os +import re + +from typing import List, Set, Optional +from pydantic import BaseModel + +from openai import OpenAI + +from .._base import CodedDocument, Document, Code +from .._base import BaseQualitativeCoder + + +class CodeList(BaseModel): + code_list: List[Code] + + +def remove_control_characters(text: str) -> str: + """ + Remove control characters from the text. + """ + return re.sub(r"[\x00-\x1F\x7F]", "", text) + + +class OAIQualitativeCoder(BaseQualitativeCoder): + DEFAULT_MODEL = "gpt-4o" + + def __init__(self, cache_dir: str = ".cache", model: str = DEFAULT_MODEL, cache_enabled: bool = False) -> None: + self.client = OpenAI() + self.cache_dir = cache_dir + self.model = model + self.cache_enabled = cache_enabled + + def code_document( + self, + doc: Document, + code_set: Optional[Set[Code]] = None, + ) -> Optional[CodedDocument]: + # get hash of the document + doc_hash = hash(doc) + cache_file = os.path.join(self.cache_dir, f"{doc_hash}.json") if self.cache_enabled else None + + if self.cache_enabled: + if not os.path.exists(self.cache_dir): + os.makedirs(self.cache_dir) + if cache_file and os.path.exists(cache_file): + with open(cache_file, "r") as f: + cached_coded_doc_json = f.read() + return CodedDocument.from_json(cached_coded_doc_json) + + # sanitize the doc before passing it to openai + doc.text = remove_control_characters(doc.text) + + coded_document: Optional[CodedDocument] = None + + if code_set is None: + completion = self.client.beta.chat.completions.parse( + model=self.model, + messages=[ + { + "role": "system", + "content": """You are an expert qualitative researcher. + +Given a list of dcocuments containing errors below, generate a list of (error) codes. +Each code should contains: +- at least 3 words, max 4 word, hyphenated. + +For example, the name could be of the format "lack-of-word2", +"failed-to-bar", "excessive-use-of-magenta". Name should adhere to +Joseph M. Williams' writing principles of clarity, conciseness, and coherence. + +Ensure each code name is lower-case, hyphenated, and directly reflects the +concept it represents. Avoid ambiguous or overly complex terms, and prioritize +simplicity, precision, and readability in the naming. + +The code names should pass the 'clarity and grace' test by being easy to +understand, descriptive, and reflective of the content they categorize. +- suggest codes that are similar to good code names. avoid code names that are +similar to bad code names. +- The definition should be simple worded and practical. At least 2 sentences, + max 3. It should be written in past tense. + +It should convey how a labeller could apply this code to future logs, without +mentioning the word "labeller". The definition should be specific enough to be +useful in debugging. It should be very concrete. And should be well thought and +make sense. Bull shitting will not earn you any points. + +- The examples should be a list. Each example should be descriptive between +2-3 sentences. Examples should be concrete, informative and not vague. Provide +at max 20 salient examples. Examples should contain a lot of detail about what +happened and should refer to incidents in the log. + +- The list of codes must mutually exclusive. + +# GOOD EXAMPLES OF FINAL CODE NAMES/CLUSTERS +* looped-without-progress +* repeated-unsuccessful-actions +* repeated-syntax-errors +* exceeded-context-window-limits +* encountered-security-risks +* failure-to-switch-strategy +* exceeded-resource-limits +* attempted-to-handle-excessive-data +* no-errors-detected +These names are high-level but also concrete. They exactly mention the type of +error, issue, gap that has been identified. + +## BAD EXAMPLES OF FINAL CODE NAMES/CLUSTERS +* mismanaged-data-utilization -- too high level +* incomplete-or-misguided-execution -- too high level +* misaligned-agent-interactions -- too high level +* mismanaged-task-strategies -- too high level +* resource-inefficiencies -- vague +* communication-issues -- vague +* coordination-issues -- too high level and vague +* operational-failures +* execution-errors -- too high level +* navigation-issues -- too concise +* adaptive-failures -- too concise +* successful-processes -- I dont like the word processes +* system-constraints +* configuration-issues +* information-inaccuracies -- too high level +* process-improvements -- vague, not an error +* inadequate-error-response -- too high-level, unclear what kind of errors +* specific-access-issues -- makes no sense +* strategy-inefficiency -- strategy is too high level +* error-management-gaps -- unclear what error management means +* error-handling-deficiency -- unclear what kind of errors +* coordination-breakdown -- unclear what coordination means +* muddled-task-execution -- unclear what kind of tasks were muddled +* task-completion-gaps -- too high level +The above names are too high level and unclear. Please DO NOT use such names. + """, + }, + { + "role": "user", + "content": doc.text, + }, + ], + response_format=CodeList, + ) + + message = completion.choices[0].message + if message.parsed and len(message.parsed.code_list) > 0: + coded_document = CodedDocument(doc=doc, codes=set(message.parsed.code_list)) + else: + print(message.refusal) + raise ValueError("Error in coding document with OpenAI") + else: + code_to_str = "\n".join( + [ + ( + f"\n---\nCode Name: {code.name}\n" + f"Definition: {code.definition}\n" + f"Examples: {code.examples}\n---\n" + ) + for code in code_set + ] + ) + + completion = self.client.beta.chat.completions.parse( + model=self.model, + messages=[ + { + "role": "system", + "content": """You are an expert qualitative researcher. + You can answer any questions about coding logs.""", + }, + { + "role": "user", + "content": f""" +## Context +The text below shows a log containing errors. Your task is to code the log with +the following codes. Generate a list of codes for the log below. + +Only use the codes from the list below. Do not create new codes. +Modify the examples of the codes to fit the context of the log. + +Your example should be informative to narrow down the details of the error in +the context of the example. + +## Codes + +{code_to_str} + +## Log + +{doc.text} +""", + }, + ], + response_format=CodeList, + ) + + message = completion.choices[0].message + if message.parsed and len(message.parsed.code_list) > 0: + code_list = message.parsed.code_list + # filter out codes whose names are not in the code_set + code_set_names = {code.name for code in code_set} + code_list = [code for code in code_list if code.name in code_set_names] + + coded_document = CodedDocument(doc=doc, codes=set(code_list)) + + if coded_document is None: + raise ValueError("Error in coding document with OpenAI") + + if self.cache_enabled and cache_file: + with open(cache_file, "w") as f: + f.write(coded_document.model_dump_json(indent=4)) + return coded_document