From 878aa4c3fc24fe59c79d7a8d0951d7dad08aee9b Mon Sep 17 00:00:00 2001
From: gagb <gagb@users.noreply.github.com>
Date: Thu, 20 Mar 2025 12:05:42 -0700
Subject: [PATCH] Add linter to AGBench (#6022)

This pull request introduces a new linting feature to the benchmark
configuration in the `agbench` package. The main changes include adding
a new command to the CLI, implementing the linter functionality, and
integrating it with the existing codebase.

### New Linting Feature:

*
[`python/packages/agbench/src/agbench/cli.py`](diffhunk://#diff-0eafed70ad5e99e6f7319927bf92ee3ce4787d156dd2775b10a61baad7ec1799R10):
Added `lint_cli` import and integrated the new "lint" command into the
`main` function.
[[1]](diffhunk://#diff-0eafed70ad5e99e6f7319927bf92ee3ce4787d156dd2775b10a61baad7ec1799R10)
[[2]](diffhunk://#diff-0eafed70ad5e99e6f7319927bf92ee3ce4787d156dd2775b10a61baad7ec1799R37-R41)

### Linter Implementation:

*
[`python/packages/agbench/src/agbench/linter/__init__.py`](diffhunk://#diff-45842e728e3daad063b3cf84d5857a4fdfe14e6d977fb2054f284eb9f5bb5272R1-R4):
Added necessary imports to initialize the linter module.
*
[`python/packages/agbench/src/agbench/linter/_base.py`](diffhunk://#diff-f7ea2f6706232406b6c727fda6d71f09c568b4573f070af79bb7f3da3514e364R1-R81):
Defined core classes such as `Document`, `Code`, `CodeExample`,
`CodedDocument`, and the `BaseQualitativeCoder` protocol.
*
[`python/packages/agbench/src/agbench/linter/cli.py`](diffhunk://#diff-e6ad1e14dc0df2c10fe62fede5a06d83865ad1961f99ec2d78f9052feb4d663bR1-R86):
Implemented the `lint_cli` function, which includes loading log files,
coding them, and printing the results.
*
[`python/packages/agbench/src/agbench/linter/coders/oai_coder.py`](diffhunk://#diff-5059129410822c8a214f797a6167cbfcfbe31bd6a3b1efcb65a2dd703ef9b331R1-R212):
Implemented the `OAIQualitativeCoder` class to interact with OpenAI for
coding documents and caching results.

Example usage:

<img width="997" alt="image"
src="https://github.com/user-attachments/assets/6718688e-9917-4a43-a2f1-1105b030528d"
/>


<img width="999" alt="image"
src="https://github.com/user-attachments/assets/7fcb9c43-70f2-4fe7-ae29-5ad6a4ef2a16"
/>

> If you are in VSCode Terminal, you can click on the links in the
terminal output to jump to the exact error.

---------

Co-authored-by: afourney <adamfo@microsoft.com>
---
 python/packages/agbench/src/agbench/cli.py    |   6 +
 .../agbench/src/agbench/linter/__init__.py    |   4 +
 .../agbench/src/agbench/linter/_base.py       |  82 +++++++
 .../agbench/src/agbench/linter/cli.py         | 105 +++++++++
 .../src/agbench/linter/coders/__init__.py     |   0
 .../src/agbench/linter/coders/oai_coder.py    | 210 ++++++++++++++++++
 6 files changed, 407 insertions(+)
 create mode 100644 python/packages/agbench/src/agbench/linter/__init__.py
 create mode 100644 python/packages/agbench/src/agbench/linter/_base.py
 create mode 100644 python/packages/agbench/src/agbench/linter/cli.py
 create mode 100644 python/packages/agbench/src/agbench/linter/coders/__init__.py
 create mode 100644 python/packages/agbench/src/agbench/linter/coders/oai_coder.py
diff --git a/python/packages/agbench/src/agbench/cli.py b/python/packages/agbench/src/agbench/cli.py
index bb11c9f93..059c02783 100644
--- a/python/packages/agbench/src/agbench/cli.py
+++ b/python/packages/agbench/src/agbench/cli.py
@@ -7,6 +7,7 @@ from .remove_missing_cmd import remove_missing_cli
 from .run_cmd import run_cli
 from .tabulate_cmd import tabulate_cli
 from .version import __version__
+from .linter.cli import lint_cli
 
 
 class CommandSpec(TypedDict):
@@ -33,6 +34,11 @@ def main(args: Optional[List[str]] = None) -> None:
             "description": "tabulate the results of a previous run",
             "function": tabulate_cli,
         },
+        {
+            "command": "lint",
+            "description": "lint the benchmark configuration",
+            "function": lint_cli,
+        },
         {
             "command": "remove_missing",
             "description": "remove folders with missing results",
diff --git a/python/packages/agbench/src/agbench/linter/__init__.py b/python/packages/agbench/src/agbench/linter/__init__.py
new file mode 100644
index 000000000..797b7f272
--- /dev/null
+++ b/python/packages/agbench/src/agbench/linter/__init__.py
@@ -0,0 +1,4 @@
+# __init__.py
+from ._base import Code, Document, CodedDocument, BaseQualitativeCoder
+
+__all__ = ["Code", "Document", "CodedDocument", "BaseQualitativeCoder"]
diff --git a/python/packages/agbench/src/agbench/linter/_base.py b/python/packages/agbench/src/agbench/linter/_base.py
new file mode 100644
index 000000000..4f6209b78
--- /dev/null
+++ b/python/packages/agbench/src/agbench/linter/_base.py
@@ -0,0 +1,82 @@
+import json
+import hashlib
+import re
+from typing import Protocol, List, Set, Optional
+from pydantic import BaseModel, Field
+
+
+class Document(BaseModel):
+    text: str = Field(..., description="Text content of the document.")
+    name: Optional[str] = Field(None, description="Optional name of the document.")
+
+    def __hash__(self) -> int:
+        return int(hashlib.md5(self.text.encode("utf-8")).hexdigest(), 16)
+
+
+class CodeExample(BaseModel):
+    """
+    Represents an example associated with a code.
+    """
+
+    reason: str = Field(
+        ..., description="A two sentence, human-readable explanation why this example and lines relate to the code."
+    )
+    line_content: str = Field(
+        ..., description="The exact content of the line where the error is found. This should be a single line."
+    )
+    line: int = Field(..., description="The most important line number where a human would say the error is.")
+    line_end: int = Field(..., description="Line number where the issue ends.")
+
+
+class Code(BaseModel):
+    name: str = Field(..., description="Normalized unique name for the code (lowercase, hyphen separated).")
+    definition: str = Field(..., description="Definition of the code.")
+    examples: List[CodeExample] = Field(
+        ..., description="List of code examples associated with the code. Cannot be empty."
+    )
+    severity: int = Field(
+        ..., description="Severity rating of the error identified using the code. Valid values: 0, 1, 2."
+    )
+    id: Optional[int] = Field(None, description="Identifier computed using MD5 of name and definition.")
+    merged_from: Optional[List[int]] = Field(None, description="List of code ids from which this code is merged.")
+
+    def __init__(
+        self,
+        name: str,
+        definition: str,
+        examples: List[CodeExample],
+        severity: int,
+        id: Optional[int] = None,
+        merged_from: Optional[List[int]] = None,
+    ):
+        super().__init__(name=name, definition=definition, examples=examples, severity=severity)
+        self.name = re.sub(r"[^a-z-]", "", self.name.lower().replace(" ", "-"))
+        self.id = int(hashlib.md5((self.name + self.definition).encode("utf-8")).hexdigest(), 16)
+        self.merged_from = None
+
+    def __hash__(self) -> int:
+        if self.id is None:
+            raise ValueError("Code ID is not set.")
+        return self.id
+
+    def add_merged_from(self, code_id: int) -> None:
+        if self.merged_from is None:
+            self.merged_from = []
+        if code_id not in self.merged_from:
+            self.merged_from.append(code_id)
+
+
+class CodedDocument(BaseModel):
+    doc: Document
+    codes: Set[Code]
+
+    @classmethod
+    def from_json(cls, json_str: str) -> "CodedDocument":
+        data = json.loads(json_str)
+        doc = Document(**data["doc"])
+        codes = {Code(**code) for code in data["codes"]}
+        return cls(doc=doc, codes=codes)
+
+
+class BaseQualitativeCoder(Protocol):
+    def code_document(self, doc: Document, code_set: Optional[Set[Code]]) -> Optional[CodedDocument]: ...
diff --git a/python/packages/agbench/src/agbench/linter/cli.py b/python/packages/agbench/src/agbench/linter/cli.py
new file mode 100644
index 000000000..426890258
--- /dev/null
+++ b/python/packages/agbench/src/agbench/linter/cli.py
@@ -0,0 +1,105 @@
+import os
+import argparse
+from typing import List, Sequence, Optional
+from openai import OpenAI
+from ._base import Document, CodedDocument
+from .coders.oai_coder import OAIQualitativeCoder
+
+
+def prepend_line_numbers(lines: List[str]) -> List[str]:
+    """
+    Returns a list of strings with each line prefixed by its right-justified
+      line number.
+    """
+    width = len(str(len(lines)))
+    new_lines = [f"{i+1:>{width}}: {line}" for i, line in enumerate(lines)]
+    return new_lines
+
+
+def load_log_file(path: str, prepend_numbers: bool = False) -> Document:
+    with open(path, "r") as f:
+        lines = f.readlines()
+    if prepend_numbers:
+        lines = prepend_line_numbers(lines)
+
+    text = "".join(lines)
+    return Document(text=text, name=os.path.abspath(path))
+
+
+def code_log(path: str) -> Optional[CodedDocument]:
+    coder = OAIQualitativeCoder()
+
+    if os.path.isfile(path):
+        doc = load_log_file(path, prepend_numbers=True)
+        coded_doc = coder.code_document(doc)
+        return coded_doc
+    else:
+        raise FileNotFoundError(f"File {path} does not exist.")
+
+
+def print_coded_results(input_path: str, coded_doc: CodedDocument) -> None:
+    num_errors: int = 0
+    # define map from severity to ANSI color
+    severity_color_map = {2: "\033[31m", 1: "\033[33m", 0: "\033[32m"}
+
+    # sort the codes by severity with the most severe first
+    sorted_codes = sorted(coded_doc.codes, key=lambda x: x.severity, reverse=True)
+
+    for code in sorted_codes:
+        # select color based on severity, default to white if missing
+        color = severity_color_map.get(code.severity, "\033[37m")
+        print(f"{color}[{code.severity}]: {code.name}\033[0m: {code.definition}")
+        for example in code.examples:
+            print(f"\033[1m{input_path}\033[0m:{example.line}" f":{example.line_end}\t{example.reason}")
+            num_errors += 1
+    print("\n")
+    print(f"Found {num_errors} errors in {input_path}.")
+    print("\n")
+
+
+def get_log_summary(input_path: str) -> str:
+    """
+    Generate a single sentence of summary for the given log file.
+    """
+    client = OpenAI()
+
+    text = load_log_file(input_path, prepend_numbers=False).text
+
+    response = client.responses.create(
+        model="gpt-4o",
+        input=f"Summarize the following log file in one sentence.\n{text}",
+    )
+    return response.output_text
+
+
+def code_command(input_path: str) -> None:
+    """
+    Process the given input path by coding log files.
+    """
+    if os.path.isfile(input_path):
+        print(f"Processing file: {input_path}")
+        print(get_log_summary(input_path))
+        coded_doc = code_log(input_path)
+        if coded_doc is None:
+            raise ValueError("Failed to code the document.")
+        print_coded_results(input_path, coded_doc)
+    else:
+        print("Invalid input path.")
+
+
+def lint_cli(args: Sequence[str]) -> None:
+    invocation_cmd = args[0]
+
+    args = args[1:]
+
+    parser = argparse.ArgumentParser(
+        prog=invocation_cmd,
+        description=f"{invocation_cmd} will analyze a console log."
+        " And detect errors/inefficiencies in the log files.",
+    )
+
+    parser.add_argument("logfile", type=str, help="Path to a log file.")
+
+    parsed_args = parser.parse_args(args)
+
+    code_command(parsed_args.logfile)
diff --git a/python/packages/agbench/src/agbench/linter/coders/__init__.py b/python/packages/agbench/src/agbench/linter/coders/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/packages/agbench/src/agbench/linter/coders/oai_coder.py b/python/packages/agbench/src/agbench/linter/coders/oai_coder.py
new file mode 100644
index 000000000..374093d3d
--- /dev/null
+++ b/python/packages/agbench/src/agbench/linter/coders/oai_coder.py
@@ -0,0 +1,210 @@
+import os
+import re
+
+from typing import List, Set, Optional
+from pydantic import BaseModel
+
+from openai import OpenAI
+
+from .._base import CodedDocument, Document, Code
+from .._base import BaseQualitativeCoder
+
+
+class CodeList(BaseModel):
+    code_list: List[Code]
+
+
+def remove_control_characters(text: str) -> str:
+    """
+    Remove control characters from the text.
+    """
+    return re.sub(r"[\x00-\x1F\x7F]", "", text)
+
+
+class OAIQualitativeCoder(BaseQualitativeCoder):
+    DEFAULT_MODEL = "gpt-4o"
+
+    def __init__(self, cache_dir: str = ".cache", model: str = DEFAULT_MODEL, cache_enabled: bool = False) -> None:
+        self.client = OpenAI()
+        self.cache_dir = cache_dir
+        self.model = model
+        self.cache_enabled = cache_enabled
+
+    def code_document(
+        self,
+        doc: Document,
+        code_set: Optional[Set[Code]] = None,
+    ) -> Optional[CodedDocument]:
+        # get hash of the document
+        doc_hash = hash(doc)
+        cache_file = os.path.join(self.cache_dir, f"{doc_hash}.json") if self.cache_enabled else None
+
+        if self.cache_enabled:
+            if not os.path.exists(self.cache_dir):
+                os.makedirs(self.cache_dir)
+            if cache_file and os.path.exists(cache_file):
+                with open(cache_file, "r") as f:
+                    cached_coded_doc_json = f.read()
+                    return CodedDocument.from_json(cached_coded_doc_json)
+
+        # sanitize the doc before passing it to openai
+        doc.text = remove_control_characters(doc.text)
+
+        coded_document: Optional[CodedDocument] = None
+
+        if code_set is None:
+            completion = self.client.beta.chat.completions.parse(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": """You are an expert qualitative researcher.
+
+Given a list of dcocuments containing errors below, generate a list of (error) codes.
+Each code should contains:
+- at least 3 words, max 4 word, hyphenated.
+
+For example, the name could be of the format "lack-of-word2",
+"failed-to-bar", "excessive-use-of-magenta". Name should adhere to
+Joseph M. Williams' writing principles of clarity, conciseness, and coherence.
+
+Ensure each code name is lower-case, hyphenated, and directly reflects the
+concept it represents. Avoid ambiguous or overly complex terms, and prioritize
+simplicity, precision, and readability in the naming.
+
+The code names should pass the 'clarity and grace' test by being easy to
+understand, descriptive, and reflective of the content they categorize.
+- suggest codes that are similar to good code names. avoid code names that are
+similar to bad code names.
+- The definition should be simple worded and practical. At least 2 sentences,
+ max 3. It should be written in past tense.
+
+It should convey how a labeller could apply this code to future logs, without
+mentioning the word "labeller". The definition should be specific enough to be
+useful in debugging. It should be very concrete. And should be well thought and
+make sense. Bull shitting will not earn you any points.
+
+- The examples should be a list. Each example should be descriptive between
+2-3 sentences. Examples should be concrete, informative and not vague. Provide
+at max 20 salient examples. Examples should contain a lot of detail about what
+happened and should refer to incidents in the log.
+
+- The list of codes must mutually exclusive.
+
+# GOOD EXAMPLES OF FINAL CODE NAMES/CLUSTERS
+* looped-without-progress
+* repeated-unsuccessful-actions
+* repeated-syntax-errors
+* exceeded-context-window-limits
+* encountered-security-risks
+* failure-to-switch-strategy
+* exceeded-resource-limits
+* attempted-to-handle-excessive-data
+* no-errors-detected
+These names are high-level but also concrete. They exactly mention the type of
+error, issue, gap that has been identified.
+
+## BAD EXAMPLES OF FINAL CODE NAMES/CLUSTERS
+* mismanaged-data-utilization -- too high level
+* incomplete-or-misguided-execution -- too high level
+* misaligned-agent-interactions -- too high level
+* mismanaged-task-strategies -- too high level
+* resource-inefficiencies -- vague
+* communication-issues -- vague
+* coordination-issues -- too high level and vague
+* operational-failures
+* execution-errors -- too high level
+* navigation-issues -- too concise
+* adaptive-failures -- too concise
+* successful-processes -- I dont like the word processes
+* system-constraints
+* configuration-issues
+* information-inaccuracies -- too high level
+* process-improvements -- vague, not an error
+* inadequate-error-response -- too high-level, unclear what kind of errors
+* specific-access-issues -- makes no sense
+* strategy-inefficiency -- strategy is too high level
+* error-management-gaps -- unclear what error management means
+* error-handling-deficiency -- unclear what kind of errors
+* coordination-breakdown -- unclear what coordination means
+* muddled-task-execution -- unclear what kind of tasks were muddled
+* task-completion-gaps -- too high level
+The above names are too high level and unclear. Please DO NOT use such names.
+    """,
+                    },
+                    {
+                        "role": "user",
+                        "content": doc.text,
+                    },
+                ],
+                response_format=CodeList,
+            )
+
+            message = completion.choices[0].message
+            if message.parsed and len(message.parsed.code_list) > 0:
+                coded_document = CodedDocument(doc=doc, codes=set(message.parsed.code_list))
+            else:
+                print(message.refusal)
+                raise ValueError("Error in coding document with OpenAI")
+        else:
+            code_to_str = "\n".join(
+                [
+                    (
+                        f"\n---\nCode Name: {code.name}\n"
+                        f"Definition: {code.definition}\n"
+                        f"Examples: {code.examples}\n---\n"
+                    )
+                    for code in code_set
+                ]
+            )
+
+            completion = self.client.beta.chat.completions.parse(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": """You are an expert qualitative researcher.
+                        You can answer any questions about coding logs.""",
+                    },
+                    {
+                        "role": "user",
+                        "content": f"""
+## Context
+The text below shows a log containing errors. Your task is to code the log with
+the following codes. Generate a list of codes for the log below.
+
+Only use the codes from the list below. Do not create new codes.
+Modify the examples of the codes to fit the context of the log.
+
+Your example should be informative to narrow down the details of the error in
+the context of the example.
+
+## Codes
+
+{code_to_str}
+
+## Log
+
+{doc.text}
+""",
+                    },
+                ],
+                response_format=CodeList,
+            )
+
+            message = completion.choices[0].message
+            if message.parsed and len(message.parsed.code_list) > 0:
+                code_list = message.parsed.code_list
+                # filter out codes whose names are not in the code_set
+                code_set_names = {code.name for code in code_set}
+                code_list = [code for code in code_list if code.name in code_set_names]
+
+                coded_document = CodedDocument(doc=doc, codes=set(code_list))
+
+        if coded_document is None:
+            raise ValueError("Error in coding document with OpenAI")
+
+        if self.cache_enabled and cache_file:
+            with open(cache_file, "w") as f:
+                f.write(coded_document.model_dump_json(indent=4))
+        return coded_document