Add linter functionality with CLI integration for log analysis

2025-12-26 14:38:50 +00:00 · 2025-03-19 14:02:04 -07:00 · 2025-03-19 14:02:04 -07:00 · 98a2827265
commit 98a2827265
parent 69292e6ff4
6 changed files with 389 additions and 0 deletions
--- a/python/packages/agbench/src/agbench/cli.py
+++ b/python/packages/agbench/src/agbench/cli.py
@ -7,6 +7,7 @@ from .remove_missing_cmd import remove_missing_cli
 from .run_cmd import run_cli
 from .tabulate_cmd import tabulate_cli
 from .version import __version__
+from .linter.cli import lint_cli


 class CommandSpec(TypedDict):
@ -33,6 +34,11 @@ def main(args: Optional[List[str]] = None) -> None:
            "description": "tabulate the results of a previous run",
            "function": tabulate_cli,
        },
+        {
+            "command": "lint",
+            "description": "lint the benchmark configuration",
+            "function": lint_cli,
+        },
        {
            "command": "remove_missing",
            "description": "remove folders with missing results",
--- a/python/packages/agbench/src/agbench/linter/init.py
+++ b/python/packages/agbench/src/agbench/linter/init.py
@ -0,0 +1,4 @@
+# __init__.py
+from ._base import Code, Document, CodedDocument, BaseQualitativeCoder
+
+__all__ = ["Code", "Document", "CodedDocument", "BaseQualitativeCoder"]
--- a/python/packages/agbench/src/agbench/linter/_base.py
+++ b/python/packages/agbench/src/agbench/linter/_base.py
@ -0,0 +1,81 @@
+import json
+import hashlib
+import re
+from typing import Protocol, List, Set, Optional
+from pydantic import BaseModel
+
+
+class Document(BaseModel):
+    text: str
+    name: Optional[str] = None
+
+    def __hash__(self) -> int:
+        return int(hashlib.md5(self.text.encode("utf-8")).hexdigest(), 16)
+
+
+class CodeExample(BaseModel):
+    """
+    Represents an example associated with a code.
+
+    Attributes:
+        line (int): The line number in the file where the code example starts.
+        line_end (int): The line number in the file  where the code example ends.
+        reason (str): A description explaining the purpose or context of the
+        code example.
+    """
+
+    line: int
+    line_end: int
+    reason: str
+
+
+class Code(BaseModel):
+    name: str
+    definition: str
+    examples: List[CodeExample]  # changed from List[str]
+    id: Optional[int] = None
+    merged_from: Optional[List[int]] = None
+
+    def __init__(
+        self,
+        name: str,
+        definition: str,
+        examples: List[CodeExample],
+        id: Optional[int] = None,
+        merged_from: Optional[List[int]] = None,
+    ):
+        super().__init__(name=name, definition=definition, examples=examples)
+        self.name = re.sub(r"[^a-z-]", "", self.name.lower().replace(" ", "-"))
+        self.id = int(
+            hashlib.md5((self.name + self.definition).encode("utf-8")).hexdigest(), 16
+        )
+        self.merged_from = None
+
+    def __hash__(self) -> int:
+        if self.id is None:
+            raise ValueError("Code ID is not set.")
+        return self.id
+
+    def add_merged_from(self, code_id: int) -> None:
+        if self.merged_from is None:
+            self.merged_from = []
+        if code_id not in self.merged_from:
+            self.merged_from.append(code_id)
+
+
+class CodedDocument(BaseModel):
+    doc: Document
+    codes: Set[Code]
+
+    @classmethod
+    def from_json(cls, json_str: str) -> "CodedDocument":
+        data = json.loads(json_str)
+        doc = Document(**data["doc"])
+        codes = {Code(**code) for code in data["codes"]}
+        return cls(doc=doc, codes=codes)
+
+
+class BaseQualitativeCoder(Protocol):
+    def code_document(
+        self, doc: Document, code_set: Optional[Set[Code]]
+    ) -> Optional[CodedDocument]: ...
--- a/python/packages/agbench/src/agbench/linter/cli.py
+++ b/python/packages/agbench/src/agbench/linter/cli.py
@ -0,0 +1,86 @@
+import os
+import argparse
+from typing import List, Sequence, Optional
+from ._base import Document, CodedDocument
+from .coders.oai_coder import OAIQualitativeCoder
+
+
+def prepend_line_numbers(lines: List[str]) -> List[str]:
+    """
+    Returns a list of strings with each line prefixed by its right-justified
+      line number.
+    """
+    width = len(str(len(lines)))
+    new_lines = [f"{i+1:>{width}}: {line}" for i, line in enumerate(lines)]
+    return new_lines
+
+
+def load_log_file(path: str, prepend_numbers: bool = False) -> Document:
+    with open(path, "r") as f:
+        lines = f.readlines()
+    if prepend_numbers:
+        lines = prepend_line_numbers(lines)
+
+    text = "".join(lines)
+    return Document(text=text, name=os.path.abspath(path))
+
+
+def code_log(path: str) -> Optional[CodedDocument]:
+    coder = OAIQualitativeCoder()
+
+    if os.path.isfile(path):
+        doc = load_log_file(path, prepend_numbers=True)
+        coded_doc = coder.code_document(doc)
+        return coded_doc
+    else:
+        raise FileNotFoundError(f"File {path} does not exist.")
+
+
+def print_coded_results(input_path: str, coded_doc: CodedDocument) -> None:
+    num_errors: int = 0
+    
+    for code in coded_doc.codes:
+        print(f"\033[31mCategory: {code.name}\033[0m: {code.definition}")
+        for example in code.examples:
+            print(
+                f"\033[1m{input_path}\033[0m:{example.line}"
+                f":{example.line_end}\t{example.reason}"
+            )
+            num_errors += 1
+    print("\n")
+    print(f"Found {num_errors} errors in {input_path}.")
+    print("\n")
+
+
+def code_command(input_path: str) -> None:
+    """
+    Process the given input path by coding log files.
+    """
+    if os.path.isfile(input_path):
+        coded_doc = code_log(input_path)
+        if coded_doc is None:
+            raise ValueError("Failed to code the document.")
+        print_coded_results(input_path, coded_doc)
+    else:
+        print("Invalid input path.")
+
+
+def lint_cli(args: Sequence[str]) -> None:
+    
+    invocation_cmd = args[0]
+    
+    args = args[1:]
+    
+    parser = argparse.ArgumentParser(
+        prog=invocation_cmd,
+        description=f"{invocation_cmd} will analyze a console log."
+        " And detect errors/inefficiencies in the log files."
+    )
+
+    parser.add_argument("logfile",
+                        type=str,
+                        help="Path to a log file.")
+
+    parsed_args = parser.parse_args(args)
+    
+    code_command(parsed_args.logfile)
--- a/python/packages/agbench/src/agbench/linter/coders/init.py
+++ b/python/packages/agbench/src/agbench/linter/coders/init.py
--- a/python/packages/agbench/src/agbench/linter/coders/oai_coder.py
+++ b/python/packages/agbench/src/agbench/linter/coders/oai_coder.py
@ -0,0 +1,212 @@
+import os
+import re
+
+from typing import List, Set, Optional
+from pydantic import BaseModel
+
+from openai import OpenAI
+
+from .._base import CodedDocument, Document, Code
+from .._base import BaseQualitativeCoder
+
+
+class CodeList(BaseModel):
+    code_list: List[Code]
+
+
+def remove_control_characters(text: str) -> str:
+    """
+    Remove control characters from the text.
+    """
+    return re.sub(r"[\x00-\x1F\x7F]", "", text)
+
+
+class OAIQualitativeCoder(BaseQualitativeCoder):
+    DEFAULT_MODEL = "gpt-4o"
+
+    def __init__(self, cache_dir: str = ".cache", model: str = DEFAULT_MODEL, cache_enabled: bool = False) -> None:
+        self.client = OpenAI()
+        self.cache_dir = cache_dir
+        self.model = model
+        self.cache_enabled = cache_enabled
+
+    def code_document(
+        self,
+        doc: Document,
+        code_set: Optional[Set[Code]] = None,
+    ) -> Optional[CodedDocument]:
+        # get hash of the document
+        doc_hash = hash(doc)
+        cache_file = os.path.join(self.cache_dir, f"{doc_hash}.json") if self.cache_enabled else None
+
+        if self.cache_enabled:
+            if not os.path.exists(self.cache_dir):
+                os.makedirs(self.cache_dir)
+            if cache_file and os.path.exists(cache_file):
+                with open(cache_file, "r") as f:
+                    cached_coded_doc_json = f.read()
+                    return CodedDocument.from_json(cached_coded_doc_json)
+
+        # sanitize the doc before passing it to openai
+        doc.text = remove_control_characters(doc.text)
+
+        coded_document: Optional[CodedDocument] = None
+
+        if code_set is None:
+            completion = self.client.beta.chat.completions.parse(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": """You are an expert qualitative researcher.
+
+Given a list of dcocuments containing errors below, generate a list of (error) codes.
+Each code should contains:
+- at least 3 words, max 4 word, hyphenated.
+
+For example, the name could be of the format "lack-of-word2",
+"failed-to-bar", "excessive-use-of-magenta". Name should adhere to
+Joseph M. Williams' writing principles of clarity, conciseness, and coherence.
+
+Ensure each code name is lower-case, hyphenated, and directly reflects the
+concept it represents. Avoid ambiguous or overly complex terms, and prioritize
+simplicity, precision, and readability in the naming.
+
+The code names should pass the 'clarity and grace' test by being easy to
+understand, descriptive, and reflective of the content they categorize.
+- suggest codes that are similar to good code names. avoid code names that are
+similar to bad code names.
+- The definition should be simple worded and practical. At least 2 sentences,
+ max 3. It should be written in past tense.
+
+It should convey how a labeller could apply this code to future logs, without
+mentioning the word "labeller". The definition should be specific enough to be
+useful in debugging. It should be very concrete. And should be well thought and
+make sense. Bull shitting will not earn you any points.
+
+- The examples should be a list. Each example should be descriptive between
+2-3 sentences. Examples should be concrete, informative and not vague. Provide
+at max 20 salient examples. Examples should contain a lot of detail about what
+happened and should refer to incidents in the log.
+
+- The list of codes must mutually exclusive.
+
+# GOOD EXAMPLES OF FINAL CODE NAMES/CLUSTERS
+* looped-without-progress
+* repeated-unsuccessful-actions
+* repeated-syntax-errors
+* exceeded-context-window-limits
+* encountered-security-risks
+* failure-to-switch-strategy
+* exceeded-resource-limits
+* attempted-to-handle-excessive-data
+* no-errors-detected
+These names are high-level but also concrete. They exactly mention the type of
+error, issue, gap that has been identified.
+
+## BAD EXAMPLES OF FINAL CODE NAMES/CLUSTERS
+* mismanaged-data-utilization -- too high level
+* incomplete-or-misguided-execution -- too high level
+* misaligned-agent-interactions -- too high level
+* mismanaged-task-strategies -- too high level
+* resource-inefficiencies -- vague
+* communication-issues -- vague
+* coordination-issues -- too high level and vague
+* operational-failures
+* execution-errors -- too high level
+* navigation-issues -- too concise
+* adaptive-failures -- too concise
+* successful-processes -- I dont like the word processes
+* system-constraints
+* configuration-issues
+* information-inaccuracies -- too high level
+* process-improvements -- vague, not an error
+* inadequate-error-response -- too high-level, unclear what kind of errors
+* specific-access-issues -- makes no sense
+* strategy-inefficiency -- strategy is too high level
+* error-management-gaps -- unclear what error management means
+* error-handling-deficiency -- unclear what kind of errors
+* coordination-breakdown -- unclear what coordination means
+* muddled-task-execution -- unclear what kind of tasks were muddled
+* task-completion-gaps -- too high level
+The above names are too high level and unclear. Please DO NOT use such names.
+    """,
+                    },
+                    {
+                        "role": "user",
+                        "content": doc.text,
+                    },
+                ],
+                response_format=CodeList,
+            )
+
+            message = completion.choices[0].message
+            if message.parsed and len(message.parsed.code_list) > 0:
+                coded_document = CodedDocument(
+                    doc=doc, codes=set(message.parsed.code_list)
+                )
+            else:
+                print(message.refusal)
+                raise ValueError("Error in coding document with OpenAI")
+        else:
+            code_to_str = "\n".join(
+                [
+                    (
+                        f"\n---\nCode Name: {code.name}\n"
+                        f"Definition: {code.definition}\n"
+                        f"Examples: {code.examples}\n---\n"
+                    )
+                    for code in code_set
+                ]
+            )
+
+            completion = self.client.beta.chat.completions.parse(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": """You are an expert qualitative researcher.
+                        You can answer any questions about coding logs.""",
+                    },
+                    {
+                        "role": "user",
+                        "content": f"""
+## Context
+The text below shows a log containing errors. Your task is to code the log with
+the following codes. Generate a list of codes for the log below.
+
+Only use the codes from the list below. Do not create new codes.
+Modify the examples of the codes to fit the context of the log.
+
+Your example should be informative to narrow down the details of the error in
+the context of the example.
+
+## Codes
+
+{code_to_str}
+
+## Log
+
+{doc.text}
+""",
+                    },
+                ],
+                response_format=CodeList,
+            )
+
+            message = completion.choices[0].message
+            if message.parsed and len(message.parsed.code_list) > 0:
+                code_list = message.parsed.code_list
+                # filter out codes whose names are not in the code_set
+                code_set_names = {code.name for code in code_set}
+                code_list = [code for code in code_list if code.name in code_set_names]
+
+                coded_document = CodedDocument(doc=doc, codes=set(code_list))
+
+        if coded_document is None:
+            raise ValueError("Error in coding document with OpenAI")
+    
+        if self.cache_enabled and cache_file:
+            with open(cache_file, "w") as f:
+                f.write(coded_document.model_dump_json(indent=4))
+        return coded_document