Add linter to AGBench (#6022)

This pull request introduces a new linting feature to the benchmark
configuration in the `agbench` package. The main changes include adding
a new command to the CLI, implementing the linter functionality, and
integrating it with the existing codebase.

### New Linting Feature:

*
[`python/packages/agbench/src/agbench/cli.py`](diffhunk://#diff-0eafed70ad5e99e6f7319927bf92ee3ce4787d156dd2775b10a61baad7ec1799R10):
Added `lint_cli` import and integrated the new "lint" command into the
`main` function.
[[1]](diffhunk://#diff-0eafed70ad5e99e6f7319927bf92ee3ce4787d156dd2775b10a61baad7ec1799R10)
[[2]](diffhunk://#diff-0eafed70ad5e99e6f7319927bf92ee3ce4787d156dd2775b10a61baad7ec1799R37-R41)

### Linter Implementation:

*
[`python/packages/agbench/src/agbench/linter/__init__.py`](diffhunk://#diff-45842e728e3daad063b3cf84d5857a4fdfe14e6d977fb2054f284eb9f5bb5272R1-R4):
Added necessary imports to initialize the linter module.
*
[`python/packages/agbench/src/agbench/linter/_base.py`](diffhunk://#diff-f7ea2f6706232406b6c727fda6d71f09c568b4573f070af79bb7f3da3514e364R1-R81):
Defined core classes such as `Document`, `Code`, `CodeExample`,
`CodedDocument`, and the `BaseQualitativeCoder` protocol.
*
[`python/packages/agbench/src/agbench/linter/cli.py`](diffhunk://#diff-e6ad1e14dc0df2c10fe62fede5a06d83865ad1961f99ec2d78f9052feb4d663bR1-R86):
Implemented the `lint_cli` function, which includes loading log files,
coding them, and printing the results.
*
[`python/packages/agbench/src/agbench/linter/coders/oai_coder.py`](diffhunk://#diff-5059129410822c8a214f797a6167cbfcfbe31bd6a3b1efcb65a2dd703ef9b331R1-R212):
Implemented the `OAIQualitativeCoder` class to interact with OpenAI for
coding documents and caching results.

Example usage:

<img width="997" alt="image"
src="https://github.com/user-attachments/assets/6718688e-9917-4a43-a2f1-1105b030528d"
/>


<img width="999" alt="image"
src="https://github.com/user-attachments/assets/7fcb9c43-70f2-4fe7-ae29-5ad6a4ef2a16"
/>

> If you are in VSCode Terminal, you can click on the links in the
terminal output to jump to the exact error.

---------

Co-authored-by: afourney <adamfo@microsoft.com>
This commit is contained in:
gagb 2025-03-20 12:05:42 -07:00 committed by GitHub
parent fef953e062
commit 878aa4c3fc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 407 additions and 0 deletions

View File

@ -7,6 +7,7 @@ from .remove_missing_cmd import remove_missing_cli
from .run_cmd import run_cli
from .tabulate_cmd import tabulate_cli
from .version import __version__
from .linter.cli import lint_cli
class CommandSpec(TypedDict):
@ -33,6 +34,11 @@ def main(args: Optional[List[str]] = None) -> None:
"description": "tabulate the results of a previous run",
"function": tabulate_cli,
},
{
"command": "lint",
"description": "lint the benchmark configuration",
"function": lint_cli,
},
{
"command": "remove_missing",
"description": "remove folders with missing results",

View File

@ -0,0 +1,4 @@
# __init__.py
from ._base import Code, Document, CodedDocument, BaseQualitativeCoder
__all__ = ["Code", "Document", "CodedDocument", "BaseQualitativeCoder"]

View File

@ -0,0 +1,82 @@
import json
import hashlib
import re
from typing import Protocol, List, Set, Optional
from pydantic import BaseModel, Field
class Document(BaseModel):
text: str = Field(..., description="Text content of the document.")
name: Optional[str] = Field(None, description="Optional name of the document.")
def __hash__(self) -> int:
return int(hashlib.md5(self.text.encode("utf-8")).hexdigest(), 16)
class CodeExample(BaseModel):
"""
Represents an example associated with a code.
"""
reason: str = Field(
..., description="A two sentence, human-readable explanation why this example and lines relate to the code."
)
line_content: str = Field(
..., description="The exact content of the line where the error is found. This should be a single line."
)
line: int = Field(..., description="The most important line number where a human would say the error is.")
line_end: int = Field(..., description="Line number where the issue ends.")
class Code(BaseModel):
name: str = Field(..., description="Normalized unique name for the code (lowercase, hyphen separated).")
definition: str = Field(..., description="Definition of the code.")
examples: List[CodeExample] = Field(
..., description="List of code examples associated with the code. Cannot be empty."
)
severity: int = Field(
..., description="Severity rating of the error identified using the code. Valid values: 0, 1, 2."
)
id: Optional[int] = Field(None, description="Identifier computed using MD5 of name and definition.")
merged_from: Optional[List[int]] = Field(None, description="List of code ids from which this code is merged.")
def __init__(
self,
name: str,
definition: str,
examples: List[CodeExample],
severity: int,
id: Optional[int] = None,
merged_from: Optional[List[int]] = None,
):
super().__init__(name=name, definition=definition, examples=examples, severity=severity)
self.name = re.sub(r"[^a-z-]", "", self.name.lower().replace(" ", "-"))
self.id = int(hashlib.md5((self.name + self.definition).encode("utf-8")).hexdigest(), 16)
self.merged_from = None
def __hash__(self) -> int:
if self.id is None:
raise ValueError("Code ID is not set.")
return self.id
def add_merged_from(self, code_id: int) -> None:
if self.merged_from is None:
self.merged_from = []
if code_id not in self.merged_from:
self.merged_from.append(code_id)
class CodedDocument(BaseModel):
doc: Document
codes: Set[Code]
@classmethod
def from_json(cls, json_str: str) -> "CodedDocument":
data = json.loads(json_str)
doc = Document(**data["doc"])
codes = {Code(**code) for code in data["codes"]}
return cls(doc=doc, codes=codes)
class BaseQualitativeCoder(Protocol):
def code_document(self, doc: Document, code_set: Optional[Set[Code]]) -> Optional[CodedDocument]: ...

View File

@ -0,0 +1,105 @@
import os
import argparse
from typing import List, Sequence, Optional
from openai import OpenAI
from ._base import Document, CodedDocument
from .coders.oai_coder import OAIQualitativeCoder
def prepend_line_numbers(lines: List[str]) -> List[str]:
"""
Returns a list of strings with each line prefixed by its right-justified
line number.
"""
width = len(str(len(lines)))
new_lines = [f"{i+1:>{width}}: {line}" for i, line in enumerate(lines)]
return new_lines
def load_log_file(path: str, prepend_numbers: bool = False) -> Document:
with open(path, "r") as f:
lines = f.readlines()
if prepend_numbers:
lines = prepend_line_numbers(lines)
text = "".join(lines)
return Document(text=text, name=os.path.abspath(path))
def code_log(path: str) -> Optional[CodedDocument]:
coder = OAIQualitativeCoder()
if os.path.isfile(path):
doc = load_log_file(path, prepend_numbers=True)
coded_doc = coder.code_document(doc)
return coded_doc
else:
raise FileNotFoundError(f"File {path} does not exist.")
def print_coded_results(input_path: str, coded_doc: CodedDocument) -> None:
num_errors: int = 0
# define map from severity to ANSI color
severity_color_map = {2: "\033[31m", 1: "\033[33m", 0: "\033[32m"}
# sort the codes by severity with the most severe first
sorted_codes = sorted(coded_doc.codes, key=lambda x: x.severity, reverse=True)
for code in sorted_codes:
# select color based on severity, default to white if missing
color = severity_color_map.get(code.severity, "\033[37m")
print(f"{color}[{code.severity}]: {code.name}\033[0m: {code.definition}")
for example in code.examples:
print(f"\033[1m{input_path}\033[0m:{example.line}" f":{example.line_end}\t{example.reason}")
num_errors += 1
print("\n")
print(f"Found {num_errors} errors in {input_path}.")
print("\n")
def get_log_summary(input_path: str) -> str:
"""
Generate a single sentence of summary for the given log file.
"""
client = OpenAI()
text = load_log_file(input_path, prepend_numbers=False).text
response = client.responses.create(
model="gpt-4o",
input=f"Summarize the following log file in one sentence.\n{text}",
)
return response.output_text
def code_command(input_path: str) -> None:
"""
Process the given input path by coding log files.
"""
if os.path.isfile(input_path):
print(f"Processing file: {input_path}")
print(get_log_summary(input_path))
coded_doc = code_log(input_path)
if coded_doc is None:
raise ValueError("Failed to code the document.")
print_coded_results(input_path, coded_doc)
else:
print("Invalid input path.")
def lint_cli(args: Sequence[str]) -> None:
invocation_cmd = args[0]
args = args[1:]
parser = argparse.ArgumentParser(
prog=invocation_cmd,
description=f"{invocation_cmd} will analyze a console log."
" And detect errors/inefficiencies in the log files.",
)
parser.add_argument("logfile", type=str, help="Path to a log file.")
parsed_args = parser.parse_args(args)
code_command(parsed_args.logfile)

View File

@ -0,0 +1,210 @@
import os
import re
from typing import List, Set, Optional
from pydantic import BaseModel
from openai import OpenAI
from .._base import CodedDocument, Document, Code
from .._base import BaseQualitativeCoder
class CodeList(BaseModel):
code_list: List[Code]
def remove_control_characters(text: str) -> str:
"""
Remove control characters from the text.
"""
return re.sub(r"[\x00-\x1F\x7F]", "", text)
class OAIQualitativeCoder(BaseQualitativeCoder):
DEFAULT_MODEL = "gpt-4o"
def __init__(self, cache_dir: str = ".cache", model: str = DEFAULT_MODEL, cache_enabled: bool = False) -> None:
self.client = OpenAI()
self.cache_dir = cache_dir
self.model = model
self.cache_enabled = cache_enabled
def code_document(
self,
doc: Document,
code_set: Optional[Set[Code]] = None,
) -> Optional[CodedDocument]:
# get hash of the document
doc_hash = hash(doc)
cache_file = os.path.join(self.cache_dir, f"{doc_hash}.json") if self.cache_enabled else None
if self.cache_enabled:
if not os.path.exists(self.cache_dir):
os.makedirs(self.cache_dir)
if cache_file and os.path.exists(cache_file):
with open(cache_file, "r") as f:
cached_coded_doc_json = f.read()
return CodedDocument.from_json(cached_coded_doc_json)
# sanitize the doc before passing it to openai
doc.text = remove_control_characters(doc.text)
coded_document: Optional[CodedDocument] = None
if code_set is None:
completion = self.client.beta.chat.completions.parse(
model=self.model,
messages=[
{
"role": "system",
"content": """You are an expert qualitative researcher.
Given a list of dcocuments containing errors below, generate a list of (error) codes.
Each code should contains:
- at least 3 words, max 4 word, hyphenated.
For example, the name could be of the format "lack-of-word2",
"failed-to-bar", "excessive-use-of-magenta". Name should adhere to
Joseph M. Williams' writing principles of clarity, conciseness, and coherence.
Ensure each code name is lower-case, hyphenated, and directly reflects the
concept it represents. Avoid ambiguous or overly complex terms, and prioritize
simplicity, precision, and readability in the naming.
The code names should pass the 'clarity and grace' test by being easy to
understand, descriptive, and reflective of the content they categorize.
- suggest codes that are similar to good code names. avoid code names that are
similar to bad code names.
- The definition should be simple worded and practical. At least 2 sentences,
max 3. It should be written in past tense.
It should convey how a labeller could apply this code to future logs, without
mentioning the word "labeller". The definition should be specific enough to be
useful in debugging. It should be very concrete. And should be well thought and
make sense. Bull shitting will not earn you any points.
- The examples should be a list. Each example should be descriptive between
2-3 sentences. Examples should be concrete, informative and not vague. Provide
at max 20 salient examples. Examples should contain a lot of detail about what
happened and should refer to incidents in the log.
- The list of codes must mutually exclusive.
# GOOD EXAMPLES OF FINAL CODE NAMES/CLUSTERS
* looped-without-progress
* repeated-unsuccessful-actions
* repeated-syntax-errors
* exceeded-context-window-limits
* encountered-security-risks
* failure-to-switch-strategy
* exceeded-resource-limits
* attempted-to-handle-excessive-data
* no-errors-detected
These names are high-level but also concrete. They exactly mention the type of
error, issue, gap that has been identified.
## BAD EXAMPLES OF FINAL CODE NAMES/CLUSTERS
* mismanaged-data-utilization -- too high level
* incomplete-or-misguided-execution -- too high level
* misaligned-agent-interactions -- too high level
* mismanaged-task-strategies -- too high level
* resource-inefficiencies -- vague
* communication-issues -- vague
* coordination-issues -- too high level and vague
* operational-failures
* execution-errors -- too high level
* navigation-issues -- too concise
* adaptive-failures -- too concise
* successful-processes -- I dont like the word processes
* system-constraints
* configuration-issues
* information-inaccuracies -- too high level
* process-improvements -- vague, not an error
* inadequate-error-response -- too high-level, unclear what kind of errors
* specific-access-issues -- makes no sense
* strategy-inefficiency -- strategy is too high level
* error-management-gaps -- unclear what error management means
* error-handling-deficiency -- unclear what kind of errors
* coordination-breakdown -- unclear what coordination means
* muddled-task-execution -- unclear what kind of tasks were muddled
* task-completion-gaps -- too high level
The above names are too high level and unclear. Please DO NOT use such names.
""",
},
{
"role": "user",
"content": doc.text,
},
],
response_format=CodeList,
)
message = completion.choices[0].message
if message.parsed and len(message.parsed.code_list) > 0:
coded_document = CodedDocument(doc=doc, codes=set(message.parsed.code_list))
else:
print(message.refusal)
raise ValueError("Error in coding document with OpenAI")
else:
code_to_str = "\n".join(
[
(
f"\n---\nCode Name: {code.name}\n"
f"Definition: {code.definition}\n"
f"Examples: {code.examples}\n---\n"
)
for code in code_set
]
)
completion = self.client.beta.chat.completions.parse(
model=self.model,
messages=[
{
"role": "system",
"content": """You are an expert qualitative researcher.
You can answer any questions about coding logs.""",
},
{
"role": "user",
"content": f"""
## Context
The text below shows a log containing errors. Your task is to code the log with
the following codes. Generate a list of codes for the log below.
Only use the codes from the list below. Do not create new codes.
Modify the examples of the codes to fit the context of the log.
Your example should be informative to narrow down the details of the error in
the context of the example.
## Codes
{code_to_str}
## Log
{doc.text}
""",
},
],
response_format=CodeList,
)
message = completion.choices[0].message
if message.parsed and len(message.parsed.code_list) > 0:
code_list = message.parsed.code_list
# filter out codes whose names are not in the code_set
code_set_names = {code.name for code in code_set}
code_list = [code for code in code_list if code.name in code_set_names]
coded_document = CodedDocument(doc=doc, codes=set(code_list))
if coded_document is None:
raise ValueError("Error in coding document with OpenAI")
if self.cache_enabled and cache_file:
with open(cache_file, "w") as f:
f.write(coded_document.model_dump_json(indent=4))
return coded_document