Move batch_eval.py to ./reproduce/

2025-12-16 09:20:53 +00:00 · 2025-05-27 16:09:57 +08:00 · 2025-05-27 16:09:57 +08:00 · c9a6256af8
commit c9a6256af8
parent 40b10e8fcf
2 changed files with 109 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1243,7 +1243,7 @@ Output the results in the following structure:
 ### Batch Eval
-To evaluate the performance of two RAG systems on high-level queries, LightRAG uses the following prompt, with the specific code available in `example/batch_eval.py`.
+To evaluate the performance of two RAG systems on high-level queries, LightRAG uses the following prompt, with the specific code available in `reproduce/batch_eval.py`.
 <details>
 <summary> Prompt </summary>
--- a/reproduce/batch_eval.py
+++ b/reproduce/batch_eval.py
@ -0,0 +1,108 @@
 import re
 import json
 import jsonlines
 from openai import OpenAI
 def batch_eval(query_file, result1_file, result2_file, output_file_path):
    client = OpenAI()
    with open(query_file, "r") as f:
        data = f.read()
    queries = re.findall(r"- Question \d+: (.+)", data)
    with open(result1_file, "r") as f:
        answers1 = json.load(f)
    answers1 = [i["result"] for i in answers1]
    with open(result2_file, "r") as f:
        answers2 = json.load(f)
    answers2 = [i["result"] for i in answers2]
    requests = []
    for i, (query, answer1, answer2) in enumerate(zip(queries, answers1, answers2)):
        sys_prompt = """
        ---Role---
        You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**.
        """
        prompt = f"""
        You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**.
        - **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question?
        - **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question?
        - **Empowerment**: How well does the answer help the reader understand and make informed judgments about the topic?
        For each criterion, choose the better answer (either Answer 1 or Answer 2) and explain why. Then, select an overall winner based on these three categories.
        Here is the question:
        {query}
        Here are the two answers:
        **Answer 1:**
        {answer1}
        **Answer 2:**
        {answer2}
        Evaluate both answers using the three criteria listed above and provide detailed explanations for each criterion.
        Output your evaluation in the following JSON format:
        {{
            "Comprehensiveness": {{
                "Winner": "[Answer 1 or Answer 2]",
                "Explanation": "[Provide explanation here]"
            }},
            "Empowerment": {{
                "Winner": "[Answer 1 or Answer 2]",
                "Explanation": "[Provide explanation here]"
            }},
            "Overall Winner": {{
                "Winner": "[Answer 1 or Answer 2]",
                "Explanation": "[Summarize why this answer is the overall winner based on the three criteria]"
            }}
        }}
        """
        request_data = {
            "custom_id": f"request-{i+1}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o-mini",
                "messages": [
                    {"role": "system", "content": sys_prompt},
                    {"role": "user", "content": prompt},
                ],
            },
        }
        requests.append(request_data)
    with jsonlines.open(output_file_path, mode="w") as writer:
        for request in requests:
            writer.write(request)
    print(f"Batch API requests written to {output_file_path}")
    batch_input_file = client.files.create(
        file=open(output_file_path, "rb"), purpose="batch"
    )
    batch_input_file_id = batch_input_file.id
    batch = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={"description": "nightly eval job"},
    )
    print(f"Batch {batch.id} has been created.")
 if __name__ == "__main__":
    batch_eval()