From c9a6256af801de4fdb123c2053a6f22cc690d621 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Tue, 27 May 2025 16:09:57 +0800 Subject: [PATCH] Move batch_eval.py to ./reproduce/ --- README.md | 2 +- reproduce/batch_eval.py | 108 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 reproduce/batch_eval.py diff --git a/README.md b/README.md index e45ca155..b9d6a31c 100644 --- a/README.md +++ b/README.md @@ -1243,7 +1243,7 @@ Output the results in the following structure: ### Batch Eval -To evaluate the performance of two RAG systems on high-level queries, LightRAG uses the following prompt, with the specific code available in `example/batch_eval.py`. +To evaluate the performance of two RAG systems on high-level queries, LightRAG uses the following prompt, with the specific code available in `reproduce/batch_eval.py`.
Prompt diff --git a/reproduce/batch_eval.py b/reproduce/batch_eval.py new file mode 100644 index 00000000..a85e1ede --- /dev/null +++ b/reproduce/batch_eval.py @@ -0,0 +1,108 @@ +import re +import json +import jsonlines + +from openai import OpenAI + + +def batch_eval(query_file, result1_file, result2_file, output_file_path): + client = OpenAI() + + with open(query_file, "r") as f: + data = f.read() + + queries = re.findall(r"- Question \d+: (.+)", data) + + with open(result1_file, "r") as f: + answers1 = json.load(f) + answers1 = [i["result"] for i in answers1] + + with open(result2_file, "r") as f: + answers2 = json.load(f) + answers2 = [i["result"] for i in answers2] + + requests = [] + for i, (query, answer1, answer2) in enumerate(zip(queries, answers1, answers2)): + sys_prompt = """ + ---Role--- + You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. + """ + + prompt = f""" + You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. + + - **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question? + - **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question? + - **Empowerment**: How well does the answer help the reader understand and make informed judgments about the topic? + + For each criterion, choose the better answer (either Answer 1 or Answer 2) and explain why. Then, select an overall winner based on these three categories. + + Here is the question: + {query} + + Here are the two answers: + + **Answer 1:** + {answer1} + + **Answer 2:** + {answer2} + + Evaluate both answers using the three criteria listed above and provide detailed explanations for each criterion. + + Output your evaluation in the following JSON format: + + {{ + "Comprehensiveness": {{ + "Winner": "[Answer 1 or Answer 2]", + "Explanation": "[Provide explanation here]" + }}, + "Empowerment": {{ + "Winner": "[Answer 1 or Answer 2]", + "Explanation": "[Provide explanation here]" + }}, + "Overall Winner": {{ + "Winner": "[Answer 1 or Answer 2]", + "Explanation": "[Summarize why this answer is the overall winner based on the three criteria]" + }} + }} + """ + + request_data = { + "custom_id": f"request-{i+1}", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": "gpt-4o-mini", + "messages": [ + {"role": "system", "content": sys_prompt}, + {"role": "user", "content": prompt}, + ], + }, + } + + requests.append(request_data) + + with jsonlines.open(output_file_path, mode="w") as writer: + for request in requests: + writer.write(request) + + print(f"Batch API requests written to {output_file_path}") + + batch_input_file = client.files.create( + file=open(output_file_path, "rb"), purpose="batch" + ) + batch_input_file_id = batch_input_file.id + + batch = client.batches.create( + input_file_id=batch_input_file_id, + endpoint="/v1/chat/completions", + completion_window="24h", + metadata={"description": "nightly eval job"}, + ) + + print(f"Batch {batch.id} has been created.") + + +if __name__ == "__main__": + batch_eval()