e2e graph builder eval (#343)

* add partial eval platform * dedupe updates * add e2e eval * Update graphiti_core/prompts/eval.py Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * clear all outputs * clear all outputs * squash eval commits * Update tests/evals/data/utils.py Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * add longmemeval disclaimer * remove gitignore * add copyright headers * add cli * Update tests/evals/data/longmemeval_data/README.md Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * Update tests/evals/eval_e2e_graph_building.py Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * updates --------- Co-authored-by: jackaldenryan <jackaldenryan@gmail.com> Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
2025-06-27 02:00:02 +00:00 · 2025-04-12 10:35:22 -04:00 · 2025-04-12 10:35:22 -04:00 · 5dce26722e
commit 5dce26722e
parent 6aa25a1901
8 changed files with 67332 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -164,3 +164,6 @@ cython_debug/
 ## Other
 # Cache files
 cache.db*
 # All DS_Store files
 .DS_Store
--- a/graphiti_core/prompts/eval.py
+++ b/graphiti_core/prompts/eval.py
@ -37,16 +37,28 @@ class EvalResponse(BaseModel):
    )
 class EvalAddEpisodeResults(BaseModel):
    candidate_is_worse: bool = Field(
        ...,
        description='boolean if the baseline extraction is higher quality than the candidate extraction.',
    )
    reasoning: str = Field(
        ..., description='why you determined the response was correct or incorrect'
    )
 class Prompt(Protocol):
    qa_prompt: PromptVersion
    eval_prompt: PromptVersion
    query_expansion: PromptVersion
    eval_add_episode_results: PromptVersion
 class Versions(TypedDict):
    qa_prompt: PromptFunction
    eval_prompt: PromptFunction
    query_expansion: PromptFunction
    eval_add_episode_results: PromptFunction
 def query_expansion(context: dict[str, Any]) -> list[Message]:
@ -112,8 +124,41 @@ def eval_prompt(context: dict[str, Any]) -> list[Message]:
    ]
 def eval_add_episode_results(context: dict[str, Any]) -> list[Message]:
    sys_prompt = """You are a judge that determines whether a baseline graph building result from a list of messages is better
        than a candidate graph building result based on the same messages."""
    user_prompt = f"""
    Given the following PREVIOUS MESSAGES and MESSAGE, determine if the BASELINE graph data extracted from the 
    conversation is higher quality than the CANDIDATE graph data extracted from the conversation.
    Return False if the BASELINE extraction is better, and True otherwise. If the CANDIDATE extraction and
    BASELINE extraction are nearly identical in quality, return True. Add your reasoning for your decision to the reasoning field
    <PREVIOUS MESSAGES>
    {context['previous_messages']}
    </PREVIOUS MESSAGES>
    <MESSAGE>
    {context['message']}
    </MESSAGE>
    <BASELINE>
    {context['baseline']}
    </BASELINE>
    <CANDIDATE>
    {context['candidate']}
    </CANDIDATE>
    """
    return [
        Message(role='system', content=sys_prompt),
        Message(role='user', content=user_prompt),
    ]
 versions: Versions = {
    'qa_prompt': qa_prompt,
    'eval_prompt': eval_prompt,
    'query_expansion': query_expansion,
    'eval_add_episode_results': eval_add_episode_results,
 }
--- a/tests/evals/data/longmemeval_data/README.md
+++ b/tests/evals/data/longmemeval_data/README.md
@ -0,0 +1,3 @@
 The `longmemeval_oracle` dataset is an open-source dataset that we are using.
 We did not create this dataset and it can be found
 here: https://huggingface.co/datasets/xiaowu0162/longmemeval/blob/main/longmemeval_oracle.
--- a/tests/evals/data/longmemeval_data/longmemeval_oracle.json
+++ b/tests/evals/data/longmemeval_data/longmemeval_oracle.json
--- a/tests/evals/eval_cli.py
+++ b/tests/evals/eval_cli.py
@ -0,0 +1,39 @@
 import argparse
 import asyncio
 from tests.evals.eval_e2e_graph_building import build_baseline_graph, eval_graph
 async def main():
    parser = argparse.ArgumentParser(
        description='Run eval_graph and optionally build_baseline_graph from the command line.'
    )
    parser.add_argument(
        '--multi-session',
        type=int,
        nargs='+',
        required=True,
        help='List of integers representing multi-session values (e.g., 1 2 3)',
    )
    parser.add_argument('--session-length', type=int, required=True, help='Length of each session')
    parser.add_argument(
        '--build-baseline', action='store_true', help='If set, also runs build_baseline_graph'
    )
    args = parser.parse_args()
    # Optionally run the async function
    if args.build_baseline:
        print('Running build_baseline_graph...')
        await build_baseline_graph(
            multi_session=args.multi_session, session_length=args.session_length
        )
    # Always call eval_graph
    result = await eval_graph(multi_session=args.multi_session, session_length=args.session_length)
    print('Result of eval_graph:', result)
 if __name__ == '__main__':
    asyncio.run(main())
--- a/tests/evals/eval_e2e_graph_building.py
+++ b/tests/evals/eval_e2e_graph_building.py
@ -0,0 +1,157 @@
 """
 Copyright 2024, Zep Software, Inc.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 """
 import json
 from datetime import datetime, timezone
 import pandas as pd
 from graphiti_core import Graphiti
 from graphiti_core.graphiti import AddEpisodeResults
 from graphiti_core.llm_client import LLMConfig, OpenAIClient
 from graphiti_core.nodes import EpisodeType
 from graphiti_core.prompts import prompt_library
 from graphiti_core.prompts.eval import EvalAddEpisodeResults
 from graphiti_core.utils.maintenance import clear_data
 from tests.test_graphiti_int import NEO4J_URI, NEO4j_PASSWORD, NEO4j_USER
 async def build_graph(
    group_id_suffix: str, multi_session: list[int], session_length: int, graphiti: Graphiti
 ) -> tuple[dict[str, list[AddEpisodeResults]], dict[str, list[str]]]:
    # Get longmemeval dataset
    lme_dataset_option = (
        'data/longmemeval_data/longmemeval_oracle.json'  # Can be _oracle, _s, or _m
    )
    lme_dataset_df = pd.read_json(lme_dataset_option)
    add_episode_results: dict[str, list[AddEpisodeResults]] = {}
    add_episode_context: dict[str, list[str]] = {}
    for multi_session_idx in multi_session:
        multi_session = lme_dataset_df['haystack_sessions'].iloc[multi_session_idx]
        multi_session_dates = lme_dataset_df['haystack_dates'].iloc[multi_session_idx]
        user_id = 'lme_oracle_experiment_user_' + str(multi_session_idx)
        await clear_data(graphiti.driver, [user_id])
        add_episode_results[user_id] = []
        add_episode_context[user_id] = []
        message_count = 0
        for session_idx, session in enumerate(multi_session):
            for _, msg in enumerate(session):
                if message_count >= session_length:
                    continue
                message_count += 1
                date = multi_session_dates[session_idx] + ' UTC'
                date_format = '%Y/%m/%d (%a) %H:%M UTC'
                date_string = datetime.strptime(date, date_format).replace(tzinfo=timezone.utc)
                episode_body = f'{msg["role"]}: {msg["content"]}'
                results = await graphiti.add_episode(
                    name='',
                    episode_body=episode_body,
                    reference_time=date_string,
                    source=EpisodeType.message,
                    source_description='',
                    group_id=user_id + '_' + group_id_suffix,
                )
                for node in results.nodes:
                    node.name_embedding = None
                for edge in results.edges:
                    edge.fact_embedding = None
                add_episode_results[user_id].append(results)
                add_episode_context[user_id].append(msg['content'])
    return add_episode_results, add_episode_context
 async def build_baseline_graph(multi_session: list[int], session_length: int):
    # Use gpt-4o for graph building baseline
    llm_client = OpenAIClient(config=LLMConfig(model='gpt-4o'))
    graphiti = Graphiti(NEO4J_URI, NEO4j_USER, NEO4j_PASSWORD, llm_client=llm_client)
    add_episode_results, _ = await build_graph('baseline', multi_session, session_length, graphiti)
    filename = 'baseline_graph_results.json'
    serializable_baseline_graph_results = {
        key: [item.model_dump(mode='json') for item in value]
        for key, value in add_episode_results.items()
    }
    with open(filename, 'w') as file:
        json.dump(serializable_baseline_graph_results, file, indent=4, default=str)
 async def eval_graph(multi_session: list[int], session_length: int, llm_client=None) -> float:
    if llm_client is None:
        llm_client = OpenAIClient()
    graphiti = Graphiti(NEO4J_URI, NEO4j_USER, NEO4j_PASSWORD, llm_client=llm_client)
    with open('baseline_graph_results.json') as file:
        baseline_results_raw = json.load(file)
        baseline_results: dict[str, list[AddEpisodeResults]] = {
            key: [AddEpisodeResults(**item) for item in value]
            for key, value in baseline_results_raw.items()
        }
    add_episode_results, add_episode_context = await build_graph(
        'candidate', multi_session, session_length, graphiti
    )
    filename = 'candidate_graph_results.json'
    candidate_baseline_graph_results = {
        key: [item.model_dump(mode='json') for item in value]
        for key, value in add_episode_results.items()
    }
    with open(filename, 'w') as file:
        json.dump(candidate_baseline_graph_results, file, indent=4, default=str)
    raw_score = 0
    user_count = 0
    for user_id in add_episode_results:
        user_count += 1
        user_raw_score = 0
        print('add_episode_context: ', add_episode_context)
        for baseline_result, add_episode_result, episodes in zip(
            baseline_results[user_id],
            add_episode_results[user_id],
            add_episode_context[user_id],
            strict=True,
        ):
            context = {
                'baseline': baseline_result,
                'candidate': add_episode_result,
                'message': episodes[0],
                'previous_messages': episodes[1:],
            }
            print(context)
            llm_response = await llm_client.generate_response(
                prompt_library.eval.eval_add_episode_results(context),
                response_model=EvalAddEpisodeResults,
            )
            candidate_is_worse = llm_response.get('candidate_is_worse', False)
            user_raw_score += 0 if candidate_is_worse else 1
            print('llm_response:', llm_response)
        user_score = user_raw_score / len(add_episode_results[user_id])
        raw_score += user_score
    score = raw_score / user_count
    return score
--- a/tests/evals/pytest.ini
+++ b/tests/evals/pytest.ini
@ -0,0 +1,4 @@
 [pytest]
 asyncio_default_fixture_loop_scope = function
 markers =
    integration: marks tests as integration tests
--- a/tests/evals/utils.py
+++ b/tests/evals/utils.py
@ -0,0 +1,39 @@
 """
 Copyright 2024, Zep Software, Inc.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 """
 import logging
 import sys
 def setup_logging():
    # Create a logger
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)  # Set the logging level to INFO
    # Create console handler and set level to INFO
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(logging.INFO)
    # Create formatter
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    # Add formatter to console handler
    console_handler.setFormatter(formatter)
    # Add console handler to logger
    logger.addHandler(console_handler)
    return logger