graphiti/graphiti_core/helpers.py

"""
Copyright 2024, Zep Software, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import asyncio
import os
from collections.abc import Coroutine
from datetime import datetime
from typing import Any

import numpy as np
from dotenv import load_dotenv
from neo4j import time as neo4j_time
from typing_extensions import LiteralString

load_dotenv()

DEFAULT_DATABASE = os.getenv('DEFAULT_DATABASE', None)
USE_PARALLEL_RUNTIME = bool(os.getenv('USE_PARALLEL_RUNTIME', False))
SEMAPHORE_LIMIT = int(os.getenv('SEMAPHORE_LIMIT', 20))
MAX_REFLEXION_ITERATIONS = int(os.getenv('MAX_REFLEXION_ITERATIONS', 0))
DEFAULT_PAGE_LIMIT = 20

RUNTIME_QUERY: LiteralString = (
    'CYPHER runtime = parallel parallelRuntimeSupport=all\n' if USE_PARALLEL_RUNTIME else ''
)


def parse_db_date(neo_date: neo4j_time.DateTime | None) -> datetime | None:
    return neo_date.to_native() if neo_date else None


def lucene_sanitize(query: str) -> str:
    # Escape special characters from a query before passing into Lucene
    # + - && || ! ( ) { } [ ] ^ " ~ * ? : \ /
    escape_map = str.maketrans(
        {
            '+': r'\+',
            '-': r'\-',
            '&': r'\&',
            '|': r'\|',
            '!': r'\!',
            '(': r'\(',
            ')': r'\)',
            '{': r'\{',
            '}': r'\}',
            '[': r'\[',
            ']': r'\]',
            '^': r'\^',
            '"': r'\"',
            '~': r'\~',
            '*': r'\*',
            '?': r'\?',
            ':': r'\:',
            '\\': r'\\',
            '/': r'\/',
            'O': r'\O',
            'R': r'\R',
            'N': r'\N',
            'T': r'\T',
            'A': r'\A',
            'D': r'\D',
        }
    )

    sanitized = query.translate(escape_map)
    return sanitized


def normalize_l2(embedding: list[float]):
    embedding_array = np.array(embedding)
    if embedding_array.ndim == 1:
        norm = np.linalg.norm(embedding_array)
        if norm == 0:
            return [0.0] * len(embedding)
        return (embedding_array / norm).tolist()
    else:
        norm = np.linalg.norm(embedding_array, 2, axis=1, keepdims=True)
        return (np.where(norm == 0, embedding_array, embedding_array / norm)).tolist()


# Use this instead of asyncio.gather() to bound coroutines
async def semaphore_gather(
    *coroutines: Coroutine,
    max_coroutines: int = SEMAPHORE_LIMIT,
):
    semaphore = asyncio.Semaphore(max_coroutines)

    async def _wrap(coro: Coroutine) -> Any:
        async with semaphore:
            return await coro

    results = []
    batch = []
    for coroutine in coroutines:
        batch.append(_wrap(coroutine))
        # once we hit max_coroutines, gather and clear the batch
        if len(batch) >= max_coroutines:
            results.extend(await asyncio.gather(*batch))
            batch.clear()

    # gather any remaining coroutines in the final batch
    if batch:
        results.extend(await asyncio.gather(*batch))

    return results
Search refactor + Community search (#111) * WIP * WIP * WIP * community search * WIP * WIP * integration tested * tests * tests * mypy * mypy * format 2024-09-16 14:03:05 -04:00			`"""`
			`Copyright 2024, Zep Software, Inc.`

			`Licensed under the Apache License, Version 2.0 (the "License");`
			`you may not use this file except in compliance with the License.`
			`You may obtain a copy of the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License.`
			`"""`

Bounded semaphore - limiting concurrency (#244) * WIP * add semaphore * remove unused imports * remove unused imports * lower concurrency limit 2024-12-17 13:08:18 -05:00			`import asyncio`
Make default DB explicit (#195) * add default database * update * init tests * update test * bump version * removed unused imports 2024-10-21 12:33:32 -04:00			`import os`
Bounded semaphore - limiting concurrency (#244) * WIP * add semaphore * remove unused imports * remove unused imports * lower concurrency limit 2024-12-17 13:08:18 -05:00			`from collections.abc import Coroutine`
Add Missing Node and edge CRUD (#51) * add CRUD operations and fix search limit bugs * format * update tests * å * update tests to double limit call * add default field * format * import correct field 2024-08-27 16:18:01 -04:00			`from datetime import datetime`
update semaphore gather to use batches (#471) * update semaphore gather to use batches * batch semaphore update * remove return type 2025-05-12 14:00:38 -04:00			`from typing import Any`
Add Missing Node and edge CRUD (#51) * add CRUD operations and fix search limit bugs * format * update tests * å * update tests to double limit call * add default field * format * import correct field 2024-08-27 16:18:01 -04:00
Add mmr reranking (#180) * mmr start * add mmr function * normalize * add mmr options to search * update communities * build communities * format * clean up normalization * normalize in mmr * update 2024-10-08 13:55:10 -04:00			`import numpy as np`
load env in helper file (#196) * load env in helper file * bump version 2024-10-22 08:49:14 -04:00			`from dotenv import load_dotenv`
Add Missing Node and edge CRUD (#51) * add CRUD operations and fix search limit bugs * format * update tests * å * update tests to double limit call * add default field * format * import correct field 2024-08-27 16:18:01 -04:00			`from neo4j import time as neo4j_time`
Add episode refactor (#399) * partial refactor * get relevant nodes refactor * load edges updates * refactor triplets * not there yet * node search update * working refactor * updates * mypy * mypy 2025-04-26 00:24:23 -04:00			`from typing_extensions import LiteralString`
Add Missing Node and edge CRUD (#51) * add CRUD operations and fix search limit bugs * format * update tests * å * update tests to double limit call * add default field * format * import correct field 2024-08-27 16:18:01 -04:00
load env in helper file (#196) * load env in helper file * bump version 2024-10-22 08:49:14 -04:00			`load_dotenv()`

Make default DB explicit (#195) * add default database * update * init tests * update test * bump version * removed unused imports 2024-10-21 12:33:32 -04:00			`DEFAULT_DATABASE = os.getenv('DEFAULT_DATABASE', None)`
Bulk add nodes and edges (#205) * test * only use parallel runtime if set to true * add and test bulk add * remove group_ids * format * bump version * update readme 2024-10-31 12:31:37 -04:00			`USE_PARALLEL_RUNTIME = bool(os.getenv('USE_PARALLEL_RUNTIME', False))`
Bounded semaphore - limiting concurrency (#244) * WIP * add semaphore * remove unused imports * remove unused imports * lower concurrency limit 2024-12-17 13:08:18 -05:00			`SEMAPHORE_LIMIT = int(os.getenv('SEMAPHORE_LIMIT', 20))`
`add_episode()` refactor (#421) * temporal updates * update resolve nodes * dedupe edge updates * edge dedupe * extract attributes * update dynamic pydantic model * first pass of extract node attributes * no errors * bug fixes * bug fixes * prompt updates * prompt updates * updates * updates * remove unused imports * update tests based on changes * remove unused import 2025-04-30 12:08:52 -04:00			`MAX_REFLEXION_ITERATIONS = int(os.getenv('MAX_REFLEXION_ITERATIONS', 0))`
Pagination for get by group_id (#218) * add pagination to subgraphs * update pagination * update LiteralString import * cleanup * cleanup * update embedding dims 2024-12-02 11:17:37 -05:00			`DEFAULT_PAGE_LIMIT = 20`
Make default DB explicit (#195) * add default database * update * init tests * update test * bump version * removed unused imports 2024-10-21 12:33:32 -04:00
Add episode refactor (#399) * partial refactor * get relevant nodes refactor * load edges updates * refactor triplets * not there yet * node search update * working refactor * updates * mypy * mypy 2025-04-26 00:24:23 -04:00			`RUNTIME_QUERY: LiteralString = (`
			`'CYPHER runtime = parallel parallelRuntimeSupport=all\n' if USE_PARALLEL_RUNTIME else ''`
			`)`

Add Missing Node and edge CRUD (#51) * add CRUD operations and fix search limit bugs * format * update tests * å * update tests to double limit call * add default field * format * import correct field 2024-08-27 16:18:01 -04:00
			`def parse_db_date(neo_date: neo4j_time.DateTime \| None) -> datetime \| None:`
			`return neo_date.to_native() if neo_date else None`
Add MSC benchmark and improve search performance (#157) * test cases * test * benchmark * eval updates * improve search performance * remove data * formatting * add None type to config * update sanitization * push version * maketrans update * mypy 2024-09-26 16:12:38 -04:00

			`def lucene_sanitize(query: str) -> str:`
			`# Escape special characters from a query before passing into Lucene`
test escape characters (#171) * test escape characters * format * tests * run tests * copyright 2024-10-03 10:08:30 -04:00			`# + - && \|\| ! ( ) { } [ ] ^ " ~ * ? : \ /`
Add MSC benchmark and improve search performance (#157) * test cases * test * benchmark * eval updates * improve search performance * remove data * formatting * add None type to config * update sanitization * push version * maketrans update * mypy 2024-09-26 16:12:38 -04:00			`escape_map = str.maketrans(`
			`{`
			`'+': r'\+',`
			`'-': r'\-',`
			`'&': r'\&',`
			`'\|': r'\\|',`
			`'!': r'\!',`
			`'(': r'\(',`
			`')': r'\)',`
			`'{': r'\{',`
			`'}': r'\}',`
			`'[': r'\[',`
			`']': r'\]',`
			`'^': r'\^',`
			`'"': r'\"',`
			`'~': r'\~',`
			`'': r'\',`
			`'?': r'\?',`
			`':': r'\:',`
			`'\\': r'\\',`
test escape characters (#171) * test escape characters * format * tests * run tests * copyright 2024-10-03 10:08:30 -04:00			`'/': r'\/',`
update lucene escaping (#233) * update lucene escaping * update unit test 2024-12-09 10:36:46 -05:00			`'O': r'\O',`
			`'R': r'\R',`
			`'N': r'\N',`
			`'T': r'\T',`
			`'A': r'\A',`
			`'D': r'\D',`
Add MSC benchmark and improve search performance (#157) * test cases * test * benchmark * eval updates * improve search performance * remove data * formatting * add None type to config * update sanitization * push version * maketrans update * mypy 2024-09-26 16:12:38 -04:00			`}`
			`)`

			`sanitized = query.translate(escape_map)`
			`return sanitized`
Add mmr reranking (#180) * mmr start * add mmr function * normalize * add mmr options to search * update communities * build communities * format * clean up normalization * normalize in mmr * update 2024-10-08 13:55:10 -04:00

Set max tokens by prompt (#255) * set max tokens * update generic openai client * mypy updates * fix: dockerfile --------- Co-authored-by: paulpaliychuk <pavlo.paliychuk.ca@gmail.com> 2025-01-24 10:14:49 -05:00			`def normalize_l2(embedding: list[float]):`
Add mmr reranking (#180) * mmr start * add mmr function * normalize * add mmr options to search * update communities * build communities * format * clean up normalization * normalize in mmr * update 2024-10-08 13:55:10 -04:00			`embedding_array = np.array(embedding)`
			`if embedding_array.ndim == 1:`
			`norm = np.linalg.norm(embedding_array)`
			`if norm == 0:`
Set max tokens by prompt (#255) * set max tokens * update generic openai client * mypy updates * fix: dockerfile --------- Co-authored-by: paulpaliychuk <pavlo.paliychuk.ca@gmail.com> 2025-01-24 10:14:49 -05:00			`return [0.0] * len(embedding)`
Add mmr reranking (#180) * mmr start * add mmr function * normalize * add mmr options to search * update communities * build communities * format * clean up normalization * normalize in mmr * update 2024-10-08 13:55:10 -04:00			`return (embedding_array / norm).tolist()`
			`else:`
			`norm = np.linalg.norm(embedding_array, 2, axis=1, keepdims=True)`
			`return (np.where(norm == 0, embedding_array, embedding_array / norm)).tolist()`
Bounded semaphore - limiting concurrency (#244) * WIP * add semaphore * remove unused imports * remove unused imports * lower concurrency limit 2024-12-17 13:08:18 -05:00

			`# Use this instead of asyncio.gather() to bound coroutines`
update semaphore gather to use batches (#471) * update semaphore gather to use batches * batch semaphore update * remove return type 2025-05-12 14:00:38 -04:00			`async def semaphore_gather(`
			`*coroutines: Coroutine,`
			`max_coroutines: int = SEMAPHORE_LIMIT,`
			`):`
Bounded semaphore - limiting concurrency (#244) * WIP * add semaphore * remove unused imports * remove unused imports * lower concurrency limit 2024-12-17 13:08:18 -05:00			`semaphore = asyncio.Semaphore(max_coroutines)`

update semaphore gather to use batches (#471) * update semaphore gather to use batches * batch semaphore update * remove return type 2025-05-12 14:00:38 -04:00			`async def _wrap(coro: Coroutine) -> Any:`
Bounded semaphore - limiting concurrency (#244) * WIP * add semaphore * remove unused imports * remove unused imports * lower concurrency limit 2024-12-17 13:08:18 -05:00			`async with semaphore:`
update semaphore gather to use batches (#471) * update semaphore gather to use batches * batch semaphore update * remove return type 2025-05-12 14:00:38 -04:00			`return await coro`

			`results = []`
			`batch = []`
			`for coroutine in coroutines:`
			`batch.append(_wrap(coroutine))`
			`# once we hit max_coroutines, gather and clear the batch`
			`if len(batch) >= max_coroutines:`
			`results.extend(await asyncio.gather(*batch))`
			`batch.clear()`

			`# gather any remaining coroutines in the final batch`
			`if batch:`
			`results.extend(await asyncio.gather(*batch))`

			`return results`