graphrag/tests/smoke/test_fixtures.py

275 lines
9.6 KiB
Python
Raw Normal View History

2024-07-01 15:25:30 -06:00
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
import asyncio
import json
import logging
import os
import shutil
import subprocess
from collections.abc import Callable
from functools import wraps
from pathlib import Path
from typing import Any, ClassVar
from unittest import mock
import pandas as pd
import pytest
from graphrag.query.context_builder.community_context import (
NO_COMMUNITY_RECORDS_WARNING,
)
from graphrag.storage.blob_pipeline_storage import BlobPipelineStorage
2024-07-01 15:25:30 -06:00
log = logging.getLogger(__name__)
debug = os.environ.get("DEBUG") is not None
gh_pages = os.environ.get("GH_PAGES") is not None
# cspell:disable-next-line well-known-key
WELL_KNOWN_AZURITE_CONNECTION_STRING = "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1"
KNOWN_WARNINGS = [NO_COMMUNITY_RECORDS_WARNING]
2024-07-01 15:25:30 -06:00
def _load_fixtures():
"""Load all fixtures from the tests/data folder."""
params = []
fixtures_path = Path("./tests/fixtures/")
# use the min-csv smoke test to hydrate the docsite parquet artifacts (see gh-pages.yml)
subfolders = ["min-csv"] if gh_pages else sorted(os.listdir(fixtures_path))
for subfolder in subfolders:
if not os.path.isdir(fixtures_path / subfolder):
continue
config_file = fixtures_path / subfolder / "config.json"
params.append((subfolder, json.loads(config_file.read_bytes().decode("utf-8"))))
2024-07-01 15:25:30 -06:00
return params[1:] # disable azure blob connection test
2024-07-01 15:25:30 -06:00
def pytest_generate_tests(metafunc):
"""Generate tests for all test functions in this module."""
run_slow = metafunc.config.getoption("run_slow")
configs = metafunc.cls.params[metafunc.function.__name__]
if not run_slow:
# Only run tests that are not marked as slow
configs = [config for config in configs if not config[1].get("slow", False)]
funcarglist = [params[1] for params in configs]
id_list = [params[0] for params in configs]
argnames = sorted(arg for arg in funcarglist[0] if arg != "slow")
metafunc.parametrize(
argnames,
[[funcargs[name] for name in argnames] for funcargs in funcarglist],
ids=id_list,
)
def cleanup(skip: bool = False):
"""Decorator to cleanup the output and cache folders after each test."""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except AssertionError:
raise
finally:
if not skip:
root = Path(kwargs["input_path"])
shutil.rmtree(root / "output", ignore_errors=True)
shutil.rmtree(root / "cache", ignore_errors=True)
return wrapper
return decorator
async def prepare_azurite_data(input_path: str, azure: dict) -> Callable[[], None]:
"""Prepare the data for the Azurite tests."""
input_container = azure["input_container"]
input_base_dir = azure.get("input_base_dir")
root = Path(input_path)
input_storage = BlobPipelineStorage(
connection_string=WELL_KNOWN_AZURITE_CONNECTION_STRING,
container_name=input_container,
)
# Bounce the container if it exists to clear out old run data
Add Cosmos DB storage/cache option (#1431) * added cosmosdb constructor and database methods * added rest of abstract method headers * added cosmos db container methods * implemented has and delete methods * finished implementing abstract class methods * integrated class into storage factory * integrated cosmosdb class into cache factory * added support for new config file fields * replaced primary key cosmosdb initialization with connection strings * modified cosmosdb setter to require json * Fix non-default emitters * Format * Ruff * ruff * first successful run of cosmosdb indexing * removed extraneous container_name setting * require base_dir to be typed as str * reverted merged changed from closed branch * removed nested try statement * readded initial non-parquet emitter fix * added basic support for parquet emitter using internal conversions * merged with main and resolved conflicts * fixed more merge conflicts * added cosmosdb functionality to query pipeline * tested query for cosmosdb * collapsed cosmosdb schema to use minimal containers and databases * simplified create_database and create_container functions * ruff fixes and semversioner * spellcheck and ci fixes * updated pyproject toml and lock file * apply fixes after merge from main * add temporary comments * refactor cache factory * refactored storage factory * minor formatting * update dictionary * fix spellcheck typo * fix default value * fix pydantic model defaults * update pydantic models * fix init_content * cleanup how factory passes parameters to file storage * remove unnecessary output file type * update pydantic model * cleanup code * implemented clear method * fix merge from main * add test stub for cosmosdb * regenerate lock file * modified set method to collapse parquet rows * modified get method to collapse parquet rows * updated has and delete methods and docstrings to adhere to new schema * added prefix helper function * replaced delimiter for prefixed id * verified empty tests are passing * fix merges from main * add find test * update cicd step name * tested querying for new schema * resolved errors from merge conflicts * refactored set method to handle cache in new schema * refactored get method to handle cache in new schema * force unique ids to be written to cosmos for nodes * found bug with has and delete methods * modified has and delete to work with cache in new schema * fix the merge from main * minor typo fixes * update lock file * spellcheck fix * fix init function signature * minor formatting updates * remove https protocol * change localhost to 127.0.0.1 address * update pytest to use bacj engine * verified cache tests * improved speed of has function * resolved pytest error with find function * added test for child method * make container_name variable private as _container_name * minor variable name fix * cleanup cosmos pytest and make the cosmosdb storage class operations more efficient * update cicd to use different cosmosdb emulator * test with http protocol * added pytest for clear() * add longer timeout for cosmosdb emulator startup * revert http connection back to https * add comments to cicd code for future dev usage * set to container and database clients to none upon deletion * ruff changes * add comments to cicd code * removed unneeded None statements and ruff fixes * more ruff fixes * Update test_run.py * remove unnecessary call to delete container * ruff format updates * Reverted test_run.py * fix ruff formatter errors * cleanup variable names to be more consistent * remove extra semversioner file * revert pydantic model changes * revert pydantic model change * revert pydantic model change * re-enable inline formatting rule * update documentation in dev guide --------- Co-authored-by: Alonso Guevara <alonsog@microsoft.com> Co-authored-by: Josh Bradley <joshbradley@microsoft.com>
2024-12-19 14:43:21 -05:00
input_storage._delete_container() # noqa: SLF001
input_storage._create_container() # noqa: SLF001
2024-07-01 15:25:30 -06:00
# Upload data files
txt_files = list((root / "input").glob("*.txt"))
csv_files = list((root / "input").glob("*.csv"))
data_files = txt_files + csv_files
for data_file in data_files:
text = data_file.read_bytes().decode("utf-8")
2024-07-01 15:25:30 -06:00
file_path = (
str(Path(input_base_dir) / data_file.name)
if input_base_dir
else data_file.name
)
await input_storage.set(file_path, text, encoding="utf-8")
Add Cosmos DB storage/cache option (#1431) * added cosmosdb constructor and database methods * added rest of abstract method headers * added cosmos db container methods * implemented has and delete methods * finished implementing abstract class methods * integrated class into storage factory * integrated cosmosdb class into cache factory * added support for new config file fields * replaced primary key cosmosdb initialization with connection strings * modified cosmosdb setter to require json * Fix non-default emitters * Format * Ruff * ruff * first successful run of cosmosdb indexing * removed extraneous container_name setting * require base_dir to be typed as str * reverted merged changed from closed branch * removed nested try statement * readded initial non-parquet emitter fix * added basic support for parquet emitter using internal conversions * merged with main and resolved conflicts * fixed more merge conflicts * added cosmosdb functionality to query pipeline * tested query for cosmosdb * collapsed cosmosdb schema to use minimal containers and databases * simplified create_database and create_container functions * ruff fixes and semversioner * spellcheck and ci fixes * updated pyproject toml and lock file * apply fixes after merge from main * add temporary comments * refactor cache factory * refactored storage factory * minor formatting * update dictionary * fix spellcheck typo * fix default value * fix pydantic model defaults * update pydantic models * fix init_content * cleanup how factory passes parameters to file storage * remove unnecessary output file type * update pydantic model * cleanup code * implemented clear method * fix merge from main * add test stub for cosmosdb * regenerate lock file * modified set method to collapse parquet rows * modified get method to collapse parquet rows * updated has and delete methods and docstrings to adhere to new schema * added prefix helper function * replaced delimiter for prefixed id * verified empty tests are passing * fix merges from main * add find test * update cicd step name * tested querying for new schema * resolved errors from merge conflicts * refactored set method to handle cache in new schema * refactored get method to handle cache in new schema * force unique ids to be written to cosmos for nodes * found bug with has and delete methods * modified has and delete to work with cache in new schema * fix the merge from main * minor typo fixes * update lock file * spellcheck fix * fix init function signature * minor formatting updates * remove https protocol * change localhost to 127.0.0.1 address * update pytest to use bacj engine * verified cache tests * improved speed of has function * resolved pytest error with find function * added test for child method * make container_name variable private as _container_name * minor variable name fix * cleanup cosmos pytest and make the cosmosdb storage class operations more efficient * update cicd to use different cosmosdb emulator * test with http protocol * added pytest for clear() * add longer timeout for cosmosdb emulator startup * revert http connection back to https * add comments to cicd code for future dev usage * set to container and database clients to none upon deletion * ruff changes * add comments to cicd code * removed unneeded None statements and ruff fixes * more ruff fixes * Update test_run.py * remove unnecessary call to delete container * ruff format updates * Reverted test_run.py * fix ruff formatter errors * cleanup variable names to be more consistent * remove extra semversioner file * revert pydantic model changes * revert pydantic model change * revert pydantic model change * re-enable inline formatting rule * update documentation in dev guide --------- Co-authored-by: Alonso Guevara <alonsog@microsoft.com> Co-authored-by: Josh Bradley <joshbradley@microsoft.com>
2024-12-19 14:43:21 -05:00
return lambda: input_storage._delete_container() # noqa: SLF001
2024-07-01 15:25:30 -06:00
class TestIndexer:
params: ClassVar[dict[str, list[tuple[str, dict[str, Any]]]]] = {
"test_fixture": _load_fixtures()
}
def __run_indexer(
self,
root: Path,
input_file_type: str,
):
command = [
"poetry",
"run",
"poe",
"index",
"--verbose" if debug else None,
"--root",
2024-10-24 14:22:32 -04:00
root.resolve().as_posix(),
"--logger",
2024-07-01 15:25:30 -06:00
"print",
]
command = [arg for arg in command if arg]
log.info("running command ", " ".join(command))
completion = subprocess.run(
command, env={**os.environ, "GRAPHRAG_INPUT_FILE_TYPE": input_file_type}
)
assert completion.returncode == 0, (
f"Indexer failed with return code: {completion.returncode}"
)
2024-07-01 15:25:30 -06:00
def __assert_indexer_outputs(
self, root: Path, workflow_config: dict[str, dict[str, Any]]
):
Refactor config (#1593) * Refactor config - Add new ModelConfig to represent LLM settings - Combines LLMParameters, ParallelizationParameters, encoding_model, and async_mode - Add top level models config that is a list of available LLM ModelConfigs - Remove LLMConfig inheritance and delete LLMConfig - Replace the inheritance with a model_id reference to the ModelConfig listed in the top level models config - Remove all fallbacks and hydration logic from create_graphrag_config - This removes the automatic env variable overrides - Support env variables within config files using Templating - This requires "$" to be escaped with extra "$" so ".*\\.txt$" becomes ".*\\.txt$$" - Update init content to initialize new config file with the ModelConfig structure * Use dict of ModelConfig instead of list * Add model validations and unit tests * Fix ruff checks * Add semversioner change * Fix unit tests * validate root_dir in pydantic model * Rename ModelConfig to LanguageModelConfig * Rename ModelConfigMissingError to LanguageModelConfigMissingError * Add validationg for unexpected API keys * Allow skipping pydantic validation for testing/mocking purposes. * Add default lm configs to verb tests * smoke test * remove config from flows to fix llm arg mapping * Fix embedding llm arg mapping * Remove timestamp from smoke test outputs * Remove unused "subworkflows" smoke test properties * Add models to smoke test configs * Update smoke test output path * Send logs to logs folder * Fix output path * Fix csv test file pattern * Update placeholder * Format * Instantiate default model configs * Fix unit tests for config defaults * Fix migration notebook * Remove create_pipeline_config * Remove several unused config models * Remove indexing embedding and input configs * Move embeddings function to config * Remove skip_workflows * Remove skip embeddings in favor of explicit naming * fix unit test spelling mistake * self.models[model_id] is already a language model. Remove redundant casting. * update validation errors to instruct users to rerun graphrag init * instantiate LanguageModelConfigs with validation * skip validation in unit tests * update verb tests to use default model settings instead of skipping validation * test using llm settings * cleanup verb tests * remove unsafe default model config * remove the ability to skip pydantic validation * remove None union types when default values are set * move vector_store from embeddings to top level of config and delete resolve_paths * update vector store settings * fix vector store and smoke tests * fix serializing vector_store settings * fix vector_store usage * fix vector_store type * support cli overrides for loading graphrag config * rename storage to output * Add --force flag to init * Remove run_id and resume, fix Drift config assignment * Ruff --------- Co-authored-by: Nathan Evans <github@talkswithnumbers.com> Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
2025-01-21 15:52:06 -08:00
output_path = root / "output"
2024-07-01 15:25:30 -06:00
assert output_path.exists(), "output folder does not exist"
# Check stats for all workflow
Refactor config (#1593) * Refactor config - Add new ModelConfig to represent LLM settings - Combines LLMParameters, ParallelizationParameters, encoding_model, and async_mode - Add top level models config that is a list of available LLM ModelConfigs - Remove LLMConfig inheritance and delete LLMConfig - Replace the inheritance with a model_id reference to the ModelConfig listed in the top level models config - Remove all fallbacks and hydration logic from create_graphrag_config - This removes the automatic env variable overrides - Support env variables within config files using Templating - This requires "$" to be escaped with extra "$" so ".*\\.txt$" becomes ".*\\.txt$$" - Update init content to initialize new config file with the ModelConfig structure * Use dict of ModelConfig instead of list * Add model validations and unit tests * Fix ruff checks * Add semversioner change * Fix unit tests * validate root_dir in pydantic model * Rename ModelConfig to LanguageModelConfig * Rename ModelConfigMissingError to LanguageModelConfigMissingError * Add validationg for unexpected API keys * Allow skipping pydantic validation for testing/mocking purposes. * Add default lm configs to verb tests * smoke test * remove config from flows to fix llm arg mapping * Fix embedding llm arg mapping * Remove timestamp from smoke test outputs * Remove unused "subworkflows" smoke test properties * Add models to smoke test configs * Update smoke test output path * Send logs to logs folder * Fix output path * Fix csv test file pattern * Update placeholder * Format * Instantiate default model configs * Fix unit tests for config defaults * Fix migration notebook * Remove create_pipeline_config * Remove several unused config models * Remove indexing embedding and input configs * Move embeddings function to config * Remove skip_workflows * Remove skip embeddings in favor of explicit naming * fix unit test spelling mistake * self.models[model_id] is already a language model. Remove redundant casting. * update validation errors to instruct users to rerun graphrag init * instantiate LanguageModelConfigs with validation * skip validation in unit tests * update verb tests to use default model settings instead of skipping validation * test using llm settings * cleanup verb tests * remove unsafe default model config * remove the ability to skip pydantic validation * remove None union types when default values are set * move vector_store from embeddings to top level of config and delete resolve_paths * update vector store settings * fix vector store and smoke tests * fix serializing vector_store settings * fix vector_store usage * fix vector_store type * support cli overrides for loading graphrag config * rename storage to output * Add --force flag to init * Remove run_id and resume, fix Drift config assignment * Ruff --------- Co-authored-by: Nathan Evans <github@talkswithnumbers.com> Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
2025-01-21 15:52:06 -08:00
stats = json.loads((output_path / "stats.json").read_bytes().decode("utf-8"))
2024-07-01 15:25:30 -06:00
# Check all workflows run
expected_workflows = set(workflow_config.keys())
workflows = set(stats["workflows"].keys())
assert workflows == expected_workflows, (
f"Workflows missing from stats.json: {expected_workflows - workflows}. Unexpected workflows in stats.json: {workflows - expected_workflows}"
)
2024-07-01 15:25:30 -06:00
# [OPTIONAL] Check runtime
for workflow, config in workflow_config.items():
# Check expected artifacts
workflow_artifacts = config.get("expected_artifacts", [])
# Check max runtime
max_runtime = config.get("max_runtime", None)
if max_runtime:
assert stats["workflows"][workflow]["overall"] <= max_runtime, (
f"Expected max runtime of {max_runtime}, found: {stats['workflows'][workflow]['overall']} for workflow: {workflow}"
)
# Check expected artifacts
for artifact in workflow_artifacts:
if artifact.endswith(".parquet"):
output_df = pd.read_parquet(output_path / artifact)
# Check number of rows between range
assert (
config["row_range"][0]
<= len(output_df)
<= config["row_range"][1]
), (
f"Expected between {config['row_range'][0]} and {config['row_range'][1]}, found: {len(output_df)} for file: {artifact}"
)
# Get non-nan rows
nan_df = output_df.loc[
:,
~output_df.columns.isin(config.get("nan_allowed_columns", [])),
]
nan_df = nan_df[nan_df.isna().any(axis=1)]
assert len(nan_df) == 0, (
f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
)
2024-07-01 15:25:30 -06:00
def __run_query(self, root: Path, query_config: dict[str, str]):
command = [
"poetry",
"run",
"poe",
"query",
"--root",
2024-10-24 14:22:32 -04:00
root.resolve().as_posix(),
2024-07-01 15:25:30 -06:00
"--method",
query_config["method"],
2024-10-24 14:22:32 -04:00
"--community-level",
2024-07-01 15:25:30 -06:00
str(query_config.get("community_level", 2)),
2024-10-24 14:22:32 -04:00
"--query",
2024-07-01 15:25:30 -06:00
query_config["query"],
]
log.info("running command ", " ".join(command))
return subprocess.run(command, capture_output=True, text=True)
@cleanup(skip=debug)
@mock.patch.dict(
os.environ,
{
**os.environ,
"BLOB_STORAGE_CONNECTION_STRING": os.getenv(
"GRAPHRAG_CACHE_CONNECTION_STRING", WELL_KNOWN_AZURITE_CONNECTION_STRING
),
"LOCAL_BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING,
"GRAPHRAG_CHUNK_SIZE": "1200",
"GRAPHRAG_CHUNK_OVERLAP": "0",
"AZURE_AI_SEARCH_URL_ENDPOINT": os.getenv("AZURE_AI_SEARCH_URL_ENDPOINT"),
"AZURE_AI_SEARCH_API_KEY": os.getenv("AZURE_AI_SEARCH_API_KEY"),
},
clear=True,
)
@pytest.mark.timeout(800)
2024-07-01 15:25:30 -06:00
def test_fixture(
self,
input_path: str,
input_file_type: str,
workflow_config: dict[str, dict[str, Any]],
query_config: list[dict[str, str]],
):
if workflow_config.get("skip"):
2024-07-01 15:25:30 -06:00
print(f"skipping smoke test {input_path})")
return
azure = workflow_config.get("azure")
root = Path(input_path)
dispose = None
if azure is not None:
dispose = asyncio.run(prepare_azurite_data(input_path, azure))
print("running indexer")
self.__run_indexer(root, input_file_type)
print("indexer complete")
if dispose is not None:
dispose()
if not workflow_config.get("skip_assert"):
2024-07-01 15:25:30 -06:00
print("performing dataset assertions")
self.__assert_indexer_outputs(root, workflow_config)
print("running queries")
for query in query_config:
result = self.__run_query(root, query)
print(f"Query: {query}\nResponse: {result.stdout}")
Multi-index querying for API layer (#1644) * added multi-global-query function header * ported over code for merging dataframes * added connection to global streaming api function * added function header for update context helper * implemented and incorperated update_context function * Updated to make sure 'parent' column in final_communities gets incremented for multi index. * first cut at multi_local_seach function * several minor changes and fixes * Updated multi index local search. * Cleaned up code. * fixed lambda function ruff errors * fixed more ruff errors * moved query api helpers to util file * moved index api helpers to util file * merged in code left out of conflict * changed GraphRagConfig object to support lists of vector stores * Updated with fixes for multi_local_search. * Minor updates. * Minor updates. * Updates for ruff check. * Minor updates. * removed redundant vector_store_configs arg * ruff formatting changes * semversioner * Minor fix. * spellcheck fixes * ruff * test fix for cicd errors * another test fix * added explicit typing for ci tests * added dict type check for vector_store during indexing * more ruff fixes * moved type check * Removed streaming. Added multi drift and basic searches. * Formatting changes. * Updates for pyright. * Update for ruff. * Ruff formatted. * first cut at fixing vector store typing errors * got multi local search working with new config * ruff and test fixes * added fix for embeddings type error * renamed multi index api functions * ruff * convert config model to dict[VectorStoreConfig] * modified tests to support new vector_store model * ruff fixes * changed some test setups to match new model * changed ci/cd settings files to match new structure * Fix stderror check * fixed bug in vector_store_config validation * ruff * add database_name field to vectorstoreconfig * removed print statements * small refactoring for PR comments * modified default config in test * modified vector store config unit test --------- Co-authored-by: dorbaker <dorbaker@microsoft.com> Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
2025-01-27 17:26:38 -05:00
assert result.returncode == 0, "Query failed"
2024-07-01 15:25:30 -06:00
assert result.stdout is not None, "Query returned no output"
assert len(result.stdout) > 0, "Query returned empty output"