Attempt to update smoke tests for multi index query.

2025-06-26 23:19:58 +00:00 · 2025-06-04 14:05:30 +00:00 · 2025-06-04 14:05:30 +00:00 · a63eed0b76
commit a63eed0b76
parent b409cef0ce
3 changed files with 279 additions and 0 deletions
--- a/tests/fixtures/text/settings_input2.yml
+++ b/tests/fixtures/text/settings_input2.yml
@ -0,0 +1,61 @@
 models:
  default_chat_model:
    azure_auth_type: api_key
    type: ${GRAPHRAG_LLM_TYPE}
    api_key: ${GRAPHRAG_API_KEY}
    api_base: ${GRAPHRAG_API_BASE}
    api_version: ${GRAPHRAG_API_VERSION}
    deployment_name: ${GRAPHRAG_LLM_DEPLOYMENT_NAME}
    model: ${GRAPHRAG_LLM_MODEL}
    tokens_per_minute: ${GRAPHRAG_LLM_TPM}
    requests_per_minute: ${GRAPHRAG_LLM_RPM}
    model_supports_json: true
    concurrent_requests: 50
    async_mode: threaded
  default_embedding_model:
    azure_auth_type: api_key
    type: ${GRAPHRAG_EMBEDDING_TYPE}
    api_key: ${GRAPHRAG_API_KEY}
    api_base: ${GRAPHRAG_API_BASE}
    api_version: ${GRAPHRAG_API_VERSION}
    deployment_name: ${GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME}
    model: ${GRAPHRAG_EMBEDDING_MODEL}
    tokens_per_minute: null
    requests_per_minute: null
    concurrent_requests: 50
    async_mode: threaded
 vector_store:
  default_vector_store2:
    type: "azure_ai_search"
    url: ${AZURE_AI_SEARCH_URL_ENDPOINT}
    api_key: ${AZURE_AI_SEARCH_API_KEY}
    container_name: "simple_text_ci2"
 input:
  type: file # or blob
  file_type: text # [csv, text, json]
  base_dir: "./tests/fixtures/text/input2"
  file_encoding: utf-8
  file_pattern: ".*\\.txt$$"
 output:
  type: file # [file, blob, cosmosdb]
  base_dir: "./tests/fixtures/text/output2"
 extract_claims:
  enabled: true
 community_reports:
  prompt: "prompts/community_report.txt"
  max_length: 2000
  max_input_length: 8000
 snapshots:
  embeddings: True
 drift_search:
  n_depth: 1
  drift_k_followups: 3
  primer_folds: 3
--- a/tests/fixtures/text/settings_miq.yml
+++ b/tests/fixtures/text/settings_miq.yml
@ -0,0 +1,62 @@
 models:
  default_chat_model:
    azure_auth_type: api_key
    type: ${GRAPHRAG_LLM_TYPE}
    api_key: ${GRAPHRAG_API_KEY}
    api_base: ${GRAPHRAG_API_BASE}
    api_version: ${GRAPHRAG_API_VERSION}
    deployment_name: ${GRAPHRAG_LLM_DEPLOYMENT_NAME}
    model: ${GRAPHRAG_LLM_MODEL}
    tokens_per_minute: ${GRAPHRAG_LLM_TPM}
    requests_per_minute: ${GRAPHRAG_LLM_RPM}
    model_supports_json: true
    concurrent_requests: 50
    async_mode: threaded
  default_embedding_model:
    azure_auth_type: api_key
    type: ${GRAPHRAG_EMBEDDING_TYPE}
    api_key: ${GRAPHRAG_API_KEY}
    api_base: ${GRAPHRAG_API_BASE}
    api_version: ${GRAPHRAG_API_VERSION}
    deployment_name: ${GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME}
    model: ${GRAPHRAG_EMBEDDING_MODEL}
    tokens_per_minute: null
    requests_per_minute: null
    concurrent_requests: 50
    async_mode: threaded
 outputs:
  index1:
    type: file # [file, blob, cosmosdb]
    base_dir: "./tests/fixtures/text/output" 
  index2:
    type: file # [file, blob, cosmosdb]
    base_dir: "./tests/fixtures/text/output2" 
 vector_store:
  index1:
    type: "azure_ai_search"
    url: ${AZURE_AI_SEARCH_URL_ENDPOINT}
    api_key: ${AZURE_AI_SEARCH_API_KEY}
    container_name: "simple_text_ci"
  index2:
    type: "azure_ai_search"
    url: ${AZURE_AI_SEARCH_URL_ENDPOINT}
    api_key: ${AZURE_AI_SEARCH_API_KEY}
    container_name: "simple_text_ci2"
 extract_claims:
  enabled: true
 community_reports:
  prompt: "prompts/community_report.txt"
  max_length: 2000
  max_input_length: 8000
 snapshots:
  embeddings: True
 drift_search:
  n_depth: 1
  drift_k_followups: 3
  primer_folds: 3
--- a/tests/smoke/test_fixtures.py
+++ b/tests/smoke/test_fixtures.py
@ -83,6 +83,7 @@ def cleanup(skip: bool = False):
                    root = Path(kwargs["input_path"])
                    shutil.rmtree(root / "output", ignore_errors=True)
                    shutil.rmtree(root / "cache", ignore_errors=True)
                    shutil.rmtree(root / "output2", ignore_errors=True)
        return wrapper
@ -118,6 +119,35 @@ async def prepare_azurite_data(input_path: str, azure: dict) -> Callable[[], Non
    return lambda: input_storage._delete_container()  # noqa: SLF001
 async def prepare_azurite_data2(input_path: str, azure: dict) -> Callable[[], None]:
    """Prepare the data for the Azurite tests."""
    input_container = azure["input_container"]
    input_base_dir = azure.get("input_base_dir")
    root = Path(input_path)
    input_storage = BlobPipelineStorage(
        connection_string=WELL_KNOWN_AZURITE_CONNECTION_STRING,
        container_name=input_container,
    )
    # Bounce the container if it exists to clear out old run data
    input_storage._delete_container()  # noqa: SLF001
    input_storage._create_container()  # noqa: SLF001
    # Upload data files
    txt_files = list((root / "input2").glob("*.txt"))
    csv_files = list((root / "input2").glob("*.csv"))
    data_files = txt_files + csv_files
    for data_file in data_files:
        text = data_file.read_bytes().decode("utf-8")
        file_path = (
            str(Path(input_base_dir) / data_file.name)
            if input_base_dir
            else data_file.name
        )
        await input_storage.set(file_path, text, encoding="utf-8")
    return lambda: input_storage._delete_container()  # noqa: SLF001
 class TestIndexer:
    params: ClassVar[dict[str, list[tuple[str, dict[str, Any]]]]] = {
@ -151,6 +181,35 @@ class TestIndexer:
            f"Indexer failed with return code: {completion.returncode}"
        )
    def __run_indexer2(
        self,
        root: Path,
        input_file_type: str,
    ):
        command = [
            "poetry",
            "run",
            "poe",
            "index",
            "--verbose" if debug else None,
            "--root",
            root.resolve().as_posix(),
            "--logger",
            "print",
            "--method",
            "standard",
            "--config",
            root.resolve().as_posix() + "/settings_input2.yml",
        ]
        command = [arg for arg in command if arg]
        log.info("running command ", " ".join(command))
        completion = subprocess.run(
            command, env={**os.environ, "GRAPHRAG_INPUT_FILE_TYPE": input_file_type}
        )
        assert completion.returncode == 0, (
            f"Indexer failed with return code: {completion.returncode}"
        )
    def __assert_indexer_outputs(
        self, root: Path, workflow_config: dict[str, dict[str, Any]]
    ):
@ -202,6 +261,58 @@ class TestIndexer:
                        f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
                    )
    def __assert_indexer_outputs2(
        self, root: Path, workflow_config: dict[str, dict[str, Any]]
    ):
        output_path = root / "output2"
        assert output_path.exists(), "output2 folder does not exist"
        # Check stats for all workflow
        stats = json.loads((output_path / "stats.json").read_bytes().decode("utf-8"))
        # Check all workflows run
        expected_workflows = set(workflow_config.keys())
        workflows = set(stats["workflows"].keys())
        assert workflows == expected_workflows, (
            f"Workflows missing from stats.json: {expected_workflows - workflows}. Unexpected workflows in stats.json: {workflows - expected_workflows}"
        )
        # [OPTIONAL] Check runtime
        for workflow, config in workflow_config.items():
            # Check expected artifacts
            workflow_artifacts = config.get("expected_artifacts", [])
            # Check max runtime
            max_runtime = config.get("max_runtime", None)
            if max_runtime:
                assert stats["workflows"][workflow]["overall"] <= max_runtime, (
                    f"Expected max runtime of {max_runtime}, found: {stats['workflows'][workflow]['overall']} for workflow: {workflow}"
                )
            # Check expected artifacts
            for artifact in workflow_artifacts:
                if artifact.endswith(".parquet"):
                    output_df = pd.read_parquet(output_path / artifact)
                    # Check number of rows between range
                    assert (
                        config["row_range"][0]
                        <= len(output_df)
                        <= config["row_range"][1]
                    ), (
                        f"Expected between {config['row_range'][0]} and {config['row_range'][1]}, found: {len(output_df)} for file: {artifact}"
                    )
                    # Get non-nan rows
                    nan_df = output_df.loc[
                        :,
                        ~output_df.columns.isin(config.get("nan_allowed_columns", [])),
                    ]
                    nan_df = nan_df[nan_df.isna().any(axis=1)]
                    assert len(nan_df) == 0, (
                        f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
                    )
    def __run_query(self, root: Path, query_config: dict[str, str]):
        command = [
            "poetry",
@ -221,6 +332,27 @@ class TestIndexer:
        log.info("running command ", " ".join(command))
        return subprocess.run(command, capture_output=True, text=True)
    def __run_multi_index_query(self, root: Path, query_config: dict[str, str]):
        command = [
            "poetry",
            "run",
            "poe",
            "query",
            "--root",
            root.resolve().as_posix(),
            "--method",
            query_config["method"],
            "--community-level",
            str(query_config.get("community_level", 2)),
            "--query",
            query_config["query"],
            "--config",
            root.resolve().as_posix() + "/settings_miq.yml",
        ]
        log.info("running command ", " ".join(command))
        return subprocess.run(command, capture_output=True, text=True)
    @cleanup(skip=debug)
    @mock.patch.dict(
        os.environ,
@ -262,10 +394,25 @@ class TestIndexer:
        if dispose is not None:
            dispose()
        dispose2 = None
        if azure is not None:
            dispose2 = asyncio.run(prepare_azurite_data2(input_path, azure))
        print("running indexer")
        self.__run_indexer2(root, input_file_type)
        print("indexer complete")
        if dispose2 is not None:
            dispose2()
        if not workflow_config.get("skip_assert"):
            print("performing dataset assertions")
            self.__assert_indexer_outputs(root, workflow_config)
        if not workflow_config.get("skip_assert"):
            print("performing dataset assertions")
            self.__assert_indexer_outputs2(root, workflow_config)
        print("running queries")
        for query in query_config:
            result = self.__run_query(root, query)
@ -274,3 +421,12 @@ class TestIndexer:
            assert result.returncode == 0, "Query failed"
            assert result.stdout is not None, "Query returned no output"
            assert len(result.stdout) > 0, "Query returned empty output"
        print("running multi_index_queries")
        for query in query_config:
            result = self.__run_multi_index_query(root, query)
            print(f"Query: {query}\nResponse: {result.stdout}")
            assert result.returncode == 0, "Query failed"
            assert result.stdout is not None, "Query returned no output"
            assert len(result.stdout) > 0, "Query returned empty output"