graphrag/tests/verbs/util.py

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

from typing import cast

import pandas as pd
from datashaper import Workflow
from pandas.testing import assert_series_equal

from graphrag.config import create_graphrag_config
from graphrag.index import (
    PipelineWorkflowConfig,
    PipelineWorkflowStep,
    create_pipeline_config,
)


def load_input_tables(inputs: list[str]) -> dict[str, pd.DataFrame]:
    """Harvest all the referenced input IDs from the workflow being tested and pass them here."""
    # stick all the inputs in a map - Workflow looks them up by name
    input_tables: dict[str, pd.DataFrame] = {}

    # all workflows implicitly receive the `input` source, which is formatted as a dataframe after loading from storage
    # we'll simulate that by just loading one of our output parquets and converting back to equivalent dataframe
    # so we aren't dealing with storage vagaries (which would become an integration test)
    source = pd.read_parquet("tests/verbs/data/create_base_documents.parquet")
    source.rename(columns={"raw_content": "text"}, inplace=True)
    input_tables["source"] = cast(pd.DataFrame, source[["id", "text", "title"]])

    for input in inputs:
        # remove the workflow: prefix if it exists, because that is not part of the actual table filename
        name = input.replace("workflow:", "")
        input_tables[input] = pd.read_parquet(f"tests/verbs/data/{name}.parquet")
    return input_tables


def load_expected(output: str) -> pd.DataFrame:
    """Pass in the workflow output (generally the workflow name)"""
    return pd.read_parquet(f"tests/verbs/data/{output}.parquet")


def get_config_for_workflow(name: str) -> PipelineWorkflowConfig:
    """Instantiates the bare minimum config to get a default workflow config for testing."""
    config = create_graphrag_config()
    pipeline_config = create_pipeline_config(config)
    print(pipeline_config.workflows)
    result = next(conf for conf in pipeline_config.workflows if conf.name == name)
    return cast(PipelineWorkflowConfig, result.config)


async def get_workflow_output(
    input_tables: dict[str, pd.DataFrame], schema: dict
) -> pd.DataFrame:
    """Pass in the input tables, the schema, and the output name"""

    # the bare minimum workflow is the pipeline schema and table context
    workflow = Workflow(
        schema=schema,
        input_tables=input_tables,
    )

    await workflow.run()

    # if there's only one output, it is the default here, no name required
    return cast(pd.DataFrame, workflow.output())


def compare_outputs(
    actual: pd.DataFrame, expected: pd.DataFrame, columns: list[str] | None = None
) -> None:
    """Compare the actual and expected dataframes, optionally specifying columns to compare.
    This uses assert_series_equal since we are sometimes intentionally omitting columns from the actual output."""
    cols = expected.columns if columns is None else columns

    assert len(actual) == len(
        expected
    ), f"Expected: {len(expected)}, Actual: {len(actual)}"

    for column in cols:
        assert column in actual.columns
        try:
            # dtypes can differ since the test data is read from parquet and our workflow runs in memory
            assert_series_equal(actual[column], expected[column], check_dtype=False)
        except AssertionError:
            print("Expected:")
            print(expected[column])
            print("Actual:")
            print(actual[columns])
            raise


def remove_disabled_steps(
    steps: list[PipelineWorkflowStep],
) -> list[PipelineWorkflowStep]:
    return [step for step in steps if step.get("enabled", True)]
Verb merge nre1 (#1140) * Setup basic verb test runner * Replace join_text_units_to_entity_ids with subflow * Update comments * Replace join_text_units_to_relationship_ids subflow * Roll in final select * Reuse assertion util * Small fix + format * Format/typing * Semver * Format/typing * Semver * Revert format changes * Fix smoke test subworkflow count * Edit subworkflows for another smoke test 2024-09-16 12:10:29 -07:00			`# Copyright (c) 2024 Microsoft Corporation.`
			`# Licensed under the MIT License`

			`from typing import cast`

			`import pandas as pd`
			`from datashaper import Workflow`
Collapse final communities workflow (#1150) * Collapse create_final_communities * Semver * Spellcheck * Clean up filtering * Add space in title * Format * Cleanup imports and format * Spruce up the tests * Update dictionary.txt * Spellcheck --------- Co-authored-by: Alonso Guevara <alonsog@microsoft.com> 2024-09-17 17:04:42 -07:00			`from pandas.testing import assert_series_equal`
Verb merge nre1 (#1140) * Setup basic verb test runner * Replace join_text_units_to_entity_ids with subflow * Update comments * Replace join_text_units_to_relationship_ids subflow * Roll in final select * Reuse assertion util * Small fix + format * Format/typing * Semver * Format/typing * Semver * Revert format changes * Fix smoke test subworkflow count * Edit subworkflows for another smoke test 2024-09-16 12:10:29 -07:00
Collapse verbs: create_final_text_units (#1143) * Load default config in verb tests * Load proper workflow config * Collapse text unit pre-embedding steps * Format * Update smoke tests * Semver * Format * Merge join* subflows into create_final_text_units * Remove join_text_units_to_covariate_ids * Format * Remove join_text_units_to_entity_ids * Remove join_text_units_to_relationship_ids * Clean up merges and aggregations * Remove unnecessary cast 2024-09-17 10:32:25 -07:00			`from graphrag.config import create_graphrag_config`
			`from graphrag.index import (`
			`PipelineWorkflowConfig,`
			`PipelineWorkflowStep,`
			`create_pipeline_config,`
			`)`

Verb merge nre1 (#1140) * Setup basic verb test runner * Replace join_text_units_to_entity_ids with subflow * Update comments * Replace join_text_units_to_relationship_ids subflow * Roll in final select * Reuse assertion util * Small fix + format * Format/typing * Semver * Format/typing * Semver * Revert format changes * Fix smoke test subworkflow count * Edit subworkflows for another smoke test 2024-09-16 12:10:29 -07:00
			`def load_input_tables(inputs: list[str]) -> dict[str, pd.DataFrame]:`
			`"""Harvest all the referenced input IDs from the workflow being tested and pass them here."""`
			`# stick all the inputs in a map - Workflow looks them up by name`
			`input_tables: dict[str, pd.DataFrame] = {}`
Collapse create base documents (#1176) * Collapse non-attribute verbs * Include document_column_attributes in collapse * Remove merge_override verb * Semver * Clean up some df/tests 2024-09-23 13:24:06 -07:00
			# all workflows implicitly receive the `input` source, which is formatted as a dataframe after loading from storage
			`# we'll simulate that by just loading one of our output parquets and converting back to equivalent dataframe`
			`# so we aren't dealing with storage vagaries (which would become an integration test)`
			`source = pd.read_parquet("tests/verbs/data/create_base_documents.parquet")`
			`source.rename(columns={"raw_content": "text"}, inplace=True)`
			`input_tables["source"] = cast(pd.DataFrame, source[["id", "text", "title"]])`

Verb merge nre1 (#1140) * Setup basic verb test runner * Replace join_text_units_to_entity_ids with subflow * Update comments * Replace join_text_units_to_relationship_ids subflow * Roll in final select * Reuse assertion util * Small fix + format * Format/typing * Semver * Format/typing * Semver * Revert format changes * Fix smoke test subworkflow count * Edit subworkflows for another smoke test 2024-09-16 12:10:29 -07:00			`for input in inputs:`
			`# remove the workflow: prefix if it exists, because that is not part of the actual table filename`
			`name = input.replace("workflow:", "")`
			`input_tables[input] = pd.read_parquet(f"tests/verbs/data/{name}.parquet")`
			`return input_tables`


			`def load_expected(output: str) -> pd.DataFrame:`
			`"""Pass in the workflow output (generally the workflow name)"""`
			`return pd.read_parquet(f"tests/verbs/data/{output}.parquet")`


Collapse verbs: create_final_text_units (#1143) * Load default config in verb tests * Load proper workflow config * Collapse text unit pre-embedding steps * Format * Update smoke tests * Semver * Format * Merge join* subflows into create_final_text_units * Remove join_text_units_to_covariate_ids * Format * Remove join_text_units_to_entity_ids * Remove join_text_units_to_relationship_ids * Clean up merges and aggregations * Remove unnecessary cast 2024-09-17 10:32:25 -07:00			`def get_config_for_workflow(name: str) -> PipelineWorkflowConfig:`
			`"""Instantiates the bare minimum config to get a default workflow config for testing."""`
			`config = create_graphrag_config()`
			`pipeline_config = create_pipeline_config(config)`
			`print(pipeline_config.workflows)`
			`result = next(conf for conf in pipeline_config.workflows if conf.name == name)`
			`return cast(PipelineWorkflowConfig, result.config)`


Verb merge nre1 (#1140) * Setup basic verb test runner * Replace join_text_units_to_entity_ids with subflow * Update comments * Replace join_text_units_to_relationship_ids subflow * Roll in final select * Reuse assertion util * Small fix + format * Format/typing * Semver * Format/typing * Semver * Revert format changes * Fix smoke test subworkflow count * Edit subworkflows for another smoke test 2024-09-16 12:10:29 -07:00			`async def get_workflow_output(`
			`input_tables: dict[str, pd.DataFrame], schema: dict`
			`) -> pd.DataFrame:`
			`"""Pass in the input tables, the schema, and the output name"""`

			`# the bare minimum workflow is the pipeline schema and table context`
			`workflow = Workflow(`
			`schema=schema,`
			`input_tables=input_tables,`
			`)`

			`await workflow.run()`

			`# if there's only one output, it is the default here, no name required`
			`return cast(pd.DataFrame, workflow.output())`


Collapse final communities workflow (#1150) * Collapse create_final_communities * Semver * Spellcheck * Clean up filtering * Add space in title * Format * Cleanup imports and format * Spruce up the tests * Update dictionary.txt * Spellcheck --------- Co-authored-by: Alonso Guevara <alonsog@microsoft.com> 2024-09-17 17:04:42 -07:00			`def compare_outputs(`
			`actual: pd.DataFrame, expected: pd.DataFrame, columns: list[str] \| None = None`
			`) -> None:`
			`"""Compare the actual and expected dataframes, optionally specifying columns to compare.`
			`This uses assert_series_equal since we are sometimes intentionally omitting columns from the actual output."""`
			`cols = expected.columns if columns is None else columns`
Collapse create base documents (#1176) * Collapse non-attribute verbs * Include document_column_attributes in collapse * Remove merge_override verb * Semver * Clean up some df/tests 2024-09-23 13:24:06 -07:00
			`assert len(actual) == len(`
			`expected`
			`), f"Expected: {len(expected)}, Actual: {len(actual)}"`

			`for column in cols:`
			`assert column in actual.columns`
			`try:`
Collapse final communities workflow (#1150) * Collapse create_final_communities * Semver * Spellcheck * Clean up filtering * Add space in title * Format * Cleanup imports and format * Spruce up the tests * Update dictionary.txt * Spellcheck --------- Co-authored-by: Alonso Guevara <alonsog@microsoft.com> 2024-09-17 17:04:42 -07:00			`# dtypes can differ since the test data is read from parquet and our workflow runs in memory`
			`assert_series_equal(actual[column], expected[column], check_dtype=False)`
Collapse create base documents (#1176) * Collapse non-attribute verbs * Include document_column_attributes in collapse * Remove merge_override verb * Semver * Clean up some df/tests 2024-09-23 13:24:06 -07:00			`except AssertionError:`
			`print("Expected:")`
			`print(expected[column])`
			`print("Actual:")`
			`print(actual[columns])`
			`raise`
Collapse verbs: create_final_text_units (#1143) * Load default config in verb tests * Load proper workflow config * Collapse text unit pre-embedding steps * Format * Update smoke tests * Semver * Format * Merge join* subflows into create_final_text_units * Remove join_text_units_to_covariate_ids * Format * Remove join_text_units_to_entity_ids * Remove join_text_units_to_relationship_ids * Clean up merges and aggregations * Remove unnecessary cast 2024-09-17 10:32:25 -07:00

			`def remove_disabled_steps(`
			`steps: list[PipelineWorkflowStep],`
			`) -> list[PipelineWorkflowStep]:`
			`return [step for step in steps if step.get("enabled", True)]`