graphrag/tests/verbs/test_finalize_graph.py

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

from graphrag.config.create_graphrag_config import create_graphrag_config
from graphrag.data_model.schemas import (
    ENTITIES_FINAL_COLUMNS,
    RELATIONSHIPS_FINAL_COLUMNS,
)
from graphrag.index.workflows.finalize_graph import (
    run_workflow,
)
from graphrag.utils.storage import load_table_from_storage, write_table_to_storage

from .util import (
    DEFAULT_MODEL_CONFIG,
    create_test_context,
    load_test_table,
)


async def test_finalize_graph():
    context = await _prep_tables()

    config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})

    await run_workflow(config, context)

    nodes_actual = await load_table_from_storage("entities", context.output_storage)
    edges_actual = await load_table_from_storage(
        "relationships", context.output_storage
    )

    assert len(nodes_actual) == 291
    assert len(edges_actual) == 452

    # x and y will be zero with the default configuration, because we do not embed/umap
    assert nodes_actual["x"].sum() == 0
    assert nodes_actual["y"].sum() == 0

    for column in ENTITIES_FINAL_COLUMNS:
        assert column in nodes_actual.columns
    for column in RELATIONSHIPS_FINAL_COLUMNS:
        assert column in edges_actual.columns


async def test_finalize_graph_umap():
    context = await _prep_tables()

    config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})

    config.embed_graph.enabled = True
    config.umap.enabled = True

    await run_workflow(config, context)

    nodes_actual = await load_table_from_storage("entities", context.output_storage)
    edges_actual = await load_table_from_storage(
        "relationships", context.output_storage
    )

    assert len(nodes_actual) == 291
    assert len(edges_actual) == 452

    # x and y should have some value other than zero due to umap
    assert nodes_actual["x"].sum() != 0
    assert nodes_actual["y"].sum() != 0

    for column in ENTITIES_FINAL_COLUMNS:
        assert column in nodes_actual.columns
    for column in RELATIONSHIPS_FINAL_COLUMNS:
        assert column in edges_actual.columns


async def _prep_tables():
    context = await create_test_context(
        storage=["entities", "relationships"],
    )

    # edit the tables to eliminate final fields that wouldn't be on the inputs
    entities = load_test_table("entities")
    entities.drop(columns=["x", "y", "degree"], inplace=True)
    await write_table_to_storage(entities, "entities", context.output_storage)
    relationships = load_test_table("relationships")
    relationships.drop(columns=["combined_degree"], inplace=True)
    await write_table_to_storage(relationships, "relationships", context.output_storage)
    return context
Add more verb tests (#1773) * Add NLP verb test * Add finalize_graph tests * Add more thorough final column assertions 2025-02-27 09:31:46 -08:00			`# Copyright (c) 2024 Microsoft Corporation.`
			`# Licensed under the MIT License`

			`from graphrag.config.create_graphrag_config import create_graphrag_config`
			`from graphrag.data_model.schemas import (`
			`ENTITIES_FINAL_COLUMNS,`
			`RELATIONSHIPS_FINAL_COLUMNS,`
			`)`
			`from graphrag.index.workflows.finalize_graph import (`
			`run_workflow,`
			`)`
			`from graphrag.utils.storage import load_table_from_storage, write_table_to_storage`

			`from .util import (`
			`DEFAULT_MODEL_CONFIG,`
			`create_test_context,`
			`load_test_table,`
			`)`


			`async def test_finalize_graph():`
			`context = await _prep_tables()`

			`config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})`

Context property bag ("state") (#1774) * Add pipeline state property bag to run context * Move state creation out of context util * Move callbacks into PipelineRunContext * Semver * Rename state.json to context.json to avoid confusion with stats.json * Expand smoke test row count * Add util to create storage and cache 2025-02-28 09:31:48 -08:00			`await run_workflow(config, context)`
Add more verb tests (#1773) * Add NLP verb test * Add finalize_graph tests * Add more thorough final column assertions 2025-02-27 09:31:46 -08:00
Pipeline registration (#1940) * Move covariate run conditional * All pipeline registration * Fix method name construction * Rename context storage -> output_storage * Rename OutputConfig as generic StorageConfig * Reuse Storage model under InputConfig * Move input storage creation out of document loading * Move document loading into workflows * Semver * Fix smoke test config for new workflows * Fix unit tests --------- Co-authored-by: Alonso Guevara <alonsog@microsoft.com> 2025-06-12 16:14:39 -07:00			`nodes_actual = await load_table_from_storage("entities", context.output_storage)`
			`edges_actual = await load_table_from_storage(`
			`"relationships", context.output_storage`
			`)`
Add more verb tests (#1773) * Add NLP verb test * Add finalize_graph tests * Add more thorough final column assertions 2025-02-27 09:31:46 -08:00
Fix graph creation (#1905) * Add edge weight to all graph creation * Semver 2025-04-29 18:18:49 -07:00			`assert len(nodes_actual) == 291`
			`assert len(edges_actual) == 452`
Add more verb tests (#1773) * Add NLP verb test * Add finalize_graph tests * Add more thorough final column assertions 2025-02-27 09:31:46 -08:00
			`# x and y will be zero with the default configuration, because we do not embed/umap`
			`assert nodes_actual["x"].sum() == 0`
			`assert nodes_actual["y"].sum() == 0`

			`for column in ENTITIES_FINAL_COLUMNS:`
			`assert column in nodes_actual.columns`
			`for column in RELATIONSHIPS_FINAL_COLUMNS:`
			`assert column in edges_actual.columns`


			`async def test_finalize_graph_umap():`
			`context = await _prep_tables()`

			`config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})`

			`config.embed_graph.enabled = True`
			`config.umap.enabled = True`

Context property bag ("state") (#1774) * Add pipeline state property bag to run context * Move state creation out of context util * Move callbacks into PipelineRunContext * Semver * Rename state.json to context.json to avoid confusion with stats.json * Expand smoke test row count * Add util to create storage and cache 2025-02-28 09:31:48 -08:00			`await run_workflow(config, context)`
Add more verb tests (#1773) * Add NLP verb test * Add finalize_graph tests * Add more thorough final column assertions 2025-02-27 09:31:46 -08:00
Pipeline registration (#1940) * Move covariate run conditional * All pipeline registration * Fix method name construction * Rename context storage -> output_storage * Rename OutputConfig as generic StorageConfig * Reuse Storage model under InputConfig * Move input storage creation out of document loading * Move document loading into workflows * Semver * Fix smoke test config for new workflows * Fix unit tests --------- Co-authored-by: Alonso Guevara <alonsog@microsoft.com> 2025-06-12 16:14:39 -07:00			`nodes_actual = await load_table_from_storage("entities", context.output_storage)`
			`edges_actual = await load_table_from_storage(`
			`"relationships", context.output_storage`
			`)`
Add more verb tests (#1773) * Add NLP verb test * Add finalize_graph tests * Add more thorough final column assertions 2025-02-27 09:31:46 -08:00
Fix graph creation (#1905) * Add edge weight to all graph creation * Semver 2025-04-29 18:18:49 -07:00			`assert len(nodes_actual) == 291`
			`assert len(edges_actual) == 452`
Add more verb tests (#1773) * Add NLP verb test * Add finalize_graph tests * Add more thorough final column assertions 2025-02-27 09:31:46 -08:00
			`# x and y should have some value other than zero due to umap`
			`assert nodes_actual["x"].sum() != 0`
			`assert nodes_actual["y"].sum() != 0`

			`for column in ENTITIES_FINAL_COLUMNS:`
			`assert column in nodes_actual.columns`
			`for column in RELATIONSHIPS_FINAL_COLUMNS:`
			`assert column in edges_actual.columns`


			`async def _prep_tables():`
			`context = await create_test_context(`
			`storage=["entities", "relationships"],`
			`)`

			`# edit the tables to eliminate final fields that wouldn't be on the inputs`
			`entities = load_test_table("entities")`
			`entities.drop(columns=["x", "y", "degree"], inplace=True)`
Pipeline registration (#1940) * Move covariate run conditional * All pipeline registration * Fix method name construction * Rename context storage -> output_storage * Rename OutputConfig as generic StorageConfig * Reuse Storage model under InputConfig * Move input storage creation out of document loading * Move document loading into workflows * Semver * Fix smoke test config for new workflows * Fix unit tests --------- Co-authored-by: Alonso Guevara <alonsog@microsoft.com> 2025-06-12 16:14:39 -07:00			`await write_table_to_storage(entities, "entities", context.output_storage)`
Add more verb tests (#1773) * Add NLP verb test * Add finalize_graph tests * Add more thorough final column assertions 2025-02-27 09:31:46 -08:00			`relationships = load_test_table("relationships")`
			`relationships.drop(columns=["combined_degree"], inplace=True)`
Pipeline registration (#1940) * Move covariate run conditional * All pipeline registration * Fix method name construction * Rename context storage -> output_storage * Rename OutputConfig as generic StorageConfig * Reuse Storage model under InputConfig * Move input storage creation out of document loading * Move document loading into workflows * Semver * Fix smoke test config for new workflows * Fix unit tests --------- Co-authored-by: Alonso Guevara <alonsog@microsoft.com> 2025-06-12 16:14:39 -07:00			`await write_table_to_storage(relationships, "relationships", context.output_storage)`
Add more verb tests (#1773) * Add NLP verb test * Add finalize_graph tests * Add more thorough final column assertions 2025-02-27 09:31:46 -08:00			`return context`