graphrag/tests/verbs/test_create_final_documents.py

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks
from graphrag.config.create_graphrag_config import create_graphrag_config
from graphrag.index.workflows.create_final_documents import (
    run_workflow,
    workflow_name,
)
from graphrag.utils.storage import load_table_from_storage

from .util import (
    compare_outputs,
    create_test_context,
    load_test_table,
)


async def test_create_final_documents():
    expected = load_test_table(workflow_name)

    context = await create_test_context(
        storage=["create_base_text_units"],
    )

    config = create_graphrag_config()

    await run_workflow(
        config,
        context,
        NoopVerbCallbacks(),
    )

    actual = await load_table_from_storage(workflow_name, context.storage)

    compare_outputs(actual, expected)


async def test_create_final_documents_with_attribute_columns():
    expected = load_test_table(workflow_name)

    context = await create_test_context(
        storage=["create_base_text_units"],
    )

    config = create_graphrag_config()
    config.input.document_attribute_columns = ["title"]

    await run_workflow(
        config,
        context,
        NoopVerbCallbacks(),
    )

    actual = await load_table_from_storage(workflow_name, context.storage)

    # we should have dropped "title" and added "attributes"
    # our test dataframe does not have attributes, so we'll assert without it
    # and separately confirm it is in the output
    compare_outputs(
        actual, expected, columns=["id", "human_readable_id", "text", "text_unit_ids"]
    )
    assert len(actual.columns) == 5
    assert "title" not in actual.columns
    assert "attributes" in actual.columns