graphrag/tests/unit/indexing/verbs/text/test_split.py

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
import unittest

import pandas as pd
import pytest

from graphrag.index.operations.split_text import split_text


class TestTextSplit(unittest.TestCase):
    def test_empty_string(self):
        input = pd.DataFrame([{"in": ""}])
        result = split_text(input, "in", "out", ",").to_dict(orient="records")

        assert len(result) == 1
        assert result[0]["out"] == []

    def test_string_without_seperator(self):
        input = pd.DataFrame([{"in": "test_string_without_seperator"}])
        result = split_text(input, "in", "out", ",").to_dict(orient="records")

        assert len(result) == 1
        assert result[0]["out"] == ["test_string_without_seperator"]

    def test_string_with_seperator(self):
        input = pd.DataFrame([{"in": "test_1,test_2"}])
        result = split_text(input, "in", "out", ",").to_dict(orient="records")

        assert len(result) == 1
        assert result[0]["out"] == ["test_1", "test_2"]

    def test_row_with_list_as_column(self):
        input = pd.DataFrame([{"in": ["test_1", "test_2"]}])
        result = split_text(input, "in", "out", ",").to_dict(orient="records")

        assert len(result) == 1
        assert result[0]["out"] == ["test_1", "test_2"]

    def test_non_string_column_throws_error(self):
        input = pd.DataFrame([{"in": 5}])
        with pytest.raises(TypeError):
            split_text(input, "in", "out", ",").to_dict(orient="records")

    def test_more_than_one_row_returns_correctly(self):
        input = pd.DataFrame([{"in": "row_1_1,row_1_2"}, {"in": "row_2_1,row_2_2"}])
        result = split_text(input, "in", "out", ",").to_dict(orient="records")

        assert len(result) == 2
        assert result[0]["out"] == ["row_1_1", "row_1_2"]
        assert result[1]["out"] == ["row_2_1", "row_2_2"]
Initial Release 2024-07-01 15:25:30 -06:00			`# Copyright (c) 2024 Microsoft Corporation.`
			`# Licensed under the MIT License`
			`import unittest`

			`import pandas as pd`
			`import pytest`

Migrate helper verbs (#1248) * Remove genid * Move snapshot_rows * Move snapshot * Delete spread_json * Delete unzip * Delete zip * Move unpack_graph * Move compute_edge_combined_degree * Delete create_graph * Delete concat * Delete text replace * Delete text_translate * Move text_split * Inline aggregate override * Move cluster_graph * Move merge_graphs * Semver * Move text_chunk * Move layout_graph and fix some __init__s * Move extract_covariates * Rename text_split -> split_text * Move extract_entities * Move summarize_descriptions * Rename text_chunk -> chunk_text * Move community report creation * Remove verb-level packing operators * Streamline some naming * Streamline param name/order * Move mock LLM data to tests * Fixed missed rename * Update some strategy refs * Rename run_gi * Inject mock responses into integ test config 2024-10-09 13:46:44 -07:00			`from graphrag.index.operations.split_text import split_text`
Initial Release 2024-07-01 15:25:30 -06:00

			`class TestTextSplit(unittest.TestCase):`
			`def test_empty_string(self):`
			`input = pd.DataFrame([{"in": ""}])`
Migrate helper verbs (#1248) * Remove genid * Move snapshot_rows * Move snapshot * Delete spread_json * Delete unzip * Delete zip * Move unpack_graph * Move compute_edge_combined_degree * Delete create_graph * Delete concat * Delete text replace * Delete text_translate * Move text_split * Inline aggregate override * Move cluster_graph * Move merge_graphs * Semver * Move text_chunk * Move layout_graph and fix some __init__s * Move extract_covariates * Rename text_split -> split_text * Move extract_entities * Move summarize_descriptions * Rename text_chunk -> chunk_text * Move community report creation * Remove verb-level packing operators * Streamline some naming * Streamline param name/order * Move mock LLM data to tests * Fixed missed rename * Update some strategy refs * Rename run_gi * Inject mock responses into integ test config 2024-10-09 13:46:44 -07:00			`result = split_text(input, "in", "out", ",").to_dict(orient="records")`
Initial Release 2024-07-01 15:25:30 -06:00
			`assert len(result) == 1`
			`assert result[0]["out"] == []`

			`def test_string_without_seperator(self):`
			`input = pd.DataFrame([{"in": "test_string_without_seperator"}])`
Migrate helper verbs (#1248) * Remove genid * Move snapshot_rows * Move snapshot * Delete spread_json * Delete unzip * Delete zip * Move unpack_graph * Move compute_edge_combined_degree * Delete create_graph * Delete concat * Delete text replace * Delete text_translate * Move text_split * Inline aggregate override * Move cluster_graph * Move merge_graphs * Semver * Move text_chunk * Move layout_graph and fix some __init__s * Move extract_covariates * Rename text_split -> split_text * Move extract_entities * Move summarize_descriptions * Rename text_chunk -> chunk_text * Move community report creation * Remove verb-level packing operators * Streamline some naming * Streamline param name/order * Move mock LLM data to tests * Fixed missed rename * Update some strategy refs * Rename run_gi * Inject mock responses into integ test config 2024-10-09 13:46:44 -07:00			`result = split_text(input, "in", "out", ",").to_dict(orient="records")`
Initial Release 2024-07-01 15:25:30 -06:00
			`assert len(result) == 1`
			`assert result[0]["out"] == ["test_string_without_seperator"]`

			`def test_string_with_seperator(self):`
			`input = pd.DataFrame([{"in": "test_1,test_2"}])`
Migrate helper verbs (#1248) * Remove genid * Move snapshot_rows * Move snapshot * Delete spread_json * Delete unzip * Delete zip * Move unpack_graph * Move compute_edge_combined_degree * Delete create_graph * Delete concat * Delete text replace * Delete text_translate * Move text_split * Inline aggregate override * Move cluster_graph * Move merge_graphs * Semver * Move text_chunk * Move layout_graph and fix some __init__s * Move extract_covariates * Rename text_split -> split_text * Move extract_entities * Move summarize_descriptions * Rename text_chunk -> chunk_text * Move community report creation * Remove verb-level packing operators * Streamline some naming * Streamline param name/order * Move mock LLM data to tests * Fixed missed rename * Update some strategy refs * Rename run_gi * Inject mock responses into integ test config 2024-10-09 13:46:44 -07:00			`result = split_text(input, "in", "out", ",").to_dict(orient="records")`
Initial Release 2024-07-01 15:25:30 -06:00
			`assert len(result) == 1`
			`assert result[0]["out"] == ["test_1", "test_2"]`

			`def test_row_with_list_as_column(self):`
			`input = pd.DataFrame([{"in": ["test_1", "test_2"]}])`
Migrate helper verbs (#1248) * Remove genid * Move snapshot_rows * Move snapshot * Delete spread_json * Delete unzip * Delete zip * Move unpack_graph * Move compute_edge_combined_degree * Delete create_graph * Delete concat * Delete text replace * Delete text_translate * Move text_split * Inline aggregate override * Move cluster_graph * Move merge_graphs * Semver * Move text_chunk * Move layout_graph and fix some __init__s * Move extract_covariates * Rename text_split -> split_text * Move extract_entities * Move summarize_descriptions * Rename text_chunk -> chunk_text * Move community report creation * Remove verb-level packing operators * Streamline some naming * Streamline param name/order * Move mock LLM data to tests * Fixed missed rename * Update some strategy refs * Rename run_gi * Inject mock responses into integ test config 2024-10-09 13:46:44 -07:00			`result = split_text(input, "in", "out", ",").to_dict(orient="records")`
Initial Release 2024-07-01 15:25:30 -06:00
			`assert len(result) == 1`
			`assert result[0]["out"] == ["test_1", "test_2"]`

			`def test_non_string_column_throws_error(self):`
			`input = pd.DataFrame([{"in": 5}])`
			`with pytest.raises(TypeError):`
Migrate helper verbs (#1248) * Remove genid * Move snapshot_rows * Move snapshot * Delete spread_json * Delete unzip * Delete zip * Move unpack_graph * Move compute_edge_combined_degree * Delete create_graph * Delete concat * Delete text replace * Delete text_translate * Move text_split * Inline aggregate override * Move cluster_graph * Move merge_graphs * Semver * Move text_chunk * Move layout_graph and fix some __init__s * Move extract_covariates * Rename text_split -> split_text * Move extract_entities * Move summarize_descriptions * Rename text_chunk -> chunk_text * Move community report creation * Remove verb-level packing operators * Streamline some naming * Streamline param name/order * Move mock LLM data to tests * Fixed missed rename * Update some strategy refs * Rename run_gi * Inject mock responses into integ test config 2024-10-09 13:46:44 -07:00			`split_text(input, "in", "out", ",").to_dict(orient="records")`
Initial Release 2024-07-01 15:25:30 -06:00
			`def test_more_than_one_row_returns_correctly(self):`
			`input = pd.DataFrame([{"in": "row_1_1,row_1_2"}, {"in": "row_2_1,row_2_2"}])`
Migrate helper verbs (#1248) * Remove genid * Move snapshot_rows * Move snapshot * Delete spread_json * Delete unzip * Delete zip * Move unpack_graph * Move compute_edge_combined_degree * Delete create_graph * Delete concat * Delete text replace * Delete text_translate * Move text_split * Inline aggregate override * Move cluster_graph * Move merge_graphs * Semver * Move text_chunk * Move layout_graph and fix some __init__s * Move extract_covariates * Rename text_split -> split_text * Move extract_entities * Move summarize_descriptions * Rename text_chunk -> chunk_text * Move community report creation * Remove verb-level packing operators * Streamline some naming * Streamline param name/order * Move mock LLM data to tests * Fixed missed rename * Update some strategy refs * Rename run_gi * Inject mock responses into integ test config 2024-10-09 13:46:44 -07:00			`result = split_text(input, "in", "out", ",").to_dict(orient="records")`
Initial Release 2024-07-01 15:25:30 -06:00
			`assert len(result) == 2`
			`assert result[0]["out"] == ["row_1_1", "row_1_2"]`
			`assert result[1]["out"] == ["row_2_1", "row_2_2"]`