Nathan Evans 61b3d6d56a
Migrate helper verbs (#1248)
* Remove genid

* Move snapshot_rows

* Move snapshot

* Delete spread_json

* Delete unzip

* Delete zip

* Move unpack_graph

* Move compute_edge_combined_degree

* Delete create_graph

* Delete concat

* Delete text replace

* Delete text_translate

* Move text_split

* Inline aggregate override

* Move cluster_graph

* Move merge_graphs

* Semver

* Move text_chunk

* Move layout_graph and fix some __init__s

* Move extract_covariates

* Rename text_split -> split_text

* Move extract_entities

* Move summarize_descriptions

* Rename text_chunk -> chunk_text

* Move community report creation

* Remove verb-level packing operators

* Streamline some naming

* Streamline param name/order

* Move mock LLM data to tests

* Fixed missed rename

* Update some strategy refs

* Rename run_gi

* Inject mock responses into integ test config
2024-10-09 13:46:44 -07:00

52 lines
1.8 KiB
Python

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
import unittest
import pandas as pd
import pytest
from graphrag.index.operations.split_text import split_text
class TestTextSplit(unittest.TestCase):
def test_empty_string(self):
input = pd.DataFrame([{"in": ""}])
result = split_text(input, "in", "out", ",").to_dict(orient="records")
assert len(result) == 1
assert result[0]["out"] == []
def test_string_without_seperator(self):
input = pd.DataFrame([{"in": "test_string_without_seperator"}])
result = split_text(input, "in", "out", ",").to_dict(orient="records")
assert len(result) == 1
assert result[0]["out"] == ["test_string_without_seperator"]
def test_string_with_seperator(self):
input = pd.DataFrame([{"in": "test_1,test_2"}])
result = split_text(input, "in", "out", ",").to_dict(orient="records")
assert len(result) == 1
assert result[0]["out"] == ["test_1", "test_2"]
def test_row_with_list_as_column(self):
input = pd.DataFrame([{"in": ["test_1", "test_2"]}])
result = split_text(input, "in", "out", ",").to_dict(orient="records")
assert len(result) == 1
assert result[0]["out"] == ["test_1", "test_2"]
def test_non_string_column_throws_error(self):
input = pd.DataFrame([{"in": 5}])
with pytest.raises(TypeError):
split_text(input, "in", "out", ",").to_dict(orient="records")
def test_more_than_one_row_returns_correctly(self):
input = pd.DataFrame([{"in": "row_1_1,row_1_2"}, {"in": "row_2_1,row_2_2"}])
result = split_text(input, "in", "out", ",").to_dict(orient="records")
assert len(result) == 2
assert result[0]["out"] == ["row_1_1", "row_1_2"]
assert result[1]["out"] == ["row_2_1", "row_2_2"]