2024-07-01 15:25:30 -06:00
|
|
|
# Copyright (c) 2024 Microsoft Corporation.
|
|
|
|
# Licensed under the MIT License
|
|
|
|
import unittest
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
import pytest
|
|
|
|
|
2024-10-09 13:46:44 -07:00
|
|
|
from graphrag.index.operations.split_text import split_text
|
2024-07-01 15:25:30 -06:00
|
|
|
|
|
|
|
|
|
|
|
class TestTextSplit(unittest.TestCase):
|
|
|
|
def test_empty_string(self):
|
|
|
|
input = pd.DataFrame([{"in": ""}])
|
2024-10-09 13:46:44 -07:00
|
|
|
result = split_text(input, "in", "out", ",").to_dict(orient="records")
|
2024-07-01 15:25:30 -06:00
|
|
|
|
|
|
|
assert len(result) == 1
|
|
|
|
assert result[0]["out"] == []
|
|
|
|
|
|
|
|
def test_string_without_seperator(self):
|
|
|
|
input = pd.DataFrame([{"in": "test_string_without_seperator"}])
|
2024-10-09 13:46:44 -07:00
|
|
|
result = split_text(input, "in", "out", ",").to_dict(orient="records")
|
2024-07-01 15:25:30 -06:00
|
|
|
|
|
|
|
assert len(result) == 1
|
|
|
|
assert result[0]["out"] == ["test_string_without_seperator"]
|
|
|
|
|
|
|
|
def test_string_with_seperator(self):
|
|
|
|
input = pd.DataFrame([{"in": "test_1,test_2"}])
|
2024-10-09 13:46:44 -07:00
|
|
|
result = split_text(input, "in", "out", ",").to_dict(orient="records")
|
2024-07-01 15:25:30 -06:00
|
|
|
|
|
|
|
assert len(result) == 1
|
|
|
|
assert result[0]["out"] == ["test_1", "test_2"]
|
|
|
|
|
|
|
|
def test_row_with_list_as_column(self):
|
|
|
|
input = pd.DataFrame([{"in": ["test_1", "test_2"]}])
|
2024-10-09 13:46:44 -07:00
|
|
|
result = split_text(input, "in", "out", ",").to_dict(orient="records")
|
2024-07-01 15:25:30 -06:00
|
|
|
|
|
|
|
assert len(result) == 1
|
|
|
|
assert result[0]["out"] == ["test_1", "test_2"]
|
|
|
|
|
|
|
|
def test_non_string_column_throws_error(self):
|
|
|
|
input = pd.DataFrame([{"in": 5}])
|
|
|
|
with pytest.raises(TypeError):
|
2024-10-09 13:46:44 -07:00
|
|
|
split_text(input, "in", "out", ",").to_dict(orient="records")
|
2024-07-01 15:25:30 -06:00
|
|
|
|
|
|
|
def test_more_than_one_row_returns_correctly(self):
|
|
|
|
input = pd.DataFrame([{"in": "row_1_1,row_1_2"}, {"in": "row_2_1,row_2_2"}])
|
2024-10-09 13:46:44 -07:00
|
|
|
result = split_text(input, "in", "out", ",").to_dict(orient="records")
|
2024-07-01 15:25:30 -06:00
|
|
|
|
|
|
|
assert len(result) == 2
|
|
|
|
assert result[0]["out"] == ["row_1_1", "row_1_2"]
|
|
|
|
assert result[1]["out"] == ["row_2_1", "row_2_2"]
|