refactor!: rename TextDocumentSplitter to DocumentSplitter (#6223)

* rename TextDocumentSplitter to DocumentSplitter

* reno

* fix init
This commit is contained in:
Stefano Fiorucci 2023-11-03 11:33:20 +01:00 committed by GitHub
parent 6e2dbdc320
commit 063d27c522
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 36 additions and 30 deletions

View File

@ -3,7 +3,7 @@ import json
from haystack.preview import Pipeline from haystack.preview import Pipeline
from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.preview.components.file_converters import TextFileToDocument from haystack.preview.components.file_converters import TextFileToDocument
from haystack.preview.components.preprocessors import TextDocumentSplitter, DocumentCleaner, DocumentLanguageClassifier from haystack.preview.components.preprocessors import DocumentSplitter, DocumentCleaner, DocumentLanguageClassifier
from haystack.preview.components.routers import FileTypeRouter from haystack.preview.components.routers import FileTypeRouter
from haystack.preview.components.writers import DocumentWriter from haystack.preview.components.writers import DocumentWriter
from haystack.preview.document_stores import InMemoryDocumentStore from haystack.preview.document_stores import InMemoryDocumentStore
@ -18,7 +18,7 @@ def test_preprocessing_pipeline(tmp_path):
preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier") preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner") preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
preprocessing_pipeline.add_component( preprocessing_pipeline.add_component(
instance=TextDocumentSplitter(split_by="sentence", split_length=1), name="splitter" instance=DocumentSplitter(split_by="sentence", split_length=1), name="splitter"
) )
preprocessing_pipeline.add_component( preprocessing_pipeline.add_component(
instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),

View File

@ -1,6 +1,6 @@
from haystack.preview.components.preprocessors.text_document_cleaner import DocumentCleaner from haystack.preview.components.preprocessors.document_cleaner import DocumentCleaner
from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter from haystack.preview.components.preprocessors.document_splitter import DocumentSplitter
from haystack.preview.components.preprocessors.document_language_classifier import DocumentLanguageClassifier from haystack.preview.components.preprocessors.document_language_classifier import DocumentLanguageClassifier
from haystack.preview.components.preprocessors.text_language_classifier import TextLanguageClassifier from haystack.preview.components.preprocessors.text_language_classifier import TextLanguageClassifier
__all__ = ["TextDocumentSplitter", "DocumentCleaner", "TextLanguageClassifier", "DocumentLanguageClassifier"] __all__ = ["DocumentSplitter", "DocumentCleaner", "TextLanguageClassifier", "DocumentLanguageClassifier"]

View File

@ -7,7 +7,7 @@ from haystack.preview import component, Document
@component @component
class TextDocumentSplitter: class DocumentSplitter:
""" """
Splits a list of text documents into a list of text documents with shorter texts. Splits a list of text documents into a list of text documents with shorter texts.
This is useful for splitting documents with long texts that otherwise would not fit into the maximum text length of language models. This is useful for splitting documents with long texts that otherwise would not fit into the maximum text length of language models.
@ -45,13 +45,13 @@ class TextDocumentSplitter:
""" """
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)): if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
raise TypeError("TextDocumentSplitter expects a List of Documents as input.") raise TypeError("DocumentSplitter expects a List of Documents as input.")
split_docs = [] split_docs = []
for doc in documents: for doc in documents:
if doc.content is None: if doc.content is None:
raise ValueError( raise ValueError(
f"TextDocumentSplitter only works with text documents but document.content for document ID {doc.id} is None." f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
) )
units = self._split_into_units(doc.content, self.split_by) units = self._split_into_units(doc.content, self.split_by)
text_splits = self._concatenate_units(units, self.split_length, self.split_overlap) text_splits = self._concatenate_units(units, self.split_length, self.split_overlap)
@ -69,7 +69,7 @@ class TextDocumentSplitter:
split_at = " " split_at = " "
else: else:
raise NotImplementedError( raise NotImplementedError(
"TextDocumentSplitter only supports 'passage', 'sentence' or 'word' split_by options." "DocumentSplitter only supports 'passage', 'sentence' or 'word' split_by options."
) )
units = text.split(split_at) units = text.split(split_at)
# Add the delimiter back to all units except the last one # Add the delimiter back to all units except the last one

View File

@ -98,7 +98,7 @@ preview = [
"Jinja2", "Jinja2",
"openai", "openai",
"pyyaml", "pyyaml",
"more-itertools", # TextDocumentSplitter "more-itertools", # DocumentSplitter
] ]
inference = [ inference = [
"transformers[torch,sentencepiece]==4.34.1", "transformers[torch,sentencepiece]==4.34.1",

View File

@ -0,0 +1,6 @@
---
preview:
- |
rename `TextDocumentSplitter` to `DocumentSplitter`, to allow a better
distinction between Components that operate on text and those that operate
on Documents.

View File

@ -10,9 +10,9 @@ class TestDocumentCleaner:
@pytest.mark.unit @pytest.mark.unit
def test_init(self): def test_init(self):
cleaner = DocumentCleaner() cleaner = DocumentCleaner()
assert cleaner.remove_empty_lines == True assert cleaner.remove_empty_lines is True
assert cleaner.remove_extra_whitespaces == True assert cleaner.remove_extra_whitespaces is True
assert cleaner.remove_repeated_substrings == False assert cleaner.remove_repeated_substrings is False
assert cleaner.remove_substrings is None assert cleaner.remove_substrings is None
assert cleaner.remove_regex is None assert cleaner.remove_regex is None

View File

@ -1,48 +1,48 @@
import pytest import pytest
from haystack.preview import Document from haystack.preview import Document
from haystack.preview.components.preprocessors import TextDocumentSplitter from haystack.preview.components.preprocessors import DocumentSplitter
class TestTextDocumentSplitter: class TestDocumentSplitter:
@pytest.mark.unit @pytest.mark.unit
def test_non_text_document(self): def test_non_text_document(self):
with pytest.raises( with pytest.raises(
ValueError, match="TextDocumentSplitter only works with text documents but document.content for document ID" ValueError, match="DocumentSplitter only works with text documents but document.content for document ID"
): ):
splitter = TextDocumentSplitter() splitter = DocumentSplitter()
splitter.run(documents=[Document()]) splitter.run(documents=[Document()])
@pytest.mark.unit @pytest.mark.unit
def test_single_doc(self): def test_single_doc(self):
with pytest.raises(TypeError, match="TextDocumentSplitter expects a List of Documents as input."): with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
splitter = TextDocumentSplitter() splitter = DocumentSplitter()
splitter.run(documents=Document()) splitter.run(documents=Document())
@pytest.mark.unit @pytest.mark.unit
def test_empty_list(self): def test_empty_list(self):
splitter = TextDocumentSplitter() splitter = DocumentSplitter()
res = splitter.run(documents=[]) res = splitter.run(documents=[])
assert res == {"documents": []} assert res == {"documents": []}
@pytest.mark.unit @pytest.mark.unit
def test_unsupported_split_by(self): def test_unsupported_split_by(self):
with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence' or 'passage'."): with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence' or 'passage'."):
TextDocumentSplitter(split_by="unsupported") DocumentSplitter(split_by="unsupported")
@pytest.mark.unit @pytest.mark.unit
def test_unsupported_split_length(self): def test_unsupported_split_length(self):
with pytest.raises(ValueError, match="split_length must be greater than 0."): with pytest.raises(ValueError, match="split_length must be greater than 0."):
TextDocumentSplitter(split_length=0) DocumentSplitter(split_length=0)
@pytest.mark.unit @pytest.mark.unit
def test_unsupported_split_overlap(self): def test_unsupported_split_overlap(self):
with pytest.raises(ValueError, match="split_overlap must be greater than or equal to 0."): with pytest.raises(ValueError, match="split_overlap must be greater than or equal to 0."):
TextDocumentSplitter(split_overlap=-1) DocumentSplitter(split_overlap=-1)
@pytest.mark.unit @pytest.mark.unit
def test_split_by_word(self): def test_split_by_word(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10) splitter = DocumentSplitter(split_by="word", split_length=10)
result = splitter.run( result = splitter.run(
documents=[ documents=[
Document( Document(
@ -56,7 +56,7 @@ class TestTextDocumentSplitter:
@pytest.mark.unit @pytest.mark.unit
def test_split_by_word_multiple_input_docs(self): def test_split_by_word_multiple_input_docs(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10) splitter = DocumentSplitter(split_by="word", split_length=10)
result = splitter.run( result = splitter.run(
documents=[ documents=[
Document( Document(
@ -76,7 +76,7 @@ class TestTextDocumentSplitter:
@pytest.mark.unit @pytest.mark.unit
def test_split_by_sentence(self): def test_split_by_sentence(self):
splitter = TextDocumentSplitter(split_by="sentence", split_length=1) splitter = DocumentSplitter(split_by="sentence", split_length=1)
result = splitter.run( result = splitter.run(
documents=[ documents=[
Document( Document(
@ -91,7 +91,7 @@ class TestTextDocumentSplitter:
@pytest.mark.unit @pytest.mark.unit
def test_split_by_passage(self): def test_split_by_passage(self):
splitter = TextDocumentSplitter(split_by="passage", split_length=1) splitter = DocumentSplitter(split_by="passage", split_length=1)
result = splitter.run( result = splitter.run(
documents=[ documents=[
Document( Document(
@ -106,7 +106,7 @@ class TestTextDocumentSplitter:
@pytest.mark.unit @pytest.mark.unit
def test_split_by_word_with_overlap(self): def test_split_by_word_with_overlap(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10, split_overlap=2) splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
result = splitter.run( result = splitter.run(
documents=[ documents=[
Document( Document(
@ -120,7 +120,7 @@ class TestTextDocumentSplitter:
@pytest.mark.unit @pytest.mark.unit
def test_source_id_stored_in_metadata(self): def test_source_id_stored_in_metadata(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10) splitter = DocumentSplitter(split_by="word", split_length=10)
doc1 = Document(content="This is a text with some words.") doc1 = Document(content="This is a text with some words.")
doc2 = Document(content="This is a different text with some words.") doc2 = Document(content="This is a different text with some words.")
result = splitter.run(documents=[doc1, doc2]) result = splitter.run(documents=[doc1, doc2])
@ -129,7 +129,7 @@ class TestTextDocumentSplitter:
@pytest.mark.unit @pytest.mark.unit
def test_copy_metadata(self): def test_copy_metadata(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10) splitter = DocumentSplitter(split_by="word", split_length=10)
documents = [ documents = [
Document(content="Text.", meta={"name": "doc 0"}), Document(content="Text.", meta={"name": "doc 0"}),
Document(content="Text.", meta={"name": "doc 1"}), Document(content="Text.", meta={"name": "doc 1"}),