mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-16 12:37:32 +00:00
refactor!: rename TextDocumentSplitter
to DocumentSplitter
(#6223)
* rename TextDocumentSplitter to DocumentSplitter * reno * fix init
This commit is contained in:
parent
6e2dbdc320
commit
063d27c522
@ -3,7 +3,7 @@ import json
|
|||||||
from haystack.preview import Pipeline
|
from haystack.preview import Pipeline
|
||||||
from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder
|
from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder
|
||||||
from haystack.preview.components.file_converters import TextFileToDocument
|
from haystack.preview.components.file_converters import TextFileToDocument
|
||||||
from haystack.preview.components.preprocessors import TextDocumentSplitter, DocumentCleaner, DocumentLanguageClassifier
|
from haystack.preview.components.preprocessors import DocumentSplitter, DocumentCleaner, DocumentLanguageClassifier
|
||||||
from haystack.preview.components.routers import FileTypeRouter
|
from haystack.preview.components.routers import FileTypeRouter
|
||||||
from haystack.preview.components.writers import DocumentWriter
|
from haystack.preview.components.writers import DocumentWriter
|
||||||
from haystack.preview.document_stores import InMemoryDocumentStore
|
from haystack.preview.document_stores import InMemoryDocumentStore
|
||||||
@ -18,7 +18,7 @@ def test_preprocessing_pipeline(tmp_path):
|
|||||||
preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
|
preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
|
||||||
preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
|
preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
|
||||||
preprocessing_pipeline.add_component(
|
preprocessing_pipeline.add_component(
|
||||||
instance=TextDocumentSplitter(split_by="sentence", split_length=1), name="splitter"
|
instance=DocumentSplitter(split_by="sentence", split_length=1), name="splitter"
|
||||||
)
|
)
|
||||||
preprocessing_pipeline.add_component(
|
preprocessing_pipeline.add_component(
|
||||||
instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
|
instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from haystack.preview.components.preprocessors.text_document_cleaner import DocumentCleaner
|
from haystack.preview.components.preprocessors.document_cleaner import DocumentCleaner
|
||||||
from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter
|
from haystack.preview.components.preprocessors.document_splitter import DocumentSplitter
|
||||||
from haystack.preview.components.preprocessors.document_language_classifier import DocumentLanguageClassifier
|
from haystack.preview.components.preprocessors.document_language_classifier import DocumentLanguageClassifier
|
||||||
from haystack.preview.components.preprocessors.text_language_classifier import TextLanguageClassifier
|
from haystack.preview.components.preprocessors.text_language_classifier import TextLanguageClassifier
|
||||||
|
|
||||||
__all__ = ["TextDocumentSplitter", "DocumentCleaner", "TextLanguageClassifier", "DocumentLanguageClassifier"]
|
__all__ = ["DocumentSplitter", "DocumentCleaner", "TextLanguageClassifier", "DocumentLanguageClassifier"]
|
||||||
|
@ -7,7 +7,7 @@ from haystack.preview import component, Document
|
|||||||
|
|
||||||
|
|
||||||
@component
|
@component
|
||||||
class TextDocumentSplitter:
|
class DocumentSplitter:
|
||||||
"""
|
"""
|
||||||
Splits a list of text documents into a list of text documents with shorter texts.
|
Splits a list of text documents into a list of text documents with shorter texts.
|
||||||
This is useful for splitting documents with long texts that otherwise would not fit into the maximum text length of language models.
|
This is useful for splitting documents with long texts that otherwise would not fit into the maximum text length of language models.
|
||||||
@ -45,13 +45,13 @@ class TextDocumentSplitter:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
|
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
|
||||||
raise TypeError("TextDocumentSplitter expects a List of Documents as input.")
|
raise TypeError("DocumentSplitter expects a List of Documents as input.")
|
||||||
|
|
||||||
split_docs = []
|
split_docs = []
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
if doc.content is None:
|
if doc.content is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"TextDocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
|
f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
|
||||||
)
|
)
|
||||||
units = self._split_into_units(doc.content, self.split_by)
|
units = self._split_into_units(doc.content, self.split_by)
|
||||||
text_splits = self._concatenate_units(units, self.split_length, self.split_overlap)
|
text_splits = self._concatenate_units(units, self.split_length, self.split_overlap)
|
||||||
@ -69,7 +69,7 @@ class TextDocumentSplitter:
|
|||||||
split_at = " "
|
split_at = " "
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"TextDocumentSplitter only supports 'passage', 'sentence' or 'word' split_by options."
|
"DocumentSplitter only supports 'passage', 'sentence' or 'word' split_by options."
|
||||||
)
|
)
|
||||||
units = text.split(split_at)
|
units = text.split(split_at)
|
||||||
# Add the delimiter back to all units except the last one
|
# Add the delimiter back to all units except the last one
|
@ -98,7 +98,7 @@ preview = [
|
|||||||
"Jinja2",
|
"Jinja2",
|
||||||
"openai",
|
"openai",
|
||||||
"pyyaml",
|
"pyyaml",
|
||||||
"more-itertools", # TextDocumentSplitter
|
"more-itertools", # DocumentSplitter
|
||||||
]
|
]
|
||||||
inference = [
|
inference = [
|
||||||
"transformers[torch,sentencepiece]==4.34.1",
|
"transformers[torch,sentencepiece]==4.34.1",
|
||||||
|
@ -0,0 +1,6 @@
|
|||||||
|
---
|
||||||
|
preview:
|
||||||
|
- |
|
||||||
|
rename `TextDocumentSplitter` to `DocumentSplitter`, to allow a better
|
||||||
|
distinction between Components that operate on text and those that operate
|
||||||
|
on Documents.
|
@ -10,9 +10,9 @@ class TestDocumentCleaner:
|
|||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_init(self):
|
def test_init(self):
|
||||||
cleaner = DocumentCleaner()
|
cleaner = DocumentCleaner()
|
||||||
assert cleaner.remove_empty_lines == True
|
assert cleaner.remove_empty_lines is True
|
||||||
assert cleaner.remove_extra_whitespaces == True
|
assert cleaner.remove_extra_whitespaces is True
|
||||||
assert cleaner.remove_repeated_substrings == False
|
assert cleaner.remove_repeated_substrings is False
|
||||||
assert cleaner.remove_substrings is None
|
assert cleaner.remove_substrings is None
|
||||||
assert cleaner.remove_regex is None
|
assert cleaner.remove_regex is None
|
||||||
|
|
@ -1,48 +1,48 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from haystack.preview import Document
|
from haystack.preview import Document
|
||||||
from haystack.preview.components.preprocessors import TextDocumentSplitter
|
from haystack.preview.components.preprocessors import DocumentSplitter
|
||||||
|
|
||||||
|
|
||||||
class TestTextDocumentSplitter:
|
class TestDocumentSplitter:
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_non_text_document(self):
|
def test_non_text_document(self):
|
||||||
with pytest.raises(
|
with pytest.raises(
|
||||||
ValueError, match="TextDocumentSplitter only works with text documents but document.content for document ID"
|
ValueError, match="DocumentSplitter only works with text documents but document.content for document ID"
|
||||||
):
|
):
|
||||||
splitter = TextDocumentSplitter()
|
splitter = DocumentSplitter()
|
||||||
splitter.run(documents=[Document()])
|
splitter.run(documents=[Document()])
|
||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_single_doc(self):
|
def test_single_doc(self):
|
||||||
with pytest.raises(TypeError, match="TextDocumentSplitter expects a List of Documents as input."):
|
with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
|
||||||
splitter = TextDocumentSplitter()
|
splitter = DocumentSplitter()
|
||||||
splitter.run(documents=Document())
|
splitter.run(documents=Document())
|
||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_empty_list(self):
|
def test_empty_list(self):
|
||||||
splitter = TextDocumentSplitter()
|
splitter = DocumentSplitter()
|
||||||
res = splitter.run(documents=[])
|
res = splitter.run(documents=[])
|
||||||
assert res == {"documents": []}
|
assert res == {"documents": []}
|
||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_unsupported_split_by(self):
|
def test_unsupported_split_by(self):
|
||||||
with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence' or 'passage'."):
|
with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence' or 'passage'."):
|
||||||
TextDocumentSplitter(split_by="unsupported")
|
DocumentSplitter(split_by="unsupported")
|
||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_unsupported_split_length(self):
|
def test_unsupported_split_length(self):
|
||||||
with pytest.raises(ValueError, match="split_length must be greater than 0."):
|
with pytest.raises(ValueError, match="split_length must be greater than 0."):
|
||||||
TextDocumentSplitter(split_length=0)
|
DocumentSplitter(split_length=0)
|
||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_unsupported_split_overlap(self):
|
def test_unsupported_split_overlap(self):
|
||||||
with pytest.raises(ValueError, match="split_overlap must be greater than or equal to 0."):
|
with pytest.raises(ValueError, match="split_overlap must be greater than or equal to 0."):
|
||||||
TextDocumentSplitter(split_overlap=-1)
|
DocumentSplitter(split_overlap=-1)
|
||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_split_by_word(self):
|
def test_split_by_word(self):
|
||||||
splitter = TextDocumentSplitter(split_by="word", split_length=10)
|
splitter = DocumentSplitter(split_by="word", split_length=10)
|
||||||
result = splitter.run(
|
result = splitter.run(
|
||||||
documents=[
|
documents=[
|
||||||
Document(
|
Document(
|
||||||
@ -56,7 +56,7 @@ class TestTextDocumentSplitter:
|
|||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_split_by_word_multiple_input_docs(self):
|
def test_split_by_word_multiple_input_docs(self):
|
||||||
splitter = TextDocumentSplitter(split_by="word", split_length=10)
|
splitter = DocumentSplitter(split_by="word", split_length=10)
|
||||||
result = splitter.run(
|
result = splitter.run(
|
||||||
documents=[
|
documents=[
|
||||||
Document(
|
Document(
|
||||||
@ -76,7 +76,7 @@ class TestTextDocumentSplitter:
|
|||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_split_by_sentence(self):
|
def test_split_by_sentence(self):
|
||||||
splitter = TextDocumentSplitter(split_by="sentence", split_length=1)
|
splitter = DocumentSplitter(split_by="sentence", split_length=1)
|
||||||
result = splitter.run(
|
result = splitter.run(
|
||||||
documents=[
|
documents=[
|
||||||
Document(
|
Document(
|
||||||
@ -91,7 +91,7 @@ class TestTextDocumentSplitter:
|
|||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_split_by_passage(self):
|
def test_split_by_passage(self):
|
||||||
splitter = TextDocumentSplitter(split_by="passage", split_length=1)
|
splitter = DocumentSplitter(split_by="passage", split_length=1)
|
||||||
result = splitter.run(
|
result = splitter.run(
|
||||||
documents=[
|
documents=[
|
||||||
Document(
|
Document(
|
||||||
@ -106,7 +106,7 @@ class TestTextDocumentSplitter:
|
|||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_split_by_word_with_overlap(self):
|
def test_split_by_word_with_overlap(self):
|
||||||
splitter = TextDocumentSplitter(split_by="word", split_length=10, split_overlap=2)
|
splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
|
||||||
result = splitter.run(
|
result = splitter.run(
|
||||||
documents=[
|
documents=[
|
||||||
Document(
|
Document(
|
||||||
@ -120,7 +120,7 @@ class TestTextDocumentSplitter:
|
|||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_source_id_stored_in_metadata(self):
|
def test_source_id_stored_in_metadata(self):
|
||||||
splitter = TextDocumentSplitter(split_by="word", split_length=10)
|
splitter = DocumentSplitter(split_by="word", split_length=10)
|
||||||
doc1 = Document(content="This is a text with some words.")
|
doc1 = Document(content="This is a text with some words.")
|
||||||
doc2 = Document(content="This is a different text with some words.")
|
doc2 = Document(content="This is a different text with some words.")
|
||||||
result = splitter.run(documents=[doc1, doc2])
|
result = splitter.run(documents=[doc1, doc2])
|
||||||
@ -129,7 +129,7 @@ class TestTextDocumentSplitter:
|
|||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_copy_metadata(self):
|
def test_copy_metadata(self):
|
||||||
splitter = TextDocumentSplitter(split_by="word", split_length=10)
|
splitter = DocumentSplitter(split_by="word", split_length=10)
|
||||||
documents = [
|
documents = [
|
||||||
Document(content="Text.", meta={"name": "doc 0"}),
|
Document(content="Text.", meta={"name": "doc 0"}),
|
||||||
Document(content="Text.", meta={"name": "doc 1"}),
|
Document(content="Text.", meta={"name": "doc 1"}),
|
Loading…
x
Reference in New Issue
Block a user