mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-06-26 22:00:13 +00:00
feat: integrate two ready-made SuperComponents from haystack-experimental (#9235)
* Add super component decorator * Add reno * MultiFileConverter * Add DocumentPreprocessor * Add reno * Add tests and change doc preprocessor to split first then clean * Remove code from merge * Add to pydoc and missing test file * PR comments * Lint fix * Fix mypy * Fix mypy * Add comment * PR comments * Update haystack/components/converters/multi_file_converter.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/preprocessors/document_preprocessor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/converters/multi_file_converter.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * PR comments * PR comment --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
This commit is contained in:
parent
4279d7e149
commit
19cf220136
@ -10,6 +10,7 @@ loaders:
|
||||
"json",
|
||||
"markdown",
|
||||
"msg",
|
||||
"multi_file_converter",
|
||||
"openapi_functions",
|
||||
"output_adapter",
|
||||
"pdfminer",
|
||||
|
@ -5,6 +5,7 @@ loaders:
|
||||
"csv_document_cleaner",
|
||||
"csv_document_splitter",
|
||||
"document_cleaner",
|
||||
"document_preprocessor",
|
||||
"document_splitter",
|
||||
"hierarchical_document_splitter",
|
||||
"recursive_splitter",
|
||||
|
@ -15,6 +15,7 @@ _import_structure = {
|
||||
"json": ["JSONConverter"],
|
||||
"markdown": ["MarkdownToDocument"],
|
||||
"msg": ["MSGToDocument"],
|
||||
"multi_file_converter": ["MultiFileConverter"],
|
||||
"openapi_functions": ["OpenAPIServiceToFunctions"],
|
||||
"output_adapter": ["OutputAdapter"],
|
||||
"pdfminer": ["PDFMinerToDocument"],
|
||||
@ -33,6 +34,7 @@ if TYPE_CHECKING:
|
||||
from .json import JSONConverter
|
||||
from .markdown import MarkdownToDocument
|
||||
from .msg import MSGToDocument
|
||||
from .multi_file_converter import MultiFileConverter
|
||||
from .openapi_functions import OpenAPIServiceToFunctions
|
||||
from .output_adapter import OutputAdapter
|
||||
from .pdfminer import PDFMinerToDocument
|
||||
|
118
haystack/components/converters/multi_file_converter.py
Normal file
118
haystack/components/converters/multi_file_converter.py
Normal file
@ -0,0 +1,118 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from enum import Enum
|
||||
|
||||
from haystack import Pipeline, super_component
|
||||
from haystack.components.converters import (
|
||||
CSVToDocument,
|
||||
DOCXToDocument,
|
||||
HTMLToDocument,
|
||||
JSONConverter,
|
||||
PPTXToDocument,
|
||||
PyPDFToDocument,
|
||||
TextFileToDocument,
|
||||
XLSXToDocument,
|
||||
)
|
||||
from haystack.components.joiners import DocumentJoiner
|
||||
from haystack.components.routers import FileTypeRouter
|
||||
|
||||
|
||||
class ConverterMimeType(str, Enum):
|
||||
CSV = "text/csv"
|
||||
DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
HTML = "text/html"
|
||||
JSON = "application/json"
|
||||
MD = "text/markdown"
|
||||
TEXT = "text/plain"
|
||||
PDF = "application/pdf"
|
||||
PPTX = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
|
||||
|
||||
@super_component
|
||||
class MultiFileConverter:
|
||||
"""
|
||||
A file converter that handles conversion of multiple file types.
|
||||
|
||||
The MultiFileConverter handles the following file types:
|
||||
- CSV
|
||||
- DOCX
|
||||
- HTML
|
||||
- JSON
|
||||
- MD
|
||||
- TEXT
|
||||
- PDF (no OCR)
|
||||
- PPTX
|
||||
- XLSX
|
||||
|
||||
Usage example:
|
||||
```
|
||||
from haystack.super_components.converters import MultiFileConverter
|
||||
|
||||
converter = MultiFileConverter()
|
||||
converter.run(sources=["test.txt", "test.pdf"], meta={})
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, encoding: str = "utf-8", json_content_key: str = "content") -> None:
|
||||
"""
|
||||
Initialize the MultiFileConverter.
|
||||
|
||||
:param encoding: The encoding to use when reading files.
|
||||
:param json_content_key: The key to use in a content field in a document when converting JSON files.
|
||||
"""
|
||||
self.encoding = encoding
|
||||
self.json_content_key = json_content_key
|
||||
|
||||
# initialize components
|
||||
router = FileTypeRouter(
|
||||
mime_types=[mime_type.value for mime_type in ConverterMimeType],
|
||||
# Ensure common extensions are registered. Tests on Windows fail otherwise.
|
||||
additional_mimetypes={
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
||||
},
|
||||
)
|
||||
|
||||
# Create pipeline and add components
|
||||
pp = Pipeline()
|
||||
|
||||
# We use type ignore here to avoid type checking errors
|
||||
# This is due to how the run method within the Component protocol is defined
|
||||
pp.add_component("router", router) # type: ignore[arg-type]
|
||||
pp.add_component("docx", DOCXToDocument(link_format="markdown")) # type: ignore[arg-type]
|
||||
pp.add_component(
|
||||
"html",
|
||||
HTMLToDocument( # type: ignore[arg-type]
|
||||
extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True}
|
||||
),
|
||||
)
|
||||
pp.add_component("json", JSONConverter(content_key=self.json_content_key)) # type: ignore[arg-type]
|
||||
pp.add_component("md", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type]
|
||||
pp.add_component("text", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type]
|
||||
pp.add_component("pdf", PyPDFToDocument()) # type: ignore[arg-type]
|
||||
pp.add_component("pptx", PPTXToDocument()) # type: ignore[arg-type]
|
||||
pp.add_component("xlsx", XLSXToDocument()) # type: ignore[arg-type]
|
||||
pp.add_component("joiner", DocumentJoiner()) # type: ignore[arg-type]
|
||||
pp.add_component("csv", CSVToDocument(encoding=self.encoding)) # type: ignore[arg-type]
|
||||
|
||||
for mime_type in ConverterMimeType:
|
||||
pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1])
|
||||
|
||||
pp.connect("docx.documents", "joiner.documents")
|
||||
pp.connect("html.documents", "joiner.documents")
|
||||
pp.connect("json.documents", "joiner.documents")
|
||||
pp.connect("md.documents", "joiner.documents")
|
||||
pp.connect("text.documents", "joiner.documents")
|
||||
pp.connect("pdf.documents", "joiner.documents")
|
||||
pp.connect("pptx.documents", "joiner.documents")
|
||||
|
||||
pp.connect("csv.documents", "joiner.documents")
|
||||
pp.connect("xlsx.documents", "joiner.documents")
|
||||
|
||||
self.pipeline = pp
|
||||
self.output_mapping = {"joiner.documents": "documents", "router.unclassified": "unclassified"}
|
||||
self.input_mapping = {"sources": ["router.sources"], "meta": ["router.meta"]}
|
@ -11,6 +11,7 @@ _import_structure = {
|
||||
"csv_document_cleaner": ["CSVDocumentCleaner"],
|
||||
"csv_document_splitter": ["CSVDocumentSplitter"],
|
||||
"document_cleaner": ["DocumentCleaner"],
|
||||
"document_preprocessor": ["DocumentPreprocessor"],
|
||||
"document_splitter": ["DocumentSplitter"],
|
||||
"hierarchical_document_splitter": ["HierarchicalDocumentSplitter"],
|
||||
"recursive_splitter": ["RecursiveDocumentSplitter"],
|
||||
@ -21,6 +22,7 @@ if TYPE_CHECKING:
|
||||
from .csv_document_cleaner import CSVDocumentCleaner
|
||||
from .csv_document_splitter import CSVDocumentSplitter
|
||||
from .document_cleaner import DocumentCleaner
|
||||
from .document_preprocessor import DocumentPreprocessor
|
||||
from .document_splitter import DocumentSplitter
|
||||
from .hierarchical_document_splitter import HierarchicalDocumentSplitter
|
||||
from .recursive_splitter import RecursiveDocumentSplitter
|
||||
|
192
haystack/components/preprocessors/document_preprocessor.py
Normal file
192
haystack/components/preprocessors/document_preprocessor.py
Normal file
@ -0,0 +1,192 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Any, Callable, Dict, List, Literal, Optional
|
||||
|
||||
from haystack import Pipeline, default_from_dict, default_to_dict, super_component
|
||||
from haystack.components.preprocessors.document_cleaner import DocumentCleaner
|
||||
from haystack.components.preprocessors.document_splitter import DocumentSplitter, Language
|
||||
from haystack.utils import deserialize_callable, serialize_callable
|
||||
|
||||
|
||||
@super_component
|
||||
class DocumentPreprocessor:
|
||||
"""
|
||||
A SuperComponent that first splits and then cleans documents.
|
||||
|
||||
This component consists of a DocumentSplitter followed by a DocumentCleaner in a single pipeline.
|
||||
It takes a list of documents as input and returns a processed list of documents.
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
from haystack import Document
|
||||
doc = Document(content="I love pizza!")
|
||||
preprocessor = DocumentPreProcessor()
|
||||
results = preprocessor.run(documents=[doc])
|
||||
print(result["documents"])
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__( # noqa: PLR0913 (too-many-arguments)
|
||||
self,
|
||||
*,
|
||||
# --- DocumentSplitter arguments ---
|
||||
split_by: Literal["function", "page", "passage", "period", "word", "line", "sentence"] = "word",
|
||||
split_length: int = 250,
|
||||
split_overlap: int = 0,
|
||||
split_threshold: int = 0,
|
||||
splitting_function: Optional[Callable[[str], List[str]]] = None,
|
||||
respect_sentence_boundary: bool = False,
|
||||
language: Language = "en",
|
||||
use_split_rules: bool = True,
|
||||
extend_abbreviations: bool = True,
|
||||
# --- DocumentCleaner arguments ---
|
||||
remove_empty_lines: bool = True,
|
||||
remove_extra_whitespaces: bool = True,
|
||||
remove_repeated_substrings: bool = False,
|
||||
keep_id: bool = False,
|
||||
remove_substrings: Optional[List[str]] = None,
|
||||
remove_regex: Optional[str] = None,
|
||||
unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
|
||||
ascii_only: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize a DocumentPreProcessor that first splits and then cleans documents.
|
||||
|
||||
**Splitter Parameters**:
|
||||
:param split_by: The unit of splitting: "function", "page", "passage", "period", "word", "line", or "sentence".
|
||||
:param split_length: The maximum number of units (words, lines, pages, and so on) in each split.
|
||||
:param split_overlap: The number of overlapping units between consecutive splits.
|
||||
:param split_threshold: The minimum number of units per split. If a split is smaller than this, it's merged
|
||||
with the previous split.
|
||||
:param splitting_function: A custom function for splitting if `split_by="function"`.
|
||||
:param respect_sentence_boundary: If `True`, splits by words but tries not to break inside a sentence.
|
||||
:param language: Language used by the sentence tokenizer if `split_by="sentence"` or
|
||||
`respect_sentence_boundary=True`.
|
||||
:param use_split_rules: Whether to apply additional splitting heuristics for the sentence splitter.
|
||||
:param extend_abbreviations: Whether to extend the sentence splitter with curated abbreviations for certain
|
||||
languages.
|
||||
|
||||
**Cleaner Parameters**:
|
||||
:param remove_empty_lines: If `True`, removes empty lines.
|
||||
:param remove_extra_whitespaces: If `True`, removes extra whitespaces.
|
||||
:param remove_repeated_substrings: If `True`, removes repeated substrings like headers/footers across pages.
|
||||
:param keep_id: If `True`, keeps the original document IDs.
|
||||
:param remove_substrings: A list of strings to remove from the document content.
|
||||
:param remove_regex: A regex pattern whose matches will be removed from the document content.
|
||||
:param unicode_normalization: Unicode normalization form to apply to the text, for example `"NFC"`.
|
||||
:param ascii_only: If `True`, converts text to ASCII only.
|
||||
"""
|
||||
# Store arguments for serialization
|
||||
self.remove_empty_lines = remove_empty_lines
|
||||
self.remove_extra_whitespaces = remove_extra_whitespaces
|
||||
self.remove_repeated_substrings = remove_repeated_substrings
|
||||
self.keep_id = keep_id
|
||||
self.remove_substrings = remove_substrings
|
||||
self.remove_regex = remove_regex
|
||||
self.unicode_normalization = unicode_normalization
|
||||
self.ascii_only = ascii_only
|
||||
|
||||
self.split_by = split_by
|
||||
self.split_length = split_length
|
||||
self.split_overlap = split_overlap
|
||||
self.split_threshold = split_threshold
|
||||
self.splitting_function = splitting_function
|
||||
self.respect_sentence_boundary = respect_sentence_boundary
|
||||
self.language = language
|
||||
self.use_split_rules = use_split_rules
|
||||
self.extend_abbreviations = extend_abbreviations
|
||||
|
||||
# Instantiate sub-components
|
||||
splitter = DocumentSplitter(
|
||||
split_by=self.split_by,
|
||||
split_length=self.split_length,
|
||||
split_overlap=self.split_overlap,
|
||||
split_threshold=self.split_threshold,
|
||||
splitting_function=self.splitting_function,
|
||||
respect_sentence_boundary=self.respect_sentence_boundary,
|
||||
language=self.language,
|
||||
use_split_rules=self.use_split_rules,
|
||||
extend_abbreviations=self.extend_abbreviations,
|
||||
)
|
||||
|
||||
cleaner = DocumentCleaner(
|
||||
remove_empty_lines=self.remove_empty_lines,
|
||||
remove_extra_whitespaces=self.remove_extra_whitespaces,
|
||||
remove_repeated_substrings=self.remove_repeated_substrings,
|
||||
keep_id=self.keep_id,
|
||||
remove_substrings=self.remove_substrings,
|
||||
remove_regex=self.remove_regex,
|
||||
unicode_normalization=self.unicode_normalization,
|
||||
ascii_only=self.ascii_only,
|
||||
)
|
||||
|
||||
# Build the Pipeline
|
||||
pp = Pipeline()
|
||||
|
||||
# We use type ignore here to avoid type checking errors
|
||||
# This is due to how the run method within the Component protocol is defined
|
||||
pp.add_component("splitter", splitter) # type: ignore[arg-type]
|
||||
pp.add_component("cleaner", cleaner) # type: ignore[arg-type]
|
||||
|
||||
# Connect the splitter output to cleaner
|
||||
pp.connect("splitter.documents", "cleaner.documents")
|
||||
self.pipeline = pp
|
||||
|
||||
# Define how pipeline inputs/outputs map to sub-component inputs/outputs
|
||||
self.input_mapping = {
|
||||
# The pipeline input "documents" feeds into "splitter.documents"
|
||||
"documents": ["splitter.documents"]
|
||||
}
|
||||
# The pipeline output "documents" comes from "cleaner.documents"
|
||||
self.output_mapping = {"cleaner.documents": "documents"}
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Serialize SuperComponent to a dictionary.
|
||||
|
||||
:return:
|
||||
Dictionary with serialized data.
|
||||
"""
|
||||
splitting_function = None
|
||||
if self.splitting_function is not None:
|
||||
splitting_function = serialize_callable(self.splitting_function)
|
||||
|
||||
return default_to_dict(
|
||||
self,
|
||||
remove_empty_lines=self.remove_empty_lines,
|
||||
remove_extra_whitespaces=self.remove_extra_whitespaces,
|
||||
remove_repeated_substrings=self.remove_repeated_substrings,
|
||||
keep_id=self.keep_id,
|
||||
remove_substrings=self.remove_substrings,
|
||||
remove_regex=self.remove_regex,
|
||||
unicode_normalization=self.unicode_normalization,
|
||||
ascii_only=self.ascii_only,
|
||||
split_by=self.split_by,
|
||||
split_length=self.split_length,
|
||||
split_overlap=self.split_overlap,
|
||||
split_threshold=self.split_threshold,
|
||||
splitting_function=splitting_function,
|
||||
respect_sentence_boundary=self.respect_sentence_boundary,
|
||||
language=self.language,
|
||||
use_split_rules=self.use_split_rules,
|
||||
extend_abbreviations=self.extend_abbreviations,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "DocumentPreprocessor":
|
||||
"""
|
||||
Deserializes the SuperComponent from a dictionary.
|
||||
|
||||
:param data:
|
||||
Dictionary to deserialize from.
|
||||
:returns:
|
||||
Deserialized SuperComponent.
|
||||
"""
|
||||
if "splitting_function" in data["init_parameters"]:
|
||||
data["init_parameters"]["splitting_function"] = deserialize_callable(
|
||||
data["init_parameters"]["splitting_function"]
|
||||
)
|
||||
|
||||
return default_from_dict(cls, data)
|
@ -0,0 +1,9 @@
|
||||
---
|
||||
highlights: >
|
||||
Two ready-made SuperComponents simplify document preprocessing: MultiFileConverter, and DocumentPreProcessor.
|
||||
For example, if you have all extra dependencies for file conversion installed, you can run the following:
|
||||
```python
|
||||
from haystack.super_components.converters import MultiFileConverter
|
||||
converter = MultiFileConverter()
|
||||
converter.run(sources=["test.txt", "test.pdf"], meta={})
|
||||
```
|
145
test/components/converters/test_multi_file_converter.py
Normal file
145
test/components/converters/test_multi_file_converter.py
Normal file
@ -0,0 +1,145 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack import Document, Pipeline
|
||||
from haystack.core.pipeline.base import component_to_dict, component_from_dict
|
||||
from haystack.core.component.component import Component
|
||||
from haystack.dataclasses import ByteStream
|
||||
from haystack.components.converters.multi_file_converter import MultiFileConverter
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def converter():
|
||||
converter = MultiFileConverter()
|
||||
converter.warm_up()
|
||||
return converter
|
||||
|
||||
|
||||
class TestMultiFileConverter:
|
||||
def test_init_default_params(self, converter):
|
||||
"""Test initialization with default parameters"""
|
||||
assert converter.encoding == "utf-8"
|
||||
assert converter.json_content_key == "content"
|
||||
assert isinstance(converter, Component)
|
||||
|
||||
def test_init_custom_params(self, converter):
|
||||
"""Test initialization with custom parameters"""
|
||||
converter = MultiFileConverter(encoding="latin-1", json_content_key="text")
|
||||
assert converter.encoding == "latin-1"
|
||||
assert converter.json_content_key == "text"
|
||||
|
||||
def test_to_dict(self, converter):
|
||||
"""Test serialization to dictionary"""
|
||||
data = component_to_dict(converter, "converter")
|
||||
assert data == {
|
||||
"type": "haystack.components.converters.multi_file_converter.MultiFileConverter",
|
||||
"init_parameters": {"encoding": "utf-8", "json_content_key": "content"},
|
||||
}
|
||||
|
||||
def test_from_dict(self):
|
||||
"""Test deserialization from dictionary"""
|
||||
data = {
|
||||
"type": "haystack.components.converters.multi_file_converter.MultiFileConverter",
|
||||
"init_parameters": {"encoding": "latin-1", "json_content_key": "text"},
|
||||
}
|
||||
conv = component_from_dict(MultiFileConverter, data, "converter")
|
||||
assert conv.encoding == "latin-1"
|
||||
assert conv.json_content_key == "text"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"suffix,file_path",
|
||||
[
|
||||
("csv", "csv/sample_1.csv"),
|
||||
("docx", "docx/sample_docx.docx"),
|
||||
("html", "html/what_is_haystack.html"),
|
||||
("json", "json/json_conversion_testfile.json"),
|
||||
("md", "markdown/sample.md"),
|
||||
("pdf", "pdf/sample_pdf_1.pdf"),
|
||||
("pptx", "pptx/sample_pptx.pptx"),
|
||||
("txt", "txt/doc_1.txt"),
|
||||
("xlsx", "xlsx/table_empty_rows_and_columns.xlsx"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.integration
|
||||
def test_run(self, test_files_path, converter, suffix, file_path):
|
||||
unclassified_bytestream = ByteStream(b"unclassified content")
|
||||
unclassified_bytestream.meta["content_type"] = "unknown_type"
|
||||
|
||||
paths = [test_files_path / file_path, unclassified_bytestream]
|
||||
|
||||
output = converter.run(sources=paths)
|
||||
docs = output["documents"]
|
||||
unclassified = output["unclassified"]
|
||||
|
||||
assert len(docs) == 1
|
||||
assert isinstance(docs[0], Document)
|
||||
assert docs[0].content is not None
|
||||
assert docs[0].meta["file_path"].endswith(suffix)
|
||||
|
||||
assert len(unclassified) == 1
|
||||
assert isinstance(unclassified[0], ByteStream)
|
||||
assert unclassified[0].meta["content_type"] == "unknown_type"
|
||||
|
||||
def test_run_with_meta(self, test_files_path, converter):
|
||||
"""Test conversion with metadata"""
|
||||
paths = [test_files_path / "txt" / "doc_1.txt"]
|
||||
meta = {"language": "en", "author": "test"}
|
||||
output = converter.run(sources=paths, meta=meta)
|
||||
docs = output["documents"]
|
||||
assert docs[0].meta["language"] == "en"
|
||||
assert docs[0].meta["author"] == "test"
|
||||
|
||||
def test_run_with_bytestream(self, test_files_path, converter):
|
||||
"""Test converting ByteStream input"""
|
||||
bytestream = ByteStream(data=b"test content", mime_type="text/plain", meta={"file_path": "test.txt"})
|
||||
output = converter.run(sources=[bytestream])
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 1
|
||||
assert docs[0].content == "test content"
|
||||
assert docs[0].meta["file_path"] == "test.txt"
|
||||
|
||||
def test_run_error_handling(self, test_files_path, converter, caplog):
|
||||
"""Test error handling for non-existent files"""
|
||||
paths = [test_files_path / "non_existent.txt"]
|
||||
with caplog.at_level("WARNING"):
|
||||
output = converter.run(sources=paths)
|
||||
assert "Could not read" in caplog.text
|
||||
assert len(output["documents"]) == 0
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_run_all_file_types(self, test_files_path, converter):
|
||||
"""Test converting all supported file types in parallel"""
|
||||
paths = [
|
||||
test_files_path / "csv" / "sample_1.csv",
|
||||
test_files_path / "docx" / "sample_docx.docx",
|
||||
test_files_path / "html" / "what_is_haystack.html",
|
||||
test_files_path / "json" / "json_conversion_testfile.json",
|
||||
test_files_path / "markdown" / "sample.md",
|
||||
test_files_path / "txt" / "doc_1.txt",
|
||||
test_files_path / "pdf" / "sample_pdf_1.pdf",
|
||||
test_files_path / "pptx" / "sample_pptx.pptx",
|
||||
test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx",
|
||||
]
|
||||
output = converter.run(sources=paths)
|
||||
docs = output["documents"]
|
||||
|
||||
# Verify we got a document for each file
|
||||
assert len(docs) == len(paths)
|
||||
assert all(isinstance(doc, Document) for doc in docs)
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_run_in_pipeline(self, test_files_path, converter):
|
||||
pipeline = Pipeline(max_runs_per_component=1)
|
||||
pipeline.add_component("converter", converter)
|
||||
|
||||
paths = [test_files_path / "txt" / "doc_1.txt", test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
||||
|
||||
output = pipeline.run(data={"sources": paths})
|
||||
docs = output["converter"]["documents"]
|
||||
|
||||
assert len(docs) == 2
|
||||
assert all(isinstance(doc, Document) for doc in docs)
|
||||
assert all(doc.content is not None for doc in docs)
|
128
test/components/preprocessors/test_document_preprocessor.py
Normal file
128
test/components/preprocessors/test_document_preprocessor.py
Normal file
@ -0,0 +1,128 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from unittest.mock import patch
|
||||
import pytest
|
||||
|
||||
from haystack import Document, Pipeline
|
||||
from haystack.components.preprocessors.document_preprocessor import DocumentPreprocessor
|
||||
|
||||
|
||||
class TestDocumentPreprocessor:
|
||||
@pytest.fixture
|
||||
def preprocessor(self) -> DocumentPreprocessor:
|
||||
return DocumentPreprocessor(
|
||||
# Cleaner parameters
|
||||
remove_empty_lines=True,
|
||||
remove_extra_whitespaces=True,
|
||||
remove_repeated_substrings=False,
|
||||
keep_id=True,
|
||||
# Splitter parameters
|
||||
split_by="word",
|
||||
split_length=3,
|
||||
split_overlap=1,
|
||||
respect_sentence_boundary=False,
|
||||
language="en",
|
||||
)
|
||||
|
||||
def test_init(self, preprocessor: DocumentPreprocessor) -> None:
|
||||
assert isinstance(preprocessor.pipeline, Pipeline)
|
||||
assert preprocessor.input_mapping == {"documents": ["splitter.documents"]}
|
||||
assert preprocessor.output_mapping == {"cleaner.documents": "documents"}
|
||||
|
||||
cleaner = preprocessor.pipeline.get_component("cleaner")
|
||||
assert cleaner.remove_empty_lines is True
|
||||
assert cleaner.remove_extra_whitespaces is True
|
||||
assert cleaner.remove_repeated_substrings is False
|
||||
assert cleaner.keep_id is True
|
||||
|
||||
splitter = preprocessor.pipeline.get_component("splitter")
|
||||
assert splitter.split_by == "word"
|
||||
assert splitter.split_length == 3
|
||||
assert splitter.split_overlap == 1
|
||||
assert splitter.respect_sentence_boundary is False
|
||||
assert splitter.language == "en"
|
||||
|
||||
def test_from_dict(self) -> None:
|
||||
preprocessor = DocumentPreprocessor.from_dict(
|
||||
{
|
||||
"init_parameters": {
|
||||
"remove_empty_lines": True,
|
||||
"remove_extra_whitespaces": True,
|
||||
"remove_repeated_substrings": False,
|
||||
"keep_id": True,
|
||||
"split_by": "word",
|
||||
"split_length": 3,
|
||||
"split_overlap": 1,
|
||||
"respect_sentence_boundary": False,
|
||||
"language": "en",
|
||||
},
|
||||
"type": "haystack.components.preprocessors.document_preprocessor.DocumentPreprocessor",
|
||||
}
|
||||
)
|
||||
assert isinstance(preprocessor, DocumentPreprocessor)
|
||||
|
||||
def test_to_dict(self, preprocessor: DocumentPreprocessor) -> None:
|
||||
expected = {
|
||||
"init_parameters": {
|
||||
"remove_empty_lines": True,
|
||||
"remove_extra_whitespaces": True,
|
||||
"remove_repeated_substrings": False,
|
||||
"keep_id": True,
|
||||
"remove_substrings": None,
|
||||
"remove_regex": None,
|
||||
"unicode_normalization": None,
|
||||
"ascii_only": False,
|
||||
"split_by": "word",
|
||||
"split_length": 3,
|
||||
"split_overlap": 1,
|
||||
"split_threshold": 0,
|
||||
"splitting_function": None,
|
||||
"respect_sentence_boundary": False,
|
||||
"language": "en",
|
||||
"use_split_rules": True,
|
||||
"extend_abbreviations": True,
|
||||
},
|
||||
"type": "haystack.components.preprocessors.document_preprocessor.DocumentPreprocessor",
|
||||
}
|
||||
assert preprocessor.to_dict() == expected
|
||||
|
||||
def test_warm_up(self, preprocessor: DocumentPreprocessor) -> None:
|
||||
with patch.object(preprocessor.pipeline, "warm_up") as mock_warm_up:
|
||||
preprocessor.warm_up()
|
||||
mock_warm_up.assert_called_once()
|
||||
|
||||
def test_run(self, preprocessor: DocumentPreprocessor) -> None:
|
||||
documents = [
|
||||
Document(content="This is a test document. It has multiple sentences."),
|
||||
Document(content="Another test document with some content."),
|
||||
]
|
||||
|
||||
preprocessor.warm_up()
|
||||
result = preprocessor.run(documents=documents)
|
||||
|
||||
# Check that we got processed documents back
|
||||
assert "documents" in result
|
||||
processed_docs = result["documents"]
|
||||
assert len(processed_docs) > len(documents) # Should have more docs due to splitting
|
||||
|
||||
# Check that the content was cleaned and split
|
||||
for doc in processed_docs:
|
||||
assert doc.content.strip() == doc.content
|
||||
assert len(doc.content.split()) <= 3 # Split length of 3 words
|
||||
assert doc.id is not None
|
||||
|
||||
def test_run_with_custom_splitting_function(self) -> None:
|
||||
def custom_split(text: str) -> list[str]:
|
||||
return [t for t in text.split(".") if t.strip() != ""]
|
||||
|
||||
preprocessor = DocumentPreprocessor(split_by="function", splitting_function=custom_split, split_length=1)
|
||||
|
||||
documents = [Document(content="First sentence. Second sentence. Third sentence.")]
|
||||
preprocessor.warm_up()
|
||||
result = preprocessor.run(documents=documents)
|
||||
|
||||
processed_docs = result["documents"]
|
||||
assert len(processed_docs) == 3 # Should be split into 3 sentences
|
||||
assert all("." not in doc.content for doc in processed_docs) # Each doc should be a single sentence
|
1
test/test_files/json/json_conversion_testfile.json
Normal file
1
test/test_files/json/json_conversion_testfile.json
Normal file
@ -0,0 +1 @@
|
||||
{"content": "Content from a json file"}
|
Loading…
x
Reference in New Issue
Block a user