mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-04 11:07:52 +00:00
feat: Add TikaDocumentConverter (2.0) (#5847)
* Add TikaFileToDocument component * Add tests * Add tika service to CI * Add release note * Change name * PR feedback * Fix naming in tests * Fix tika version in CI * Update tests --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
This commit is contained in:
parent
4da43b6b05
commit
9a4373bf8e
9
.github/workflows/tests.yml
vendored
9
.github/workflows/tests.yml
vendored
@ -831,8 +831,13 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest, windows-latest]
|
||||
os: [ubuntu-latest]
|
||||
runs-on: ${{ matrix.os }}
|
||||
services:
|
||||
tika:
|
||||
image: apache/tika:2.9.0.0
|
||||
ports:
|
||||
- 9998:9998
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@ -848,7 +853,7 @@ jobs:
|
||||
|
||||
- name: Install Haystack
|
||||
# FIXME Use haystack-ai dependency list
|
||||
run: pip install .[dev,inference] langdetect
|
||||
run: pip install .[dev,inference,file-conversion] langdetect
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
from haystack.preview.components.file_converters.txt import TextFileToDocument
|
||||
from haystack.preview.components.file_converters.tika import TikaDocumentConverter
|
||||
|
||||
__all__ = ["TextFileToDocument"]
|
||||
__all__ = ["TextFileToDocument", "TikaDocumentConverter"]
|
||||
|
||||
85
haystack/preview/components/file_converters/tika.py
Normal file
85
haystack/preview/components/file_converters/tika.py
Normal file
@ -0,0 +1,85 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Union, Dict, Any
|
||||
|
||||
from haystack.preview.lazy_imports import LazyImport
|
||||
from haystack.preview import component, Document, default_to_dict, default_from_dict
|
||||
|
||||
|
||||
with LazyImport("Run 'pip install farm-haystack[file-conversion]' or 'pip install tika'") as tika_import:
|
||||
from tika import parser as tika_parser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@component
|
||||
class TikaDocumentConverter:
|
||||
"""
|
||||
A component for converting files of different types (pdf, docx, html, etc.) to Documents.
|
||||
This component uses [Apache Tika](https://tika.apache.org/) for parsing the files and, therefore,
|
||||
requires a running Tika server.
|
||||
"""
|
||||
|
||||
def __init__(self, tika_url: str = "http://localhost:9998/tika", id_hash_keys: Optional[List[str]] = None):
|
||||
"""
|
||||
Create a TikaDocumentConverter component.
|
||||
|
||||
:param tika_url: URL of the Tika server. Default: `"http://localhost:9998/tika"`
|
||||
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
|
||||
attributes. If you want to ensure you don't have duplicate Documents in your DocumentStore but texts are not
|
||||
unique, you can pass the name of the metadata to use when building the document ID (like
|
||||
`["text", "category"]`) to this field. In this case, the ID will be generated by using the text and the content of the
|
||||
`category` field. Default: `None`
|
||||
"""
|
||||
tika_import.check()
|
||||
self.tika_url = tika_url
|
||||
self.id_hash_keys = id_hash_keys or []
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(self, paths: List[Union[str, Path]], id_hash_keys: Optional[List[str]] = None):
|
||||
"""
|
||||
Convert files to Documents.
|
||||
|
||||
:param paths: A list of paths to the files to convert.
|
||||
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
|
||||
attributes. If you want to ensure you don't have duplicate Documents in your DocumentStore but texts are not
|
||||
unique, you can pass the name of the metadata to use when building the document ID (like
|
||||
`["text", "category"]`) to this field. In this case, the ID will be generated by using the text and the
|
||||
content of the `category` field.
|
||||
If not set, the id_hash_keys passed to the constructor will be used.
|
||||
Default: `None`
|
||||
|
||||
"""
|
||||
id_hash_keys = id_hash_keys or self.id_hash_keys
|
||||
|
||||
documents = []
|
||||
for path in paths:
|
||||
path = Path(path)
|
||||
try:
|
||||
parsed_file = tika_parser.from_file(path.as_posix(), self.tika_url)
|
||||
extracted_text = parsed_file["content"]
|
||||
if not extracted_text:
|
||||
logger.warning("Skipping file at '%s' as Tika was not able to extract any content.", str(path))
|
||||
continue
|
||||
if id_hash_keys:
|
||||
document = Document(text=extracted_text, id_hash_keys=id_hash_keys)
|
||||
else:
|
||||
document = Document(text=extracted_text)
|
||||
documents.append(document)
|
||||
except Exception as e:
|
||||
logger.error("Could not convert file at '%s' to Document. Error: %s", str(path), e)
|
||||
|
||||
return {"documents": documents}
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Serialize this component to a dictionary.
|
||||
"""
|
||||
return default_to_dict(self, tika_url=self.tika_url, id_hash_keys=self.id_hash_keys)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "TikaDocumentConverter":
|
||||
"""
|
||||
Deserialize this component from a dictionary.
|
||||
"""
|
||||
return default_from_dict(cls, data)
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
preview:
|
||||
- |
|
||||
Add TikaDocumentConverter component to convert files of different types to Documents.
|
||||
@ -0,0 +1,102 @@
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack.preview.components.file_converters.tika import TikaDocumentConverter
|
||||
|
||||
|
||||
class TestTikaDocumentConverter:
|
||||
@pytest.mark.unit
|
||||
def test_to_dict(self):
|
||||
component = TikaDocumentConverter()
|
||||
data = component.to_dict()
|
||||
assert data == {
|
||||
"type": "TikaDocumentConverter",
|
||||
"init_parameters": {"tika_url": "http://localhost:9998/tika", "id_hash_keys": []},
|
||||
}
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_to_dict_with_custom_init_parameters(self):
|
||||
component = TikaDocumentConverter(tika_url="http://localhost:1234/tika", id_hash_keys=["text", "category"])
|
||||
data = component.to_dict()
|
||||
assert data == {
|
||||
"type": "TikaDocumentConverter",
|
||||
"init_parameters": {"tika_url": "http://localhost:1234/tika", "id_hash_keys": ["text", "category"]},
|
||||
}
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_from_dict(self):
|
||||
data = {
|
||||
"type": "TikaDocumentConverter",
|
||||
"init_parameters": {"tika_url": "http://localhost:9998/tika", "id_hash_keys": ["text", "category"]},
|
||||
}
|
||||
component = TikaDocumentConverter.from_dict(data)
|
||||
assert component.tika_url == "http://localhost:9998/tika"
|
||||
assert component.id_hash_keys == ["text", "category"]
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run(self):
|
||||
component = TikaDocumentConverter()
|
||||
with patch("haystack.preview.components.file_converters.tika.tika_parser.from_file") as mock_tika_parser:
|
||||
mock_tika_parser.return_value = {"content": "Content of mock_file.pdf"}
|
||||
documents = component.run(paths=["mock_file.pdf"])["documents"]
|
||||
|
||||
assert len(documents) == 1
|
||||
assert documents[0].text == "Content of mock_file.pdf"
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run_logs_warning_if_content_empty(self, caplog):
|
||||
component = TikaDocumentConverter()
|
||||
with patch("haystack.preview.components.file_converters.tika.tika_parser.from_file") as mock_tika_parser:
|
||||
mock_tika_parser.return_value = {"content": ""}
|
||||
with caplog.at_level("WARNING"):
|
||||
component.run(paths=["mock_file.pdf"])
|
||||
assert "Skipping file at 'mock_file.pdf' as Tika was not able to extract any content." in caplog.text
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run_logs_error(self, caplog):
|
||||
component = TikaDocumentConverter()
|
||||
with patch("haystack.preview.components.file_converters.tika.tika_parser.from_file") as mock_tika_parser:
|
||||
mock_tika_parser.side_effect = Exception("Some error")
|
||||
with caplog.at_level("ERROR"):
|
||||
component.run(paths=["mock_file.pdf"])
|
||||
assert "Could not convert file at 'mock_file.pdf' to Document. Error: Some error" in caplog.text
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_run_with_txt_files(self, preview_samples_path):
|
||||
component = TikaDocumentConverter()
|
||||
output = component.run(
|
||||
paths=[preview_samples_path / "txt" / "doc_1.txt", preview_samples_path / "txt" / "doc_2.txt"]
|
||||
)
|
||||
documents = output["documents"]
|
||||
assert len(documents) == 2
|
||||
assert "Some text for testing.\nTwo lines in here." in documents[0].text
|
||||
assert "This is a test line.\n123 456 789\n987 654 321" in documents[1].text
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_run_with_pdf_file(self, preview_samples_path):
|
||||
component = TikaDocumentConverter()
|
||||
output = component.run(
|
||||
paths=[preview_samples_path / "pdf" / "sample_pdf_1.pdf", preview_samples_path / "pdf" / "sample_pdf_2.pdf"]
|
||||
)
|
||||
documents = output["documents"]
|
||||
assert len(documents) == 2
|
||||
assert "A sample PDF file" in documents[0].text
|
||||
assert "Page 2 of Sample PDF" in documents[0].text
|
||||
assert "Page 4 of Sample PDF" in documents[0].text
|
||||
assert "First Page" in documents[1].text
|
||||
assert (
|
||||
"Wiki engines usually allow content to be written using a simplified markup language" in documents[1].text
|
||||
)
|
||||
assert "This section needs additional citations for verification." in documents[1].text
|
||||
assert "This would make it easier for other users to find the article." in documents[1].text
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_run_with_docx_file(self, preview_samples_path):
|
||||
component = TikaDocumentConverter()
|
||||
output = component.run(paths=[preview_samples_path / "docx" / "sample_docx.docx"])
|
||||
documents = output["documents"]
|
||||
assert len(documents) == 1
|
||||
assert "Sample Docx File" in documents[0].text
|
||||
assert "Now we are in Page 2" in documents[0].text
|
||||
assert "Page 3 was empty this is page 4" in documents[0].text
|
||||
BIN
test/preview/test_files/docx/sample_docx.docx
Normal file
BIN
test/preview/test_files/docx/sample_docx.docx
Normal file
Binary file not shown.
BIN
test/preview/test_files/pdf/sample_pdf_2.pdf
Normal file
BIN
test/preview/test_files/pdf/sample_pdf_2.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user