feat: Add TikaDocumentConverter (2.0) (#5847)

* Add TikaFileToDocument component

* Add tests

* Add tika service to CI

* Add release note

* Change name

* PR feedback

* Fix naming in tests

* Fix tika version in CI

* Update tests

---------

Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
This commit is contained in:
bogdankostic 2023-09-25 11:47:21 +02:00 committed by GitHub
parent 4da43b6b05
commit 9a4373bf8e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 200 additions and 3 deletions

View File

@ -831,8 +831,13 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest]
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
services:
tika:
image: apache/tika:2.9.0.0
ports:
- 9998:9998
steps:
- uses: actions/checkout@v4
@ -848,7 +853,7 @@ jobs:
- name: Install Haystack
# FIXME Use haystack-ai dependency list
run: pip install .[dev,inference] langdetect
run: pip install .[dev,inference,file-conversion] langdetect
- name: Run tests
run: |

View File

@ -1,3 +1,4 @@
from haystack.preview.components.file_converters.txt import TextFileToDocument
from haystack.preview.components.file_converters.tika import TikaDocumentConverter
__all__ = ["TextFileToDocument"]
__all__ = ["TextFileToDocument", "TikaDocumentConverter"]

View File

@ -0,0 +1,85 @@
import logging
from pathlib import Path
from typing import Optional, List, Union, Dict, Any
from haystack.preview.lazy_imports import LazyImport
from haystack.preview import component, Document, default_to_dict, default_from_dict
with LazyImport("Run 'pip install farm-haystack[file-conversion]' or 'pip install tika'") as tika_import:
from tika import parser as tika_parser
logger = logging.getLogger(__name__)
@component
class TikaDocumentConverter:
"""
A component for converting files of different types (pdf, docx, html, etc.) to Documents.
This component uses [Apache Tika](https://tika.apache.org/) for parsing the files and, therefore,
requires a running Tika server.
"""
def __init__(self, tika_url: str = "http://localhost:9998/tika", id_hash_keys: Optional[List[str]] = None):
"""
Create a TikaDocumentConverter component.
:param tika_url: URL of the Tika server. Default: `"http://localhost:9998/tika"`
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
attributes. If you want to ensure you don't have duplicate Documents in your DocumentStore but texts are not
unique, you can pass the name of the metadata to use when building the document ID (like
`["text", "category"]`) to this field. In this case, the ID will be generated by using the text and the content of the
`category` field. Default: `None`
"""
tika_import.check()
self.tika_url = tika_url
self.id_hash_keys = id_hash_keys or []
@component.output_types(documents=List[Document])
def run(self, paths: List[Union[str, Path]], id_hash_keys: Optional[List[str]] = None):
"""
Convert files to Documents.
:param paths: A list of paths to the files to convert.
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
attributes. If you want to ensure you don't have duplicate Documents in your DocumentStore but texts are not
unique, you can pass the name of the metadata to use when building the document ID (like
`["text", "category"]`) to this field. In this case, the ID will be generated by using the text and the
content of the `category` field.
If not set, the id_hash_keys passed to the constructor will be used.
Default: `None`
"""
id_hash_keys = id_hash_keys or self.id_hash_keys
documents = []
for path in paths:
path = Path(path)
try:
parsed_file = tika_parser.from_file(path.as_posix(), self.tika_url)
extracted_text = parsed_file["content"]
if not extracted_text:
logger.warning("Skipping file at '%s' as Tika was not able to extract any content.", str(path))
continue
if id_hash_keys:
document = Document(text=extracted_text, id_hash_keys=id_hash_keys)
else:
document = Document(text=extracted_text)
documents.append(document)
except Exception as e:
logger.error("Could not convert file at '%s' to Document. Error: %s", str(path), e)
return {"documents": documents}
def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""
return default_to_dict(self, tika_url=self.tika_url, id_hash_keys=self.id_hash_keys)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "TikaDocumentConverter":
"""
Deserialize this component from a dictionary.
"""
return default_from_dict(cls, data)

View File

@ -0,0 +1,4 @@
---
preview:
- |
Add TikaDocumentConverter component to convert files of different types to Documents.

View File

@ -0,0 +1,102 @@
from unittest.mock import patch
import pytest
from haystack.preview.components.file_converters.tika import TikaDocumentConverter
class TestTikaDocumentConverter:
@pytest.mark.unit
def test_to_dict(self):
component = TikaDocumentConverter()
data = component.to_dict()
assert data == {
"type": "TikaDocumentConverter",
"init_parameters": {"tika_url": "http://localhost:9998/tika", "id_hash_keys": []},
}
@pytest.mark.unit
def test_to_dict_with_custom_init_parameters(self):
component = TikaDocumentConverter(tika_url="http://localhost:1234/tika", id_hash_keys=["text", "category"])
data = component.to_dict()
assert data == {
"type": "TikaDocumentConverter",
"init_parameters": {"tika_url": "http://localhost:1234/tika", "id_hash_keys": ["text", "category"]},
}
@pytest.mark.unit
def test_from_dict(self):
data = {
"type": "TikaDocumentConverter",
"init_parameters": {"tika_url": "http://localhost:9998/tika", "id_hash_keys": ["text", "category"]},
}
component = TikaDocumentConverter.from_dict(data)
assert component.tika_url == "http://localhost:9998/tika"
assert component.id_hash_keys == ["text", "category"]
@pytest.mark.unit
def test_run(self):
component = TikaDocumentConverter()
with patch("haystack.preview.components.file_converters.tika.tika_parser.from_file") as mock_tika_parser:
mock_tika_parser.return_value = {"content": "Content of mock_file.pdf"}
documents = component.run(paths=["mock_file.pdf"])["documents"]
assert len(documents) == 1
assert documents[0].text == "Content of mock_file.pdf"
@pytest.mark.unit
def test_run_logs_warning_if_content_empty(self, caplog):
component = TikaDocumentConverter()
with patch("haystack.preview.components.file_converters.tika.tika_parser.from_file") as mock_tika_parser:
mock_tika_parser.return_value = {"content": ""}
with caplog.at_level("WARNING"):
component.run(paths=["mock_file.pdf"])
assert "Skipping file at 'mock_file.pdf' as Tika was not able to extract any content." in caplog.text
@pytest.mark.unit
def test_run_logs_error(self, caplog):
component = TikaDocumentConverter()
with patch("haystack.preview.components.file_converters.tika.tika_parser.from_file") as mock_tika_parser:
mock_tika_parser.side_effect = Exception("Some error")
with caplog.at_level("ERROR"):
component.run(paths=["mock_file.pdf"])
assert "Could not convert file at 'mock_file.pdf' to Document. Error: Some error" in caplog.text
@pytest.mark.integration
def test_run_with_txt_files(self, preview_samples_path):
component = TikaDocumentConverter()
output = component.run(
paths=[preview_samples_path / "txt" / "doc_1.txt", preview_samples_path / "txt" / "doc_2.txt"]
)
documents = output["documents"]
assert len(documents) == 2
assert "Some text for testing.\nTwo lines in here." in documents[0].text
assert "This is a test line.\n123 456 789\n987 654 321" in documents[1].text
@pytest.mark.integration
def test_run_with_pdf_file(self, preview_samples_path):
component = TikaDocumentConverter()
output = component.run(
paths=[preview_samples_path / "pdf" / "sample_pdf_1.pdf", preview_samples_path / "pdf" / "sample_pdf_2.pdf"]
)
documents = output["documents"]
assert len(documents) == 2
assert "A sample PDF file" in documents[0].text
assert "Page 2 of Sample PDF" in documents[0].text
assert "Page 4 of Sample PDF" in documents[0].text
assert "First Page" in documents[1].text
assert (
"Wiki engines usually allow content to be written using a simplified markup language" in documents[1].text
)
assert "This section needs additional citations for verification." in documents[1].text
assert "This would make it easier for other users to find the article." in documents[1].text
@pytest.mark.integration
def test_run_with_docx_file(self, preview_samples_path):
component = TikaDocumentConverter()
output = component.run(paths=[preview_samples_path / "docx" / "sample_docx.docx"])
documents = output["documents"]
assert len(documents) == 1
assert "Sample Docx File" in documents[0].text
assert "Now we are in Page 2" in documents[0].text
assert "Page 3 was empty this is page 4" in documents[0].text

Binary file not shown.

Binary file not shown.