feat: Add TikaDocumentConverter (2.0) (#5847)

* Add TikaFileToDocument component * Add tests * Add tika service to CI * Add release note * Change name * PR feedback * Fix naming in tests * Fix tika version in CI * Update tests --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
2026-01-04 11:07:52 +00:00 · 2023-09-25 11:47:21 +02:00 · 2023-09-25 11:47:21 +02:00 · 9a4373bf8e
commit 9a4373bf8e
parent 4da43b6b05
7 changed files with 200 additions and 3 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -831,8 +831,13 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        os: [ubuntu-latest, windows-latest]
+        os: [ubuntu-latest]
    runs-on: ${{ matrix.os }}
+    services:
+      tika:
+        image: apache/tika:2.9.0.0
+        ports:
+          - 9998:9998
    steps:
      - uses: actions/checkout@v4

@ -848,7 +853,7 @@ jobs:

      - name: Install Haystack
        # FIXME Use haystack-ai dependency list
-        run: pip install .[dev,inference] langdetect
+        run: pip install .[dev,inference,file-conversion] langdetect

      - name: Run tests
        run: |
--- a/haystack/preview/components/file_converters/init.py
+++ b/haystack/preview/components/file_converters/init.py
@ -1,3 +1,4 @@
 from haystack.preview.components.file_converters.txt import TextFileToDocument
+from haystack.preview.components.file_converters.tika import TikaDocumentConverter

-__all__ = ["TextFileToDocument"]
+__all__ = ["TextFileToDocument", "TikaDocumentConverter"]
--- a/haystack/preview/components/file_converters/tika.py
+++ b/haystack/preview/components/file_converters/tika.py
@ -0,0 +1,85 @@
+import logging
+from pathlib import Path
+from typing import Optional, List, Union, Dict, Any
+
+from haystack.preview.lazy_imports import LazyImport
+from haystack.preview import component, Document, default_to_dict, default_from_dict
+
+
+with LazyImport("Run 'pip install farm-haystack[file-conversion]' or 'pip install tika'") as tika_import:
+    from tika import parser as tika_parser
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class TikaDocumentConverter:
+    """
+    A component for converting files of different types (pdf, docx, html, etc.) to Documents.
+    This component uses [Apache Tika](https://tika.apache.org/) for parsing the files and, therefore,
+    requires a running Tika server.
+    """
+
+    def __init__(self, tika_url: str = "http://localhost:9998/tika", id_hash_keys: Optional[List[str]] = None):
+        """
+        Create a TikaDocumentConverter component.
+
+        :param tika_url: URL of the Tika server. Default: `"http://localhost:9998/tika"`
+        :param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
+            attributes. If you want to ensure you don't have duplicate Documents in your DocumentStore but texts are not
+            unique, you can pass the name of the metadata to use when building the document ID (like
+            `["text", "category"]`) to this field. In this case, the ID will be generated by using the text and the content of the
+            `category` field. Default: `None`
+        """
+        tika_import.check()
+        self.tika_url = tika_url
+        self.id_hash_keys = id_hash_keys or []
+
+    @component.output_types(documents=List[Document])
+    def run(self, paths: List[Union[str, Path]], id_hash_keys: Optional[List[str]] = None):
+        """
+        Convert files to Documents.
+
+        :param paths: A list of paths to the files to convert.
+        :param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
+            attributes. If you want to ensure you don't have duplicate Documents in your DocumentStore but texts are not
+            unique, you can pass the name of the metadata to use when building the document ID (like
+            `["text", "category"]`) to this field. In this case, the ID will be generated by using the text and the
+            content of the `category` field.
+            If not set, the id_hash_keys passed to the constructor will be used.
+            Default: `None`
+
+        """
+        id_hash_keys = id_hash_keys or self.id_hash_keys
+
+        documents = []
+        for path in paths:
+            path = Path(path)
+            try:
+                parsed_file = tika_parser.from_file(path.as_posix(), self.tika_url)
+                extracted_text = parsed_file["content"]
+                if not extracted_text:
+                    logger.warning("Skipping file at '%s' as Tika was not able to extract any content.", str(path))
+                    continue
+                if id_hash_keys:
+                    document = Document(text=extracted_text, id_hash_keys=id_hash_keys)
+                else:
+                    document = Document(text=extracted_text)
+                documents.append(document)
+            except Exception as e:
+                logger.error("Could not convert file at '%s' to Document. Error: %s", str(path), e)
+
+        return {"documents": documents}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serialize this component to a dictionary.
+        """
+        return default_to_dict(self, tika_url=self.tika_url, id_hash_keys=self.id_hash_keys)
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "TikaDocumentConverter":
+        """
+        Deserialize this component from a dictionary.
+        """
+        return default_from_dict(cls, data)
--- a/releasenotes/notes/add-tikaconverter-2.0-ef637f93114e6c96.yaml
+++ b/releasenotes/notes/add-tikaconverter-2.0-ef637f93114e6c96.yaml
@ -0,0 +1,4 @@
+---
+preview:
+  - |
+    Add TikaDocumentConverter component to convert files of different types to Documents.
--- a/test/preview/components/file_converters/test_tika_doc_converter.py
+++ b/test/preview/components/file_converters/test_tika_doc_converter.py
@ -0,0 +1,102 @@
+from unittest.mock import patch
+
+import pytest
+
+from haystack.preview.components.file_converters.tika import TikaDocumentConverter
+
+
+class TestTikaDocumentConverter:
+    @pytest.mark.unit
+    def test_to_dict(self):
+        component = TikaDocumentConverter()
+        data = component.to_dict()
+        assert data == {
+            "type": "TikaDocumentConverter",
+            "init_parameters": {"tika_url": "http://localhost:9998/tika", "id_hash_keys": []},
+        }
+
+    @pytest.mark.unit
+    def test_to_dict_with_custom_init_parameters(self):
+        component = TikaDocumentConverter(tika_url="http://localhost:1234/tika", id_hash_keys=["text", "category"])
+        data = component.to_dict()
+        assert data == {
+            "type": "TikaDocumentConverter",
+            "init_parameters": {"tika_url": "http://localhost:1234/tika", "id_hash_keys": ["text", "category"]},
+        }
+
+    @pytest.mark.unit
+    def test_from_dict(self):
+        data = {
+            "type": "TikaDocumentConverter",
+            "init_parameters": {"tika_url": "http://localhost:9998/tika", "id_hash_keys": ["text", "category"]},
+        }
+        component = TikaDocumentConverter.from_dict(data)
+        assert component.tika_url == "http://localhost:9998/tika"
+        assert component.id_hash_keys == ["text", "category"]
+
+    @pytest.mark.unit
+    def test_run(self):
+        component = TikaDocumentConverter()
+        with patch("haystack.preview.components.file_converters.tika.tika_parser.from_file") as mock_tika_parser:
+            mock_tika_parser.return_value = {"content": "Content of mock_file.pdf"}
+            documents = component.run(paths=["mock_file.pdf"])["documents"]
+
+        assert len(documents) == 1
+        assert documents[0].text == "Content of mock_file.pdf"
+
+    @pytest.mark.unit
+    def test_run_logs_warning_if_content_empty(self, caplog):
+        component = TikaDocumentConverter()
+        with patch("haystack.preview.components.file_converters.tika.tika_parser.from_file") as mock_tika_parser:
+            mock_tika_parser.return_value = {"content": ""}
+            with caplog.at_level("WARNING"):
+                component.run(paths=["mock_file.pdf"])
+                assert "Skipping file at 'mock_file.pdf' as Tika was not able to extract any content." in caplog.text
+
+    @pytest.mark.unit
+    def test_run_logs_error(self, caplog):
+        component = TikaDocumentConverter()
+        with patch("haystack.preview.components.file_converters.tika.tika_parser.from_file") as mock_tika_parser:
+            mock_tika_parser.side_effect = Exception("Some error")
+            with caplog.at_level("ERROR"):
+                component.run(paths=["mock_file.pdf"])
+                assert "Could not convert file at 'mock_file.pdf' to Document. Error: Some error" in caplog.text
+
+    @pytest.mark.integration
+    def test_run_with_txt_files(self, preview_samples_path):
+        component = TikaDocumentConverter()
+        output = component.run(
+            paths=[preview_samples_path / "txt" / "doc_1.txt", preview_samples_path / "txt" / "doc_2.txt"]
+        )
+        documents = output["documents"]
+        assert len(documents) == 2
+        assert "Some text for testing.\nTwo lines in here." in documents[0].text
+        assert "This is a test line.\n123 456 789\n987 654 321" in documents[1].text
+
+    @pytest.mark.integration
+    def test_run_with_pdf_file(self, preview_samples_path):
+        component = TikaDocumentConverter()
+        output = component.run(
+            paths=[preview_samples_path / "pdf" / "sample_pdf_1.pdf", preview_samples_path / "pdf" / "sample_pdf_2.pdf"]
+        )
+        documents = output["documents"]
+        assert len(documents) == 2
+        assert "A sample PDF file" in documents[0].text
+        assert "Page 2 of Sample PDF" in documents[0].text
+        assert "Page 4 of Sample PDF" in documents[0].text
+        assert "First Page" in documents[1].text
+        assert (
+            "Wiki engines usually allow content to be written using a simplified markup language" in documents[1].text
+        )
+        assert "This section needs additional citations for verification." in documents[1].text
+        assert "This would make it easier for other users to find the article." in documents[1].text
+
+    @pytest.mark.integration
+    def test_run_with_docx_file(self, preview_samples_path):
+        component = TikaDocumentConverter()
+        output = component.run(paths=[preview_samples_path / "docx" / "sample_docx.docx"])
+        documents = output["documents"]
+        assert len(documents) == 1
+        assert "Sample Docx File" in documents[0].text
+        assert "Now we are in Page 2" in documents[0].text
+        assert "Page 3 was empty this is page 4" in documents[0].text
--- a/test/preview/test_files/docx/sample_docx.docx
+++ b/test/preview/test_files/docx/sample_docx.docx
--- a/test/preview/test_files/pdf/sample_pdf_2.pdf
+++ b/test/preview/test_files/pdf/sample_pdf_2.pdf