Add AzureConverter to support table parsing from documents (#1813)

* Add FormRecognizerConverter * Change signature of convert method + change return type of all converters * Adapt preprocessing util to new return type of converters * Parametrize number of lines used for surrounding context of table * Change name from FormRecognizerConverter to AzureConverter * Set version of azure-ai-formrecognizer package * Change tutorial 8 based on new return type of converters * Add tests * Add latest docstring and tutorial changes * Fix typo Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
2025-12-30 16:47:19 +00:00 · 2021-11-29 18:44:20 +01:00 · 2021-11-29 18:44:20 +01:00 · eb5f7bb4c0
commit eb5f7bb4c0
parent c29f960c47
16 changed files with 254 additions and 44 deletions
--- a/docs/_src/tutorials/tutorials/8.md
+++ b/docs/_src/tutorials/tutorials/8.md
@ -73,13 +73,13 @@ For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't
 # Here are some examples of how you would use file converters

 converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
-doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)
+doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)[0]

 converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
-doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)
+doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)[0]

 converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"])
-doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)
+doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)[0]

 ```

--- a/haystack/nodes/init.py
+++ b/haystack/nodes/init.py
@ -15,6 +15,7 @@ from haystack.nodes.file_converter import (
    TikaConverter,
    TikaXHTMLParser,
    TextConverter,
+    AzureConverter
 )
 from haystack.nodes.other import Docs2Answers, JoinDocuments
 from haystack.nodes.preprocessor import BasePreProcessor, PreProcessor
--- a/haystack/nodes/file_converter/init.py
+++ b/haystack/nodes/file_converter/init.py
@ -5,3 +5,4 @@ from haystack.nodes.file_converter.markdown import MarkdownConverter
 from haystack.nodes.file_converter.pdf import PDFToTextConverter, PDFToTextOCRConverter
 from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser
 from haystack.nodes.file_converter.txt import TextConverter
+from haystack.nodes.file_converter.azure import AzureConverter
--- a/haystack/nodes/file_converter/azure.py
+++ b/haystack/nodes/file_converter/azure.py
@ -0,0 +1,191 @@
+import logging
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+from collections import defaultdict
+import json
+
+from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult
+from azure.core.credentials import AzureKeyCredential
+
+from haystack.nodes.file_converter import BaseConverter
+
+logger = logging.getLogger(__name__)
+
+
+class AzureConverter(BaseConverter):
+    """
+    File converter that makes use of Microsoft Azure's Form Recognizer service
+    (https://azure.microsoft.com/en-us/services/form-recognizer/).
+    This Converter extracts both text and tables.
+    Supported file formats are: PDF, JPEG, PNG, BMP and TIFF.
+
+    In order to be able to use this Converter, you need an active Azure account
+    and a Form Recognizer or Cognitive Services resource.
+    (Here you can find information on how to set this up:
+    https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quickstarts/try-v3-python-sdk#prerequisites)
+
+    """
+
+    def __init__(self,
+                 endpoint: str,
+                 credential_key: str,
+                 model_id: str = "prebuilt-document",
+                 valid_languages: Optional[List[str]] = None,
+                 save_json: bool = False,
+                 surrounding_context_len: int = 3,
+                 ):
+        """
+        :param endpoint: Your Form Recognizer or Cognitive Services resource's endpoint.
+        :param credential_key: Your Form Recognizer or Cognitive Services resource's subscription key.
+        :param model_id: The identifier of the model you want to use to extract information out of your file.
+                         Default: "prebuilt-document". General purpose models are "prebuilt-document"
+                         and "prebuilt-layout".
+                         List of available prebuilt models:
+                         https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.2.0b1/index.html#documentanalysisclient
+        :param valid_languages: Validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
+        :param save_json: Whether to save the output of the Form Recognizer to a JSON file.
+        :param surrounding_context_len: Number of lines before and after a table to extract as surrounding context.
+        """
+        # save init parameters to enable export of component config as YAML
+        self.set_config(endpoint=endpoint, credential_key=credential_key, model_id=model_id,
+                        valid_languages=valid_languages, save_json=save_json,
+                        surrounding_context_len=surrounding_context_len)
+
+        self.document_analysis_client = DocumentAnalysisClient(endpoint=endpoint,
+                                                               credential=AzureKeyCredential(credential_key))
+        self.model_id = model_id
+        self.valid_languages = valid_languages
+        self.save_json = save_json
+        self.surrounding_context_len = surrounding_context_len
+
+        super().__init__(valid_languages=valid_languages)
+
+    def convert(self,
+                file_path: Path,
+                meta: Optional[Dict[str, str]] = None,
+                remove_numeric_tables: Optional[bool] = None,
+                valid_languages: Optional[List[str]] = None,
+                encoding: Optional[str] = "utf-8",
+                pages: Optional[str] = None,
+                known_language: Optional[str] = None,
+                ) -> List[Dict[str, Any]]:
+
+        """
+        Extract text and tables from a PDF, JPEG, PNG, BMP or TIFF file using Azure's Form Recognizer service.
+
+        :param file_path: Path to the file you want to convert.
+        :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
+                     Can be any custom keys and values.
+        :param remove_numeric_tables: Not applicable.
+        :param valid_languages: Validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
+        :param encoding: Not applicable.
+        :param pages: Custom page numbers for multi-page documents(PDF/TIFF). Input the page numbers and/or ranges
+                      of pages you want to get in the result. For a range of pages, use a hyphen,
+                      like pages=”1-3, 5-6”. Separate each page number or range with a comma.
+        :param known_language: Locale hint of the input document.
+                               See supported locales here: https://aka.ms/azsdk/formrecognizer/supportedlocales.
+        """
+
+        if valid_languages is None:
+            valid_languages = self.valid_languages
+
+        with open(file_path, "rb") as file:
+            poller = self.document_analysis_client.begin_analyze_document(self.model_id, file, pages=pages,
+                                                                          locale=known_language)
+            result = poller.result()
+
+        tables = self._convert_tables(result, meta)
+        text = self._convert_text(result, meta)
+        docs = tables + [text]
+
+        if valid_languages:
+            file_text = text["content"] + " ".join([cell for table in tables for row in table["content"] for cell in row])
+            if not self.validate_language(file_text):
+                logger.warning(
+                    f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
+                    f"been decoded in the correct text format."
+                )
+
+        if self.save_json:
+            with open(str(file_path) + ".json", "w") as json_file:
+                json.dump(result.to_dict(), json_file, indent=2)
+
+        return docs
+
+    def _convert_tables(self, result: AnalyzeResult, meta: Optional[Dict[str, str]]) -> List[Dict[str, Any]]:
+        converted_tables = []
+
+        for table in result.tables:
+            # Initialize table with empty cells
+            table_list = [[""] * table.column_count for _ in range(table.row_count)]
+
+            for cell in table.cells:
+                # Remove ':selected:'/':unselected:' tags from cell's content
+                cell.content = cell.content.replace(":selected:", "")
+                cell.content = cell.content.replace(":unselected:", "")
+
+                for c in range(cell.column_span):
+                    for r in range(cell.row_span):
+                        table_list[cell.row_index + r][cell.column_index + c] = cell.content
+
+            caption = ""
+            # Check if all column names are the same -> exclude these cells and use as caption
+            if all(col_name == table_list[0][0] for col_name in table_list[0]):
+                caption = table_list[0][0]
+                table_list.pop(0)
+
+            # Get preceding context of table
+            table_beginning_page = next(page for page in result.pages
+                                        if page.page_number == table.bounding_regions[0].page_number)
+            table_start_offset = table.spans[0].offset
+            preceding_lines = [line.content for line in table_beginning_page.lines
+                               if line.spans[0].offset < table_start_offset]
+            preceding_context = f"{caption}\n".strip() + "\n".join(preceding_lines[-self.surrounding_context_len:])
+
+            # Get following context
+            table_end_page = table_beginning_page if len(table.bounding_regions) == 1 else \
+                next(page for page in result.pages
+                     if page.page_number == table.bounding_regions[-1].page_number)
+            table_end_offset = table_start_offset + table.spans[0].length
+            following_lines = [line.content for line in table_end_page.lines if line.spans[0].offset > table_end_offset]
+            following_context = "\n".join(following_lines[:self.surrounding_context_len])
+
+            if isinstance(meta, dict):
+                meta["preceding_context"] = preceding_context
+                meta["following_context"] = following_context
+            else:
+                meta = {"preceding_context": preceding_context, "following_context": following_context}
+
+            converted_tables.append({"content": table_list, "content_type": "table", "meta": meta})
+
+        return converted_tables
+
+    def _convert_text(self, result: AnalyzeResult, meta: Optional[Dict[str, str]]) -> Dict[str, Any]:
+        text = ""
+        table_spans_by_page = defaultdict(list)
+        for table in result.tables:
+            table_spans_by_page[table.bounding_regions[0].page_number].append(table.spans[0])
+
+        for page in result.pages:
+            tables_on_page = table_spans_by_page[page.page_number]
+            for line in page.lines:
+                in_table = False
+                # Check if line is part of a table
+                for table in tables_on_page:
+                    if table.offset <= line.spans[0].offset <= table.offset + table.length:
+                        in_table = True
+                        break
+                if in_table:
+                    continue
+                text += f"{line.content}\n"
+            text += "\f"
+
+        return {"content": text, "content_type": "text", "meta": meta}
--- a/haystack/nodes/file_converter/base.py
+++ b/haystack/nodes/file_converter/base.py
@ -48,7 +48,7 @@ class BaseConverter(BaseComponent):
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "utf-8",
-    ) -> Dict[str, Any]:
+    ) -> List[Dict[str, Any]]:
        """
        Convert a file to a dictionary containing the text and any associated meta data.

@ -101,14 +101,11 @@ class BaseConverter(BaseComponent):

        documents: list = []
        for file_path, file_meta in zip(file_paths, meta):
-            documents.append(
-                self.convert(
-                    file_path=file_path,
-                    meta=file_meta,
-                    remove_numeric_tables=remove_numeric_tables,
-                    valid_languages=valid_languages,
-                )
-            )
+            for doc in self.convert(file_path=file_path,
+                                    meta=file_meta,
+                                    remove_numeric_tables=remove_numeric_tables,
+                                    valid_languages=valid_languages):
+                documents.append(doc)

        result = {"documents": documents}
        return result, "output_1"
--- a/haystack/nodes/file_converter/docx.py
+++ b/haystack/nodes/file_converter/docx.py
@ -18,7 +18,7 @@ class DocxToTextConverter(BaseConverter):
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = None,
-    ) -> Dict[str, Any]:
+    ) -> List[Dict[str, Any]]:
        """
        Extract text from a .docx file.
        Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
@ -51,4 +51,4 @@ class DocxToTextConverter(BaseConverter):
        paragraphs = [para.text for para in file.paragraphs]
        text = "\n".join(paragraphs)
        document = {"content": text, "content_type": "text", "meta": meta}
-        return document
+        return [document]
--- a/haystack/nodes/file_converter/image.py
+++ b/haystack/nodes/file_converter/image.py
@ -84,7 +84,7 @@ class ImageToTextConverter(BaseConverter):
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "utf-8",
-    ) -> Dict[str, Any]:
+    ) -> List[Dict[str, Any]]:
        """
        Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)

@ -142,7 +142,7 @@ class ImageToTextConverter(BaseConverter):

        text = "\f".join(cleaned_pages)
        document = {"content": text, "meta": meta}
-        return document
+        return [document]

    def _image_to_text(self, image: PpmImageFile) -> List[str]:
        """
--- a/haystack/nodes/file_converter/markdown.py
+++ b/haystack/nodes/file_converter/markdown.py
@ -17,7 +17,7 @@ class MarkdownConverter(BaseConverter):
            remove_numeric_tables: Optional[bool] = None,
            valid_languages: Optional[List[str]] = None,
            encoding: Optional[str] = "utf-8",
-    ) -> Dict[str, Any]:
+    ) -> List[Dict[str, Any]]:
        """
        Reads text from a txt file and executes optional preprocessing steps.

@ -33,7 +33,7 @@ class MarkdownConverter(BaseConverter):
            markdown_text = f.read()
        text = self.markdown_to_text(markdown_text)
        document = {"content": text, "content_type": "text", "meta": meta}
-        return document
+        return [document]

    # Following code snippet is copied from https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
    @staticmethod
--- a/haystack/nodes/file_converter/pdf.py
+++ b/haystack/nodes/file_converter/pdf.py
@ -63,7 +63,7 @@ class PDFToTextConverter(BaseConverter):
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "Latin1",
-    ) -> Dict[str, Any]:
+    ) -> List[Dict[str, Any]]:
        """
        Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)

@ -136,7 +136,7 @@ class PDFToTextConverter(BaseConverter):

        text = "\f".join(cleaned_pages)
        document = {"content": text, "content_type": "text", "meta": meta}
-        return document
+        return [document]

    def _read_pdf(
        self, file_path: Path, layout: bool, encoding: Optional[str] = "Latin1"
@ -197,7 +197,7 @@ class PDFToTextOCRConverter(BaseConverter):
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "utf-8",
-    ) -> Dict[str, Any]:
+    ) -> List[Dict[str, Any]]:
        """
        Convert a file to a dictionary containing the text and any associated meta data.

@ -226,11 +226,11 @@ class PDFToTextOCRConverter(BaseConverter):
                    dir=os.path.dirname(os.path.realpath(__file__)), suffix=".jpeg"
                )
                image.save(temp_img.name)
-                pages.append(self.image_2_text.convert(temp_img.name)["content"])
+                pages.append(self.image_2_text.convert(temp_img.name)[0]["content"])
        except Exception as exception:
            logger.error(f"File {file_path} has an error \n {exception}")

        raw_text = "\f".join(pages)
        document = {"content": raw_text, "meta": meta}

-        return document
+        return [document]
--- a/haystack/nodes/file_converter/tika.py
+++ b/haystack/nodes/file_converter/tika.py
@ -87,7 +87,7 @@ class TikaConverter(BaseConverter):
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = None,
-    ) -> Dict[str, Any]:
+    ) -> List[Dict[str, Any]]:
        """
        :param file_path: path of the file to convert
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
@ -150,4 +150,4 @@ class TikaConverter(BaseConverter):

        text = "\f".join(cleaned_pages)
        document = {"content": text, "content_type": "text", "meta": {**parsed["metadata"], **(meta or {})}}
-        return document
+        return [document]
--- a/haystack/nodes/file_converter/txt.py
+++ b/haystack/nodes/file_converter/txt.py
@ -16,7 +16,7 @@ class TextConverter(BaseConverter):
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "utf-8",
-    ) -> Dict[str, Any]:
+    ) -> List[Dict[str, Any]]:
        """
        Reads text from a txt file and executes optional preprocessing steps.

@ -75,4 +75,4 @@ class TextConverter(BaseConverter):

        text = "".join(cleaned_pages)
        document = {"content": text, "content_type": "text", "meta": meta}
-        return document
+        return [document]
--- a/haystack/utils/preprocessing.py
+++ b/haystack/utils/preprocessing.py
@ -64,7 +64,7 @@ def convert_files_to_dicts(
                    file_path=path,
                    meta=None,
                    encoding=encoding,
-            )
+            )[0]  # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single dict
            text = document["content"]

            if clean_func:
@ -119,7 +119,7 @@ def tika_convert_files_to_dicts(
    documents = []
    for path in file_paths:
        logger.info('Converting {}'.format(path))
-        document = converter.convert(path)
+        document = converter.convert(path)[0] # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single dict
        meta = document["meta"] or {}
        meta["name"] = path.name
        text = document["content"]
--- a/requirements.txt
+++ b/requirements.txt
@ -62,4 +62,5 @@ mmh3
 weaviate-client==2.5.0
 ray==1.5.0
 dataclasses-json
-quantulum3
+quantulum3
+azure-ai-formrecognizer==3.2.0b2
--- a/test/test_file_converter.py
+++ b/test/test_file_converter.py
@ -1,11 +1,10 @@
 from pathlib import Path
+import os

 import pytest

-from haystack.file_converter import MarkdownConverter
-from haystack.file_converter.docx import DocxToTextConverter
-from haystack.file_converter.pdf import PDFToTextConverter, PDFToTextOCRConverter
-from haystack.file_converter.tika import TikaConverter
+from haystack.nodes import MarkdownConverter, DocxToTextConverter, PDFToTextConverter, PDFToTextOCRConverter, \
+    TikaConverter, AzureConverter


@pytest.mark.tika
@ -15,7 +14,7 @@ from haystack.file_converter.tika import TikaConverter
 )
 def test_convert(Converter):
    converter = Converter()
-    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))[0]
    pages = document["content"].split("\f")
    assert len(pages) == 4  # the sample PDF file has four pages.
    assert pages[0] != ""  # the page 1 of PDF contains text.
@ -33,7 +32,7 @@ def test_convert(Converter):
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
 def test_table_removal(Converter):
    converter = Converter(remove_numeric_tables=True)
-    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))[0]
    pages = document["content"].split("\f")
    # assert numeric rows are removed from the table.
    assert "324" not in pages[0]
@ -60,11 +59,31 @@ def test_language_validation(Converter, caplog):

 def test_docx_converter():
    converter = DocxToTextConverter()
-    document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))
+    document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))[0]
    assert document["content"].startswith("Sample Docx File")


 def test_markdown_converter():
    converter = MarkdownConverter()
-    document = converter.convert(file_path=Path("samples/markdown/sample.md"))
+    document = converter.convert(file_path=Path("samples/markdown/sample.md"))[0]
    assert document["content"].startswith("What to build with Haystack")
+
+
+def test_azure_converter():
+    # Check if Form Recognizer endpoint and credential key in environment variables
+    if "AZURE_FORMRECOGNIZER_ENDPOINT" in os.environ and "AZURE_FORMRECOGNIZER_KEY" in os.environ:
+        converter = AzureConverter(endpoint=os.environ["AZURE_FORMRECOGNIZER_ENDPOINT"],
+                                   credential_key=os.environ["AZURE_FORMRECOGNIZER_KEY"],
+                                   save_json=True,
+                                   )
+
+        docs = converter.convert(file_path="samples/pdf/sample_pdf_1.pdf")
+        assert len(docs) == 2
+        assert docs[0]["content_type"] == "table"
+        assert len(docs[0]["content"]) == 5  # number of rows
+        assert len(docs[0]["content"][0]) == 5  # number of columns, Form Recognizer assumes there are 5 columns
+        assert docs[0]["content"][0] == ['', 'Column 1', '', 'Column 2', 'Column 3']
+        assert docs[0]["content"][4] == ['D', '$54.35', '', '$6345.', '']
+
+        assert docs[1]["content_type"] == "text"
+        assert docs[1]["content"].startswith("A sample PDF file")
--- a/tutorials/Tutorial8_Preprocessing.ipynb
+++ b/tutorials/Tutorial8_Preprocessing.ipynb
@ -160,13 +160,13 @@
    "# Here are some examples of how you would use file converters\n",
    "\n",
    "converter = TextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n",
-    "doc_txt = converter.convert(file_path=\"data/preprocessing_tutorial/classics.txt\", meta=None)\n",
+    "doc_txt = converter.convert(file_path=\"data/preprocessing_tutorial/classics.txt\", meta=None)[0]\n",
    "\n",
    "converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n",
-    "doc_pdf = converter.convert(file_path=\"data/preprocessing_tutorial/bert.pdf\", meta=None)\n",
+    "doc_pdf = converter.convert(file_path=\"data/preprocessing_tutorial/bert.pdf\", meta=None)[0]\n",
    "\n",
    "converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=[\"en\"])\n",
-    "doc_docx = converter.convert(file_path=\"data/preprocessing_tutorial/heavy_metal.docx\", meta=None)\n"
+    "doc_docx = converter.convert(file_path=\"data/preprocessing_tutorial/heavy_metal.docx\", meta=None)[0]\n"
   ],
   "outputs": [],
   "metadata": {
--- a/tutorials/Tutorial8_Preprocessing.py
+++ b/tutorials/Tutorial8_Preprocessing.py
@ -42,13 +42,13 @@ def tutorial8_preprocessing():
    # Here are some examples of how you would use file converters

    converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
-    doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)
+    doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)[0]

    converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
-    doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)
+    doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)[0]

    converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"])
-    doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)
+    doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)[0]

    # Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.