diff --git a/docs/_src/tutorials/tutorials/8.md b/docs/_src/tutorials/tutorials/8.md index 0fd97754f..430e05d84 100644 --- a/docs/_src/tutorials/tutorials/8.md +++ b/docs/_src/tutorials/tutorials/8.md @@ -73,13 +73,13 @@ For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't # Here are some examples of how you would use file converters converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"]) -doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None) +doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)[0] converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) -doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None) +doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)[0] converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"]) -doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None) +doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)[0] ``` diff --git a/haystack/nodes/__init__.py b/haystack/nodes/__init__.py index 4390d78a5..cd3adcbca 100644 --- a/haystack/nodes/__init__.py +++ b/haystack/nodes/__init__.py @@ -15,6 +15,7 @@ from haystack.nodes.file_converter import ( TikaConverter, TikaXHTMLParser, TextConverter, + AzureConverter ) from haystack.nodes.other import Docs2Answers, JoinDocuments from haystack.nodes.preprocessor import BasePreProcessor, PreProcessor diff --git a/haystack/nodes/file_converter/__init__.py b/haystack/nodes/file_converter/__init__.py index 6f24ba205..3429d3ce7 100644 --- a/haystack/nodes/file_converter/__init__.py +++ b/haystack/nodes/file_converter/__init__.py @@ -5,3 +5,4 @@ from haystack.nodes.file_converter.markdown import MarkdownConverter from haystack.nodes.file_converter.pdf import PDFToTextConverter, PDFToTextOCRConverter from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser from haystack.nodes.file_converter.txt import TextConverter +from haystack.nodes.file_converter.azure import AzureConverter diff --git a/haystack/nodes/file_converter/azure.py b/haystack/nodes/file_converter/azure.py new file mode 100644 index 000000000..7d3629b9a --- /dev/null +++ b/haystack/nodes/file_converter/azure.py @@ -0,0 +1,191 @@ +import logging +from pathlib import Path +from typing import List, Optional, Dict, Any +from collections import defaultdict +import json + +from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult +from azure.core.credentials import AzureKeyCredential + +from haystack.nodes.file_converter import BaseConverter + +logger = logging.getLogger(__name__) + + +class AzureConverter(BaseConverter): + """ + File converter that makes use of Microsoft Azure's Form Recognizer service + (https://azure.microsoft.com/en-us/services/form-recognizer/). + This Converter extracts both text and tables. + Supported file formats are: PDF, JPEG, PNG, BMP and TIFF. + + In order to be able to use this Converter, you need an active Azure account + and a Form Recognizer or Cognitive Services resource. + (Here you can find information on how to set this up: + https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quickstarts/try-v3-python-sdk#prerequisites) + + """ + + def __init__(self, + endpoint: str, + credential_key: str, + model_id: str = "prebuilt-document", + valid_languages: Optional[List[str]] = None, + save_json: bool = False, + surrounding_context_len: int = 3, + ): + """ + :param endpoint: Your Form Recognizer or Cognitive Services resource's endpoint. + :param credential_key: Your Form Recognizer or Cognitive Services resource's subscription key. + :param model_id: The identifier of the model you want to use to extract information out of your file. + Default: "prebuilt-document". General purpose models are "prebuilt-document" + and "prebuilt-layout". + List of available prebuilt models: + https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.2.0b1/index.html#documentanalysisclient + :param valid_languages: Validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. + :param save_json: Whether to save the output of the Form Recognizer to a JSON file. + :param surrounding_context_len: Number of lines before and after a table to extract as surrounding context. + """ + # save init parameters to enable export of component config as YAML + self.set_config(endpoint=endpoint, credential_key=credential_key, model_id=model_id, + valid_languages=valid_languages, save_json=save_json, + surrounding_context_len=surrounding_context_len) + + self.document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, + credential=AzureKeyCredential(credential_key)) + self.model_id = model_id + self.valid_languages = valid_languages + self.save_json = save_json + self.surrounding_context_len = surrounding_context_len + + super().__init__(valid_languages=valid_languages) + + def convert(self, + file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = "utf-8", + pages: Optional[str] = None, + known_language: Optional[str] = None, + ) -> List[Dict[str, Any]]: + + """ + Extract text and tables from a PDF, JPEG, PNG, BMP or TIFF file using Azure's Form Recognizer service. + + :param file_path: Path to the file you want to convert. + :param meta: Optional dictionary with metadata that shall be attached to all resulting documents. + Can be any custom keys and values. + :param remove_numeric_tables: Not applicable. + :param valid_languages: Validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. + :param encoding: Not applicable. + :param pages: Custom page numbers for multi-page documents(PDF/TIFF). Input the page numbers and/or ranges + of pages you want to get in the result. For a range of pages, use a hyphen, + like pages=”1-3, 5-6”. Separate each page number or range with a comma. + :param known_language: Locale hint of the input document. + See supported locales here: https://aka.ms/azsdk/formrecognizer/supportedlocales. + """ + + if valid_languages is None: + valid_languages = self.valid_languages + + with open(file_path, "rb") as file: + poller = self.document_analysis_client.begin_analyze_document(self.model_id, file, pages=pages, + locale=known_language) + result = poller.result() + + tables = self._convert_tables(result, meta) + text = self._convert_text(result, meta) + docs = tables + [text] + + if valid_languages: + file_text = text["content"] + " ".join([cell for table in tables for row in table["content"] for cell in row]) + if not self.validate_language(file_text): + logger.warning( + f"The language for {file_path} is not one of {self.valid_languages}. The file may not have " + f"been decoded in the correct text format." + ) + + if self.save_json: + with open(str(file_path) + ".json", "w") as json_file: + json.dump(result.to_dict(), json_file, indent=2) + + return docs + + def _convert_tables(self, result: AnalyzeResult, meta: Optional[Dict[str, str]]) -> List[Dict[str, Any]]: + converted_tables = [] + + for table in result.tables: + # Initialize table with empty cells + table_list = [[""] * table.column_count for _ in range(table.row_count)] + + for cell in table.cells: + # Remove ':selected:'/':unselected:' tags from cell's content + cell.content = cell.content.replace(":selected:", "") + cell.content = cell.content.replace(":unselected:", "") + + for c in range(cell.column_span): + for r in range(cell.row_span): + table_list[cell.row_index + r][cell.column_index + c] = cell.content + + caption = "" + # Check if all column names are the same -> exclude these cells and use as caption + if all(col_name == table_list[0][0] for col_name in table_list[0]): + caption = table_list[0][0] + table_list.pop(0) + + # Get preceding context of table + table_beginning_page = next(page for page in result.pages + if page.page_number == table.bounding_regions[0].page_number) + table_start_offset = table.spans[0].offset + preceding_lines = [line.content for line in table_beginning_page.lines + if line.spans[0].offset < table_start_offset] + preceding_context = f"{caption}\n".strip() + "\n".join(preceding_lines[-self.surrounding_context_len:]) + + # Get following context + table_end_page = table_beginning_page if len(table.bounding_regions) == 1 else \ + next(page for page in result.pages + if page.page_number == table.bounding_regions[-1].page_number) + table_end_offset = table_start_offset + table.spans[0].length + following_lines = [line.content for line in table_end_page.lines if line.spans[0].offset > table_end_offset] + following_context = "\n".join(following_lines[:self.surrounding_context_len]) + + if isinstance(meta, dict): + meta["preceding_context"] = preceding_context + meta["following_context"] = following_context + else: + meta = {"preceding_context": preceding_context, "following_context": following_context} + + converted_tables.append({"content": table_list, "content_type": "table", "meta": meta}) + + return converted_tables + + def _convert_text(self, result: AnalyzeResult, meta: Optional[Dict[str, str]]) -> Dict[str, Any]: + text = "" + table_spans_by_page = defaultdict(list) + for table in result.tables: + table_spans_by_page[table.bounding_regions[0].page_number].append(table.spans[0]) + + for page in result.pages: + tables_on_page = table_spans_by_page[page.page_number] + for line in page.lines: + in_table = False + # Check if line is part of a table + for table in tables_on_page: + if table.offset <= line.spans[0].offset <= table.offset + table.length: + in_table = True + break + if in_table: + continue + text += f"{line.content}\n" + text += "\f" + + return {"content": text, "content_type": "text", "meta": meta} diff --git a/haystack/nodes/file_converter/base.py b/haystack/nodes/file_converter/base.py index f6523835c..86290d024 100644 --- a/haystack/nodes/file_converter/base.py +++ b/haystack/nodes/file_converter/base.py @@ -48,7 +48,7 @@ class BaseConverter(BaseComponent): remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", - ) -> Dict[str, Any]: + ) -> List[Dict[str, Any]]: """ Convert a file to a dictionary containing the text and any associated meta data. @@ -101,14 +101,11 @@ class BaseConverter(BaseComponent): documents: list = [] for file_path, file_meta in zip(file_paths, meta): - documents.append( - self.convert( - file_path=file_path, - meta=file_meta, - remove_numeric_tables=remove_numeric_tables, - valid_languages=valid_languages, - ) - ) + for doc in self.convert(file_path=file_path, + meta=file_meta, + remove_numeric_tables=remove_numeric_tables, + valid_languages=valid_languages): + documents.append(doc) result = {"documents": documents} return result, "output_1" diff --git a/haystack/nodes/file_converter/docx.py b/haystack/nodes/file_converter/docx.py index 748de5349..dbfbc7565 100644 --- a/haystack/nodes/file_converter/docx.py +++ b/haystack/nodes/file_converter/docx.py @@ -18,7 +18,7 @@ class DocxToTextConverter(BaseConverter): remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, - ) -> Dict[str, Any]: + ) -> List[Dict[str, Any]]: """ Extract text from a .docx file. Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here. @@ -51,4 +51,4 @@ class DocxToTextConverter(BaseConverter): paragraphs = [para.text for para in file.paragraphs] text = "\n".join(paragraphs) document = {"content": text, "content_type": "text", "meta": meta} - return document + return [document] diff --git a/haystack/nodes/file_converter/image.py b/haystack/nodes/file_converter/image.py index efbe360eb..a645d76a8 100644 --- a/haystack/nodes/file_converter/image.py +++ b/haystack/nodes/file_converter/image.py @@ -84,7 +84,7 @@ class ImageToTextConverter(BaseConverter): remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", - ) -> Dict[str, Any]: + ) -> List[Dict[str, Any]]: """ Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) @@ -142,7 +142,7 @@ class ImageToTextConverter(BaseConverter): text = "\f".join(cleaned_pages) document = {"content": text, "meta": meta} - return document + return [document] def _image_to_text(self, image: PpmImageFile) -> List[str]: """ diff --git a/haystack/nodes/file_converter/markdown.py b/haystack/nodes/file_converter/markdown.py index 567d1e338..0eb96ea16 100644 --- a/haystack/nodes/file_converter/markdown.py +++ b/haystack/nodes/file_converter/markdown.py @@ -17,7 +17,7 @@ class MarkdownConverter(BaseConverter): remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", - ) -> Dict[str, Any]: + ) -> List[Dict[str, Any]]: """ Reads text from a txt file and executes optional preprocessing steps. @@ -33,7 +33,7 @@ class MarkdownConverter(BaseConverter): markdown_text = f.read() text = self.markdown_to_text(markdown_text) document = {"content": text, "content_type": "text", "meta": meta} - return document + return [document] # Following code snippet is copied from https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe @staticmethod diff --git a/haystack/nodes/file_converter/pdf.py b/haystack/nodes/file_converter/pdf.py index e19348ee4..0b397acb3 100644 --- a/haystack/nodes/file_converter/pdf.py +++ b/haystack/nodes/file_converter/pdf.py @@ -63,7 +63,7 @@ class PDFToTextConverter(BaseConverter): remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "Latin1", - ) -> Dict[str, Any]: + ) -> List[Dict[str, Any]]: """ Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html) @@ -136,7 +136,7 @@ class PDFToTextConverter(BaseConverter): text = "\f".join(cleaned_pages) document = {"content": text, "content_type": "text", "meta": meta} - return document + return [document] def _read_pdf( self, file_path: Path, layout: bool, encoding: Optional[str] = "Latin1" @@ -197,7 +197,7 @@ class PDFToTextOCRConverter(BaseConverter): remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", - ) -> Dict[str, Any]: + ) -> List[Dict[str, Any]]: """ Convert a file to a dictionary containing the text and any associated meta data. @@ -226,11 +226,11 @@ class PDFToTextOCRConverter(BaseConverter): dir=os.path.dirname(os.path.realpath(__file__)), suffix=".jpeg" ) image.save(temp_img.name) - pages.append(self.image_2_text.convert(temp_img.name)["content"]) + pages.append(self.image_2_text.convert(temp_img.name)[0]["content"]) except Exception as exception: logger.error(f"File {file_path} has an error \n {exception}") raw_text = "\f".join(pages) document = {"content": raw_text, "meta": meta} - return document + return [document] diff --git a/haystack/nodes/file_converter/tika.py b/haystack/nodes/file_converter/tika.py index 56de672e8..4b22b29d9 100644 --- a/haystack/nodes/file_converter/tika.py +++ b/haystack/nodes/file_converter/tika.py @@ -87,7 +87,7 @@ class TikaConverter(BaseConverter): remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, - ) -> Dict[str, Any]: + ) -> List[Dict[str, Any]]: """ :param file_path: path of the file to convert :param meta: dictionary of meta data key-value pairs to append in the returned document. @@ -150,4 +150,4 @@ class TikaConverter(BaseConverter): text = "\f".join(cleaned_pages) document = {"content": text, "content_type": "text", "meta": {**parsed["metadata"], **(meta or {})}} - return document + return [document] diff --git a/haystack/nodes/file_converter/txt.py b/haystack/nodes/file_converter/txt.py index e2b02b055..42e339b98 100644 --- a/haystack/nodes/file_converter/txt.py +++ b/haystack/nodes/file_converter/txt.py @@ -16,7 +16,7 @@ class TextConverter(BaseConverter): remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", - ) -> Dict[str, Any]: + ) -> List[Dict[str, Any]]: """ Reads text from a txt file and executes optional preprocessing steps. @@ -75,4 +75,4 @@ class TextConverter(BaseConverter): text = "".join(cleaned_pages) document = {"content": text, "content_type": "text", "meta": meta} - return document + return [document] diff --git a/haystack/utils/preprocessing.py b/haystack/utils/preprocessing.py index 4f8a3305e..664288bd7 100644 --- a/haystack/utils/preprocessing.py +++ b/haystack/utils/preprocessing.py @@ -64,7 +64,7 @@ def convert_files_to_dicts( file_path=path, meta=None, encoding=encoding, - ) + )[0] # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single dict text = document["content"] if clean_func: @@ -119,7 +119,7 @@ def tika_convert_files_to_dicts( documents = [] for path in file_paths: logger.info('Converting {}'.format(path)) - document = converter.convert(path) + document = converter.convert(path)[0] # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single dict meta = document["meta"] or {} meta["name"] = path.name text = document["content"] diff --git a/requirements.txt b/requirements.txt index e08b27c38..35e32cf16 100644 --- a/requirements.txt +++ b/requirements.txt @@ -62,4 +62,5 @@ mmh3 weaviate-client==2.5.0 ray==1.5.0 dataclasses-json -quantulum3 \ No newline at end of file +quantulum3 +azure-ai-formrecognizer==3.2.0b2 \ No newline at end of file diff --git a/test/test_file_converter.py b/test/test_file_converter.py index 79c9a7964..cd244f9c9 100644 --- a/test/test_file_converter.py +++ b/test/test_file_converter.py @@ -1,11 +1,10 @@ from pathlib import Path +import os import pytest -from haystack.file_converter import MarkdownConverter -from haystack.file_converter.docx import DocxToTextConverter -from haystack.file_converter.pdf import PDFToTextConverter, PDFToTextOCRConverter -from haystack.file_converter.tika import TikaConverter +from haystack.nodes import MarkdownConverter, DocxToTextConverter, PDFToTextConverter, PDFToTextOCRConverter, \ + TikaConverter, AzureConverter @pytest.mark.tika @@ -15,7 +14,7 @@ from haystack.file_converter.tika import TikaConverter ) def test_convert(Converter): converter = Converter() - document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) + document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))[0] pages = document["content"].split("\f") assert len(pages) == 4 # the sample PDF file has four pages. assert pages[0] != "" # the page 1 of PDF contains text. @@ -33,7 +32,7 @@ def test_convert(Converter): @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter]) def test_table_removal(Converter): converter = Converter(remove_numeric_tables=True) - document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) + document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))[0] pages = document["content"].split("\f") # assert numeric rows are removed from the table. assert "324" not in pages[0] @@ -60,11 +59,31 @@ def test_language_validation(Converter, caplog): def test_docx_converter(): converter = DocxToTextConverter() - document = converter.convert(file_path=Path("samples/docx/sample_docx.docx")) + document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))[0] assert document["content"].startswith("Sample Docx File") def test_markdown_converter(): converter = MarkdownConverter() - document = converter.convert(file_path=Path("samples/markdown/sample.md")) + document = converter.convert(file_path=Path("samples/markdown/sample.md"))[0] assert document["content"].startswith("What to build with Haystack") + + +def test_azure_converter(): + # Check if Form Recognizer endpoint and credential key in environment variables + if "AZURE_FORMRECOGNIZER_ENDPOINT" in os.environ and "AZURE_FORMRECOGNIZER_KEY" in os.environ: + converter = AzureConverter(endpoint=os.environ["AZURE_FORMRECOGNIZER_ENDPOINT"], + credential_key=os.environ["AZURE_FORMRECOGNIZER_KEY"], + save_json=True, + ) + + docs = converter.convert(file_path="samples/pdf/sample_pdf_1.pdf") + assert len(docs) == 2 + assert docs[0]["content_type"] == "table" + assert len(docs[0]["content"]) == 5 # number of rows + assert len(docs[0]["content"][0]) == 5 # number of columns, Form Recognizer assumes there are 5 columns + assert docs[0]["content"][0] == ['', 'Column 1', '', 'Column 2', 'Column 3'] + assert docs[0]["content"][4] == ['D', '$54.35', '', '$6345.', ''] + + assert docs[1]["content_type"] == "text" + assert docs[1]["content"].startswith("A sample PDF file") diff --git a/tutorials/Tutorial8_Preprocessing.ipynb b/tutorials/Tutorial8_Preprocessing.ipynb index ab57d26c0..3b519d793 100644 --- a/tutorials/Tutorial8_Preprocessing.ipynb +++ b/tutorials/Tutorial8_Preprocessing.ipynb @@ -160,13 +160,13 @@ "# Here are some examples of how you would use file converters\n", "\n", "converter = TextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n", - "doc_txt = converter.convert(file_path=\"data/preprocessing_tutorial/classics.txt\", meta=None)\n", + "doc_txt = converter.convert(file_path=\"data/preprocessing_tutorial/classics.txt\", meta=None)[0]\n", "\n", "converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n", - "doc_pdf = converter.convert(file_path=\"data/preprocessing_tutorial/bert.pdf\", meta=None)\n", + "doc_pdf = converter.convert(file_path=\"data/preprocessing_tutorial/bert.pdf\", meta=None)[0]\n", "\n", "converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=[\"en\"])\n", - "doc_docx = converter.convert(file_path=\"data/preprocessing_tutorial/heavy_metal.docx\", meta=None)\n" + "doc_docx = converter.convert(file_path=\"data/preprocessing_tutorial/heavy_metal.docx\", meta=None)[0]\n" ], "outputs": [], "metadata": { diff --git a/tutorials/Tutorial8_Preprocessing.py b/tutorials/Tutorial8_Preprocessing.py index a9e687c0b..7d19931b4 100644 --- a/tutorials/Tutorial8_Preprocessing.py +++ b/tutorials/Tutorial8_Preprocessing.py @@ -42,13 +42,13 @@ def tutorial8_preprocessing(): # Here are some examples of how you would use file converters converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"]) - doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None) + doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)[0] converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) - doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None) + doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)[0] converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"]) - doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None) + doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)[0] # Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.