Add AzureConverter to support table parsing from documents (#1813)

* Add FormRecognizerConverter

* Change signature of convert method + change return type of all converters

* Adapt preprocessing util to new return type of converters

* Parametrize number of lines used for surrounding context of table

* Change name from FormRecognizerConverter to AzureConverter

* Set version of azure-ai-formrecognizer package

* Change tutorial 8 based on new return type of converters

* Add tests

* Add latest docstring and tutorial changes

* Fix typo

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
This commit is contained in:
bogdankostic 2021-11-29 18:44:20 +01:00 committed by GitHub
parent c29f960c47
commit eb5f7bb4c0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 254 additions and 44 deletions

View File

@ -73,13 +73,13 @@ For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't
# Here are some examples of how you would use file converters
converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)
doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)[0]
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)
doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)[0]
converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"])
doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)
doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)[0]
```

View File

@ -15,6 +15,7 @@ from haystack.nodes.file_converter import (
TikaConverter,
TikaXHTMLParser,
TextConverter,
AzureConverter
)
from haystack.nodes.other import Docs2Answers, JoinDocuments
from haystack.nodes.preprocessor import BasePreProcessor, PreProcessor

View File

@ -5,3 +5,4 @@ from haystack.nodes.file_converter.markdown import MarkdownConverter
from haystack.nodes.file_converter.pdf import PDFToTextConverter, PDFToTextOCRConverter
from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser
from haystack.nodes.file_converter.txt import TextConverter
from haystack.nodes.file_converter.azure import AzureConverter

View File

@ -0,0 +1,191 @@
import logging
from pathlib import Path
from typing import List, Optional, Dict, Any
from collections import defaultdict
import json
from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult
from azure.core.credentials import AzureKeyCredential
from haystack.nodes.file_converter import BaseConverter
logger = logging.getLogger(__name__)
class AzureConverter(BaseConverter):
"""
File converter that makes use of Microsoft Azure's Form Recognizer service
(https://azure.microsoft.com/en-us/services/form-recognizer/).
This Converter extracts both text and tables.
Supported file formats are: PDF, JPEG, PNG, BMP and TIFF.
In order to be able to use this Converter, you need an active Azure account
and a Form Recognizer or Cognitive Services resource.
(Here you can find information on how to set this up:
https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quickstarts/try-v3-python-sdk#prerequisites)
"""
def __init__(self,
endpoint: str,
credential_key: str,
model_id: str = "prebuilt-document",
valid_languages: Optional[List[str]] = None,
save_json: bool = False,
surrounding_context_len: int = 3,
):
"""
:param endpoint: Your Form Recognizer or Cognitive Services resource's endpoint.
:param credential_key: Your Form Recognizer or Cognitive Services resource's subscription key.
:param model_id: The identifier of the model you want to use to extract information out of your file.
Default: "prebuilt-document". General purpose models are "prebuilt-document"
and "prebuilt-layout".
List of available prebuilt models:
https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.2.0b1/index.html#documentanalysisclient
:param valid_languages: Validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
:param save_json: Whether to save the output of the Form Recognizer to a JSON file.
:param surrounding_context_len: Number of lines before and after a table to extract as surrounding context.
"""
# save init parameters to enable export of component config as YAML
self.set_config(endpoint=endpoint, credential_key=credential_key, model_id=model_id,
valid_languages=valid_languages, save_json=save_json,
surrounding_context_len=surrounding_context_len)
self.document_analysis_client = DocumentAnalysisClient(endpoint=endpoint,
credential=AzureKeyCredential(credential_key))
self.model_id = model_id
self.valid_languages = valid_languages
self.save_json = save_json
self.surrounding_context_len = surrounding_context_len
super().__init__(valid_languages=valid_languages)
def convert(self,
file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
pages: Optional[str] = None,
known_language: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""
Extract text and tables from a PDF, JPEG, PNG, BMP or TIFF file using Azure's Form Recognizer service.
:param file_path: Path to the file you want to convert.
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
Can be any custom keys and values.
:param remove_numeric_tables: Not applicable.
:param valid_languages: Validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
:param encoding: Not applicable.
:param pages: Custom page numbers for multi-page documents(PDF/TIFF). Input the page numbers and/or ranges
of pages you want to get in the result. For a range of pages, use a hyphen,
like pages=1-3, 5-6. Separate each page number or range with a comma.
:param known_language: Locale hint of the input document.
See supported locales here: https://aka.ms/azsdk/formrecognizer/supportedlocales.
"""
if valid_languages is None:
valid_languages = self.valid_languages
with open(file_path, "rb") as file:
poller = self.document_analysis_client.begin_analyze_document(self.model_id, file, pages=pages,
locale=known_language)
result = poller.result()
tables = self._convert_tables(result, meta)
text = self._convert_text(result, meta)
docs = tables + [text]
if valid_languages:
file_text = text["content"] + " ".join([cell for table in tables for row in table["content"] for cell in row])
if not self.validate_language(file_text):
logger.warning(
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
f"been decoded in the correct text format."
)
if self.save_json:
with open(str(file_path) + ".json", "w") as json_file:
json.dump(result.to_dict(), json_file, indent=2)
return docs
def _convert_tables(self, result: AnalyzeResult, meta: Optional[Dict[str, str]]) -> List[Dict[str, Any]]:
converted_tables = []
for table in result.tables:
# Initialize table with empty cells
table_list = [[""] * table.column_count for _ in range(table.row_count)]
for cell in table.cells:
# Remove ':selected:'/':unselected:' tags from cell's content
cell.content = cell.content.replace(":selected:", "")
cell.content = cell.content.replace(":unselected:", "")
for c in range(cell.column_span):
for r in range(cell.row_span):
table_list[cell.row_index + r][cell.column_index + c] = cell.content
caption = ""
# Check if all column names are the same -> exclude these cells and use as caption
if all(col_name == table_list[0][0] for col_name in table_list[0]):
caption = table_list[0][0]
table_list.pop(0)
# Get preceding context of table
table_beginning_page = next(page for page in result.pages
if page.page_number == table.bounding_regions[0].page_number)
table_start_offset = table.spans[0].offset
preceding_lines = [line.content for line in table_beginning_page.lines
if line.spans[0].offset < table_start_offset]
preceding_context = f"{caption}\n".strip() + "\n".join(preceding_lines[-self.surrounding_context_len:])
# Get following context
table_end_page = table_beginning_page if len(table.bounding_regions) == 1 else \
next(page for page in result.pages
if page.page_number == table.bounding_regions[-1].page_number)
table_end_offset = table_start_offset + table.spans[0].length
following_lines = [line.content for line in table_end_page.lines if line.spans[0].offset > table_end_offset]
following_context = "\n".join(following_lines[:self.surrounding_context_len])
if isinstance(meta, dict):
meta["preceding_context"] = preceding_context
meta["following_context"] = following_context
else:
meta = {"preceding_context": preceding_context, "following_context": following_context}
converted_tables.append({"content": table_list, "content_type": "table", "meta": meta})
return converted_tables
def _convert_text(self, result: AnalyzeResult, meta: Optional[Dict[str, str]]) -> Dict[str, Any]:
text = ""
table_spans_by_page = defaultdict(list)
for table in result.tables:
table_spans_by_page[table.bounding_regions[0].page_number].append(table.spans[0])
for page in result.pages:
tables_on_page = table_spans_by_page[page.page_number]
for line in page.lines:
in_table = False
# Check if line is part of a table
for table in tables_on_page:
if table.offset <= line.spans[0].offset <= table.offset + table.length:
in_table = True
break
if in_table:
continue
text += f"{line.content}\n"
text += "\f"
return {"content": text, "content_type": "text", "meta": meta}

View File

@ -48,7 +48,7 @@ class BaseConverter(BaseComponent):
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
) -> Dict[str, Any]:
) -> List[Dict[str, Any]]:
"""
Convert a file to a dictionary containing the text and any associated meta data.
@ -101,14 +101,11 @@ class BaseConverter(BaseComponent):
documents: list = []
for file_path, file_meta in zip(file_paths, meta):
documents.append(
self.convert(
file_path=file_path,
meta=file_meta,
remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages,
)
)
for doc in self.convert(file_path=file_path,
meta=file_meta,
remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages):
documents.append(doc)
result = {"documents": documents}
return result, "output_1"

View File

@ -18,7 +18,7 @@ class DocxToTextConverter(BaseConverter):
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
) -> Dict[str, Any]:
) -> List[Dict[str, Any]]:
"""
Extract text from a .docx file.
Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
@ -51,4 +51,4 @@ class DocxToTextConverter(BaseConverter):
paragraphs = [para.text for para in file.paragraphs]
text = "\n".join(paragraphs)
document = {"content": text, "content_type": "text", "meta": meta}
return document
return [document]

View File

@ -84,7 +84,7 @@ class ImageToTextConverter(BaseConverter):
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
) -> Dict[str, Any]:
) -> List[Dict[str, Any]]:
"""
Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
@ -142,7 +142,7 @@ class ImageToTextConverter(BaseConverter):
text = "\f".join(cleaned_pages)
document = {"content": text, "meta": meta}
return document
return [document]
def _image_to_text(self, image: PpmImageFile) -> List[str]:
"""

View File

@ -17,7 +17,7 @@ class MarkdownConverter(BaseConverter):
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
) -> Dict[str, Any]:
) -> List[Dict[str, Any]]:
"""
Reads text from a txt file and executes optional preprocessing steps.
@ -33,7 +33,7 @@ class MarkdownConverter(BaseConverter):
markdown_text = f.read()
text = self.markdown_to_text(markdown_text)
document = {"content": text, "content_type": "text", "meta": meta}
return document
return [document]
# Following code snippet is copied from https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
@staticmethod

View File

@ -63,7 +63,7 @@ class PDFToTextConverter(BaseConverter):
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "Latin1",
) -> Dict[str, Any]:
) -> List[Dict[str, Any]]:
"""
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
@ -136,7 +136,7 @@ class PDFToTextConverter(BaseConverter):
text = "\f".join(cleaned_pages)
document = {"content": text, "content_type": "text", "meta": meta}
return document
return [document]
def _read_pdf(
self, file_path: Path, layout: bool, encoding: Optional[str] = "Latin1"
@ -197,7 +197,7 @@ class PDFToTextOCRConverter(BaseConverter):
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
) -> Dict[str, Any]:
) -> List[Dict[str, Any]]:
"""
Convert a file to a dictionary containing the text and any associated meta data.
@ -226,11 +226,11 @@ class PDFToTextOCRConverter(BaseConverter):
dir=os.path.dirname(os.path.realpath(__file__)), suffix=".jpeg"
)
image.save(temp_img.name)
pages.append(self.image_2_text.convert(temp_img.name)["content"])
pages.append(self.image_2_text.convert(temp_img.name)[0]["content"])
except Exception as exception:
logger.error(f"File {file_path} has an error \n {exception}")
raw_text = "\f".join(pages)
document = {"content": raw_text, "meta": meta}
return document
return [document]

View File

@ -87,7 +87,7 @@ class TikaConverter(BaseConverter):
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
) -> Dict[str, Any]:
) -> List[Dict[str, Any]]:
"""
:param file_path: path of the file to convert
:param meta: dictionary of meta data key-value pairs to append in the returned document.
@ -150,4 +150,4 @@ class TikaConverter(BaseConverter):
text = "\f".join(cleaned_pages)
document = {"content": text, "content_type": "text", "meta": {**parsed["metadata"], **(meta or {})}}
return document
return [document]

View File

@ -16,7 +16,7 @@ class TextConverter(BaseConverter):
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
) -> Dict[str, Any]:
) -> List[Dict[str, Any]]:
"""
Reads text from a txt file and executes optional preprocessing steps.
@ -75,4 +75,4 @@ class TextConverter(BaseConverter):
text = "".join(cleaned_pages)
document = {"content": text, "content_type": "text", "meta": meta}
return document
return [document]

View File

@ -64,7 +64,7 @@ def convert_files_to_dicts(
file_path=path,
meta=None,
encoding=encoding,
)
)[0] # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single dict
text = document["content"]
if clean_func:
@ -119,7 +119,7 @@ def tika_convert_files_to_dicts(
documents = []
for path in file_paths:
logger.info('Converting {}'.format(path))
document = converter.convert(path)
document = converter.convert(path)[0] # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single dict
meta = document["meta"] or {}
meta["name"] = path.name
text = document["content"]

View File

@ -62,4 +62,5 @@ mmh3
weaviate-client==2.5.0
ray==1.5.0
dataclasses-json
quantulum3
quantulum3
azure-ai-formrecognizer==3.2.0b2

View File

@ -1,11 +1,10 @@
from pathlib import Path
import os
import pytest
from haystack.file_converter import MarkdownConverter
from haystack.file_converter.docx import DocxToTextConverter
from haystack.file_converter.pdf import PDFToTextConverter, PDFToTextOCRConverter
from haystack.file_converter.tika import TikaConverter
from haystack.nodes import MarkdownConverter, DocxToTextConverter, PDFToTextConverter, PDFToTextOCRConverter, \
TikaConverter, AzureConverter
@pytest.mark.tika
@ -15,7 +14,7 @@ from haystack.file_converter.tika import TikaConverter
)
def test_convert(Converter):
converter = Converter()
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))[0]
pages = document["content"].split("\f")
assert len(pages) == 4 # the sample PDF file has four pages.
assert pages[0] != "" # the page 1 of PDF contains text.
@ -33,7 +32,7 @@ def test_convert(Converter):
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_table_removal(Converter):
converter = Converter(remove_numeric_tables=True)
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))[0]
pages = document["content"].split("\f")
# assert numeric rows are removed from the table.
assert "324" not in pages[0]
@ -60,11 +59,31 @@ def test_language_validation(Converter, caplog):
def test_docx_converter():
converter = DocxToTextConverter()
document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))
document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))[0]
assert document["content"].startswith("Sample Docx File")
def test_markdown_converter():
converter = MarkdownConverter()
document = converter.convert(file_path=Path("samples/markdown/sample.md"))
document = converter.convert(file_path=Path("samples/markdown/sample.md"))[0]
assert document["content"].startswith("What to build with Haystack")
def test_azure_converter():
# Check if Form Recognizer endpoint and credential key in environment variables
if "AZURE_FORMRECOGNIZER_ENDPOINT" in os.environ and "AZURE_FORMRECOGNIZER_KEY" in os.environ:
converter = AzureConverter(endpoint=os.environ["AZURE_FORMRECOGNIZER_ENDPOINT"],
credential_key=os.environ["AZURE_FORMRECOGNIZER_KEY"],
save_json=True,
)
docs = converter.convert(file_path="samples/pdf/sample_pdf_1.pdf")
assert len(docs) == 2
assert docs[0]["content_type"] == "table"
assert len(docs[0]["content"]) == 5 # number of rows
assert len(docs[0]["content"][0]) == 5 # number of columns, Form Recognizer assumes there are 5 columns
assert docs[0]["content"][0] == ['', 'Column 1', '', 'Column 2', 'Column 3']
assert docs[0]["content"][4] == ['D', '$54.35', '', '$6345.', '']
assert docs[1]["content_type"] == "text"
assert docs[1]["content"].startswith("A sample PDF file")

View File

@ -160,13 +160,13 @@
"# Here are some examples of how you would use file converters\n",
"\n",
"converter = TextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n",
"doc_txt = converter.convert(file_path=\"data/preprocessing_tutorial/classics.txt\", meta=None)\n",
"doc_txt = converter.convert(file_path=\"data/preprocessing_tutorial/classics.txt\", meta=None)[0]\n",
"\n",
"converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n",
"doc_pdf = converter.convert(file_path=\"data/preprocessing_tutorial/bert.pdf\", meta=None)\n",
"doc_pdf = converter.convert(file_path=\"data/preprocessing_tutorial/bert.pdf\", meta=None)[0]\n",
"\n",
"converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=[\"en\"])\n",
"doc_docx = converter.convert(file_path=\"data/preprocessing_tutorial/heavy_metal.docx\", meta=None)\n"
"doc_docx = converter.convert(file_path=\"data/preprocessing_tutorial/heavy_metal.docx\", meta=None)[0]\n"
],
"outputs": [],
"metadata": {

View File

@ -42,13 +42,13 @@ def tutorial8_preprocessing():
# Here are some examples of how you would use file converters
converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)
doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)[0]
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)
doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)[0]
converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"])
doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)
doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)[0]
# Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.