mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-29 16:08:38 +00:00
Add AzureConverter to support table parsing from documents (#1813)
* Add FormRecognizerConverter * Change signature of convert method + change return type of all converters * Adapt preprocessing util to new return type of converters * Parametrize number of lines used for surrounding context of table * Change name from FormRecognizerConverter to AzureConverter * Set version of azure-ai-formrecognizer package * Change tutorial 8 based on new return type of converters * Add tests * Add latest docstring and tutorial changes * Fix typo Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
This commit is contained in:
parent
c29f960c47
commit
eb5f7bb4c0
@ -73,13 +73,13 @@ For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't
|
||||
# Here are some examples of how you would use file converters
|
||||
|
||||
converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
|
||||
doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)
|
||||
doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)[0]
|
||||
|
||||
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
|
||||
doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)
|
||||
doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)[0]
|
||||
|
||||
converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"])
|
||||
doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)
|
||||
doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)[0]
|
||||
|
||||
```
|
||||
|
||||
|
||||
@ -15,6 +15,7 @@ from haystack.nodes.file_converter import (
|
||||
TikaConverter,
|
||||
TikaXHTMLParser,
|
||||
TextConverter,
|
||||
AzureConverter
|
||||
)
|
||||
from haystack.nodes.other import Docs2Answers, JoinDocuments
|
||||
from haystack.nodes.preprocessor import BasePreProcessor, PreProcessor
|
||||
|
||||
@ -5,3 +5,4 @@ from haystack.nodes.file_converter.markdown import MarkdownConverter
|
||||
from haystack.nodes.file_converter.pdf import PDFToTextConverter, PDFToTextOCRConverter
|
||||
from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser
|
||||
from haystack.nodes.file_converter.txt import TextConverter
|
||||
from haystack.nodes.file_converter.azure import AzureConverter
|
||||
|
||||
191
haystack/nodes/file_converter/azure.py
Normal file
191
haystack/nodes/file_converter/azure.py
Normal file
@ -0,0 +1,191 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any
|
||||
from collections import defaultdict
|
||||
import json
|
||||
|
||||
from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
|
||||
from haystack.nodes.file_converter import BaseConverter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AzureConverter(BaseConverter):
|
||||
"""
|
||||
File converter that makes use of Microsoft Azure's Form Recognizer service
|
||||
(https://azure.microsoft.com/en-us/services/form-recognizer/).
|
||||
This Converter extracts both text and tables.
|
||||
Supported file formats are: PDF, JPEG, PNG, BMP and TIFF.
|
||||
|
||||
In order to be able to use this Converter, you need an active Azure account
|
||||
and a Form Recognizer or Cognitive Services resource.
|
||||
(Here you can find information on how to set this up:
|
||||
https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quickstarts/try-v3-python-sdk#prerequisites)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
endpoint: str,
|
||||
credential_key: str,
|
||||
model_id: str = "prebuilt-document",
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
save_json: bool = False,
|
||||
surrounding_context_len: int = 3,
|
||||
):
|
||||
"""
|
||||
:param endpoint: Your Form Recognizer or Cognitive Services resource's endpoint.
|
||||
:param credential_key: Your Form Recognizer or Cognitive Services resource's subscription key.
|
||||
:param model_id: The identifier of the model you want to use to extract information out of your file.
|
||||
Default: "prebuilt-document". General purpose models are "prebuilt-document"
|
||||
and "prebuilt-layout".
|
||||
List of available prebuilt models:
|
||||
https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.2.0b1/index.html#documentanalysisclient
|
||||
:param valid_languages: Validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
:param save_json: Whether to save the output of the Form Recognizer to a JSON file.
|
||||
:param surrounding_context_len: Number of lines before and after a table to extract as surrounding context.
|
||||
"""
|
||||
# save init parameters to enable export of component config as YAML
|
||||
self.set_config(endpoint=endpoint, credential_key=credential_key, model_id=model_id,
|
||||
valid_languages=valid_languages, save_json=save_json,
|
||||
surrounding_context_len=surrounding_context_len)
|
||||
|
||||
self.document_analysis_client = DocumentAnalysisClient(endpoint=endpoint,
|
||||
credential=AzureKeyCredential(credential_key))
|
||||
self.model_id = model_id
|
||||
self.valid_languages = valid_languages
|
||||
self.save_json = save_json
|
||||
self.surrounding_context_len = surrounding_context_len
|
||||
|
||||
super().__init__(valid_languages=valid_languages)
|
||||
|
||||
def convert(self,
|
||||
file_path: Path,
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
pages: Optional[str] = None,
|
||||
known_language: Optional[str] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
|
||||
"""
|
||||
Extract text and tables from a PDF, JPEG, PNG, BMP or TIFF file using Azure's Form Recognizer service.
|
||||
|
||||
:param file_path: Path to the file you want to convert.
|
||||
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
|
||||
Can be any custom keys and values.
|
||||
:param remove_numeric_tables: Not applicable.
|
||||
:param valid_languages: Validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
:param encoding: Not applicable.
|
||||
:param pages: Custom page numbers for multi-page documents(PDF/TIFF). Input the page numbers and/or ranges
|
||||
of pages you want to get in the result. For a range of pages, use a hyphen,
|
||||
like pages=”1-3, 5-6”. Separate each page number or range with a comma.
|
||||
:param known_language: Locale hint of the input document.
|
||||
See supported locales here: https://aka.ms/azsdk/formrecognizer/supportedlocales.
|
||||
"""
|
||||
|
||||
if valid_languages is None:
|
||||
valid_languages = self.valid_languages
|
||||
|
||||
with open(file_path, "rb") as file:
|
||||
poller = self.document_analysis_client.begin_analyze_document(self.model_id, file, pages=pages,
|
||||
locale=known_language)
|
||||
result = poller.result()
|
||||
|
||||
tables = self._convert_tables(result, meta)
|
||||
text = self._convert_text(result, meta)
|
||||
docs = tables + [text]
|
||||
|
||||
if valid_languages:
|
||||
file_text = text["content"] + " ".join([cell for table in tables for row in table["content"] for cell in row])
|
||||
if not self.validate_language(file_text):
|
||||
logger.warning(
|
||||
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
|
||||
f"been decoded in the correct text format."
|
||||
)
|
||||
|
||||
if self.save_json:
|
||||
with open(str(file_path) + ".json", "w") as json_file:
|
||||
json.dump(result.to_dict(), json_file, indent=2)
|
||||
|
||||
return docs
|
||||
|
||||
def _convert_tables(self, result: AnalyzeResult, meta: Optional[Dict[str, str]]) -> List[Dict[str, Any]]:
|
||||
converted_tables = []
|
||||
|
||||
for table in result.tables:
|
||||
# Initialize table with empty cells
|
||||
table_list = [[""] * table.column_count for _ in range(table.row_count)]
|
||||
|
||||
for cell in table.cells:
|
||||
# Remove ':selected:'/':unselected:' tags from cell's content
|
||||
cell.content = cell.content.replace(":selected:", "")
|
||||
cell.content = cell.content.replace(":unselected:", "")
|
||||
|
||||
for c in range(cell.column_span):
|
||||
for r in range(cell.row_span):
|
||||
table_list[cell.row_index + r][cell.column_index + c] = cell.content
|
||||
|
||||
caption = ""
|
||||
# Check if all column names are the same -> exclude these cells and use as caption
|
||||
if all(col_name == table_list[0][0] for col_name in table_list[0]):
|
||||
caption = table_list[0][0]
|
||||
table_list.pop(0)
|
||||
|
||||
# Get preceding context of table
|
||||
table_beginning_page = next(page for page in result.pages
|
||||
if page.page_number == table.bounding_regions[0].page_number)
|
||||
table_start_offset = table.spans[0].offset
|
||||
preceding_lines = [line.content for line in table_beginning_page.lines
|
||||
if line.spans[0].offset < table_start_offset]
|
||||
preceding_context = f"{caption}\n".strip() + "\n".join(preceding_lines[-self.surrounding_context_len:])
|
||||
|
||||
# Get following context
|
||||
table_end_page = table_beginning_page if len(table.bounding_regions) == 1 else \
|
||||
next(page for page in result.pages
|
||||
if page.page_number == table.bounding_regions[-1].page_number)
|
||||
table_end_offset = table_start_offset + table.spans[0].length
|
||||
following_lines = [line.content for line in table_end_page.lines if line.spans[0].offset > table_end_offset]
|
||||
following_context = "\n".join(following_lines[:self.surrounding_context_len])
|
||||
|
||||
if isinstance(meta, dict):
|
||||
meta["preceding_context"] = preceding_context
|
||||
meta["following_context"] = following_context
|
||||
else:
|
||||
meta = {"preceding_context": preceding_context, "following_context": following_context}
|
||||
|
||||
converted_tables.append({"content": table_list, "content_type": "table", "meta": meta})
|
||||
|
||||
return converted_tables
|
||||
|
||||
def _convert_text(self, result: AnalyzeResult, meta: Optional[Dict[str, str]]) -> Dict[str, Any]:
|
||||
text = ""
|
||||
table_spans_by_page = defaultdict(list)
|
||||
for table in result.tables:
|
||||
table_spans_by_page[table.bounding_regions[0].page_number].append(table.spans[0])
|
||||
|
||||
for page in result.pages:
|
||||
tables_on_page = table_spans_by_page[page.page_number]
|
||||
for line in page.lines:
|
||||
in_table = False
|
||||
# Check if line is part of a table
|
||||
for table in tables_on_page:
|
||||
if table.offset <= line.spans[0].offset <= table.offset + table.length:
|
||||
in_table = True
|
||||
break
|
||||
if in_table:
|
||||
continue
|
||||
text += f"{line.content}\n"
|
||||
text += "\f"
|
||||
|
||||
return {"content": text, "content_type": "text", "meta": meta}
|
||||
@ -48,7 +48,7 @@ class BaseConverter(BaseComponent):
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> Dict[str, Any]:
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Convert a file to a dictionary containing the text and any associated meta data.
|
||||
|
||||
@ -101,14 +101,11 @@ class BaseConverter(BaseComponent):
|
||||
|
||||
documents: list = []
|
||||
for file_path, file_meta in zip(file_paths, meta):
|
||||
documents.append(
|
||||
self.convert(
|
||||
file_path=file_path,
|
||||
meta=file_meta,
|
||||
remove_numeric_tables=remove_numeric_tables,
|
||||
valid_languages=valid_languages,
|
||||
)
|
||||
)
|
||||
for doc in self.convert(file_path=file_path,
|
||||
meta=file_meta,
|
||||
remove_numeric_tables=remove_numeric_tables,
|
||||
valid_languages=valid_languages):
|
||||
documents.append(doc)
|
||||
|
||||
result = {"documents": documents}
|
||||
return result, "output_1"
|
||||
|
||||
@ -18,7 +18,7 @@ class DocxToTextConverter(BaseConverter):
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract text from a .docx file.
|
||||
Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
|
||||
@ -51,4 +51,4 @@ class DocxToTextConverter(BaseConverter):
|
||||
paragraphs = [para.text for para in file.paragraphs]
|
||||
text = "\n".join(paragraphs)
|
||||
document = {"content": text, "content_type": "text", "meta": meta}
|
||||
return document
|
||||
return [document]
|
||||
|
||||
@ -84,7 +84,7 @@ class ImageToTextConverter(BaseConverter):
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> Dict[str, Any]:
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
|
||||
|
||||
@ -142,7 +142,7 @@ class ImageToTextConverter(BaseConverter):
|
||||
|
||||
text = "\f".join(cleaned_pages)
|
||||
document = {"content": text, "meta": meta}
|
||||
return document
|
||||
return [document]
|
||||
|
||||
def _image_to_text(self, image: PpmImageFile) -> List[str]:
|
||||
"""
|
||||
|
||||
@ -17,7 +17,7 @@ class MarkdownConverter(BaseConverter):
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> Dict[str, Any]:
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Reads text from a txt file and executes optional preprocessing steps.
|
||||
|
||||
@ -33,7 +33,7 @@ class MarkdownConverter(BaseConverter):
|
||||
markdown_text = f.read()
|
||||
text = self.markdown_to_text(markdown_text)
|
||||
document = {"content": text, "content_type": "text", "meta": meta}
|
||||
return document
|
||||
return [document]
|
||||
|
||||
# Following code snippet is copied from https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
|
||||
@staticmethod
|
||||
|
||||
@ -63,7 +63,7 @@ class PDFToTextConverter(BaseConverter):
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "Latin1",
|
||||
) -> Dict[str, Any]:
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
|
||||
|
||||
@ -136,7 +136,7 @@ class PDFToTextConverter(BaseConverter):
|
||||
|
||||
text = "\f".join(cleaned_pages)
|
||||
document = {"content": text, "content_type": "text", "meta": meta}
|
||||
return document
|
||||
return [document]
|
||||
|
||||
def _read_pdf(
|
||||
self, file_path: Path, layout: bool, encoding: Optional[str] = "Latin1"
|
||||
@ -197,7 +197,7 @@ class PDFToTextOCRConverter(BaseConverter):
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> Dict[str, Any]:
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Convert a file to a dictionary containing the text and any associated meta data.
|
||||
|
||||
@ -226,11 +226,11 @@ class PDFToTextOCRConverter(BaseConverter):
|
||||
dir=os.path.dirname(os.path.realpath(__file__)), suffix=".jpeg"
|
||||
)
|
||||
image.save(temp_img.name)
|
||||
pages.append(self.image_2_text.convert(temp_img.name)["content"])
|
||||
pages.append(self.image_2_text.convert(temp_img.name)[0]["content"])
|
||||
except Exception as exception:
|
||||
logger.error(f"File {file_path} has an error \n {exception}")
|
||||
|
||||
raw_text = "\f".join(pages)
|
||||
document = {"content": raw_text, "meta": meta}
|
||||
|
||||
return document
|
||||
return [document]
|
||||
|
||||
@ -87,7 +87,7 @@ class TikaConverter(BaseConverter):
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
:param file_path: path of the file to convert
|
||||
:param meta: dictionary of meta data key-value pairs to append in the returned document.
|
||||
@ -150,4 +150,4 @@ class TikaConverter(BaseConverter):
|
||||
|
||||
text = "\f".join(cleaned_pages)
|
||||
document = {"content": text, "content_type": "text", "meta": {**parsed["metadata"], **(meta or {})}}
|
||||
return document
|
||||
return [document]
|
||||
|
||||
@ -16,7 +16,7 @@ class TextConverter(BaseConverter):
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> Dict[str, Any]:
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Reads text from a txt file and executes optional preprocessing steps.
|
||||
|
||||
@ -75,4 +75,4 @@ class TextConverter(BaseConverter):
|
||||
|
||||
text = "".join(cleaned_pages)
|
||||
document = {"content": text, "content_type": "text", "meta": meta}
|
||||
return document
|
||||
return [document]
|
||||
|
||||
@ -64,7 +64,7 @@ def convert_files_to_dicts(
|
||||
file_path=path,
|
||||
meta=None,
|
||||
encoding=encoding,
|
||||
)
|
||||
)[0] # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single dict
|
||||
text = document["content"]
|
||||
|
||||
if clean_func:
|
||||
@ -119,7 +119,7 @@ def tika_convert_files_to_dicts(
|
||||
documents = []
|
||||
for path in file_paths:
|
||||
logger.info('Converting {}'.format(path))
|
||||
document = converter.convert(path)
|
||||
document = converter.convert(path)[0] # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single dict
|
||||
meta = document["meta"] or {}
|
||||
meta["name"] = path.name
|
||||
text = document["content"]
|
||||
|
||||
@ -62,4 +62,5 @@ mmh3
|
||||
weaviate-client==2.5.0
|
||||
ray==1.5.0
|
||||
dataclasses-json
|
||||
quantulum3
|
||||
quantulum3
|
||||
azure-ai-formrecognizer==3.2.0b2
|
||||
@ -1,11 +1,10 @@
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack.file_converter import MarkdownConverter
|
||||
from haystack.file_converter.docx import DocxToTextConverter
|
||||
from haystack.file_converter.pdf import PDFToTextConverter, PDFToTextOCRConverter
|
||||
from haystack.file_converter.tika import TikaConverter
|
||||
from haystack.nodes import MarkdownConverter, DocxToTextConverter, PDFToTextConverter, PDFToTextOCRConverter, \
|
||||
TikaConverter, AzureConverter
|
||||
|
||||
|
||||
@pytest.mark.tika
|
||||
@ -15,7 +14,7 @@ from haystack.file_converter.tika import TikaConverter
|
||||
)
|
||||
def test_convert(Converter):
|
||||
converter = Converter()
|
||||
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))[0]
|
||||
pages = document["content"].split("\f")
|
||||
assert len(pages) == 4 # the sample PDF file has four pages.
|
||||
assert pages[0] != "" # the page 1 of PDF contains text.
|
||||
@ -33,7 +32,7 @@ def test_convert(Converter):
|
||||
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
||||
def test_table_removal(Converter):
|
||||
converter = Converter(remove_numeric_tables=True)
|
||||
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))[0]
|
||||
pages = document["content"].split("\f")
|
||||
# assert numeric rows are removed from the table.
|
||||
assert "324" not in pages[0]
|
||||
@ -60,11 +59,31 @@ def test_language_validation(Converter, caplog):
|
||||
|
||||
def test_docx_converter():
|
||||
converter = DocxToTextConverter()
|
||||
document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))
|
||||
document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))[0]
|
||||
assert document["content"].startswith("Sample Docx File")
|
||||
|
||||
|
||||
def test_markdown_converter():
|
||||
converter = MarkdownConverter()
|
||||
document = converter.convert(file_path=Path("samples/markdown/sample.md"))
|
||||
document = converter.convert(file_path=Path("samples/markdown/sample.md"))[0]
|
||||
assert document["content"].startswith("What to build with Haystack")
|
||||
|
||||
|
||||
def test_azure_converter():
|
||||
# Check if Form Recognizer endpoint and credential key in environment variables
|
||||
if "AZURE_FORMRECOGNIZER_ENDPOINT" in os.environ and "AZURE_FORMRECOGNIZER_KEY" in os.environ:
|
||||
converter = AzureConverter(endpoint=os.environ["AZURE_FORMRECOGNIZER_ENDPOINT"],
|
||||
credential_key=os.environ["AZURE_FORMRECOGNIZER_KEY"],
|
||||
save_json=True,
|
||||
)
|
||||
|
||||
docs = converter.convert(file_path="samples/pdf/sample_pdf_1.pdf")
|
||||
assert len(docs) == 2
|
||||
assert docs[0]["content_type"] == "table"
|
||||
assert len(docs[0]["content"]) == 5 # number of rows
|
||||
assert len(docs[0]["content"][0]) == 5 # number of columns, Form Recognizer assumes there are 5 columns
|
||||
assert docs[0]["content"][0] == ['', 'Column 1', '', 'Column 2', 'Column 3']
|
||||
assert docs[0]["content"][4] == ['D', '$54.35', '', '$6345.', '']
|
||||
|
||||
assert docs[1]["content_type"] == "text"
|
||||
assert docs[1]["content"].startswith("A sample PDF file")
|
||||
|
||||
@ -160,13 +160,13 @@
|
||||
"# Here are some examples of how you would use file converters\n",
|
||||
"\n",
|
||||
"converter = TextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n",
|
||||
"doc_txt = converter.convert(file_path=\"data/preprocessing_tutorial/classics.txt\", meta=None)\n",
|
||||
"doc_txt = converter.convert(file_path=\"data/preprocessing_tutorial/classics.txt\", meta=None)[0]\n",
|
||||
"\n",
|
||||
"converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n",
|
||||
"doc_pdf = converter.convert(file_path=\"data/preprocessing_tutorial/bert.pdf\", meta=None)\n",
|
||||
"doc_pdf = converter.convert(file_path=\"data/preprocessing_tutorial/bert.pdf\", meta=None)[0]\n",
|
||||
"\n",
|
||||
"converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=[\"en\"])\n",
|
||||
"doc_docx = converter.convert(file_path=\"data/preprocessing_tutorial/heavy_metal.docx\", meta=None)\n"
|
||||
"doc_docx = converter.convert(file_path=\"data/preprocessing_tutorial/heavy_metal.docx\", meta=None)[0]\n"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
|
||||
@ -42,13 +42,13 @@ def tutorial8_preprocessing():
|
||||
# Here are some examples of how you would use file converters
|
||||
|
||||
converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
|
||||
doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)
|
||||
doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)[0]
|
||||
|
||||
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
|
||||
doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)
|
||||
doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)[0]
|
||||
|
||||
converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"])
|
||||
doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)
|
||||
doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)[0]
|
||||
|
||||
# Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user