From 72fe43a7cca34729bbaf2fe658f31cfb8042c1bb Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 12 Jun 2023 12:23:32 +0200 Subject: [PATCH] build: Move Azure's Form Recognizer dependency to extras (#5096) * build: Move Azure's Form Recognizer dependency to extras * try catch imports for AzureConverter * assign None to failed imports * use lazy import * use forward reference in type hints --- haystack/nodes/file_converter/azure.py | 19 +++++++++++++------ pyproject.toml | 2 +- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/haystack/nodes/file_converter/azure.py b/haystack/nodes/file_converter/azure.py index dda04b91e..8ddbcd049 100644 --- a/haystack/nodes/file_converter/azure.py +++ b/haystack/nodes/file_converter/azure.py @@ -5,17 +5,21 @@ from collections import defaultdict import json import copy -from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult -from azure.core.credentials import AzureKeyCredential import pandas as pd +from haystack.lazy_imports import LazyImport from haystack.nodes.file_converter.base import BaseConverter from haystack.errors import HaystackError from haystack.schema import Document - logger = logging.getLogger(__name__) +with LazyImport( + message="Run 'pip install farm-haystack[file-conversion]' or 'pip install " "azure-ai-formrecognizer>=3.2.0b2'" +) as azure_import: + from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult + from azure.core.credentials import AzureKeyCredential + class AzureConverter(BaseConverter): """ @@ -70,6 +74,9 @@ class AzureConverter(BaseConverter): :param add_page_number: Adds the number of the page a table occurs in to the Document's meta field `"page"`. """ + # ensure the required dependencies were actually imported + azure_import.check() + super().__init__(valid_languages=valid_languages, id_hash_keys=id_hash_keys) self.document_analysis_client = DocumentAnalysisClient( @@ -179,7 +186,7 @@ class AzureConverter(BaseConverter): def _convert_tables_and_text( self, - result: AnalyzeResult, + result: "AnalyzeResult", meta: Optional[Dict[str, Any]], valid_languages: Optional[List[str]], file_path: Path, @@ -209,7 +216,7 @@ class AzureConverter(BaseConverter): return docs def _convert_tables( - self, result: AnalyzeResult, meta: Optional[Dict[str, Any]], id_hash_keys: Optional[List[str]] = None + self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]], id_hash_keys: Optional[List[str]] = None ) -> List[Document]: converted_tables: List[Document] = [] @@ -310,7 +317,7 @@ class AzureConverter(BaseConverter): return converted_tables def _convert_text( - self, result: AnalyzeResult, meta: Optional[Dict[str, str]], id_hash_keys: Optional[List[str]] = None + self, result: "AnalyzeResult", meta: Optional[Dict[str, str]], id_hash_keys: Optional[List[str]] = None ) -> Document: text = "" table_spans_by_page = defaultdict(list) diff --git a/pyproject.toml b/pyproject.toml index 11fb76e61..7ca3082f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,6 @@ dependencies = [ "networkx", # graphs library "quantulum3", # quantities extraction from text "posthog", # telemetry - "azure-ai-formrecognizer>=3.2.0b2", # forms reader # audio's espnet-model-zoo requires huggingface-hub version <0.8 while we need >=0.5 to be able to use create_repo in FARMReader "huggingface-hub>=0.5.0", "tenacity", # retry decorator @@ -143,6 +142,7 @@ preprocessing = [ "langdetect", # for language classification ] file-conversion = [ + "azure-ai-formrecognizer>=3.2.0b2", # Microsoft Azure's Form Recognizer service (text and table exctrator) "python-docx", "tika", # Apache Tika (text & metadata extractor) "beautifulsoup4",