mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-09 14:23:43 +00:00
update docstrings (#8225)
This commit is contained in:
parent
bcc4104729
commit
9427d7aee6
@ -27,16 +27,16 @@ with LazyImport(message="Run 'pip install \"azure-ai-formrecognizer>=3.2.0b2\"'"
|
|||||||
@component
|
@component
|
||||||
class AzureOCRDocumentConverter:
|
class AzureOCRDocumentConverter:
|
||||||
"""
|
"""
|
||||||
Convert files to documents using Azure's Document Intelligence service.
|
Converts files to documents using Azure's Document Intelligence service.
|
||||||
|
|
||||||
Supported file formats are: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, and HTML.
|
Supported file formats are: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, and HTML.
|
||||||
|
|
||||||
In order to be able to use this component, you need an active Azure account
|
To use this component, you need an active Azure account
|
||||||
and a Document Intelligence or Cognitive Services resource. Follow the steps described in the [Azure documentation]
|
and a Document Intelligence or Cognitive Services resource. For help with setting up your resource, see
|
||||||
(https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api)
|
[Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api).
|
||||||
to set up your resource.
|
|
||||||
|
### Usage example
|
||||||
|
|
||||||
Usage example:
|
|
||||||
```python
|
```python
|
||||||
from haystack.components.converters import AzureOCRDocumentConverter
|
from haystack.components.converters import AzureOCRDocumentConverter
|
||||||
from haystack.utils import Secret
|
from haystack.utils import Secret
|
||||||
@ -61,30 +61,28 @@ class AzureOCRDocumentConverter:
|
|||||||
threshold_y: Optional[float] = 0.05,
|
threshold_y: Optional[float] = 0.05,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create an AzureOCRDocumentConverter component.
|
Creates an AzureOCRDocumentConverter component.
|
||||||
|
|
||||||
:param endpoint:
|
:param endpoint:
|
||||||
The endpoint of your Azure resource.
|
The endpoint of your Azure resource.
|
||||||
:param api_key:
|
:param api_key:
|
||||||
The key of your Azure resource.
|
The API key of your Azure resource.
|
||||||
:param model_id:
|
:param model_id:
|
||||||
The model ID of the model you want to use. Please refer to [Azure documentation]
|
The ID of the model you want to use. For a list of available models, see [Azure documentation]
|
||||||
(https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature)
|
(https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature).
|
||||||
for a list of available models. Default: `"prebuilt-read"`.
|
:param preceding_context_len: Number of lines before a table to include as preceding context
|
||||||
:param preceding_context_len: Number of lines before a table to extract as preceding context
|
(this will be added to the metadata).
|
||||||
(will be returned as part of metadata).
|
:param following_context_len: Number of lines after a table to include as subsequent context (
|
||||||
:param following_context_len: Number of lines after a table to extract as subsequent context (
|
this will be added to the metadata).
|
||||||
will be returned as part of metadata).
|
:param merge_multiple_column_headers: If `True`, merges multiple column header rows into a single row.
|
||||||
:param merge_multiple_column_headers: Some tables contain more than one row as a column header
|
:param page_layout: The type reading order to follow. Possible options:
|
||||||
(i.e., column description).
|
- `natural`: Uses the natural reading order determined by Azure.
|
||||||
This parameter lets you choose, whether to merge multiple column header rows to a single row.
|
- `single_column`: Groups all lines with the same height on the page based on a threshold
|
||||||
:param page_layout: The type reading order to follow. If "natural" is chosen then the natural reading order
|
determined by `threshold_y`.
|
||||||
determined by Azure will be used. If "single_column" is chosen then all lines with the same height on the
|
:param threshold_y: Only relevant if `single_column` is set to `page_layout`.
|
||||||
page will be grouped together based on a threshold determined by `threshold_y`.
|
The threshold, in inches, to determine if two recognized PDF elements are grouped into a
|
||||||
:param threshold_y: The threshold to determine if two recognized elements in a PDF should be grouped into a
|
single line. This is crucial for section headers or numbers which may be spatially separated
|
||||||
single line. This is especially relevant for section headers or numbers which may be spacially separated
|
from the remaining text on the horizontal axis.
|
||||||
on the horizontal axis from the remaining text. The threshold is specified in units of inches.
|
|
||||||
This is only relevant if "single_column" is chosen for `page_layout`.
|
|
||||||
"""
|
"""
|
||||||
azure_import.check()
|
azure_import.check()
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user