2024-05-03 09:11:55 -07:00
|
|
|
"""Provides partitioning with automatic file-type detection."""
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
2023-04-12 14:31:01 -04:00
|
|
|
import io
|
2024-05-10 14:37:07 -05:00
|
|
|
from typing import IO, Any, Callable, Literal, Optional
|
2023-04-12 14:31:01 -04:00
|
|
|
|
|
|
|
import requests
|
2023-01-09 16:15:14 -05:00
|
|
|
|
2024-05-03 09:11:55 -07:00
|
|
|
from unstructured.documents.elements import DataSourceMetadata, Element
|
2023-05-15 13:23:19 -05:00
|
|
|
from unstructured.file_utils.filetype import (
|
|
|
|
FILETYPE_TO_MIMETYPE,
|
|
|
|
STR_TO_FILETYPE,
|
|
|
|
FileType,
|
|
|
|
detect_filetype,
|
2023-07-25 15:59:45 -04:00
|
|
|
is_json_processable,
|
2023-05-15 13:23:19 -05:00
|
|
|
)
|
2023-04-13 11:04:15 -04:00
|
|
|
from unstructured.logger import logger
|
2023-04-12 14:31:01 -04:00
|
|
|
from unstructured.partition.common import exactly_one
|
2023-01-09 16:15:14 -05:00
|
|
|
from unstructured.partition.email import partition_email
|
|
|
|
from unstructured.partition.html import partition_html
|
2023-03-09 03:36:01 +09:00
|
|
|
from unstructured.partition.json import partition_json
|
2024-05-03 09:11:55 -07:00
|
|
|
from unstructured.partition.lang import check_language_args
|
2023-01-13 16:39:53 -05:00
|
|
|
from unstructured.partition.text import partition_text
|
2023-11-15 21:41:02 -08:00
|
|
|
from unstructured.partition.utils.constants import PartitionStrategy
|
2023-05-18 11:40:12 -04:00
|
|
|
from unstructured.partition.xml import partition_xml
|
2023-08-01 11:31:13 -04:00
|
|
|
from unstructured.utils import dependency_exists
|
|
|
|
|
2024-05-03 09:11:55 -07:00
|
|
|
PARTITION_WITH_EXTRAS_MAP: dict[str, Callable[..., list[Element]]] = {}
|
2023-08-21 23:00:21 -04:00
|
|
|
|
2023-08-01 11:31:13 -04:00
|
|
|
if dependency_exists("pandas"):
|
|
|
|
from unstructured.partition.csv import partition_csv
|
|
|
|
from unstructured.partition.tsv import partition_tsv
|
|
|
|
|
2023-08-21 23:00:21 -04:00
|
|
|
PARTITION_WITH_EXTRAS_MAP["csv"] = partition_csv
|
|
|
|
PARTITION_WITH_EXTRAS_MAP["tsv"] = partition_tsv
|
|
|
|
|
2023-08-01 11:31:13 -04:00
|
|
|
|
|
|
|
if dependency_exists("docx"):
|
|
|
|
from unstructured.partition.doc import partition_doc
|
|
|
|
from unstructured.partition.docx import partition_docx
|
|
|
|
|
2023-08-21 23:00:21 -04:00
|
|
|
PARTITION_WITH_EXTRAS_MAP["doc"] = partition_doc
|
|
|
|
PARTITION_WITH_EXTRAS_MAP["docx"] = partition_docx
|
|
|
|
|
2023-08-01 11:31:13 -04:00
|
|
|
|
|
|
|
if dependency_exists("docx") and dependency_exists("pypandoc"):
|
|
|
|
from unstructured.partition.odt import partition_odt
|
|
|
|
|
2023-08-21 23:00:21 -04:00
|
|
|
PARTITION_WITH_EXTRAS_MAP["odt"] = partition_odt
|
|
|
|
|
2023-08-01 11:31:13 -04:00
|
|
|
|
2023-10-26 12:22:40 -05:00
|
|
|
if dependency_exists("pypandoc"):
|
2023-08-01 11:31:13 -04:00
|
|
|
from unstructured.partition.epub import partition_epub
|
2023-08-12 16:02:06 -05:00
|
|
|
|
2023-08-21 23:00:21 -04:00
|
|
|
PARTITION_WITH_EXTRAS_MAP["epub"] = partition_epub
|
|
|
|
|
2023-08-12 16:02:06 -05:00
|
|
|
|
|
|
|
if dependency_exists("pypandoc"):
|
2023-08-01 11:31:13 -04:00
|
|
|
from unstructured.partition.org import partition_org
|
|
|
|
from unstructured.partition.rst import partition_rst
|
|
|
|
from unstructured.partition.rtf import partition_rtf
|
|
|
|
|
2023-08-21 23:00:21 -04:00
|
|
|
PARTITION_WITH_EXTRAS_MAP["org"] = partition_org
|
|
|
|
PARTITION_WITH_EXTRAS_MAP["rst"] = partition_rst
|
|
|
|
PARTITION_WITH_EXTRAS_MAP["rtf"] = partition_rtf
|
|
|
|
|
2023-08-01 11:31:13 -04:00
|
|
|
|
|
|
|
if dependency_exists("markdown"):
|
|
|
|
from unstructured.partition.md import partition_md
|
|
|
|
|
2023-08-21 23:00:21 -04:00
|
|
|
PARTITION_WITH_EXTRAS_MAP["md"] = partition_md
|
|
|
|
|
2023-08-01 11:31:13 -04:00
|
|
|
|
|
|
|
if dependency_exists("msg_parser"):
|
|
|
|
from unstructured.partition.msg import partition_msg
|
|
|
|
|
2023-08-21 23:00:21 -04:00
|
|
|
PARTITION_WITH_EXTRAS_MAP["msg"] = partition_msg
|
|
|
|
|
2023-08-01 11:31:13 -04:00
|
|
|
|
|
|
|
pdf_imports = ["pdf2image", "pdfminer", "PIL"]
|
|
|
|
if all(dependency_exists(dep) for dep in pdf_imports):
|
2023-12-15 14:29:58 -08:00
|
|
|
from unstructured.partition.pdf import partition_pdf
|
2023-08-01 11:31:13 -04:00
|
|
|
|
2023-08-21 23:00:21 -04:00
|
|
|
PARTITION_WITH_EXTRAS_MAP["pdf"] = partition_pdf
|
|
|
|
|
2023-08-01 11:31:13 -04:00
|
|
|
|
|
|
|
if dependency_exists("unstructured_inference"):
|
2023-12-15 14:29:58 -08:00
|
|
|
from unstructured.partition.image import partition_image
|
2023-08-01 11:31:13 -04:00
|
|
|
|
2023-08-21 23:00:21 -04:00
|
|
|
PARTITION_WITH_EXTRAS_MAP["image"] = partition_image
|
|
|
|
|
2023-08-01 11:31:13 -04:00
|
|
|
|
|
|
|
if dependency_exists("pptx"):
|
|
|
|
from unstructured.partition.ppt import partition_ppt
|
|
|
|
from unstructured.partition.pptx import partition_pptx
|
|
|
|
|
2023-08-21 23:00:21 -04:00
|
|
|
PARTITION_WITH_EXTRAS_MAP["ppt"] = partition_ppt
|
|
|
|
PARTITION_WITH_EXTRAS_MAP["pptx"] = partition_pptx
|
|
|
|
|
2023-08-01 11:31:13 -04:00
|
|
|
|
|
|
|
if dependency_exists("pandas") and dependency_exists("openpyxl"):
|
|
|
|
from unstructured.partition.xlsx import partition_xlsx
|
2023-01-09 16:15:14 -05:00
|
|
|
|
2023-08-21 23:00:21 -04:00
|
|
|
PARTITION_WITH_EXTRAS_MAP["xlsx"] = partition_xlsx
|
|
|
|
|
|
|
|
|
2024-01-17 17:50:36 -05:00
|
|
|
IMAGE_FILETYPES = [
|
2024-01-29 22:49:00 -06:00
|
|
|
FileType.HEIC,
|
2024-01-17 17:50:36 -05:00
|
|
|
FileType.PNG,
|
|
|
|
FileType.JPG,
|
|
|
|
FileType.TIFF,
|
|
|
|
FileType.BMP,
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2023-08-21 23:00:21 -04:00
|
|
|
def _get_partition_with_extras(
|
|
|
|
doc_type: str,
|
2024-05-03 09:11:55 -07:00
|
|
|
partition_with_extras_map: Optional[dict[str, Callable[..., list[Element]]]] = None,
|
2023-08-21 23:00:21 -04:00
|
|
|
):
|
|
|
|
if partition_with_extras_map is None:
|
|
|
|
partition_with_extras_map = PARTITION_WITH_EXTRAS_MAP
|
|
|
|
_partition_func = partition_with_extras_map.get(doc_type)
|
|
|
|
if _partition_func is None:
|
|
|
|
raise ImportError(
|
|
|
|
f"partition_{doc_type} is not available. "
|
|
|
|
f"Install the {doc_type} dependencies with "
|
|
|
|
f'pip install "unstructured[{doc_type}]"',
|
|
|
|
)
|
|
|
|
return _partition_func
|
|
|
|
|
2023-01-09 16:15:14 -05:00
|
|
|
|
2023-02-08 10:11:15 -05:00
|
|
|
def partition(
|
|
|
|
filename: Optional[str] = None,
|
2023-03-24 16:32:45 -07:00
|
|
|
content_type: Optional[str] = None,
|
2023-07-05 22:37:31 +02:00
|
|
|
file: Optional[IO[bytes]] = None,
|
2023-03-24 16:32:45 -07:00
|
|
|
file_filename: Optional[str] = None,
|
2023-04-12 14:31:01 -04:00
|
|
|
url: Optional[str] = None,
|
2023-02-08 10:11:15 -05:00
|
|
|
include_page_breaks: bool = False,
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy: str = PartitionStrategy.AUTO,
|
2023-06-05 11:27:12 -07:00
|
|
|
encoding: Optional[str] = None,
|
2024-05-10 14:37:07 -05:00
|
|
|
paragraph_grouper: Optional[Callable[[str], str]] | Literal[False] = None,
|
2024-05-03 09:11:55 -07:00
|
|
|
headers: dict[str, str] = {},
|
2024-05-17 11:28:11 -04:00
|
|
|
skip_infer_table_types: list[str] = ["pdf", "jpg", "png", "heic"],
|
2023-04-20 11:13:56 -04:00
|
|
|
ssl_verify: bool = True,
|
2023-09-13 13:07:28 -04:00
|
|
|
ocr_languages: Optional[str] = None, # changing to optional for deprecation
|
2024-05-03 09:11:55 -07:00
|
|
|
languages: Optional[list[str]] = None,
|
2023-10-10 20:47:56 -05:00
|
|
|
detect_language_per_element: bool = False,
|
2024-05-17 11:28:11 -04:00
|
|
|
pdf_infer_table_structure: bool = False,
|
2024-01-04 09:52:00 -08:00
|
|
|
extract_images_in_pdf: bool = False,
|
2024-05-03 09:11:55 -07:00
|
|
|
extract_image_block_types: Optional[list[str]] = None,
|
2024-01-04 09:52:00 -08:00
|
|
|
extract_image_block_output_dir: Optional[str] = None,
|
|
|
|
extract_image_block_to_payload: bool = False,
|
2023-05-18 11:40:12 -04:00
|
|
|
xml_keep_tags: bool = False,
|
2023-06-07 21:22:18 -07:00
|
|
|
data_source_metadata: Optional[DataSourceMetadata] = None,
|
2023-08-24 03:02:47 -04:00
|
|
|
metadata_filename: Optional[str] = None,
|
2023-11-07 18:44:58 -06:00
|
|
|
request_timeout: Optional[int] = None,
|
2023-12-22 09:06:54 -06:00
|
|
|
hi_res_model_name: Optional[str] = None,
|
|
|
|
model_name: Optional[str] = None, # to be deprecated
|
2024-03-18 02:09:44 +01:00
|
|
|
date_from_file_object: bool = False,
|
2024-04-15 23:03:42 +02:00
|
|
|
starting_page_number: int = 1,
|
2024-05-03 09:11:55 -07:00
|
|
|
**kwargs: Any,
|
2023-02-08 10:11:15 -05:00
|
|
|
):
|
2023-01-09 16:15:14 -05:00
|
|
|
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
|
|
|
the file's type and route it to the appropriate partitioning function. Applies the default
|
|
|
|
parameters for each partitioning function. Use the document-type specific partitioning
|
|
|
|
functions if you need access to additional kwarg options.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
2023-04-05 10:54:11 -07:00
|
|
|
filename
|
2023-01-09 16:15:14 -05:00
|
|
|
A string defining the target filename path.
|
2023-03-24 16:32:45 -07:00
|
|
|
content_type
|
|
|
|
A string defining the file content in MIME type
|
2023-01-09 16:15:14 -05:00
|
|
|
file
|
|
|
|
A file-like object using "rb" mode --> open(filename, "rb").
|
2023-08-24 03:02:47 -04:00
|
|
|
metadata_filename
|
2023-03-24 16:32:45 -07:00
|
|
|
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
2023-04-12 14:31:01 -04:00
|
|
|
url
|
|
|
|
The url for a remote document. Pass in content_type if you want partition to treat
|
|
|
|
the document as a specific content_type.
|
2023-02-08 10:11:15 -05:00
|
|
|
include_page_breaks
|
|
|
|
If True, the output will include page breaks if the filetype supports it
|
2023-03-10 22:16:05 -05:00
|
|
|
strategy
|
2023-06-09 13:54:18 -04:00
|
|
|
The strategy to use for partitioning PDF/image. Uses a layout detection model if set
|
|
|
|
to 'hi_res', otherwise partition simply extracts the text from the document
|
2023-03-10 22:16:05 -05:00
|
|
|
and processes it.
|
|
|
|
encoding
|
|
|
|
The encoding method used to decode the text input. If None, utf-8 will be used.
|
2023-04-13 11:04:15 -04:00
|
|
|
headers
|
|
|
|
The headers to be used in conjunction with the HTTP request if URL is set.
|
2023-07-27 13:33:36 -04:00
|
|
|
skip_infer_table_types
|
|
|
|
The document types that you want to skip table extraction with.
|
2023-04-20 11:13:56 -04:00
|
|
|
ssl_verify
|
|
|
|
If the URL parameter is set, determines whether or not partition uses SSL verification
|
|
|
|
in the HTTP request.
|
2023-09-13 13:07:28 -04:00
|
|
|
languages
|
2023-10-10 20:47:56 -05:00
|
|
|
The languages present in the document, for use in partitioning and/or OCR. For partitioning
|
|
|
|
image or pdf documents with Tesseract, you'll first need to install the appropriate
|
|
|
|
Tesseract language pack. For other partitions, language is detected using naive Bayesian
|
|
|
|
filter via `langdetect`. Multiple languages indicates text could be in either language.
|
|
|
|
Additional Parameters:
|
|
|
|
detect_language_per_element
|
|
|
|
Detect language per element instead of at the document level.
|
2023-04-21 13:48:19 -05:00
|
|
|
pdf_infer_table_structure
|
2024-03-22 11:08:49 +01:00
|
|
|
Deprecated! Use `skip_infer_table_types` to opt out of table extraction for any document
|
|
|
|
type.
|
2023-04-21 13:48:19 -05:00
|
|
|
If True and strategy=hi_res, any Table Elements extracted from a PDF will include an
|
|
|
|
additional metadata field, "text_as_html," where the value (string) is a just a
|
|
|
|
transformation of the data into an HTML <table>.
|
|
|
|
The "text" field for a partitioned Table Element is always present, whether True or False.
|
2024-01-04 09:52:00 -08:00
|
|
|
extract_images_in_pdf
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
Only applicable if `strategy=hi_res`.
|
2024-01-04 09:52:00 -08:00
|
|
|
If True, any detected images will be saved in the path specified by
|
|
|
|
'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
Deprecation Note: This parameter is marked for deprecation. Future versions will use
|
2024-01-04 09:52:00 -08:00
|
|
|
'extract_image_block_types' for broader extraction capabilities.
|
|
|
|
extract_image_block_types
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
Only applicable if `strategy=hi_res`.
|
|
|
|
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
|
2024-01-04 09:52:00 -08:00
|
|
|
saved in the path specified by 'extract_image_block_output_dir' or stored as base64
|
|
|
|
encoded data within metadata fields.
|
|
|
|
extract_image_block_to_payload
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
Only applicable if `strategy=hi_res`.
|
2024-01-04 09:52:00 -08:00
|
|
|
If True, images of the element type(s) defined in 'extract_image_block_types' will be
|
|
|
|
encoded as base64 data and stored in two metadata fields: 'image_base64' and
|
|
|
|
'image_mime_type'.
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
This parameter facilitates the inclusion of element data directly within the payload,
|
|
|
|
especially for web-based applications or APIs.
|
2024-01-04 09:52:00 -08:00
|
|
|
extract_image_block_output_dir
|
|
|
|
Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
The filesystem path for saving images of the element type(s)
|
2024-01-04 09:52:00 -08:00
|
|
|
specified in 'extract_image_block_types'.
|
2023-05-18 11:40:12 -04:00
|
|
|
xml_keep_tags
|
|
|
|
If True, will retain the XML tags in the output. Otherwise it will simply extract
|
|
|
|
the text from within the tags. Only applies to partition_xml.
|
2023-11-07 18:44:58 -06:00
|
|
|
request_timeout
|
|
|
|
The timeout for the HTTP request if URL is set. Defaults to None meaning no timeout and
|
|
|
|
requests will block indefinitely.
|
2023-12-22 09:06:54 -06:00
|
|
|
hi_res_model_name
|
|
|
|
The layout detection model used when partitioning strategy is set to `hi_res`.
|
|
|
|
model_name
|
|
|
|
The layout detection model used when partitioning strategy is set to `hi_res`. To be
|
|
|
|
deprecated in favor of `hi_res_model_name`.
|
2024-03-18 02:09:44 +01:00
|
|
|
date_from_file_object
|
|
|
|
Applies only when providing file via `file` parameter. If this option is True and inference
|
|
|
|
from message header failed, attempt to infer last_modified metadata from bytes,
|
|
|
|
otherwise set it to None.
|
2024-04-15 23:03:42 +02:00
|
|
|
starting_page_number
|
|
|
|
Indicates what page number should be assigned to the first page in the document.
|
|
|
|
This information will be reflected in elements' metadata and can be be especially
|
|
|
|
useful when partitioning a document that is part of a larger document.
|
2023-01-09 16:15:14 -05:00
|
|
|
"""
|
2023-04-12 14:31:01 -04:00
|
|
|
exactly_one(file=file, filename=filename, url=url)
|
|
|
|
|
2023-08-24 03:02:47 -04:00
|
|
|
if metadata_filename and file_filename:
|
|
|
|
raise ValueError(
|
|
|
|
"Only one of metadata_filename and file_filename is specified. "
|
|
|
|
"metadata_filename is preferred. file_filename is marked for deprecation.",
|
|
|
|
)
|
|
|
|
|
|
|
|
if file_filename is not None:
|
|
|
|
metadata_filename = file_filename
|
|
|
|
logger.warn(
|
|
|
|
"The file_filename kwarg will be deprecated in a future version of unstructured. "
|
|
|
|
"Please use metadata_filename instead.",
|
|
|
|
)
|
|
|
|
kwargs.setdefault("metadata_filename", metadata_filename)
|
2024-03-18 02:09:44 +01:00
|
|
|
kwargs.setdefault("date_from_file_object", date_from_file_object)
|
2023-08-24 03:02:47 -04:00
|
|
|
|
2024-05-17 11:28:11 -04:00
|
|
|
if pdf_infer_table_structure:
|
2024-03-22 11:08:49 +01:00
|
|
|
logger.warning(
|
|
|
|
"The pdf_infer_table_structure kwarg is deprecated. Please use skip_infer_table_types "
|
|
|
|
"instead."
|
|
|
|
)
|
|
|
|
|
2024-01-29 14:12:08 -06:00
|
|
|
languages = check_language_args(languages or [], ocr_languages)
|
2023-09-13 13:07:28 -04:00
|
|
|
|
2023-04-12 14:31:01 -04:00
|
|
|
if url is not None:
|
2023-04-13 11:04:15 -04:00
|
|
|
file, filetype = file_and_type_from_url(
|
|
|
|
url=url,
|
|
|
|
content_type=content_type,
|
|
|
|
headers=headers,
|
2023-04-20 11:13:56 -04:00
|
|
|
ssl_verify=ssl_verify,
|
2023-11-07 18:44:58 -06:00
|
|
|
request_timeout=request_timeout,
|
2023-04-13 11:04:15 -04:00
|
|
|
)
|
2023-04-12 14:31:01 -04:00
|
|
|
else:
|
2023-04-13 11:04:15 -04:00
|
|
|
if headers != {}:
|
|
|
|
logger.warning(
|
|
|
|
"The headers kwarg is set but the url kwarg is not. "
|
|
|
|
"The headers kwarg will be ignored.",
|
|
|
|
)
|
2023-04-12 14:31:01 -04:00
|
|
|
filetype = detect_filetype(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
2023-08-24 03:02:47 -04:00
|
|
|
file_filename=metadata_filename,
|
2023-04-12 14:31:01 -04:00
|
|
|
content_type=content_type,
|
2023-06-09 11:08:16 -04:00
|
|
|
encoding=encoding,
|
2023-04-12 14:31:01 -04:00
|
|
|
)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
if file is not None:
|
|
|
|
file.seek(0)
|
|
|
|
|
2023-07-27 13:33:36 -04:00
|
|
|
infer_table_structure = decide_table_extraction(
|
|
|
|
filetype,
|
|
|
|
skip_infer_table_types,
|
|
|
|
pdf_infer_table_structure,
|
|
|
|
)
|
|
|
|
|
2023-02-17 09:30:23 -05:00
|
|
|
if filetype == FileType.DOC:
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_doc = _get_partition_with_extras("doc")
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = _partition_doc(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
2023-10-23 17:11:53 -07:00
|
|
|
infer_table_structure=infer_table_structure,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
2024-04-15 23:03:42 +02:00
|
|
|
starting_page_number=starting_page_number,
|
2023-10-10 20:47:56 -05:00
|
|
|
**kwargs,
|
|
|
|
)
|
2023-04-12 14:31:01 -04:00
|
|
|
elif filetype == FileType.DOCX:
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_docx = _get_partition_with_extras("docx")
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = _partition_docx(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
2023-10-23 17:11:53 -07:00
|
|
|
infer_table_structure=infer_table_structure,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
2024-04-15 23:03:42 +02:00
|
|
|
starting_page_number=starting_page_number,
|
2023-10-10 20:47:56 -05:00
|
|
|
**kwargs,
|
|
|
|
)
|
2023-05-04 15:28:08 -04:00
|
|
|
elif filetype == FileType.ODT:
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_odt = _get_partition_with_extras("odt")
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = _partition_odt(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
2023-10-23 17:11:53 -07:00
|
|
|
infer_table_structure=infer_table_structure,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
2024-04-24 09:05:20 +02:00
|
|
|
starting_page_number=starting_page_number,
|
2023-10-10 20:47:56 -05:00
|
|
|
**kwargs,
|
|
|
|
)
|
2023-01-09 16:15:14 -05:00
|
|
|
elif filetype == FileType.EML:
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = partition_email(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
encoding=encoding,
|
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
|
|
|
**kwargs,
|
|
|
|
)
|
2023-03-28 16:15:22 -04:00
|
|
|
elif filetype == FileType.MSG:
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_msg = _get_partition_with_extras("msg")
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = _partition_msg(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
|
|
|
**kwargs,
|
|
|
|
)
|
2023-01-09 16:15:14 -05:00
|
|
|
elif filetype == FileType.HTML:
|
2023-04-12 14:31:01 -04:00
|
|
|
elements = partition_html(
|
2023-03-10 22:16:05 -05:00
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
|
|
|
encoding=encoding,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-03-10 22:16:05 -05:00
|
|
|
)
|
2023-05-18 11:40:12 -04:00
|
|
|
elif filetype == FileType.XML:
|
|
|
|
elements = partition_xml(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
encoding=encoding,
|
|
|
|
xml_keep_tags=xml_keep_tags,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-05-18 11:40:12 -04:00
|
|
|
)
|
2023-03-14 11:52:21 -04:00
|
|
|
elif filetype == FileType.EPUB:
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_epub = _get_partition_with_extras("epub")
|
|
|
|
elements = _partition_epub(
|
2023-04-12 14:31:01 -04:00
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-10-23 17:11:53 -07:00
|
|
|
infer_table_structure=infer_table_structure,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-04-12 14:31:01 -04:00
|
|
|
)
|
2023-06-23 20:45:31 +02:00
|
|
|
elif filetype == FileType.ORG:
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_org = _get_partition_with_extras("org")
|
|
|
|
elements = _partition_org(
|
2023-06-23 20:45:31 +02:00
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
2023-06-23 20:45:31 +02:00
|
|
|
**kwargs,
|
|
|
|
)
|
2023-06-12 15:31:10 -04:00
|
|
|
elif filetype == FileType.RST:
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_rst = _get_partition_with_extras("rst")
|
|
|
|
elements = _partition_rst(
|
2023-06-12 15:31:10 -04:00
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-10-23 17:11:53 -07:00
|
|
|
infer_table_structure=infer_table_structure,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-06-12 15:31:10 -04:00
|
|
|
)
|
2023-02-27 23:36:44 +01:00
|
|
|
elif filetype == FileType.MD:
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_md = _get_partition_with_extras("md")
|
|
|
|
elements = _partition_md(
|
2023-04-12 14:31:01 -04:00
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-10-23 17:11:53 -07:00
|
|
|
infer_table_structure=infer_table_structure,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-04-12 14:31:01 -04:00
|
|
|
)
|
2023-01-09 16:15:14 -05:00
|
|
|
elif filetype == FileType.PDF:
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_pdf = _get_partition_with_extras("pdf")
|
|
|
|
elements = _partition_pdf(
|
2024-05-03 09:11:55 -07:00
|
|
|
filename=filename,
|
|
|
|
file=file,
|
2023-02-08 10:11:15 -05:00
|
|
|
url=None,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-07-27 13:33:36 -04:00
|
|
|
infer_table_structure=infer_table_structure,
|
2023-03-10 22:16:05 -05:00
|
|
|
strategy=strategy,
|
2023-09-13 13:07:28 -04:00
|
|
|
languages=languages,
|
2023-12-22 09:06:54 -06:00
|
|
|
hi_res_model_name=hi_res_model_name or model_name,
|
2024-01-04 09:52:00 -08:00
|
|
|
extract_images_in_pdf=extract_images_in_pdf,
|
|
|
|
extract_image_block_types=extract_image_block_types,
|
|
|
|
extract_image_block_output_dir=extract_image_block_output_dir,
|
|
|
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
2024-04-15 23:03:42 +02:00
|
|
|
starting_page_number=starting_page_number,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-02-08 10:11:15 -05:00
|
|
|
)
|
2024-01-17 17:50:36 -05:00
|
|
|
elif filetype in IMAGE_FILETYPES:
|
2024-05-03 09:11:55 -07:00
|
|
|
_partition_image = _get_partition_with_extras("image")
|
|
|
|
elements = _partition_image(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
2023-02-08 10:11:15 -05:00
|
|
|
url=None,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-07-27 13:33:36 -04:00
|
|
|
infer_table_structure=infer_table_structure,
|
2023-06-09 13:54:18 -04:00
|
|
|
strategy=strategy,
|
2023-09-13 13:07:28 -04:00
|
|
|
languages=languages,
|
2023-12-22 09:06:54 -06:00
|
|
|
hi_res_model_name=hi_res_model_name or model_name,
|
2024-01-04 09:52:00 -08:00
|
|
|
extract_images_in_pdf=extract_images_in_pdf,
|
|
|
|
extract_image_block_types=extract_image_block_types,
|
|
|
|
extract_image_block_output_dir=extract_image_block_output_dir,
|
|
|
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
2024-05-09 16:31:35 -05:00
|
|
|
starting_page_number=starting_page_number,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-02-08 10:11:15 -05:00
|
|
|
)
|
2023-01-13 16:39:53 -05:00
|
|
|
elif filetype == FileType.TXT:
|
2023-04-12 14:31:01 -04:00
|
|
|
elements = partition_text(
|
2023-04-06 14:35:22 -04:00
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
encoding=encoding,
|
|
|
|
paragraph_grouper=paragraph_grouper,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-04-06 14:35:22 -04:00
|
|
|
)
|
2023-04-10 17:25:03 -04:00
|
|
|
elif filetype == FileType.RTF:
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_rtf = _get_partition_with_extras("rtf")
|
|
|
|
elements = _partition_rtf(
|
2023-04-12 14:31:01 -04:00
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-10-23 17:11:53 -07:00
|
|
|
infer_table_structure=infer_table_structure,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-04-12 14:31:01 -04:00
|
|
|
)
|
2023-02-17 11:57:08 -05:00
|
|
|
elif filetype == FileType.PPT:
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_ppt = _get_partition_with_extras("ppt")
|
|
|
|
elements = _partition_ppt(
|
2023-04-12 14:31:01 -04:00
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-10-23 17:11:53 -07:00
|
|
|
infer_table_structure=infer_table_structure,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-04-12 14:31:01 -04:00
|
|
|
)
|
2023-01-23 12:03:09 -05:00
|
|
|
elif filetype == FileType.PPTX:
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_pptx = _get_partition_with_extras("pptx")
|
|
|
|
elements = _partition_pptx(
|
2023-04-12 14:31:01 -04:00
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-10-23 17:11:53 -07:00
|
|
|
infer_table_structure=infer_table_structure,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
2024-04-15 23:03:42 +02:00
|
|
|
starting_page_number=starting_page_number,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-04-12 14:31:01 -04:00
|
|
|
)
|
2023-03-09 03:36:01 +09:00
|
|
|
elif filetype == FileType.JSON:
|
2023-07-25 15:59:45 -04:00
|
|
|
if not is_json_processable(filename=filename, file=file):
|
|
|
|
raise ValueError(
|
|
|
|
"Detected a JSON file that does not conform to the Unstructured schema. "
|
|
|
|
"partition_json currently only processes serialized Unstructured output.",
|
|
|
|
)
|
2023-06-20 13:07:30 -04:00
|
|
|
elements = partition_json(filename=filename, file=file, **kwargs)
|
2023-05-26 01:55:32 -07:00
|
|
|
elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_xlsx = _get_partition_with_extras("xlsx")
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = _partition_xlsx(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
2023-10-23 17:11:53 -07:00
|
|
|
infer_table_structure=infer_table_structure,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
2024-04-15 23:03:42 +02:00
|
|
|
starting_page_number=starting_page_number,
|
2023-10-10 20:47:56 -05:00
|
|
|
**kwargs,
|
|
|
|
)
|
2023-05-19 15:57:42 -04:00
|
|
|
elif filetype == FileType.CSV:
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_csv = _get_partition_with_extras("csv")
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = _partition_csv(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
2023-10-23 17:11:53 -07:00
|
|
|
infer_table_structure=infer_table_structure,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
|
|
|
**kwargs,
|
|
|
|
)
|
2023-06-15 13:50:53 -05:00
|
|
|
elif filetype == FileType.TSV:
|
2023-08-21 23:00:21 -04:00
|
|
|
_partition_tsv = _get_partition_with_extras("tsv")
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = _partition_tsv(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
languages=languages,
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
|
|
|
**kwargs,
|
|
|
|
)
|
2023-06-09 16:07:50 -04:00
|
|
|
elif filetype == FileType.EMPTY:
|
|
|
|
elements = []
|
2023-01-09 16:15:14 -05:00
|
|
|
else:
|
|
|
|
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
2023-03-06 16:08:10 -08:00
|
|
|
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
|
2023-04-12 14:31:01 -04:00
|
|
|
|
|
|
|
for element in elements:
|
|
|
|
element.metadata.url = url
|
2023-06-07 21:22:18 -07:00
|
|
|
element.metadata.data_source = data_source_metadata
|
2023-05-15 13:23:19 -05:00
|
|
|
if content_type is not None:
|
|
|
|
out_filetype = STR_TO_FILETYPE.get(content_type)
|
|
|
|
element.metadata.filetype = (
|
|
|
|
FILETYPE_TO_MIMETYPE[out_filetype] if out_filetype is not None else None
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
element.metadata.filetype = FILETYPE_TO_MIMETYPE[filetype]
|
2023-04-12 14:31:01 -04:00
|
|
|
|
|
|
|
return elements
|
|
|
|
|
|
|
|
|
|
|
|
def file_and_type_from_url(
|
|
|
|
url: str,
|
|
|
|
content_type: Optional[str] = None,
|
2024-05-03 09:11:55 -07:00
|
|
|
headers: dict[str, str] = {},
|
2023-04-20 11:13:56 -04:00
|
|
|
ssl_verify: bool = True,
|
2023-11-07 18:44:58 -06:00
|
|
|
request_timeout: Optional[int] = None,
|
2024-05-03 09:11:55 -07:00
|
|
|
) -> tuple[io.BytesIO, Optional[FileType]]:
|
2023-11-07 18:44:58 -06:00
|
|
|
response = requests.get(url, headers=headers, verify=ssl_verify, timeout=request_timeout)
|
2023-04-12 14:31:01 -04:00
|
|
|
file = io.BytesIO(response.content)
|
|
|
|
|
fix: parse URL response Content-Type according to RFC 9110 (#2950)
Currently, `file_and_type_from_url()` does not correctly handle the
`Content-Type` header. Specifically, it assumes that the header contains
only the mime-type (e.g. `text/html`), however, [RFC
9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows
for additional directives — specifically the `charset` — to be returned
in the header. This leads to a `ValueError` when loading a URL with a
response Content-Type header such as `text/html; charset=UTF-8`.
To reproduce the issue:
```python
from unstructured.partition.auto import partition
url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/"
partition(url=url)
```
Which will result in the following exception:
```python
{
"name": "ValueError",
"message": "Invalid file. The FileType.UNK file type is not supported in partition.",
"stack": "---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[1], line 4
1 from unstructured.partition.auto import partition
3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\"
----> 4 partition(url=url)
File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs)
539 else:
540 msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\"
--> 541 raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\")
543 for element in elements:
544 element.metadata.url = url
ValueError: Invalid file. The FileType.UNK file type is not supported in partition."
}
```
This PR fixes the issue by parsing the mime-type out of the
`Content-Type` header string.
Closes #2257
2024-04-30 07:53:44 +02:00
|
|
|
content_type = (
|
|
|
|
content_type or response.headers.get("Content-Type", "").split(";")[0].strip().lower()
|
|
|
|
)
|
2023-06-09 11:08:16 -04:00
|
|
|
encoding = response.headers.get("Content-Encoding", "utf-8")
|
|
|
|
|
|
|
|
filetype = detect_filetype(file=file, content_type=content_type, encoding=encoding)
|
2023-04-12 14:31:01 -04:00
|
|
|
return file, filetype
|
2023-07-27 13:33:36 -04:00
|
|
|
|
|
|
|
|
|
|
|
def decide_table_extraction(
|
|
|
|
filetype: Optional[FileType],
|
2024-05-03 09:11:55 -07:00
|
|
|
skip_infer_table_types: list[str],
|
2023-07-27 13:33:36 -04:00
|
|
|
pdf_infer_table_structure: bool,
|
|
|
|
) -> bool:
|
|
|
|
doc_type = filetype.name.lower() if filetype else None
|
|
|
|
|
|
|
|
if doc_type == "pdf":
|
2024-03-22 11:08:49 +01:00
|
|
|
# For backwards compatibility. Ultimately we want to remove pdf_infer_table_structure
|
|
|
|
# completely and rely exclusively on `skip_infer_table_types` for all file types.
|
|
|
|
# Until then for pdf files we first check pdf_infer_table_structure and then update
|
|
|
|
# based on skip_infer_tables.
|
2024-05-23 13:37:15 -07:00
|
|
|
return pdf_infer_table_structure or doc_type not in skip_infer_table_types
|
2023-07-27 13:33:36 -04:00
|
|
|
|
2023-10-27 13:24:55 -05:00
|
|
|
return doc_type not in skip_infer_table_types
|