mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-26 17:02:31 +00:00
fix: isolate metadata imports to doctype (#1671)
In a different PR, some no-extras tests started failing with import errors when something innocuous was imported from `unstructured.file_utils.metadata`. This turned out to be because of the top-level, doctype-specific imports in that file. Importing a general metadata object shouldn't require installation of modules like `PIL`, `docx`, and `openpyxl`. To fix, I moved these functions to be imported inside the functions that use them, and added the `requires_dependencies` decorator to the functions. #### Testing: You should be able to run something like: ```python from unstructured.file_utils.metadata import Metadata ``` Without `openpyxl` installed.
This commit is contained in:
parent
6b7fe4469f
commit
b9fa20ab46
@ -1,8 +1,8 @@
|
|||||||
## 0.10.20-dev4
|
## 0.10.20-dev5
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* **Align to top left when shrinking bounding boxes for `xy-curt` sorting:** Update `shrink_bbox()` to keep top left rather than center
|
* **Align to top left when shrinking bounding boxes for `xy-cut` sorting:** Update `shrink_bbox()` to keep top left rather than center.
|
||||||
* **Add visualization script to annotate elements** This script is often used to analyze/visualize elements with coordinates (e.g. partition_pdf()).
|
* **Add visualization script to annotate elements** This script is often used to analyze/visualize elements with coordinates (e.g. partition_pdf()).
|
||||||
* **Adds data source properties to the Jira connector** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
|
* **Adds data source properties to the Jira connector** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
|
||||||
* **Improve title detection in pptx documents** The default title textboxes on a pptx slide are now categorized as titles.
|
* **Improve title detection in pptx documents** The default title textboxes on a pptx slide are now categorized as titles.
|
||||||
@ -15,6 +15,7 @@ setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed.
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* **Fix prevent metadata module from importing dependencies from unnecessary modules** Problem: The `metadata` module had several top level imports that were only used in and applicable to code related to specific document types, while there were many general-purpose functions. As a result, general-purpose functions couldn't be used without unnecessary dependencies being installed. Fix: moved 3rd party dependency top level imports to inside the functions in which they are used and applied a decorator to check that the dependency is installed and emit a helpful error message if not.
|
||||||
* **Fixes category_depth None value for Title elements** Problem: `Title` elements from `chipper` get `category_depth`= None even when `Headline` and/or `Subheadline` elements are present in the same page. Fix: all `Title` elements with `category_depth` = None should be set to have a depth of 0 instead iff there are `Headline` and/or `Subheadline` element-types present. Importance: `Title` elements should be equivalent html `H1` when nested headings are present; otherwise, `category_depth` metadata can result ambiguous within elements in a page.
|
* **Fixes category_depth None value for Title elements** Problem: `Title` elements from `chipper` get `category_depth`= None even when `Headline` and/or `Subheadline` elements are present in the same page. Fix: all `Title` elements with `category_depth` = None should be set to have a depth of 0 instead iff there are `Headline` and/or `Subheadline` element-types present. Importance: `Title` elements should be equivalent html `H1` when nested headings are present; otherwise, `category_depth` metadata can result ambiguous within elements in a page.
|
||||||
* **Tweak `xy-cut` ordering output to be more column friendly** This results in the order of elements more closely reflecting natural reading order which benefits downstream applications. While element ordering from `xy-cut` is usually mostly correct when ordering multi-column documents, sometimes elements from a RHS column will appear before elements in a LHS column. Fix: add swapped `xy-cut` ordering by sorting by X coordinate first and then Y coordinate.
|
* **Tweak `xy-cut` ordering output to be more column friendly** This results in the order of elements more closely reflecting natural reading order which benefits downstream applications. While element ordering from `xy-cut` is usually mostly correct when ordering multi-column documents, sometimes elements from a RHS column will appear before elements in a LHS column. Fix: add swapped `xy-cut` ordering by sorting by X coordinate first and then Y coordinate.
|
||||||
* **Fixes badly initialized Formula** Problem: YoloX contain new types of elements, when loading a document that contain formulas a new element of that class
|
* **Fixes badly initialized Formula** Problem: YoloX contain new types of elements, when loading a document that contain formulas a new element of that class
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.10.20-dev4" # pragma: no cover
|
__version__ = "0.10.20-dev5" # pragma: no cover
|
||||||
|
@ -3,10 +3,7 @@ import io
|
|||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import IO, Any, Dict, Final, Optional
|
from typing import IO, Any, Dict, Final, Optional
|
||||||
|
|
||||||
import docx
|
from unstructured.utils import requires_dependencies
|
||||||
import openpyxl
|
|
||||||
from PIL import Image
|
|
||||||
from PIL.ExifTags import TAGS
|
|
||||||
|
|
||||||
# NOTE(robison) - ref: https://www.media.mit.edu/pia/Research/deepview/exif.html
|
# NOTE(robison) - ref: https://www.media.mit.edu/pia/Research/deepview/exif.html
|
||||||
EXIF_DATETIME_FMT: Final[str] = "%Y:%m:%d %H:%M:%S"
|
EXIF_DATETIME_FMT: Final[str] = "%Y:%m:%d %H:%M:%S"
|
||||||
@ -39,11 +36,14 @@ class Metadata:
|
|||||||
return self.__dict__
|
return self.__dict__
|
||||||
|
|
||||||
|
|
||||||
|
@requires_dependencies("docx")
|
||||||
def get_docx_metadata(
|
def get_docx_metadata(
|
||||||
filename: str = "",
|
filename: str = "",
|
||||||
file: Optional[IO[bytes]] = None,
|
file: Optional[IO[bytes]] = None,
|
||||||
) -> Metadata:
|
) -> Metadata:
|
||||||
"""Extracts document metadata from a Microsoft .docx document."""
|
"""Extracts document metadata from a Microsoft .docx document."""
|
||||||
|
import docx
|
||||||
|
|
||||||
if filename:
|
if filename:
|
||||||
doc = docx.Document(filename)
|
doc = docx.Document(filename)
|
||||||
elif file:
|
elif file:
|
||||||
@ -72,11 +72,14 @@ def get_docx_metadata(
|
|||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
|
@requires_dependencies("openpyxl")
|
||||||
def get_xlsx_metadata(
|
def get_xlsx_metadata(
|
||||||
filename: str = "",
|
filename: str = "",
|
||||||
file: Optional[IO[bytes]] = None,
|
file: Optional[IO[bytes]] = None,
|
||||||
) -> Metadata:
|
) -> Metadata:
|
||||||
"""Extracts document metadata from a Microsoft .xlsx document."""
|
"""Extracts document metadata from a Microsoft .xlsx document."""
|
||||||
|
import openpyxl
|
||||||
|
|
||||||
if filename:
|
if filename:
|
||||||
workbook = openpyxl.load_workbook(filename)
|
workbook = openpyxl.load_workbook(filename)
|
||||||
elif file:
|
elif file:
|
||||||
@ -106,11 +109,15 @@ def get_xlsx_metadata(
|
|||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
|
@requires_dependencies("PIL")
|
||||||
def get_jpg_metadata(
|
def get_jpg_metadata(
|
||||||
filename: str = "",
|
filename: str = "",
|
||||||
file: Optional[IO[bytes]] = None,
|
file: Optional[IO[bytes]] = None,
|
||||||
) -> Metadata:
|
) -> Metadata:
|
||||||
"""Extracts metadata from a JPG image, including EXIF metadata."""
|
"""Extracts metadata from a JPG image, including EXIF metadata."""
|
||||||
|
from PIL import Image
|
||||||
|
from PIL.ExifTags import TAGS
|
||||||
|
|
||||||
if filename:
|
if filename:
|
||||||
image = Image.open(filename)
|
image = Image.open(filename)
|
||||||
elif file:
|
elif file:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user