mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-09 15:07:30 +00:00
feat: extract metadata from .docx, .xlsx, and .jpg (#113)
* add python-docx dependency * added function for extracting metadata from word documents * add openpyxl * added get_jpg_metadata; fixed typing * bump changelog * added pillow to dependencies
This commit is contained in:
parent
e0a76effff
commit
b14f6ac9bd
@ -1,9 +1,10 @@
|
|||||||
## 0.3.5-dev2
|
## 0.3.5-dev3
|
||||||
|
|
||||||
* Add new pattern to recognize plain text dash bullets
|
* Add new pattern to recognize plain text dash bullets
|
||||||
* Add test for bullet patterns
|
* Add test for bullet patterns
|
||||||
* Fix for `partition_html` that allows for processing `div` tags that have both text and child
|
* Fix for `partition_html` that allows for processing `div` tags that have both text and child
|
||||||
elements
|
elements
|
||||||
|
* Add ability to extract document metadata from `.docx`, `.xlsx`, and `.jpg` files.
|
||||||
|
|
||||||
## 0.3.4
|
## 0.3.4
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# This file is autogenerated by pip-compile with python 3.8
|
# This file is autogenerated by pip-compile with python 3.10
|
||||||
# To update, run:
|
# To update, run:
|
||||||
#
|
#
|
||||||
# pip-compile requirements/build.in
|
# pip-compile requirements/build.in
|
||||||
@ -22,8 +22,6 @@ idna==3.4
|
|||||||
# via requests
|
# via requests
|
||||||
imagesize==1.4.1
|
imagesize==1.4.1
|
||||||
# via sphinx
|
# via sphinx
|
||||||
importlib-metadata==5.0.0
|
|
||||||
# via sphinx
|
|
||||||
jinja2==3.1.2
|
jinja2==3.1.2
|
||||||
# via sphinx
|
# via sphinx
|
||||||
markupsafe==2.1.1
|
markupsafe==2.1.1
|
||||||
@ -60,5 +58,3 @@ sphinxcontrib-serializinghtml==1.1.5
|
|||||||
# via sphinx
|
# via sphinx
|
||||||
urllib3==1.26.12
|
urllib3==1.26.12
|
||||||
# via requests
|
# via requests
|
||||||
zipp==3.10.0
|
|
||||||
# via importlib-metadata
|
|
||||||
|
|||||||
@ -234,3 +234,39 @@ for reading in a document with an XSLT stylesheet is as follows:
|
|||||||
If you read from a stylesheet ``HTMLDocument`` will use the ``etree.XMLParser`` by default
|
If you read from a stylesheet ``HTMLDocument`` will use the ``etree.XMLParser`` by default
|
||||||
instead of the ``etree.HTMLParser`` because ``HTMLDocument`` assumes you want to convert
|
instead of the ``etree.HTMLParser`` because ``HTMLDocument`` assumes you want to convert
|
||||||
your raw XML to HTML.
|
your raw XML to HTML.
|
||||||
|
|
||||||
|
|
||||||
|
##################################
|
||||||
|
Extracting Metadata from Documents
|
||||||
|
##################################
|
||||||
|
|
||||||
|
The ``unstructured`` library includes utilities for extracting metadata from
|
||||||
|
documents. Currently, there is support for extracting metadata from ``.docx``,
|
||||||
|
``.xlsx``, and ``.jpg`` documents. When you call these functions, the return type
|
||||||
|
is a ``Metadata`` data class that you can convert to a dictionary by calling the
|
||||||
|
``to_dict()`` method. If you extract metadata from a ``.jpg`` document, the output
|
||||||
|
will include EXIF metadata in the ``exif_data`` attribute, if it is available.
|
||||||
|
Here is an example of how to use the metadata extraction functionality:
|
||||||
|
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.file_utils.metadata import get_jpg_metadata
|
||||||
|
|
||||||
|
filename = "example-docs/example.jpg"
|
||||||
|
metadata = get_jpg_metadata(filename=filename)
|
||||||
|
|
||||||
|
|
||||||
|
You can also pass in a file-like object with:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.file_utils.metadata import get_jpg_metadata
|
||||||
|
|
||||||
|
filename = "example-docs/example.jpg"
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
metadata = get_jpg_metadata(file=f)
|
||||||
|
|
||||||
|
|
||||||
|
To extract metadata from ``.docx`` or ``.xlsx``, use ``get_docx_metadata`` and
|
||||||
|
``get_xlsx_metadata``. The interfaces are the same as ``get_jpg_metadata``.
|
||||||
|
|||||||
BIN
example-docs/example.jpg
Normal file
BIN
example-docs/example.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 32 KiB |
BIN
example-docs/fake-excel.xlsx
Normal file
BIN
example-docs/fake-excel.xlsx
Normal file
Binary file not shown.
BIN
example-docs/fake.docx
Normal file
BIN
example-docs/fake.docx
Normal file
Binary file not shown.
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# This file is autogenerated by pip-compile with python 3.8
|
# This file is autogenerated by pip-compile with python 3.10
|
||||||
# To update, run:
|
# To update, run:
|
||||||
#
|
#
|
||||||
# pip-compile --output-file=requirements/base.txt
|
# pip-compile --output-file=requirements/base.txt
|
||||||
@ -16,6 +16,8 @@ click==8.1.3
|
|||||||
# via nltk
|
# via nltk
|
||||||
deprecated==1.2.13
|
deprecated==1.2.13
|
||||||
# via argilla
|
# via argilla
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
# via openpyxl
|
||||||
h11==0.9.0
|
h11==0.9.0
|
||||||
# via httpcore
|
# via httpcore
|
||||||
httpcore==0.11.1
|
httpcore==0.11.1
|
||||||
@ -27,7 +29,9 @@ idna==3.4
|
|||||||
joblib==1.2.0
|
joblib==1.2.0
|
||||||
# via nltk
|
# via nltk
|
||||||
lxml==4.9.1
|
lxml==4.9.1
|
||||||
# via unstructured (setup.py)
|
# via
|
||||||
|
# python-docx
|
||||||
|
# unstructured (setup.py)
|
||||||
monotonic==1.6
|
monotonic==1.6
|
||||||
# via argilla
|
# via argilla
|
||||||
nltk==3.7
|
nltk==3.7
|
||||||
@ -36,16 +40,22 @@ numpy==1.23.5
|
|||||||
# via
|
# via
|
||||||
# argilla
|
# argilla
|
||||||
# pandas
|
# pandas
|
||||||
|
openpyxl==3.0.10
|
||||||
|
# via unstructured (setup.py)
|
||||||
packaging==21.3
|
packaging==21.3
|
||||||
# via argilla
|
# via argilla
|
||||||
pandas==1.5.2
|
pandas==1.5.2
|
||||||
# via argilla
|
# via argilla
|
||||||
|
pillow==9.3.0
|
||||||
|
# via unstructured (setup.py)
|
||||||
pydantic==1.10.2
|
pydantic==1.10.2
|
||||||
# via argilla
|
# via argilla
|
||||||
pyparsing==3.0.9
|
pyparsing==3.0.9
|
||||||
# via packaging
|
# via packaging
|
||||||
python-dateutil==2.8.2
|
python-dateutil==2.8.2
|
||||||
# via pandas
|
# via pandas
|
||||||
|
python-docx==0.8.11
|
||||||
|
# via unstructured (setup.py)
|
||||||
pytz==2022.6
|
pytz==2022.6
|
||||||
# via pandas
|
# via pandas
|
||||||
regex==2022.10.31
|
regex==2022.10.31
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# This file is autogenerated by pip-compile with python 3.8
|
# This file is autogenerated by pip-compile with python 3.10
|
||||||
# To update, run:
|
# To update, run:
|
||||||
#
|
#
|
||||||
# pip-compile requirements/build.in
|
# pip-compile requirements/build.in
|
||||||
@ -22,8 +22,6 @@ idna==3.4
|
|||||||
# via requests
|
# via requests
|
||||||
imagesize==1.4.1
|
imagesize==1.4.1
|
||||||
# via sphinx
|
# via sphinx
|
||||||
importlib-metadata==5.0.0
|
|
||||||
# via sphinx
|
|
||||||
jinja2==3.1.2
|
jinja2==3.1.2
|
||||||
# via sphinx
|
# via sphinx
|
||||||
markupsafe==2.1.1
|
markupsafe==2.1.1
|
||||||
@ -60,5 +58,3 @@ sphinxcontrib-serializinghtml==1.1.5
|
|||||||
# via sphinx
|
# via sphinx
|
||||||
urllib3==1.26.12
|
urllib3==1.26.12
|
||||||
# via requests
|
# via requests
|
||||||
zipp==3.10.0
|
|
||||||
# via importlib-metadata
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# This file is autogenerated by pip-compile with python 3.8
|
# This file is autogenerated by pip-compile with python 3.10
|
||||||
# To update, run:
|
# To update, run:
|
||||||
#
|
#
|
||||||
# pip-compile requirements/dev.in
|
# pip-compile requirements/dev.in
|
||||||
@ -40,10 +40,6 @@ executing==1.0.0
|
|||||||
# via stack-data
|
# via stack-data
|
||||||
fastjsonschema==2.16.2
|
fastjsonschema==2.16.2
|
||||||
# via nbformat
|
# via nbformat
|
||||||
importlib-metadata==5.0.0
|
|
||||||
# via nbconvert
|
|
||||||
importlib-resources==5.10.0
|
|
||||||
# via jsonschema
|
|
||||||
ipykernel==6.15.3
|
ipykernel==6.15.3
|
||||||
# via
|
# via
|
||||||
# ipywidgets
|
# ipywidgets
|
||||||
@ -143,8 +139,6 @@ pickleshare==0.7.5
|
|||||||
# via ipython
|
# via ipython
|
||||||
pip-tools==6.10.0
|
pip-tools==6.10.0
|
||||||
# via -r requirements/dev.in
|
# via -r requirements/dev.in
|
||||||
pkgutil-resolve-name==1.3.10
|
|
||||||
# via jsonschema
|
|
||||||
platformdirs==2.5.4
|
platformdirs==2.5.4
|
||||||
# via jupyter-core
|
# via jupyter-core
|
||||||
prometheus-client==0.14.1
|
prometheus-client==0.14.1
|
||||||
@ -233,10 +227,6 @@ wheel==0.37.1
|
|||||||
# via pip-tools
|
# via pip-tools
|
||||||
widgetsnbextension==4.0.3
|
widgetsnbextension==4.0.3
|
||||||
# via ipywidgets
|
# via ipywidgets
|
||||||
zipp==3.10.0
|
|
||||||
# via
|
|
||||||
# importlib-metadata
|
|
||||||
# importlib-resources
|
|
||||||
|
|
||||||
# The following packages are considered to be unsafe in a requirements file:
|
# The following packages are considered to be unsafe in a requirements file:
|
||||||
# pip
|
# pip
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# This file is autogenerated by pip-compile with python 3.8
|
# This file is autogenerated by pip-compile with python 3.10
|
||||||
# To update, run:
|
# To update, run:
|
||||||
#
|
#
|
||||||
# pip-compile --extra=huggingface --output-file=requirements/huggingface.txt
|
# pip-compile --extra=huggingface --output-file=requirements/huggingface.txt
|
||||||
@ -21,6 +21,8 @@ click==8.1.3
|
|||||||
# sacremoses
|
# sacremoses
|
||||||
deprecated==1.2.13
|
deprecated==1.2.13
|
||||||
# via argilla
|
# via argilla
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
# via openpyxl
|
||||||
filelock==3.8.2
|
filelock==3.8.2
|
||||||
# via
|
# via
|
||||||
# huggingface-hub
|
# huggingface-hub
|
||||||
@ -44,7 +46,9 @@ joblib==1.2.0
|
|||||||
langdetect==1.0.9
|
langdetect==1.0.9
|
||||||
# via unstructured (setup.py)
|
# via unstructured (setup.py)
|
||||||
lxml==4.9.1
|
lxml==4.9.1
|
||||||
# via unstructured (setup.py)
|
# via
|
||||||
|
# python-docx
|
||||||
|
# unstructured (setup.py)
|
||||||
monotonic==1.6
|
monotonic==1.6
|
||||||
# via argilla
|
# via argilla
|
||||||
nltk==3.7
|
nltk==3.7
|
||||||
@ -54,6 +58,8 @@ numpy==1.23.4
|
|||||||
# argilla
|
# argilla
|
||||||
# pandas
|
# pandas
|
||||||
# transformers
|
# transformers
|
||||||
|
openpyxl==3.0.10
|
||||||
|
# via unstructured (setup.py)
|
||||||
packaging==21.3
|
packaging==21.3
|
||||||
# via
|
# via
|
||||||
# argilla
|
# argilla
|
||||||
@ -61,12 +67,16 @@ packaging==21.3
|
|||||||
# transformers
|
# transformers
|
||||||
pandas==1.5.2
|
pandas==1.5.2
|
||||||
# via argilla
|
# via argilla
|
||||||
|
pillow==9.3.0
|
||||||
|
# via unstructured (setup.py)
|
||||||
pydantic==1.10.2
|
pydantic==1.10.2
|
||||||
# via argilla
|
# via argilla
|
||||||
pyparsing==3.0.9
|
pyparsing==3.0.9
|
||||||
# via packaging
|
# via packaging
|
||||||
python-dateutil==2.8.2
|
python-dateutil==2.8.2
|
||||||
# via pandas
|
# via pandas
|
||||||
|
python-docx==0.8.11
|
||||||
|
# via unstructured (setup.py)
|
||||||
pytz==2022.6
|
pytz==2022.6
|
||||||
# via pandas
|
# via pandas
|
||||||
pyyaml==6.0
|
pyyaml==6.0
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# This file is autogenerated by pip-compile with python 3.8
|
# This file is autogenerated by pip-compile with python 3.10
|
||||||
# To update, run:
|
# To update, run:
|
||||||
#
|
#
|
||||||
# pip-compile requirements/test.in
|
# pip-compile requirements/test.in
|
||||||
@ -80,7 +80,6 @@ tomli==2.0.1
|
|||||||
# pytest
|
# pytest
|
||||||
typing-extensions==4.3.0
|
typing-extensions==4.3.0
|
||||||
# via
|
# via
|
||||||
# black
|
|
||||||
# mypy
|
# mypy
|
||||||
# pydantic
|
# pydantic
|
||||||
urllib3==1.26.12
|
urllib3==1.26.12
|
||||||
|
|||||||
5
setup.py
5
setup.py
@ -48,9 +48,12 @@ setup(
|
|||||||
version=__version__,
|
version=__version__,
|
||||||
entry_points={},
|
entry_points={},
|
||||||
install_requires=[
|
install_requires=[
|
||||||
|
"argilla",
|
||||||
"lxml",
|
"lxml",
|
||||||
"nltk",
|
"nltk",
|
||||||
"argilla",
|
"openpyxl",
|
||||||
|
"pillow",
|
||||||
|
"python-docx",
|
||||||
# NOTE(robinson) - The following dependencies are pinned
|
# NOTE(robinson) - The following dependencies are pinned
|
||||||
# to address security scans
|
# to address security scans
|
||||||
"certifi>=2022.12.07",
|
"certifi>=2022.12.07",
|
||||||
|
|||||||
107
test_unstructured/file_utils/test_metadata.py
Normal file
107
test_unstructured/file_utils/test_metadata.py
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
import datetime
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import docx
|
||||||
|
import openpyxl
|
||||||
|
|
||||||
|
import unstructured.file_utils.metadata as meta
|
||||||
|
|
||||||
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
|
EXAMPLE_JPG_FILENAME = os.path.join(DIRECTORY, "..", "..", "example-docs", "example.jpg")
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_docx_metadata_from_filename(tmpdir):
|
||||||
|
filename = os.path.join(tmpdir, "test-doc.docx")
|
||||||
|
|
||||||
|
document = docx.Document()
|
||||||
|
document.add_paragraph("Lorem ipsum dolor sit amet.")
|
||||||
|
document.core_properties.author = "Mr. Miagi"
|
||||||
|
document.save(filename)
|
||||||
|
|
||||||
|
metadata = meta.get_docx_metadata(filename=filename)
|
||||||
|
assert metadata.author == "Mr. Miagi"
|
||||||
|
assert metadata.to_dict()["author"] == "Mr. Miagi"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_docx_metadata_from_file(tmpdir):
|
||||||
|
filename = os.path.join(tmpdir, "test-doc.docx")
|
||||||
|
|
||||||
|
document = docx.Document()
|
||||||
|
document.add_paragraph("Lorem ipsum dolor sit amet.")
|
||||||
|
document.core_properties.author = "Mr. Miagi"
|
||||||
|
document.save(filename)
|
||||||
|
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
metadata = meta.get_docx_metadata(file=f)
|
||||||
|
assert metadata.author == "Mr. Miagi"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_docx_metadata_raises_without_file_or_filename():
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
meta.get_docx_metadata()
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_xlsx_metadata_from_filename(tmpdir):
|
||||||
|
filename = os.path.join(tmpdir, "test-excel.xlsx")
|
||||||
|
|
||||||
|
workbook = openpyxl.Workbook()
|
||||||
|
workbook.properties.creator = "Mr. Miagi"
|
||||||
|
workbook.save(filename)
|
||||||
|
|
||||||
|
metadata = meta.get_xlsx_metadata(filename=filename)
|
||||||
|
metadata.author = "Mr. Miagi"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_xlsx_metadata_from_file(tmpdir):
|
||||||
|
filename = os.path.join(tmpdir, "test-excel.xlsx")
|
||||||
|
|
||||||
|
workbook = openpyxl.Workbook()
|
||||||
|
workbook.properties.creator = "Mr. Miagi"
|
||||||
|
workbook.save(filename)
|
||||||
|
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
metadata = meta.get_xlsx_metadata(file=f)
|
||||||
|
metadata.author = "Mr. Miagi"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_xlsx_metadata_raises_without_file_or_filename():
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
meta.get_xlsx_metadata()
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_jpg_metadata_from_filename():
|
||||||
|
metadata = meta.get_jpg_metadata(filename=EXAMPLE_JPG_FILENAME)
|
||||||
|
assert metadata.modified == datetime.datetime(2003, 12, 14, 12, 1, 44)
|
||||||
|
assert metadata.exif_data["Make"] == "Canon"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_jpg_metadata_from_file():
|
||||||
|
with open(EXAMPLE_JPG_FILENAME, "rb") as f:
|
||||||
|
metadata = meta.get_jpg_metadata(file=f)
|
||||||
|
assert metadata.modified == datetime.datetime(2003, 12, 14, 12, 1, 44)
|
||||||
|
assert metadata.exif_data["Make"] == "Canon"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_jpg_metadata_raises_without_file_or_filename():
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
meta.get_jpg_metadata()
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_exif_datetime():
|
||||||
|
exif_data = {"DateTime": "2022:12:23 15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
|
||||||
|
date = meta._get_exif_datetime(exif_data, "DateTime")
|
||||||
|
assert date == datetime.datetime(2022, 12, 23, 15, 49, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_exif_datetime_ignores_bad_formats():
|
||||||
|
exif_data = {"DateTime": "2022-12-23TZ15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
|
||||||
|
date = meta._get_exif_datetime(exif_data, "DateTime")
|
||||||
|
assert date is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_exif_datetime_ignores_missing_key():
|
||||||
|
exif_data = {"Datetime": "2022-12-23TZ15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
|
||||||
|
date = meta._get_exif_datetime(exif_data, "DateTimeDigitized")
|
||||||
|
assert date is None
|
||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.3.5-dev2" # pragma: no cover
|
__version__ = "0.3.5-dev3" # pragma: no cover
|
||||||
|
|||||||
0
unstructured/file_utils/__init__.py
Normal file
0
unstructured/file_utils/__init__.py
Normal file
152
unstructured/file_utils/metadata.py
Normal file
152
unstructured/file_utils/metadata.py
Normal file
@ -0,0 +1,152 @@
|
|||||||
|
from dataclasses import dataclass, field
|
||||||
|
import datetime
|
||||||
|
import io
|
||||||
|
from typing import Any, Dict, IO, Final, Optional
|
||||||
|
|
||||||
|
import docx
|
||||||
|
import openpyxl
|
||||||
|
from PIL import Image
|
||||||
|
from PIL.ExifTags import TAGS
|
||||||
|
|
||||||
|
# NOTE(robison) - ref: https://www.media.mit.edu/pia/Research/deepview/exif.html
|
||||||
|
EXIF_DATETIME_FMT: Final[str] = "%Y:%m:%d %H:%M:%S"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Metadata:
|
||||||
|
author: str = ""
|
||||||
|
category: str = ""
|
||||||
|
comments: str = ""
|
||||||
|
content_status: str = ""
|
||||||
|
created: Optional[datetime.datetime] = None
|
||||||
|
identifier: str = ""
|
||||||
|
keywords: str = ""
|
||||||
|
language: str = ""
|
||||||
|
last_modified_by: str = ""
|
||||||
|
last_printed: Optional[datetime.datetime] = None
|
||||||
|
modified: Optional[datetime.datetime] = None
|
||||||
|
revision: Optional[int] = 0
|
||||||
|
subject: str = ""
|
||||||
|
title: str = ""
|
||||||
|
version: str = ""
|
||||||
|
description: str = ""
|
||||||
|
namespace: str = ""
|
||||||
|
|
||||||
|
# NOTE(robinson) - Metadata for use with image files
|
||||||
|
exif_data: Dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return self.__dict__
|
||||||
|
|
||||||
|
|
||||||
|
def get_docx_metadata(
|
||||||
|
filename: str = "",
|
||||||
|
file: Optional[IO] = None,
|
||||||
|
) -> Metadata:
|
||||||
|
"""Extracts document metadata from a Microsoft .docx document."""
|
||||||
|
if filename:
|
||||||
|
doc = docx.Document(filename)
|
||||||
|
elif file:
|
||||||
|
doc = docx.Document(file)
|
||||||
|
else:
|
||||||
|
raise FileNotFoundError("No filename nor file were specified")
|
||||||
|
|
||||||
|
metadata = Metadata(
|
||||||
|
author=getattr(doc.core_properties, "author", ""),
|
||||||
|
category=getattr(doc.core_properties, "category", ""),
|
||||||
|
comments=getattr(doc.core_properties, "comments", ""),
|
||||||
|
content_status=getattr(doc.core_properties, "content_status", ""),
|
||||||
|
created=getattr(doc.core_properties, "created", None),
|
||||||
|
identifier=getattr(doc.core_properties, "identifier", ""),
|
||||||
|
keywords=getattr(doc.core_properties, "keywords", ""),
|
||||||
|
language=getattr(doc.core_properties, "language", ""),
|
||||||
|
last_modified_by=getattr(doc.core_properties, "last_modified_by", ""),
|
||||||
|
last_printed=getattr(doc.core_properties, "last_printed", None),
|
||||||
|
modified=getattr(doc.core_properties, "modified", None),
|
||||||
|
revision=getattr(doc.core_properties, "revision", None),
|
||||||
|
subject=getattr(doc.core_properties, "subject", ""),
|
||||||
|
title=getattr(doc.core_properties, "title", ""),
|
||||||
|
version=getattr(doc.core_properties, "version", ""),
|
||||||
|
)
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
|
def get_xlsx_metadata(
|
||||||
|
filename: str = "",
|
||||||
|
file: Optional[IO] = None,
|
||||||
|
) -> Metadata:
|
||||||
|
"""Extracts document metadata from a Microsoft .xlsx document."""
|
||||||
|
if filename:
|
||||||
|
workbook = openpyxl.load_workbook(filename)
|
||||||
|
elif file:
|
||||||
|
workbook = openpyxl.load_workbook(file)
|
||||||
|
else:
|
||||||
|
raise FileNotFoundError("No filename nor file were specified")
|
||||||
|
|
||||||
|
metadata = Metadata(
|
||||||
|
author=getattr(workbook.properties, "creator", ""),
|
||||||
|
category=getattr(workbook.properties, "category", ""),
|
||||||
|
content_status=getattr(workbook.properties, "contentStatus", ""),
|
||||||
|
created=getattr(workbook.properties, "created", None),
|
||||||
|
description=getattr(workbook.properties, "description", ""),
|
||||||
|
identifier=getattr(workbook.properties, "identifier", ""),
|
||||||
|
keywords=getattr(workbook.properties, "keywords", ""),
|
||||||
|
language=getattr(workbook.properties, "language", ""),
|
||||||
|
last_modified_by=getattr(workbook.properties, "lastModifiedBy", ""),
|
||||||
|
last_printed=getattr(workbook.properties, "lastPrinted", None),
|
||||||
|
modified=getattr(workbook.properties, "modified", None),
|
||||||
|
namespace=getattr(workbook.properties, "namespace", ""),
|
||||||
|
revision=getattr(workbook.properties, "revision", None),
|
||||||
|
subject=getattr(workbook.properties, "subject", ""),
|
||||||
|
title=getattr(workbook.properties, "title", ""),
|
||||||
|
version=getattr(workbook.properties, "version", ""),
|
||||||
|
)
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
|
def get_jpg_metadata(
|
||||||
|
filename: str = "",
|
||||||
|
file: Optional[IO] = None,
|
||||||
|
) -> Metadata:
|
||||||
|
"""Extracts metadata from a JPG image, including EXIF metadata."""
|
||||||
|
if filename:
|
||||||
|
image = Image.open(filename)
|
||||||
|
elif file:
|
||||||
|
image = Image.open(io.BytesIO(file.read()))
|
||||||
|
else:
|
||||||
|
raise FileNotFoundError("No filename nor file were specified")
|
||||||
|
|
||||||
|
exif_data = image.getexif()
|
||||||
|
exif_dict: Dict[str, Any] = dict()
|
||||||
|
for tag_id in exif_data:
|
||||||
|
tag = TAGS.get(tag_id, tag_id)
|
||||||
|
data = exif_data.get(tag_id)
|
||||||
|
exif_dict[tag] = data
|
||||||
|
|
||||||
|
metadata = Metadata(
|
||||||
|
author=exif_dict.get("Artist", ""),
|
||||||
|
comments=exif_dict.get("UserComment", ""),
|
||||||
|
created=_get_exif_datetime(exif_dict, "DateTimeOriginal"),
|
||||||
|
# NOTE(robinson) - Per EXIF docs, DateTime is the last modified data
|
||||||
|
# ref: https://www.media.mit.edu/pia/Research/deepview/exif.html
|
||||||
|
modified=_get_exif_datetime(exif_dict, "DateTime"),
|
||||||
|
exif_data=exif_dict,
|
||||||
|
)
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
|
def _get_exif_datetime(exif_dict: Dict[str, Any], key: str) -> Optional[datetime.datetime]:
|
||||||
|
"""Converts a datetime string from the EXIF data to a Python datetime object."""
|
||||||
|
date = exif_dict.get(key)
|
||||||
|
if not date:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
return datetime.datetime.strptime(date, EXIF_DATETIME_FMT)
|
||||||
|
# NOTE(robinson) - An exception could occur if the datetime is not formatted
|
||||||
|
# using the standard EXIF datetime format
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
Loading…
x
Reference in New Issue
Block a user