mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-03 19:43:24 +00:00 
			
		
		
		
	feat: extract metadata from .docx, .xlsx, and .jpg (#113)
				
					
				
			* add python-docx dependency * added function for extracting metadata from word documents * add openpyxl * added get_jpg_metadata; fixed typing * bump changelog * added pillow to dependencies
This commit is contained in:
		
							parent
							
								
									e0a76effff
								
							
						
					
					
						commit
						b14f6ac9bd
					
				@ -1,9 +1,10 @@
 | 
			
		||||
## 0.3.5-dev2
 | 
			
		||||
## 0.3.5-dev3
 | 
			
		||||
 | 
			
		||||
* Add new pattern to recognize plain text dash bullets
 | 
			
		||||
* Add test for bullet patterns
 | 
			
		||||
* Fix for `partition_html` that allows for processing `div` tags that have both text and child
 | 
			
		||||
  elements
 | 
			
		||||
* Add ability to extract document metadata from `.docx`, `.xlsx`, and `.jpg` files.
 | 
			
		||||
 | 
			
		||||
## 0.3.4
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,5 @@
 | 
			
		||||
#
 | 
			
		||||
# This file is autogenerated by pip-compile with python 3.8
 | 
			
		||||
# This file is autogenerated by pip-compile with python 3.10
 | 
			
		||||
# To update, run:
 | 
			
		||||
#
 | 
			
		||||
#    pip-compile requirements/build.in
 | 
			
		||||
@ -22,8 +22,6 @@ idna==3.4
 | 
			
		||||
    # via requests
 | 
			
		||||
imagesize==1.4.1
 | 
			
		||||
    # via sphinx
 | 
			
		||||
importlib-metadata==5.0.0
 | 
			
		||||
    # via sphinx
 | 
			
		||||
jinja2==3.1.2
 | 
			
		||||
    # via sphinx
 | 
			
		||||
markupsafe==2.1.1
 | 
			
		||||
@ -60,5 +58,3 @@ sphinxcontrib-serializinghtml==1.1.5
 | 
			
		||||
    # via sphinx
 | 
			
		||||
urllib3==1.26.12
 | 
			
		||||
    # via requests
 | 
			
		||||
zipp==3.10.0
 | 
			
		||||
    # via importlib-metadata
 | 
			
		||||
 | 
			
		||||
@ -234,3 +234,39 @@ for reading in a document with an XSLT stylesheet is as follows:
 | 
			
		||||
If you read from a stylesheet ``HTMLDocument`` will use the ``etree.XMLParser`` by default
 | 
			
		||||
instead of the ``etree.HTMLParser`` because ``HTMLDocument`` assumes you want to convert
 | 
			
		||||
your raw XML to HTML.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
##################################
 | 
			
		||||
Extracting Metadata from Documents
 | 
			
		||||
##################################
 | 
			
		||||
 | 
			
		||||
The ``unstructured`` library includes utilities for extracting metadata from
 | 
			
		||||
documents. Currently, there is support for extracting metadata from ``.docx``,
 | 
			
		||||
``.xlsx``, and ``.jpg`` documents. When you call these functions, the return type
 | 
			
		||||
is a ``Metadata`` data class that you can convert to a dictionary by calling the
 | 
			
		||||
``to_dict()`` method. If you extract metadata from a ``.jpg`` document, the output
 | 
			
		||||
will include EXIF metadata in the ``exif_data`` attribute, if it is available.
 | 
			
		||||
Here is an example of how to use the metadata extraction functionality:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
.. code:: python
 | 
			
		||||
 | 
			
		||||
  from unstructured.file_utils.metadata import get_jpg_metadata
 | 
			
		||||
 | 
			
		||||
  filename = "example-docs/example.jpg"
 | 
			
		||||
  metadata = get_jpg_metadata(filename=filename)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
You can also pass in a file-like object with:
 | 
			
		||||
 | 
			
		||||
.. code:: python
 | 
			
		||||
 | 
			
		||||
  from unstructured.file_utils.metadata import get_jpg_metadata
 | 
			
		||||
 | 
			
		||||
  filename = "example-docs/example.jpg"
 | 
			
		||||
  with open(filename, "rb") as f:
 | 
			
		||||
      metadata = get_jpg_metadata(file=f)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
To extract metadata from ``.docx`` or ``.xlsx``, use ``get_docx_metadata`` and
 | 
			
		||||
``get_xlsx_metadata``. The interfaces are the same as ``get_jpg_metadata``.
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										
											BIN
										
									
								
								example-docs/example.jpg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								example-docs/example.jpg
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| 
		 After Width: | Height: | Size: 32 KiB  | 
							
								
								
									
										
											BIN
										
									
								
								example-docs/fake-excel.xlsx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								example-docs/fake-excel.xlsx
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								example-docs/fake.docx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								example-docs/fake.docx
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							@ -1,5 +1,5 @@
 | 
			
		||||
#
 | 
			
		||||
# This file is autogenerated by pip-compile with python 3.8
 | 
			
		||||
# This file is autogenerated by pip-compile with python 3.10
 | 
			
		||||
# To update, run:
 | 
			
		||||
#
 | 
			
		||||
#    pip-compile --output-file=requirements/base.txt
 | 
			
		||||
@ -16,6 +16,8 @@ click==8.1.3
 | 
			
		||||
    # via nltk
 | 
			
		||||
deprecated==1.2.13
 | 
			
		||||
    # via argilla
 | 
			
		||||
et-xmlfile==1.1.0
 | 
			
		||||
    # via openpyxl
 | 
			
		||||
h11==0.9.0
 | 
			
		||||
    # via httpcore
 | 
			
		||||
httpcore==0.11.1
 | 
			
		||||
@ -27,7 +29,9 @@ idna==3.4
 | 
			
		||||
joblib==1.2.0
 | 
			
		||||
    # via nltk
 | 
			
		||||
lxml==4.9.1
 | 
			
		||||
    # via unstructured (setup.py)
 | 
			
		||||
    # via
 | 
			
		||||
    #   python-docx
 | 
			
		||||
    #   unstructured (setup.py)
 | 
			
		||||
monotonic==1.6
 | 
			
		||||
    # via argilla
 | 
			
		||||
nltk==3.7
 | 
			
		||||
@ -36,16 +40,22 @@ numpy==1.23.5
 | 
			
		||||
    # via
 | 
			
		||||
    #   argilla
 | 
			
		||||
    #   pandas
 | 
			
		||||
openpyxl==3.0.10
 | 
			
		||||
    # via unstructured (setup.py)
 | 
			
		||||
packaging==21.3
 | 
			
		||||
    # via argilla
 | 
			
		||||
pandas==1.5.2
 | 
			
		||||
    # via argilla
 | 
			
		||||
pillow==9.3.0
 | 
			
		||||
    # via unstructured (setup.py)
 | 
			
		||||
pydantic==1.10.2
 | 
			
		||||
    # via argilla
 | 
			
		||||
pyparsing==3.0.9
 | 
			
		||||
    # via packaging
 | 
			
		||||
python-dateutil==2.8.2
 | 
			
		||||
    # via pandas
 | 
			
		||||
python-docx==0.8.11
 | 
			
		||||
    # via unstructured (setup.py)
 | 
			
		||||
pytz==2022.6
 | 
			
		||||
    # via pandas
 | 
			
		||||
regex==2022.10.31
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,5 @@
 | 
			
		||||
#
 | 
			
		||||
# This file is autogenerated by pip-compile with python 3.8
 | 
			
		||||
# This file is autogenerated by pip-compile with python 3.10
 | 
			
		||||
# To update, run:
 | 
			
		||||
#
 | 
			
		||||
#    pip-compile requirements/build.in
 | 
			
		||||
@ -22,8 +22,6 @@ idna==3.4
 | 
			
		||||
    # via requests
 | 
			
		||||
imagesize==1.4.1
 | 
			
		||||
    # via sphinx
 | 
			
		||||
importlib-metadata==5.0.0
 | 
			
		||||
    # via sphinx
 | 
			
		||||
jinja2==3.1.2
 | 
			
		||||
    # via sphinx
 | 
			
		||||
markupsafe==2.1.1
 | 
			
		||||
@ -60,5 +58,3 @@ sphinxcontrib-serializinghtml==1.1.5
 | 
			
		||||
    # via sphinx
 | 
			
		||||
urllib3==1.26.12
 | 
			
		||||
    # via requests
 | 
			
		||||
zipp==3.10.0
 | 
			
		||||
    # via importlib-metadata
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,5 @@
 | 
			
		||||
#
 | 
			
		||||
# This file is autogenerated by pip-compile with python 3.8
 | 
			
		||||
# This file is autogenerated by pip-compile with python 3.10
 | 
			
		||||
# To update, run:
 | 
			
		||||
#
 | 
			
		||||
#    pip-compile requirements/dev.in
 | 
			
		||||
@ -40,10 +40,6 @@ executing==1.0.0
 | 
			
		||||
    # via stack-data
 | 
			
		||||
fastjsonschema==2.16.2
 | 
			
		||||
    # via nbformat
 | 
			
		||||
importlib-metadata==5.0.0
 | 
			
		||||
    # via nbconvert
 | 
			
		||||
importlib-resources==5.10.0
 | 
			
		||||
    # via jsonschema
 | 
			
		||||
ipykernel==6.15.3
 | 
			
		||||
    # via
 | 
			
		||||
    #   ipywidgets
 | 
			
		||||
@ -143,8 +139,6 @@ pickleshare==0.7.5
 | 
			
		||||
    # via ipython
 | 
			
		||||
pip-tools==6.10.0
 | 
			
		||||
    # via -r requirements/dev.in
 | 
			
		||||
pkgutil-resolve-name==1.3.10
 | 
			
		||||
    # via jsonschema
 | 
			
		||||
platformdirs==2.5.4
 | 
			
		||||
    # via jupyter-core
 | 
			
		||||
prometheus-client==0.14.1
 | 
			
		||||
@ -233,10 +227,6 @@ wheel==0.37.1
 | 
			
		||||
    # via pip-tools
 | 
			
		||||
widgetsnbextension==4.0.3
 | 
			
		||||
    # via ipywidgets
 | 
			
		||||
zipp==3.10.0
 | 
			
		||||
    # via
 | 
			
		||||
    #   importlib-metadata
 | 
			
		||||
    #   importlib-resources
 | 
			
		||||
 | 
			
		||||
# The following packages are considered to be unsafe in a requirements file:
 | 
			
		||||
# pip
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,5 @@
 | 
			
		||||
#
 | 
			
		||||
# This file is autogenerated by pip-compile with python 3.8
 | 
			
		||||
# This file is autogenerated by pip-compile with python 3.10
 | 
			
		||||
# To update, run:
 | 
			
		||||
#
 | 
			
		||||
#    pip-compile --extra=huggingface --output-file=requirements/huggingface.txt
 | 
			
		||||
@ -21,6 +21,8 @@ click==8.1.3
 | 
			
		||||
    #   sacremoses
 | 
			
		||||
deprecated==1.2.13
 | 
			
		||||
    # via argilla
 | 
			
		||||
et-xmlfile==1.1.0
 | 
			
		||||
    # via openpyxl
 | 
			
		||||
filelock==3.8.2
 | 
			
		||||
    # via
 | 
			
		||||
    #   huggingface-hub
 | 
			
		||||
@ -44,7 +46,9 @@ joblib==1.2.0
 | 
			
		||||
langdetect==1.0.9
 | 
			
		||||
    # via unstructured (setup.py)
 | 
			
		||||
lxml==4.9.1
 | 
			
		||||
    # via unstructured (setup.py)
 | 
			
		||||
    # via
 | 
			
		||||
    #   python-docx
 | 
			
		||||
    #   unstructured (setup.py)
 | 
			
		||||
monotonic==1.6
 | 
			
		||||
    # via argilla
 | 
			
		||||
nltk==3.7
 | 
			
		||||
@ -54,6 +58,8 @@ numpy==1.23.4
 | 
			
		||||
    #   argilla
 | 
			
		||||
    #   pandas
 | 
			
		||||
    #   transformers
 | 
			
		||||
openpyxl==3.0.10
 | 
			
		||||
    # via unstructured (setup.py)
 | 
			
		||||
packaging==21.3
 | 
			
		||||
    # via
 | 
			
		||||
    #   argilla
 | 
			
		||||
@ -61,12 +67,16 @@ packaging==21.3
 | 
			
		||||
    #   transformers
 | 
			
		||||
pandas==1.5.2
 | 
			
		||||
    # via argilla
 | 
			
		||||
pillow==9.3.0
 | 
			
		||||
    # via unstructured (setup.py)
 | 
			
		||||
pydantic==1.10.2
 | 
			
		||||
    # via argilla
 | 
			
		||||
pyparsing==3.0.9
 | 
			
		||||
    # via packaging
 | 
			
		||||
python-dateutil==2.8.2
 | 
			
		||||
    # via pandas
 | 
			
		||||
python-docx==0.8.11
 | 
			
		||||
    # via unstructured (setup.py)
 | 
			
		||||
pytz==2022.6
 | 
			
		||||
    # via pandas
 | 
			
		||||
pyyaml==6.0
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,5 @@
 | 
			
		||||
#
 | 
			
		||||
# This file is autogenerated by pip-compile with python 3.8
 | 
			
		||||
# This file is autogenerated by pip-compile with python 3.10
 | 
			
		||||
# To update, run:
 | 
			
		||||
#
 | 
			
		||||
#    pip-compile requirements/test.in
 | 
			
		||||
@ -80,7 +80,6 @@ tomli==2.0.1
 | 
			
		||||
    #   pytest
 | 
			
		||||
typing-extensions==4.3.0
 | 
			
		||||
    # via
 | 
			
		||||
    #   black
 | 
			
		||||
    #   mypy
 | 
			
		||||
    #   pydantic
 | 
			
		||||
urllib3==1.26.12
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										5
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										5
									
								
								setup.py
									
									
									
									
									
								
							@ -48,9 +48,12 @@ setup(
 | 
			
		||||
    version=__version__,
 | 
			
		||||
    entry_points={},
 | 
			
		||||
    install_requires=[
 | 
			
		||||
        "argilla",
 | 
			
		||||
        "lxml",
 | 
			
		||||
        "nltk",
 | 
			
		||||
        "argilla",
 | 
			
		||||
        "openpyxl",
 | 
			
		||||
        "pillow",
 | 
			
		||||
        "python-docx",
 | 
			
		||||
        # NOTE(robinson) - The following dependencies are pinned
 | 
			
		||||
        # to address security scans
 | 
			
		||||
        "certifi>=2022.12.07",
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										107
									
								
								test_unstructured/file_utils/test_metadata.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										107
									
								
								test_unstructured/file_utils/test_metadata.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,107 @@
 | 
			
		||||
import datetime
 | 
			
		||||
import os
 | 
			
		||||
import pathlib
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
import docx
 | 
			
		||||
import openpyxl
 | 
			
		||||
 | 
			
		||||
import unstructured.file_utils.metadata as meta
 | 
			
		||||
 | 
			
		||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
 | 
			
		||||
EXAMPLE_JPG_FILENAME = os.path.join(DIRECTORY, "..", "..", "example-docs", "example.jpg")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_docx_metadata_from_filename(tmpdir):
 | 
			
		||||
    filename = os.path.join(tmpdir, "test-doc.docx")
 | 
			
		||||
 | 
			
		||||
    document = docx.Document()
 | 
			
		||||
    document.add_paragraph("Lorem ipsum dolor sit amet.")
 | 
			
		||||
    document.core_properties.author = "Mr. Miagi"
 | 
			
		||||
    document.save(filename)
 | 
			
		||||
 | 
			
		||||
    metadata = meta.get_docx_metadata(filename=filename)
 | 
			
		||||
    assert metadata.author == "Mr. Miagi"
 | 
			
		||||
    assert metadata.to_dict()["author"] == "Mr. Miagi"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_docx_metadata_from_file(tmpdir):
 | 
			
		||||
    filename = os.path.join(tmpdir, "test-doc.docx")
 | 
			
		||||
 | 
			
		||||
    document = docx.Document()
 | 
			
		||||
    document.add_paragraph("Lorem ipsum dolor sit amet.")
 | 
			
		||||
    document.core_properties.author = "Mr. Miagi"
 | 
			
		||||
    document.save(filename)
 | 
			
		||||
 | 
			
		||||
    with open(filename, "rb") as f:
 | 
			
		||||
        metadata = meta.get_docx_metadata(file=f)
 | 
			
		||||
    assert metadata.author == "Mr. Miagi"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_docx_metadata_raises_without_file_or_filename():
 | 
			
		||||
    with pytest.raises(FileNotFoundError):
 | 
			
		||||
        meta.get_docx_metadata()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_xlsx_metadata_from_filename(tmpdir):
 | 
			
		||||
    filename = os.path.join(tmpdir, "test-excel.xlsx")
 | 
			
		||||
 | 
			
		||||
    workbook = openpyxl.Workbook()
 | 
			
		||||
    workbook.properties.creator = "Mr. Miagi"
 | 
			
		||||
    workbook.save(filename)
 | 
			
		||||
 | 
			
		||||
    metadata = meta.get_xlsx_metadata(filename=filename)
 | 
			
		||||
    metadata.author = "Mr. Miagi"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_xlsx_metadata_from_file(tmpdir):
 | 
			
		||||
    filename = os.path.join(tmpdir, "test-excel.xlsx")
 | 
			
		||||
 | 
			
		||||
    workbook = openpyxl.Workbook()
 | 
			
		||||
    workbook.properties.creator = "Mr. Miagi"
 | 
			
		||||
    workbook.save(filename)
 | 
			
		||||
 | 
			
		||||
    with open(filename, "rb") as f:
 | 
			
		||||
        metadata = meta.get_xlsx_metadata(file=f)
 | 
			
		||||
    metadata.author = "Mr. Miagi"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_xlsx_metadata_raises_without_file_or_filename():
 | 
			
		||||
    with pytest.raises(FileNotFoundError):
 | 
			
		||||
        meta.get_xlsx_metadata()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_jpg_metadata_from_filename():
 | 
			
		||||
    metadata = meta.get_jpg_metadata(filename=EXAMPLE_JPG_FILENAME)
 | 
			
		||||
    assert metadata.modified == datetime.datetime(2003, 12, 14, 12, 1, 44)
 | 
			
		||||
    assert metadata.exif_data["Make"] == "Canon"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_jpg_metadata_from_file():
 | 
			
		||||
    with open(EXAMPLE_JPG_FILENAME, "rb") as f:
 | 
			
		||||
        metadata = meta.get_jpg_metadata(file=f)
 | 
			
		||||
    assert metadata.modified == datetime.datetime(2003, 12, 14, 12, 1, 44)
 | 
			
		||||
    assert metadata.exif_data["Make"] == "Canon"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_jpg_metadata_raises_without_file_or_filename():
 | 
			
		||||
    with pytest.raises(FileNotFoundError):
 | 
			
		||||
        meta.get_jpg_metadata()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_exif_datetime():
 | 
			
		||||
    exif_data = {"DateTime": "2022:12:23 15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
 | 
			
		||||
    date = meta._get_exif_datetime(exif_data, "DateTime")
 | 
			
		||||
    assert date == datetime.datetime(2022, 12, 23, 15, 49, 0)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_exif_datetime_ignores_bad_formats():
 | 
			
		||||
    exif_data = {"DateTime": "2022-12-23TZ15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
 | 
			
		||||
    date = meta._get_exif_datetime(exif_data, "DateTime")
 | 
			
		||||
    assert date is None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_exif_datetime_ignores_missing_key():
 | 
			
		||||
    exif_data = {"Datetime": "2022-12-23TZ15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
 | 
			
		||||
    date = meta._get_exif_datetime(exif_data, "DateTimeDigitized")
 | 
			
		||||
    assert date is None
 | 
			
		||||
@ -1 +1 @@
 | 
			
		||||
__version__ = "0.3.5-dev2"  # pragma: no cover
 | 
			
		||||
__version__ = "0.3.5-dev3"  # pragma: no cover
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										0
									
								
								unstructured/file_utils/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								unstructured/file_utils/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										152
									
								
								unstructured/file_utils/metadata.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										152
									
								
								unstructured/file_utils/metadata.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,152 @@
 | 
			
		||||
from dataclasses import dataclass, field
 | 
			
		||||
import datetime
 | 
			
		||||
import io
 | 
			
		||||
from typing import Any, Dict, IO, Final, Optional
 | 
			
		||||
 | 
			
		||||
import docx
 | 
			
		||||
import openpyxl
 | 
			
		||||
from PIL import Image
 | 
			
		||||
from PIL.ExifTags import TAGS
 | 
			
		||||
 | 
			
		||||
# NOTE(robison) - ref: https://www.media.mit.edu/pia/Research/deepview/exif.html
 | 
			
		||||
EXIF_DATETIME_FMT: Final[str] = "%Y:%m:%d %H:%M:%S"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class Metadata:
 | 
			
		||||
    author: str = ""
 | 
			
		||||
    category: str = ""
 | 
			
		||||
    comments: str = ""
 | 
			
		||||
    content_status: str = ""
 | 
			
		||||
    created: Optional[datetime.datetime] = None
 | 
			
		||||
    identifier: str = ""
 | 
			
		||||
    keywords: str = ""
 | 
			
		||||
    language: str = ""
 | 
			
		||||
    last_modified_by: str = ""
 | 
			
		||||
    last_printed: Optional[datetime.datetime] = None
 | 
			
		||||
    modified: Optional[datetime.datetime] = None
 | 
			
		||||
    revision: Optional[int] = 0
 | 
			
		||||
    subject: str = ""
 | 
			
		||||
    title: str = ""
 | 
			
		||||
    version: str = ""
 | 
			
		||||
    description: str = ""
 | 
			
		||||
    namespace: str = ""
 | 
			
		||||
 | 
			
		||||
    # NOTE(robinson) - Metadata for use with image files
 | 
			
		||||
    exif_data: Dict[str, Any] = field(default_factory=dict)
 | 
			
		||||
 | 
			
		||||
    def to_dict(self):
 | 
			
		||||
        return self.__dict__
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_docx_metadata(
 | 
			
		||||
    filename: str = "",
 | 
			
		||||
    file: Optional[IO] = None,
 | 
			
		||||
) -> Metadata:
 | 
			
		||||
    """Extracts document metadata from a Microsoft .docx document."""
 | 
			
		||||
    if filename:
 | 
			
		||||
        doc = docx.Document(filename)
 | 
			
		||||
    elif file:
 | 
			
		||||
        doc = docx.Document(file)
 | 
			
		||||
    else:
 | 
			
		||||
        raise FileNotFoundError("No filename nor file were specified")
 | 
			
		||||
 | 
			
		||||
    metadata = Metadata(
 | 
			
		||||
        author=getattr(doc.core_properties, "author", ""),
 | 
			
		||||
        category=getattr(doc.core_properties, "category", ""),
 | 
			
		||||
        comments=getattr(doc.core_properties, "comments", ""),
 | 
			
		||||
        content_status=getattr(doc.core_properties, "content_status", ""),
 | 
			
		||||
        created=getattr(doc.core_properties, "created", None),
 | 
			
		||||
        identifier=getattr(doc.core_properties, "identifier", ""),
 | 
			
		||||
        keywords=getattr(doc.core_properties, "keywords", ""),
 | 
			
		||||
        language=getattr(doc.core_properties, "language", ""),
 | 
			
		||||
        last_modified_by=getattr(doc.core_properties, "last_modified_by", ""),
 | 
			
		||||
        last_printed=getattr(doc.core_properties, "last_printed", None),
 | 
			
		||||
        modified=getattr(doc.core_properties, "modified", None),
 | 
			
		||||
        revision=getattr(doc.core_properties, "revision", None),
 | 
			
		||||
        subject=getattr(doc.core_properties, "subject", ""),
 | 
			
		||||
        title=getattr(doc.core_properties, "title", ""),
 | 
			
		||||
        version=getattr(doc.core_properties, "version", ""),
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    return metadata
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_xlsx_metadata(
 | 
			
		||||
    filename: str = "",
 | 
			
		||||
    file: Optional[IO] = None,
 | 
			
		||||
) -> Metadata:
 | 
			
		||||
    """Extracts document metadata from a Microsoft .xlsx document."""
 | 
			
		||||
    if filename:
 | 
			
		||||
        workbook = openpyxl.load_workbook(filename)
 | 
			
		||||
    elif file:
 | 
			
		||||
        workbook = openpyxl.load_workbook(file)
 | 
			
		||||
    else:
 | 
			
		||||
        raise FileNotFoundError("No filename nor file were specified")
 | 
			
		||||
 | 
			
		||||
    metadata = Metadata(
 | 
			
		||||
        author=getattr(workbook.properties, "creator", ""),
 | 
			
		||||
        category=getattr(workbook.properties, "category", ""),
 | 
			
		||||
        content_status=getattr(workbook.properties, "contentStatus", ""),
 | 
			
		||||
        created=getattr(workbook.properties, "created", None),
 | 
			
		||||
        description=getattr(workbook.properties, "description", ""),
 | 
			
		||||
        identifier=getattr(workbook.properties, "identifier", ""),
 | 
			
		||||
        keywords=getattr(workbook.properties, "keywords", ""),
 | 
			
		||||
        language=getattr(workbook.properties, "language", ""),
 | 
			
		||||
        last_modified_by=getattr(workbook.properties, "lastModifiedBy", ""),
 | 
			
		||||
        last_printed=getattr(workbook.properties, "lastPrinted", None),
 | 
			
		||||
        modified=getattr(workbook.properties, "modified", None),
 | 
			
		||||
        namespace=getattr(workbook.properties, "namespace", ""),
 | 
			
		||||
        revision=getattr(workbook.properties, "revision", None),
 | 
			
		||||
        subject=getattr(workbook.properties, "subject", ""),
 | 
			
		||||
        title=getattr(workbook.properties, "title", ""),
 | 
			
		||||
        version=getattr(workbook.properties, "version", ""),
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    return metadata
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_jpg_metadata(
 | 
			
		||||
    filename: str = "",
 | 
			
		||||
    file: Optional[IO] = None,
 | 
			
		||||
) -> Metadata:
 | 
			
		||||
    """Extracts metadata from a JPG image, including EXIF metadata."""
 | 
			
		||||
    if filename:
 | 
			
		||||
        image = Image.open(filename)
 | 
			
		||||
    elif file:
 | 
			
		||||
        image = Image.open(io.BytesIO(file.read()))
 | 
			
		||||
    else:
 | 
			
		||||
        raise FileNotFoundError("No filename nor file were specified")
 | 
			
		||||
 | 
			
		||||
    exif_data = image.getexif()
 | 
			
		||||
    exif_dict: Dict[str, Any] = dict()
 | 
			
		||||
    for tag_id in exif_data:
 | 
			
		||||
        tag = TAGS.get(tag_id, tag_id)
 | 
			
		||||
        data = exif_data.get(tag_id)
 | 
			
		||||
        exif_dict[tag] = data
 | 
			
		||||
 | 
			
		||||
    metadata = Metadata(
 | 
			
		||||
        author=exif_dict.get("Artist", ""),
 | 
			
		||||
        comments=exif_dict.get("UserComment", ""),
 | 
			
		||||
        created=_get_exif_datetime(exif_dict, "DateTimeOriginal"),
 | 
			
		||||
        # NOTE(robinson) - Per EXIF docs, DateTime is the last modified data
 | 
			
		||||
        # ref: https://www.media.mit.edu/pia/Research/deepview/exif.html
 | 
			
		||||
        modified=_get_exif_datetime(exif_dict, "DateTime"),
 | 
			
		||||
        exif_data=exif_dict,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    return metadata
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _get_exif_datetime(exif_dict: Dict[str, Any], key: str) -> Optional[datetime.datetime]:
 | 
			
		||||
    """Converts a datetime string from the EXIF data to a Python datetime object."""
 | 
			
		||||
    date = exif_dict.get(key)
 | 
			
		||||
    if not date:
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        return datetime.datetime.strptime(date, EXIF_DATETIME_FMT)
 | 
			
		||||
    # NOTE(robinson) - An exception could occur if the datetime is not formatted
 | 
			
		||||
    # using the standard EXIF datetime format
 | 
			
		||||
    except ValueError:
 | 
			
		||||
        return None
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user