mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 02:53:31 +00:00
chore(file): remove dead code (#3645)
**Summary** Remove dead code in `unstructured.file_utils`. **Additional Context** These modules were added in 12/2022 and 1/2023 and are not referenced by any code. Removing to reduce unnecessary complexity. These can of course be recovered from Git history if we decide we want them again in future.
This commit is contained in:
parent
22998354db
commit
cd074bb32b
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
@ -161,6 +161,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
needs: [setup, lint]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@ -40,7 +40,7 @@ jobs:
|
||||
# actions/checkout MUST come before auth
|
||||
- uses: 'actions/checkout@v4'
|
||||
- name: Set up Python ${{ env.PYTHON_VERSION }}
|
||||
uses: actions/setup-python@v4
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
- name: Get full Python version
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
## 0.15.13-dev1
|
||||
## 0.15.13-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
||||
@ -3,4 +3,13 @@
|
||||
set -e
|
||||
|
||||
# $1 is the path for chroma to write the contents to. The symbol "&" runs process in background
|
||||
chroma run --path "$1" &
|
||||
echo "Current venv is:"
|
||||
echo "$VIRTUAL_ENV"
|
||||
echo "Current path is:"
|
||||
echo "$PATH"
|
||||
ls -l "$VIRTUAL_ENV/bin/chroma"
|
||||
echo "================"
|
||||
cat "$VIRTUAL_ENV/bin/chroma"
|
||||
echo "================"
|
||||
# chroma run --path "$1" &
|
||||
python "$VIRTUAL_ENV/bin/chroma" run --path "$1" &
|
||||
|
||||
@ -1,97 +0,0 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from unstructured.file_utils import exploration
|
||||
from unstructured.file_utils.model import FileType
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
|
||||
is_in_docker = os.path.exists("/.dockerenv")
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_get_directory_file_info(tmpdir):
|
||||
file_info_test = os.path.join(tmpdir, "file_info_test")
|
||||
if not os.path.exists(file_info_test):
|
||||
os.mkdir(file_info_test)
|
||||
|
||||
directory1 = os.path.join(file_info_test, "directory1")
|
||||
if not os.path.exists(directory1):
|
||||
os.mkdir(directory1)
|
||||
|
||||
filename1 = os.path.join(directory1, "filename1.txt")
|
||||
with open(filename1, "w") as f:
|
||||
f.write("hello there!")
|
||||
|
||||
directory2 = os.path.join(file_info_test, "directory2")
|
||||
if not os.path.exists(directory2):
|
||||
os.mkdir(directory2)
|
||||
|
||||
filename2 = os.path.join(directory2, "filename2.txt")
|
||||
with open(filename2, "w") as f:
|
||||
f.write("hello there!")
|
||||
|
||||
file_info = exploration.get_directory_file_info(file_info_test)
|
||||
assert isinstance(file_info, pd.DataFrame)
|
||||
assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
|
||||
|
||||
means = file_info.groupby("filetype").mean(numeric_only=True)
|
||||
assert means.columns.to_list() == ["filesize"]
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_get_file_info(tmpdir):
|
||||
file_info_test = os.path.join(tmpdir, "file_info_test")
|
||||
if not os.path.exists(file_info_test):
|
||||
os.mkdir(file_info_test)
|
||||
|
||||
directory1 = os.path.join(file_info_test, "directory1")
|
||||
if not os.path.exists(directory1):
|
||||
os.mkdir(directory1)
|
||||
|
||||
filename1 = os.path.join(directory1, "filename1.txt")
|
||||
with open(filename1, "w") as f:
|
||||
f.write("hello there!")
|
||||
|
||||
directory2 = os.path.join(file_info_test, "directory2")
|
||||
if not os.path.exists(directory2):
|
||||
os.mkdir(directory2)
|
||||
|
||||
filename2 = os.path.join(directory2, "filename2.txt")
|
||||
with open(filename2, "w") as f:
|
||||
f.write("hello there!")
|
||||
|
||||
file_info = exploration.get_file_info([filename1, filename2])
|
||||
assert isinstance(file_info, pd.DataFrame)
|
||||
assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
|
||||
|
||||
means = file_info.groupby("filetype").mean(numeric_only=True)
|
||||
assert means.columns.to_list() == ["filesize"]
|
||||
|
||||
|
||||
def test_get_file_info_from_file_contents():
|
||||
file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt")
|
||||
with open(file_contents_filename) as f:
|
||||
file_contents = [f.read()]
|
||||
|
||||
file_info = exploration.get_file_info_from_file_contents(
|
||||
file_contents=file_contents,
|
||||
filenames=["test.eml"],
|
||||
)
|
||||
assert file_info.filetype[0] == FileType.EML
|
||||
|
||||
|
||||
def test_get_file_info_from_file_contents_raises_if_lists_no_equal():
|
||||
file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt")
|
||||
with open(file_contents_filename) as f:
|
||||
file_contents = [f.read()]
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
exploration.get_file_info_from_file_contents(
|
||||
file_contents=file_contents,
|
||||
filenames=["test.eml", "test2.eml"],
|
||||
)
|
||||
@ -1,108 +0,0 @@
|
||||
import datetime
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import docx
|
||||
import openpyxl
|
||||
import pytest
|
||||
|
||||
import unstructured.file_utils.metadata as meta
|
||||
from test_unstructured.unit_utils import example_doc_path
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_JPG_FILENAME = example_doc_path("img/example.jpg")
|
||||
|
||||
|
||||
def test_get_docx_metadata_from_filename(tmpdir):
|
||||
filename = os.path.join(tmpdir, "test-doc.docx")
|
||||
|
||||
document = docx.Document()
|
||||
document.add_paragraph("Lorem ipsum dolor sit amet.")
|
||||
document.core_properties.author = "Mr. Miagi"
|
||||
document.save(filename)
|
||||
|
||||
metadata = meta.get_docx_metadata(filename=filename)
|
||||
assert metadata.author == "Mr. Miagi"
|
||||
assert metadata.to_dict()["author"] == "Mr. Miagi"
|
||||
|
||||
|
||||
def test_get_docx_metadata_from_file(tmpdir):
|
||||
filename = os.path.join(tmpdir, "test-doc.docx")
|
||||
|
||||
document = docx.Document()
|
||||
document.add_paragraph("Lorem ipsum dolor sit amet.")
|
||||
document.core_properties.author = "Mr. Miagi"
|
||||
document.save(filename)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
metadata = meta.get_docx_metadata(file=f)
|
||||
assert metadata.author == "Mr. Miagi"
|
||||
|
||||
|
||||
def test_get_docx_metadata_raises_without_file_or_filename():
|
||||
with pytest.raises(FileNotFoundError):
|
||||
meta.get_docx_metadata()
|
||||
|
||||
|
||||
def test_get_xlsx_metadata_from_filename(tmpdir):
|
||||
filename = os.path.join(tmpdir, "test-excel.xlsx")
|
||||
|
||||
workbook = openpyxl.Workbook()
|
||||
workbook.properties.creator = "Mr. Miagi"
|
||||
workbook.save(filename)
|
||||
|
||||
metadata = meta.get_xlsx_metadata(filename=filename)
|
||||
metadata.author = "Mr. Miagi"
|
||||
|
||||
|
||||
def test_get_xlsx_metadata_from_file(tmpdir):
|
||||
filename = os.path.join(tmpdir, "test-excel.xlsx")
|
||||
|
||||
workbook = openpyxl.Workbook()
|
||||
workbook.properties.creator = "Mr. Miagi"
|
||||
workbook.save(filename)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
metadata = meta.get_xlsx_metadata(file=f)
|
||||
metadata.author = "Mr. Miagi"
|
||||
|
||||
|
||||
def test_get_xlsx_metadata_raises_without_file_or_filename():
|
||||
with pytest.raises(FileNotFoundError):
|
||||
meta.get_xlsx_metadata()
|
||||
|
||||
|
||||
def test_get_jpg_metadata_from_filename():
|
||||
metadata = meta.get_jpg_metadata(filename=EXAMPLE_JPG_FILENAME)
|
||||
assert metadata.modified == datetime.datetime(2003, 12, 14, 12, 1, 44)
|
||||
assert metadata.exif_data["Make"] == "Canon"
|
||||
|
||||
|
||||
def test_get_jpg_metadata_from_file():
|
||||
with open(EXAMPLE_JPG_FILENAME, "rb") as f:
|
||||
metadata = meta.get_jpg_metadata(file=f)
|
||||
assert metadata.modified == datetime.datetime(2003, 12, 14, 12, 1, 44)
|
||||
assert metadata.exif_data["Make"] == "Canon"
|
||||
|
||||
|
||||
def test_get_jpg_metadata_raises_without_file_or_filename():
|
||||
with pytest.raises(FileNotFoundError):
|
||||
meta.get_jpg_metadata()
|
||||
|
||||
|
||||
def test_get_exif_datetime():
|
||||
exif_data = {"DateTime": "2022:12:23 15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
|
||||
date = meta._get_exif_datetime(exif_data, "DateTime")
|
||||
assert date == datetime.datetime(2022, 12, 23, 15, 49, 0)
|
||||
|
||||
|
||||
def test_get_exif_datetime_ignores_bad_formats():
|
||||
exif_data = {"DateTime": "2022-12-23TZ15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
|
||||
date = meta._get_exif_datetime(exif_data, "DateTime")
|
||||
assert date is None
|
||||
|
||||
|
||||
def test_get_exif_datetime_ignores_missing_key():
|
||||
exif_data = {"Datetime": "2022-12-23TZ15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
|
||||
date = meta._get_exif_datetime(exif_data, "DateTimeDigitized")
|
||||
assert date is None
|
||||
@ -38,7 +38,7 @@ trap cleanup EXIT
|
||||
# which is incompatible with the bson installed from pypi. bson is installed as part of the
|
||||
# astradb dependencies.
|
||||
# ref: https://pymongo.readthedocs.io/en/stable/installation.html
|
||||
pip uninstall -y bson pymongo
|
||||
python -m pip uninstall -y bson pymongo
|
||||
make install-ingest-mongodb
|
||||
|
||||
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
||||
|
||||
@ -24,7 +24,7 @@ fi
|
||||
# which is incompatible with the bson installed from pypi. bson is installed as part of the
|
||||
# astradb dependencies.
|
||||
# ref: https://pymongo.readthedocs.io/en/stable/installation.html
|
||||
pip uninstall -y bson pymongo
|
||||
python -m pip uninstall -y bson pymongo
|
||||
make install-ingest-mongodb
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.15.13-dev1" # pragma: no cover
|
||||
__version__ = "0.15.13-dev2" # pragma: no cover
|
||||
|
||||
@ -1,76 +0,0 @@
|
||||
import base64
|
||||
import io
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from unstructured.file_utils.filetype import detect_filetype
|
||||
|
||||
|
||||
def get_directory_file_info(directory: str) -> pd.DataFrame:
|
||||
"""Recursively walks a directory and extracts key file information to support initial
|
||||
exploration of text data sets. Returns a pandas DataFrame."""
|
||||
filenames: List[str] = []
|
||||
for path, _, files in os.walk(directory):
|
||||
for filename_no_path in files:
|
||||
filenames.append(os.path.join(path, filename_no_path))
|
||||
return get_file_info(filenames)
|
||||
|
||||
|
||||
def get_file_info(filenames: List[str]) -> pd.DataFrame:
|
||||
"""Returns a pandas DataFrame summarizing the filetypes for a list of files."""
|
||||
data: Dict[str, List[Any]] = {
|
||||
"filename": [],
|
||||
"path": [],
|
||||
"filesize": [],
|
||||
"extension": [],
|
||||
"filetype": [],
|
||||
}
|
||||
|
||||
for filename in filenames:
|
||||
path, filename_no_path = os.path.split(os.path.abspath(filename))
|
||||
_, extension = os.path.splitext(filename)
|
||||
filesize = os.path.getsize(filename)
|
||||
filetype = detect_filetype(filename)
|
||||
|
||||
data["filename"].append(filename_no_path)
|
||||
data["path"].append(path)
|
||||
data["extension"].append(extension)
|
||||
data["filesize"].append(filesize)
|
||||
data["filetype"].append(filetype)
|
||||
|
||||
return pd.DataFrame(data)
|
||||
|
||||
|
||||
def get_file_info_from_file_contents(
|
||||
file_contents: List[str],
|
||||
filenames: Optional[List[str]] = None,
|
||||
) -> pd.DataFrame:
|
||||
data: Dict[str, List[Any]] = {
|
||||
"filesize": [],
|
||||
"filetype": [],
|
||||
}
|
||||
|
||||
if filenames:
|
||||
if len(filenames) != len(file_contents):
|
||||
raise ValueError(
|
||||
f"There are {len(filenames)} filenames and {len(file_contents)} "
|
||||
"file_contents. Both inputs must be the same length.",
|
||||
)
|
||||
data["filename"] = []
|
||||
|
||||
for i, file_content in enumerate(file_contents):
|
||||
content_string = file_content.split(",")[-1]
|
||||
content_bytes = base64.b64decode(content_string)
|
||||
f = io.BytesIO(content_bytes)
|
||||
filetype = detect_filetype(file=f)
|
||||
f.seek(0, os.SEEK_END)
|
||||
filesize = f.tell()
|
||||
|
||||
data["filesize"].append(filesize)
|
||||
data["filetype"].append(filetype)
|
||||
if filenames:
|
||||
data["filename"].append(filenames[i])
|
||||
|
||||
return pd.DataFrame(data)
|
||||
@ -1,159 +0,0 @@
|
||||
import datetime
|
||||
import io
|
||||
from dataclasses import dataclass, field
|
||||
from typing import IO, Any, Dict, Final, Optional
|
||||
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
# NOTE(robison) - ref: https://www.media.mit.edu/pia/Research/deepview/exif.html
|
||||
EXIF_DATETIME_FMT: Final[str] = "%Y:%m:%d %H:%M:%S"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Metadata:
|
||||
author: str = ""
|
||||
category: str = ""
|
||||
comments: str = ""
|
||||
content_status: str = ""
|
||||
created: Optional[datetime.datetime] = None
|
||||
identifier: str = ""
|
||||
keywords: str = ""
|
||||
language: str = ""
|
||||
last_modified_by: str = ""
|
||||
last_printed: Optional[datetime.datetime] = None
|
||||
modified: Optional[datetime.datetime] = None
|
||||
revision: Optional[int] = 0
|
||||
subject: str = ""
|
||||
title: str = ""
|
||||
version: str = ""
|
||||
description: str = ""
|
||||
namespace: str = ""
|
||||
|
||||
# NOTE(robinson) - Metadata for use with image files
|
||||
exif_data: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self):
|
||||
return self.__dict__
|
||||
|
||||
|
||||
@requires_dependencies("docx")
|
||||
def get_docx_metadata(
|
||||
filename: str = "",
|
||||
file: Optional[IO[bytes]] = None,
|
||||
) -> Metadata:
|
||||
"""Extracts document metadata from a Microsoft .docx document."""
|
||||
import docx
|
||||
|
||||
if filename:
|
||||
doc = docx.Document(filename)
|
||||
elif file:
|
||||
doc = docx.Document(file)
|
||||
else:
|
||||
raise FileNotFoundError("No filename nor file were specified")
|
||||
|
||||
metadata = Metadata(
|
||||
author=getattr(doc.core_properties, "author", ""),
|
||||
category=getattr(doc.core_properties, "category", ""),
|
||||
comments=getattr(doc.core_properties, "comments", ""),
|
||||
content_status=getattr(doc.core_properties, "content_status", ""),
|
||||
created=getattr(doc.core_properties, "created", None),
|
||||
identifier=getattr(doc.core_properties, "identifier", ""),
|
||||
keywords=getattr(doc.core_properties, "keywords", ""),
|
||||
language=getattr(doc.core_properties, "language", ""),
|
||||
last_modified_by=getattr(doc.core_properties, "last_modified_by", ""),
|
||||
last_printed=getattr(doc.core_properties, "last_printed", None),
|
||||
modified=getattr(doc.core_properties, "modified", None),
|
||||
revision=getattr(doc.core_properties, "revision", None),
|
||||
subject=getattr(doc.core_properties, "subject", ""),
|
||||
title=getattr(doc.core_properties, "title", ""),
|
||||
version=getattr(doc.core_properties, "version", ""),
|
||||
)
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
@requires_dependencies("openpyxl")
|
||||
def get_xlsx_metadata(
|
||||
filename: str = "",
|
||||
file: Optional[IO[bytes]] = None,
|
||||
) -> Metadata:
|
||||
"""Extracts document metadata from a Microsoft .xlsx document."""
|
||||
import openpyxl
|
||||
|
||||
if filename:
|
||||
workbook = openpyxl.load_workbook(filename)
|
||||
elif file:
|
||||
workbook = openpyxl.load_workbook(file)
|
||||
else:
|
||||
raise FileNotFoundError("No filename nor file were specified")
|
||||
|
||||
metadata = Metadata(
|
||||
author=getattr(workbook.properties, "creator", ""),
|
||||
category=getattr(workbook.properties, "category", ""),
|
||||
content_status=getattr(workbook.properties, "contentStatus", ""),
|
||||
created=getattr(workbook.properties, "created", None),
|
||||
description=getattr(workbook.properties, "description", ""),
|
||||
identifier=getattr(workbook.properties, "identifier", ""),
|
||||
keywords=getattr(workbook.properties, "keywords", ""),
|
||||
language=getattr(workbook.properties, "language", ""),
|
||||
last_modified_by=getattr(workbook.properties, "lastModifiedBy", ""),
|
||||
last_printed=getattr(workbook.properties, "lastPrinted", None),
|
||||
modified=getattr(workbook.properties, "modified", None),
|
||||
namespace=getattr(workbook.properties, "namespace", ""),
|
||||
revision=getattr(workbook.properties, "revision", None),
|
||||
subject=getattr(workbook.properties, "subject", ""),
|
||||
title=getattr(workbook.properties, "title", ""),
|
||||
version=getattr(workbook.properties, "version", ""),
|
||||
)
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
@requires_dependencies("PIL")
|
||||
def get_jpg_metadata(
|
||||
filename: str = "",
|
||||
file: Optional[IO[bytes]] = None,
|
||||
) -> Metadata:
|
||||
"""Extracts metadata from a JPG image, including EXIF metadata."""
|
||||
from PIL import Image
|
||||
from PIL.ExifTags import TAGS
|
||||
|
||||
if filename:
|
||||
image = Image.open(filename)
|
||||
elif file:
|
||||
image = Image.open(io.BytesIO(file.read()))
|
||||
else:
|
||||
raise FileNotFoundError("No filename nor file were specified")
|
||||
|
||||
exif_data = image.getexif()
|
||||
exif_dict: Dict[str, Any] = {}
|
||||
for tag_id in exif_data:
|
||||
tag = TAGS.get(tag_id, tag_id)
|
||||
data = exif_data.get(tag_id)
|
||||
exif_dict[tag] = data
|
||||
|
||||
metadata = Metadata(
|
||||
author=exif_dict.get("Artist", ""),
|
||||
comments=exif_dict.get("UserComment", ""),
|
||||
created=_get_exif_datetime(exif_dict, "DateTimeOriginal"),
|
||||
# NOTE(robinson) - Per EXIF docs, DateTime is the last modified data
|
||||
# ref: https://www.media.mit.edu/pia/Research/deepview/exif.html
|
||||
modified=_get_exif_datetime(exif_dict, "DateTime"),
|
||||
exif_data=exif_dict,
|
||||
)
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
def _get_exif_datetime(exif_dict: Dict[str, Any], key: str) -> Optional[datetime.datetime]:
|
||||
"""Converts a datetime string from the EXIF data to a Python datetime object."""
|
||||
date = exif_dict.get(key)
|
||||
if not date:
|
||||
return None
|
||||
|
||||
try:
|
||||
return datetime.datetime.strptime(date, EXIF_DATETIME_FMT)
|
||||
# NOTE(robinson) - An exception could occur if the datetime is not formatted
|
||||
# using the standard EXIF datetime format
|
||||
except ValueError:
|
||||
return None
|
||||
Loading…
x
Reference in New Issue
Block a user