chore(file): remove dead code (#3645)

**Summary**
Remove dead code in `unstructured.file_utils`.

**Additional Context**
These modules were added in 12/2022 and 1/2023 and are not referenced by
any code. Removing to reduce unnecessary complexity. These can of course
be recovered from Git history if we decide we want them again in future.
This commit is contained in:
Steve Canny 2024-09-18 23:45:33 -07:00 committed by GitHub
parent 22998354db
commit cd074bb32b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 16 additions and 446 deletions

View File

@ -161,6 +161,7 @@ jobs:
runs-on: ubuntu-latest
env:
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup, lint]
steps:
- uses: actions/checkout@v4

View File

@ -40,7 +40,7 @@ jobs:
# actions/checkout MUST come before auth
- uses: 'actions/checkout@v4'
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Get full Python version

View File

@ -1,4 +1,4 @@
## 0.15.13-dev1
## 0.15.13-dev2
### Enhancements

View File

@ -3,4 +3,13 @@
set -e
# $1 is the path for chroma to write the contents to. The symbol "&" runs process in background
chroma run --path "$1" &
echo "Current venv is:"
echo "$VIRTUAL_ENV"
echo "Current path is:"
echo "$PATH"
ls -l "$VIRTUAL_ENV/bin/chroma"
echo "================"
cat "$VIRTUAL_ENV/bin/chroma"
echo "================"
# chroma run --path "$1" &
python "$VIRTUAL_ENV/bin/chroma" run --path "$1" &

View File

@ -1,97 +0,0 @@
import os
import pathlib
import pandas as pd
import pytest
from unstructured.file_utils import exploration
from unstructured.file_utils.model import FileType
DIRECTORY = pathlib.Path(__file__).parent.resolve()
is_in_docker = os.path.exists("/.dockerenv")
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_get_directory_file_info(tmpdir):
file_info_test = os.path.join(tmpdir, "file_info_test")
if not os.path.exists(file_info_test):
os.mkdir(file_info_test)
directory1 = os.path.join(file_info_test, "directory1")
if not os.path.exists(directory1):
os.mkdir(directory1)
filename1 = os.path.join(directory1, "filename1.txt")
with open(filename1, "w") as f:
f.write("hello there!")
directory2 = os.path.join(file_info_test, "directory2")
if not os.path.exists(directory2):
os.mkdir(directory2)
filename2 = os.path.join(directory2, "filename2.txt")
with open(filename2, "w") as f:
f.write("hello there!")
file_info = exploration.get_directory_file_info(file_info_test)
assert isinstance(file_info, pd.DataFrame)
assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
means = file_info.groupby("filetype").mean(numeric_only=True)
assert means.columns.to_list() == ["filesize"]
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_get_file_info(tmpdir):
file_info_test = os.path.join(tmpdir, "file_info_test")
if not os.path.exists(file_info_test):
os.mkdir(file_info_test)
directory1 = os.path.join(file_info_test, "directory1")
if not os.path.exists(directory1):
os.mkdir(directory1)
filename1 = os.path.join(directory1, "filename1.txt")
with open(filename1, "w") as f:
f.write("hello there!")
directory2 = os.path.join(file_info_test, "directory2")
if not os.path.exists(directory2):
os.mkdir(directory2)
filename2 = os.path.join(directory2, "filename2.txt")
with open(filename2, "w") as f:
f.write("hello there!")
file_info = exploration.get_file_info([filename1, filename2])
assert isinstance(file_info, pd.DataFrame)
assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
means = file_info.groupby("filetype").mean(numeric_only=True)
assert means.columns.to_list() == ["filesize"]
def test_get_file_info_from_file_contents():
file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt")
with open(file_contents_filename) as f:
file_contents = [f.read()]
file_info = exploration.get_file_info_from_file_contents(
file_contents=file_contents,
filenames=["test.eml"],
)
assert file_info.filetype[0] == FileType.EML
def test_get_file_info_from_file_contents_raises_if_lists_no_equal():
file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt")
with open(file_contents_filename) as f:
file_contents = [f.read()]
with pytest.raises(ValueError):
exploration.get_file_info_from_file_contents(
file_contents=file_contents,
filenames=["test.eml", "test2.eml"],
)

View File

@ -1,108 +0,0 @@
import datetime
import os
import pathlib
import docx
import openpyxl
import pytest
import unstructured.file_utils.metadata as meta
from test_unstructured.unit_utils import example_doc_path
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_JPG_FILENAME = example_doc_path("img/example.jpg")
def test_get_docx_metadata_from_filename(tmpdir):
filename = os.path.join(tmpdir, "test-doc.docx")
document = docx.Document()
document.add_paragraph("Lorem ipsum dolor sit amet.")
document.core_properties.author = "Mr. Miagi"
document.save(filename)
metadata = meta.get_docx_metadata(filename=filename)
assert metadata.author == "Mr. Miagi"
assert metadata.to_dict()["author"] == "Mr. Miagi"
def test_get_docx_metadata_from_file(tmpdir):
filename = os.path.join(tmpdir, "test-doc.docx")
document = docx.Document()
document.add_paragraph("Lorem ipsum dolor sit amet.")
document.core_properties.author = "Mr. Miagi"
document.save(filename)
with open(filename, "rb") as f:
metadata = meta.get_docx_metadata(file=f)
assert metadata.author == "Mr. Miagi"
def test_get_docx_metadata_raises_without_file_or_filename():
with pytest.raises(FileNotFoundError):
meta.get_docx_metadata()
def test_get_xlsx_metadata_from_filename(tmpdir):
filename = os.path.join(tmpdir, "test-excel.xlsx")
workbook = openpyxl.Workbook()
workbook.properties.creator = "Mr. Miagi"
workbook.save(filename)
metadata = meta.get_xlsx_metadata(filename=filename)
metadata.author = "Mr. Miagi"
def test_get_xlsx_metadata_from_file(tmpdir):
filename = os.path.join(tmpdir, "test-excel.xlsx")
workbook = openpyxl.Workbook()
workbook.properties.creator = "Mr. Miagi"
workbook.save(filename)
with open(filename, "rb") as f:
metadata = meta.get_xlsx_metadata(file=f)
metadata.author = "Mr. Miagi"
def test_get_xlsx_metadata_raises_without_file_or_filename():
with pytest.raises(FileNotFoundError):
meta.get_xlsx_metadata()
def test_get_jpg_metadata_from_filename():
metadata = meta.get_jpg_metadata(filename=EXAMPLE_JPG_FILENAME)
assert metadata.modified == datetime.datetime(2003, 12, 14, 12, 1, 44)
assert metadata.exif_data["Make"] == "Canon"
def test_get_jpg_metadata_from_file():
with open(EXAMPLE_JPG_FILENAME, "rb") as f:
metadata = meta.get_jpg_metadata(file=f)
assert metadata.modified == datetime.datetime(2003, 12, 14, 12, 1, 44)
assert metadata.exif_data["Make"] == "Canon"
def test_get_jpg_metadata_raises_without_file_or_filename():
with pytest.raises(FileNotFoundError):
meta.get_jpg_metadata()
def test_get_exif_datetime():
exif_data = {"DateTime": "2022:12:23 15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
date = meta._get_exif_datetime(exif_data, "DateTime")
assert date == datetime.datetime(2022, 12, 23, 15, 49, 0)
def test_get_exif_datetime_ignores_bad_formats():
exif_data = {"DateTime": "2022-12-23TZ15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
date = meta._get_exif_datetime(exif_data, "DateTime")
assert date is None
def test_get_exif_datetime_ignores_missing_key():
exif_data = {"Datetime": "2022-12-23TZ15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
date = meta._get_exif_datetime(exif_data, "DateTimeDigitized")
assert date is None

View File

@ -38,7 +38,7 @@ trap cleanup EXIT
# which is incompatible with the bson installed from pypi. bson is installed as part of the
# astradb dependencies.
# ref: https://pymongo.readthedocs.io/en/stable/installation.html
pip uninstall -y bson pymongo
python -m pip uninstall -y bson pymongo
make install-ingest-mongodb
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \

View File

@ -24,7 +24,7 @@ fi
# which is incompatible with the bson installed from pypi. bson is installed as part of the
# astradb dependencies.
# ref: https://pymongo.readthedocs.io/en/stable/installation.html
pip uninstall -y bson pymongo
python -m pip uninstall -y bson pymongo
make install-ingest-mongodb
PYTHONPATH=. ./unstructured/ingest/main.py \

View File

@ -1 +1 @@
__version__ = "0.15.13-dev1" # pragma: no cover
__version__ = "0.15.13-dev2" # pragma: no cover

View File

@ -1,76 +0,0 @@
import base64
import io
import os
from typing import Any, Dict, List, Optional
import pandas as pd
from unstructured.file_utils.filetype import detect_filetype
def get_directory_file_info(directory: str) -> pd.DataFrame:
"""Recursively walks a directory and extracts key file information to support initial
exploration of text data sets. Returns a pandas DataFrame."""
filenames: List[str] = []
for path, _, files in os.walk(directory):
for filename_no_path in files:
filenames.append(os.path.join(path, filename_no_path))
return get_file_info(filenames)
def get_file_info(filenames: List[str]) -> pd.DataFrame:
"""Returns a pandas DataFrame summarizing the filetypes for a list of files."""
data: Dict[str, List[Any]] = {
"filename": [],
"path": [],
"filesize": [],
"extension": [],
"filetype": [],
}
for filename in filenames:
path, filename_no_path = os.path.split(os.path.abspath(filename))
_, extension = os.path.splitext(filename)
filesize = os.path.getsize(filename)
filetype = detect_filetype(filename)
data["filename"].append(filename_no_path)
data["path"].append(path)
data["extension"].append(extension)
data["filesize"].append(filesize)
data["filetype"].append(filetype)
return pd.DataFrame(data)
def get_file_info_from_file_contents(
file_contents: List[str],
filenames: Optional[List[str]] = None,
) -> pd.DataFrame:
data: Dict[str, List[Any]] = {
"filesize": [],
"filetype": [],
}
if filenames:
if len(filenames) != len(file_contents):
raise ValueError(
f"There are {len(filenames)} filenames and {len(file_contents)} "
"file_contents. Both inputs must be the same length.",
)
data["filename"] = []
for i, file_content in enumerate(file_contents):
content_string = file_content.split(",")[-1]
content_bytes = base64.b64decode(content_string)
f = io.BytesIO(content_bytes)
filetype = detect_filetype(file=f)
f.seek(0, os.SEEK_END)
filesize = f.tell()
data["filesize"].append(filesize)
data["filetype"].append(filetype)
if filenames:
data["filename"].append(filenames[i])
return pd.DataFrame(data)

View File

@ -1,159 +0,0 @@
import datetime
import io
from dataclasses import dataclass, field
from typing import IO, Any, Dict, Final, Optional
from unstructured.utils import requires_dependencies
# NOTE(robison) - ref: https://www.media.mit.edu/pia/Research/deepview/exif.html
EXIF_DATETIME_FMT: Final[str] = "%Y:%m:%d %H:%M:%S"
@dataclass
class Metadata:
author: str = ""
category: str = ""
comments: str = ""
content_status: str = ""
created: Optional[datetime.datetime] = None
identifier: str = ""
keywords: str = ""
language: str = ""
last_modified_by: str = ""
last_printed: Optional[datetime.datetime] = None
modified: Optional[datetime.datetime] = None
revision: Optional[int] = 0
subject: str = ""
title: str = ""
version: str = ""
description: str = ""
namespace: str = ""
# NOTE(robinson) - Metadata for use with image files
exif_data: Dict[str, Any] = field(default_factory=dict)
def to_dict(self):
return self.__dict__
@requires_dependencies("docx")
def get_docx_metadata(
filename: str = "",
file: Optional[IO[bytes]] = None,
) -> Metadata:
"""Extracts document metadata from a Microsoft .docx document."""
import docx
if filename:
doc = docx.Document(filename)
elif file:
doc = docx.Document(file)
else:
raise FileNotFoundError("No filename nor file were specified")
metadata = Metadata(
author=getattr(doc.core_properties, "author", ""),
category=getattr(doc.core_properties, "category", ""),
comments=getattr(doc.core_properties, "comments", ""),
content_status=getattr(doc.core_properties, "content_status", ""),
created=getattr(doc.core_properties, "created", None),
identifier=getattr(doc.core_properties, "identifier", ""),
keywords=getattr(doc.core_properties, "keywords", ""),
language=getattr(doc.core_properties, "language", ""),
last_modified_by=getattr(doc.core_properties, "last_modified_by", ""),
last_printed=getattr(doc.core_properties, "last_printed", None),
modified=getattr(doc.core_properties, "modified", None),
revision=getattr(doc.core_properties, "revision", None),
subject=getattr(doc.core_properties, "subject", ""),
title=getattr(doc.core_properties, "title", ""),
version=getattr(doc.core_properties, "version", ""),
)
return metadata
@requires_dependencies("openpyxl")
def get_xlsx_metadata(
filename: str = "",
file: Optional[IO[bytes]] = None,
) -> Metadata:
"""Extracts document metadata from a Microsoft .xlsx document."""
import openpyxl
if filename:
workbook = openpyxl.load_workbook(filename)
elif file:
workbook = openpyxl.load_workbook(file)
else:
raise FileNotFoundError("No filename nor file were specified")
metadata = Metadata(
author=getattr(workbook.properties, "creator", ""),
category=getattr(workbook.properties, "category", ""),
content_status=getattr(workbook.properties, "contentStatus", ""),
created=getattr(workbook.properties, "created", None),
description=getattr(workbook.properties, "description", ""),
identifier=getattr(workbook.properties, "identifier", ""),
keywords=getattr(workbook.properties, "keywords", ""),
language=getattr(workbook.properties, "language", ""),
last_modified_by=getattr(workbook.properties, "lastModifiedBy", ""),
last_printed=getattr(workbook.properties, "lastPrinted", None),
modified=getattr(workbook.properties, "modified", None),
namespace=getattr(workbook.properties, "namespace", ""),
revision=getattr(workbook.properties, "revision", None),
subject=getattr(workbook.properties, "subject", ""),
title=getattr(workbook.properties, "title", ""),
version=getattr(workbook.properties, "version", ""),
)
return metadata
@requires_dependencies("PIL")
def get_jpg_metadata(
filename: str = "",
file: Optional[IO[bytes]] = None,
) -> Metadata:
"""Extracts metadata from a JPG image, including EXIF metadata."""
from PIL import Image
from PIL.ExifTags import TAGS
if filename:
image = Image.open(filename)
elif file:
image = Image.open(io.BytesIO(file.read()))
else:
raise FileNotFoundError("No filename nor file were specified")
exif_data = image.getexif()
exif_dict: Dict[str, Any] = {}
for tag_id in exif_data:
tag = TAGS.get(tag_id, tag_id)
data = exif_data.get(tag_id)
exif_dict[tag] = data
metadata = Metadata(
author=exif_dict.get("Artist", ""),
comments=exif_dict.get("UserComment", ""),
created=_get_exif_datetime(exif_dict, "DateTimeOriginal"),
# NOTE(robinson) - Per EXIF docs, DateTime is the last modified data
# ref: https://www.media.mit.edu/pia/Research/deepview/exif.html
modified=_get_exif_datetime(exif_dict, "DateTime"),
exif_data=exif_dict,
)
return metadata
def _get_exif_datetime(exif_dict: Dict[str, Any], key: str) -> Optional[datetime.datetime]:
"""Converts a datetime string from the EXIF data to a Python datetime object."""
date = exif_dict.get(key)
if not date:
return None
try:
return datetime.datetime.strptime(date, EXIF_DATETIME_FMT)
# NOTE(robinson) - An exception could occur if the datetime is not formatted
# using the standard EXIF datetime format
except ValueError:
return None