diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cb91257b4..142578885 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -161,6 +161,7 @@ jobs: runs-on: ubuntu-latest env: UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }} + NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup, lint] steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 1bb2d8182..d22a5aab9 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -40,7 +40,7 @@ jobs: # actions/checkout MUST come before auth - uses: 'actions/checkout@v4' - name: Set up Python ${{ env.PYTHON_VERSION }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ env.PYTHON_VERSION }} - name: Get full Python version diff --git a/CHANGELOG.md b/CHANGELOG.md index e9f7bf990..2ab5534c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.13-dev1 +## 0.15.13-dev2 ### Enhancements diff --git a/scripts/chroma-test-helpers/create-and-check-chroma.sh b/scripts/chroma-test-helpers/create-and-check-chroma.sh index 37435426f..726ee9cae 100755 --- a/scripts/chroma-test-helpers/create-and-check-chroma.sh +++ b/scripts/chroma-test-helpers/create-and-check-chroma.sh @@ -3,4 +3,13 @@ set -e # $1 is the path for chroma to write the contents to. The symbol "&" runs process in background -chroma run --path "$1" & +echo "Current venv is:" +echo "$VIRTUAL_ENV" +echo "Current path is:" +echo "$PATH" +ls -l "$VIRTUAL_ENV/bin/chroma" +echo "================" +cat "$VIRTUAL_ENV/bin/chroma" +echo "================" +# chroma run --path "$1" & +python "$VIRTUAL_ENV/bin/chroma" run --path "$1" & diff --git a/test_unstructured/file_utils/test_exploration.py b/test_unstructured/file_utils/test_exploration.py deleted file mode 100644 index 7e38fbfd8..000000000 --- a/test_unstructured/file_utils/test_exploration.py +++ /dev/null @@ -1,97 +0,0 @@ -import os -import pathlib - -import pandas as pd -import pytest - -from unstructured.file_utils import exploration -from unstructured.file_utils.model import FileType - -DIRECTORY = pathlib.Path(__file__).parent.resolve() - - -is_in_docker = os.path.exists("/.dockerenv") - - -@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") -def test_get_directory_file_info(tmpdir): - file_info_test = os.path.join(tmpdir, "file_info_test") - if not os.path.exists(file_info_test): - os.mkdir(file_info_test) - - directory1 = os.path.join(file_info_test, "directory1") - if not os.path.exists(directory1): - os.mkdir(directory1) - - filename1 = os.path.join(directory1, "filename1.txt") - with open(filename1, "w") as f: - f.write("hello there!") - - directory2 = os.path.join(file_info_test, "directory2") - if not os.path.exists(directory2): - os.mkdir(directory2) - - filename2 = os.path.join(directory2, "filename2.txt") - with open(filename2, "w") as f: - f.write("hello there!") - - file_info = exploration.get_directory_file_info(file_info_test) - assert isinstance(file_info, pd.DataFrame) - assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"} - - means = file_info.groupby("filetype").mean(numeric_only=True) - assert means.columns.to_list() == ["filesize"] - - -@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") -def test_get_file_info(tmpdir): - file_info_test = os.path.join(tmpdir, "file_info_test") - if not os.path.exists(file_info_test): - os.mkdir(file_info_test) - - directory1 = os.path.join(file_info_test, "directory1") - if not os.path.exists(directory1): - os.mkdir(directory1) - - filename1 = os.path.join(directory1, "filename1.txt") - with open(filename1, "w") as f: - f.write("hello there!") - - directory2 = os.path.join(file_info_test, "directory2") - if not os.path.exists(directory2): - os.mkdir(directory2) - - filename2 = os.path.join(directory2, "filename2.txt") - with open(filename2, "w") as f: - f.write("hello there!") - - file_info = exploration.get_file_info([filename1, filename2]) - assert isinstance(file_info, pd.DataFrame) - assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"} - - means = file_info.groupby("filetype").mean(numeric_only=True) - assert means.columns.to_list() == ["filesize"] - - -def test_get_file_info_from_file_contents(): - file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt") - with open(file_contents_filename) as f: - file_contents = [f.read()] - - file_info = exploration.get_file_info_from_file_contents( - file_contents=file_contents, - filenames=["test.eml"], - ) - assert file_info.filetype[0] == FileType.EML - - -def test_get_file_info_from_file_contents_raises_if_lists_no_equal(): - file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt") - with open(file_contents_filename) as f: - file_contents = [f.read()] - - with pytest.raises(ValueError): - exploration.get_file_info_from_file_contents( - file_contents=file_contents, - filenames=["test.eml", "test2.eml"], - ) diff --git a/test_unstructured/file_utils/test_metadata.py b/test_unstructured/file_utils/test_metadata.py deleted file mode 100644 index 99ee2356b..000000000 --- a/test_unstructured/file_utils/test_metadata.py +++ /dev/null @@ -1,108 +0,0 @@ -import datetime -import os -import pathlib - -import docx -import openpyxl -import pytest - -import unstructured.file_utils.metadata as meta -from test_unstructured.unit_utils import example_doc_path - -DIRECTORY = pathlib.Path(__file__).parent.resolve() -EXAMPLE_JPG_FILENAME = example_doc_path("img/example.jpg") - - -def test_get_docx_metadata_from_filename(tmpdir): - filename = os.path.join(tmpdir, "test-doc.docx") - - document = docx.Document() - document.add_paragraph("Lorem ipsum dolor sit amet.") - document.core_properties.author = "Mr. Miagi" - document.save(filename) - - metadata = meta.get_docx_metadata(filename=filename) - assert metadata.author == "Mr. Miagi" - assert metadata.to_dict()["author"] == "Mr. Miagi" - - -def test_get_docx_metadata_from_file(tmpdir): - filename = os.path.join(tmpdir, "test-doc.docx") - - document = docx.Document() - document.add_paragraph("Lorem ipsum dolor sit amet.") - document.core_properties.author = "Mr. Miagi" - document.save(filename) - - with open(filename, "rb") as f: - metadata = meta.get_docx_metadata(file=f) - assert metadata.author == "Mr. Miagi" - - -def test_get_docx_metadata_raises_without_file_or_filename(): - with pytest.raises(FileNotFoundError): - meta.get_docx_metadata() - - -def test_get_xlsx_metadata_from_filename(tmpdir): - filename = os.path.join(tmpdir, "test-excel.xlsx") - - workbook = openpyxl.Workbook() - workbook.properties.creator = "Mr. Miagi" - workbook.save(filename) - - metadata = meta.get_xlsx_metadata(filename=filename) - metadata.author = "Mr. Miagi" - - -def test_get_xlsx_metadata_from_file(tmpdir): - filename = os.path.join(tmpdir, "test-excel.xlsx") - - workbook = openpyxl.Workbook() - workbook.properties.creator = "Mr. Miagi" - workbook.save(filename) - - with open(filename, "rb") as f: - metadata = meta.get_xlsx_metadata(file=f) - metadata.author = "Mr. Miagi" - - -def test_get_xlsx_metadata_raises_without_file_or_filename(): - with pytest.raises(FileNotFoundError): - meta.get_xlsx_metadata() - - -def test_get_jpg_metadata_from_filename(): - metadata = meta.get_jpg_metadata(filename=EXAMPLE_JPG_FILENAME) - assert metadata.modified == datetime.datetime(2003, 12, 14, 12, 1, 44) - assert metadata.exif_data["Make"] == "Canon" - - -def test_get_jpg_metadata_from_file(): - with open(EXAMPLE_JPG_FILENAME, "rb") as f: - metadata = meta.get_jpg_metadata(file=f) - assert metadata.modified == datetime.datetime(2003, 12, 14, 12, 1, 44) - assert metadata.exif_data["Make"] == "Canon" - - -def test_get_jpg_metadata_raises_without_file_or_filename(): - with pytest.raises(FileNotFoundError): - meta.get_jpg_metadata() - - -def test_get_exif_datetime(): - exif_data = {"DateTime": "2022:12:23 15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"} - date = meta._get_exif_datetime(exif_data, "DateTime") - assert date == datetime.datetime(2022, 12, 23, 15, 49, 0) - - -def test_get_exif_datetime_ignores_bad_formats(): - exif_data = {"DateTime": "2022-12-23TZ15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"} - date = meta._get_exif_datetime(exif_data, "DateTime") - assert date is None - - -def test_get_exif_datetime_ignores_missing_key(): - exif_data = {"Datetime": "2022-12-23TZ15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"} - date = meta._get_exif_datetime(exif_data, "DateTimeDigitized") - assert date is None diff --git a/test_unstructured_ingest/dest/mongodb.sh b/test_unstructured_ingest/dest/mongodb.sh index c027035d4..938af0d5f 100755 --- a/test_unstructured_ingest/dest/mongodb.sh +++ b/test_unstructured_ingest/dest/mongodb.sh @@ -38,7 +38,7 @@ trap cleanup EXIT # which is incompatible with the bson installed from pypi. bson is installed as part of the # astradb dependencies. # ref: https://pymongo.readthedocs.io/en/stable/installation.html -pip uninstall -y bson pymongo +python -m pip uninstall -y bson pymongo make install-ingest-mongodb python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \ diff --git a/test_unstructured_ingest/src/mongodb.sh b/test_unstructured_ingest/src/mongodb.sh index b961ea753..553014266 100755 --- a/test_unstructured_ingest/src/mongodb.sh +++ b/test_unstructured_ingest/src/mongodb.sh @@ -24,7 +24,7 @@ fi # which is incompatible with the bson installed from pypi. bson is installed as part of the # astradb dependencies. # ref: https://pymongo.readthedocs.io/en/stable/installation.html -pip uninstall -y bson pymongo +python -m pip uninstall -y bson pymongo make install-ingest-mongodb PYTHONPATH=. ./unstructured/ingest/main.py \ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 0653b5d65..de4688b31 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.13-dev1" # pragma: no cover +__version__ = "0.15.13-dev2" # pragma: no cover diff --git a/unstructured/file_utils/exploration.py b/unstructured/file_utils/exploration.py deleted file mode 100644 index 55d9719b1..000000000 --- a/unstructured/file_utils/exploration.py +++ /dev/null @@ -1,76 +0,0 @@ -import base64 -import io -import os -from typing import Any, Dict, List, Optional - -import pandas as pd - -from unstructured.file_utils.filetype import detect_filetype - - -def get_directory_file_info(directory: str) -> pd.DataFrame: - """Recursively walks a directory and extracts key file information to support initial - exploration of text data sets. Returns a pandas DataFrame.""" - filenames: List[str] = [] - for path, _, files in os.walk(directory): - for filename_no_path in files: - filenames.append(os.path.join(path, filename_no_path)) - return get_file_info(filenames) - - -def get_file_info(filenames: List[str]) -> pd.DataFrame: - """Returns a pandas DataFrame summarizing the filetypes for a list of files.""" - data: Dict[str, List[Any]] = { - "filename": [], - "path": [], - "filesize": [], - "extension": [], - "filetype": [], - } - - for filename in filenames: - path, filename_no_path = os.path.split(os.path.abspath(filename)) - _, extension = os.path.splitext(filename) - filesize = os.path.getsize(filename) - filetype = detect_filetype(filename) - - data["filename"].append(filename_no_path) - data["path"].append(path) - data["extension"].append(extension) - data["filesize"].append(filesize) - data["filetype"].append(filetype) - - return pd.DataFrame(data) - - -def get_file_info_from_file_contents( - file_contents: List[str], - filenames: Optional[List[str]] = None, -) -> pd.DataFrame: - data: Dict[str, List[Any]] = { - "filesize": [], - "filetype": [], - } - - if filenames: - if len(filenames) != len(file_contents): - raise ValueError( - f"There are {len(filenames)} filenames and {len(file_contents)} " - "file_contents. Both inputs must be the same length.", - ) - data["filename"] = [] - - for i, file_content in enumerate(file_contents): - content_string = file_content.split(",")[-1] - content_bytes = base64.b64decode(content_string) - f = io.BytesIO(content_bytes) - filetype = detect_filetype(file=f) - f.seek(0, os.SEEK_END) - filesize = f.tell() - - data["filesize"].append(filesize) - data["filetype"].append(filetype) - if filenames: - data["filename"].append(filenames[i]) - - return pd.DataFrame(data) diff --git a/unstructured/file_utils/metadata.py b/unstructured/file_utils/metadata.py deleted file mode 100644 index 20e447628..000000000 --- a/unstructured/file_utils/metadata.py +++ /dev/null @@ -1,159 +0,0 @@ -import datetime -import io -from dataclasses import dataclass, field -from typing import IO, Any, Dict, Final, Optional - -from unstructured.utils import requires_dependencies - -# NOTE(robison) - ref: https://www.media.mit.edu/pia/Research/deepview/exif.html -EXIF_DATETIME_FMT: Final[str] = "%Y:%m:%d %H:%M:%S" - - -@dataclass -class Metadata: - author: str = "" - category: str = "" - comments: str = "" - content_status: str = "" - created: Optional[datetime.datetime] = None - identifier: str = "" - keywords: str = "" - language: str = "" - last_modified_by: str = "" - last_printed: Optional[datetime.datetime] = None - modified: Optional[datetime.datetime] = None - revision: Optional[int] = 0 - subject: str = "" - title: str = "" - version: str = "" - description: str = "" - namespace: str = "" - - # NOTE(robinson) - Metadata for use with image files - exif_data: Dict[str, Any] = field(default_factory=dict) - - def to_dict(self): - return self.__dict__ - - -@requires_dependencies("docx") -def get_docx_metadata( - filename: str = "", - file: Optional[IO[bytes]] = None, -) -> Metadata: - """Extracts document metadata from a Microsoft .docx document.""" - import docx - - if filename: - doc = docx.Document(filename) - elif file: - doc = docx.Document(file) - else: - raise FileNotFoundError("No filename nor file were specified") - - metadata = Metadata( - author=getattr(doc.core_properties, "author", ""), - category=getattr(doc.core_properties, "category", ""), - comments=getattr(doc.core_properties, "comments", ""), - content_status=getattr(doc.core_properties, "content_status", ""), - created=getattr(doc.core_properties, "created", None), - identifier=getattr(doc.core_properties, "identifier", ""), - keywords=getattr(doc.core_properties, "keywords", ""), - language=getattr(doc.core_properties, "language", ""), - last_modified_by=getattr(doc.core_properties, "last_modified_by", ""), - last_printed=getattr(doc.core_properties, "last_printed", None), - modified=getattr(doc.core_properties, "modified", None), - revision=getattr(doc.core_properties, "revision", None), - subject=getattr(doc.core_properties, "subject", ""), - title=getattr(doc.core_properties, "title", ""), - version=getattr(doc.core_properties, "version", ""), - ) - - return metadata - - -@requires_dependencies("openpyxl") -def get_xlsx_metadata( - filename: str = "", - file: Optional[IO[bytes]] = None, -) -> Metadata: - """Extracts document metadata from a Microsoft .xlsx document.""" - import openpyxl - - if filename: - workbook = openpyxl.load_workbook(filename) - elif file: - workbook = openpyxl.load_workbook(file) - else: - raise FileNotFoundError("No filename nor file were specified") - - metadata = Metadata( - author=getattr(workbook.properties, "creator", ""), - category=getattr(workbook.properties, "category", ""), - content_status=getattr(workbook.properties, "contentStatus", ""), - created=getattr(workbook.properties, "created", None), - description=getattr(workbook.properties, "description", ""), - identifier=getattr(workbook.properties, "identifier", ""), - keywords=getattr(workbook.properties, "keywords", ""), - language=getattr(workbook.properties, "language", ""), - last_modified_by=getattr(workbook.properties, "lastModifiedBy", ""), - last_printed=getattr(workbook.properties, "lastPrinted", None), - modified=getattr(workbook.properties, "modified", None), - namespace=getattr(workbook.properties, "namespace", ""), - revision=getattr(workbook.properties, "revision", None), - subject=getattr(workbook.properties, "subject", ""), - title=getattr(workbook.properties, "title", ""), - version=getattr(workbook.properties, "version", ""), - ) - - return metadata - - -@requires_dependencies("PIL") -def get_jpg_metadata( - filename: str = "", - file: Optional[IO[bytes]] = None, -) -> Metadata: - """Extracts metadata from a JPG image, including EXIF metadata.""" - from PIL import Image - from PIL.ExifTags import TAGS - - if filename: - image = Image.open(filename) - elif file: - image = Image.open(io.BytesIO(file.read())) - else: - raise FileNotFoundError("No filename nor file were specified") - - exif_data = image.getexif() - exif_dict: Dict[str, Any] = {} - for tag_id in exif_data: - tag = TAGS.get(tag_id, tag_id) - data = exif_data.get(tag_id) - exif_dict[tag] = data - - metadata = Metadata( - author=exif_dict.get("Artist", ""), - comments=exif_dict.get("UserComment", ""), - created=_get_exif_datetime(exif_dict, "DateTimeOriginal"), - # NOTE(robinson) - Per EXIF docs, DateTime is the last modified data - # ref: https://www.media.mit.edu/pia/Research/deepview/exif.html - modified=_get_exif_datetime(exif_dict, "DateTime"), - exif_data=exif_dict, - ) - - return metadata - - -def _get_exif_datetime(exif_dict: Dict[str, Any], key: str) -> Optional[datetime.datetime]: - """Converts a datetime string from the EXIF data to a Python datetime object.""" - date = exif_dict.get(key) - if not date: - return None - - try: - return datetime.datetime.strptime(date, EXIF_DATETIME_FMT) - # NOTE(robinson) - An exception could occur if the datetime is not formatted - # using the standard EXIF datetime format - except ValueError: - return None