diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cb91257b4..142578885 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -161,6 +161,7 @@ jobs:
     runs-on: ubuntu-latest
     env:
       UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      NLTK_DATA: ${{ github.workspace }}/nltk_data
     needs: [setup, lint]
     steps:
     - uses: actions/checkout@v4
diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml
index 1bb2d8182..d22a5aab9 100644
--- a/.github/workflows/ingest-test-fixtures-update-pr.yml
+++ b/.github/workflows/ingest-test-fixtures-update-pr.yml
@@ -40,7 +40,7 @@ jobs:
       # actions/checkout MUST come before auth
       - uses: 'actions/checkout@v4'
       - name: Set up Python ${{ env.PYTHON_VERSION }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ env.PYTHON_VERSION }}
       - name: Get full Python version
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e9f7bf990..2ab5534c0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.15.13-dev1
+## 0.15.13-dev2
 
 ### Enhancements
 
diff --git a/scripts/chroma-test-helpers/create-and-check-chroma.sh b/scripts/chroma-test-helpers/create-and-check-chroma.sh
index 37435426f..726ee9cae 100755
--- a/scripts/chroma-test-helpers/create-and-check-chroma.sh
+++ b/scripts/chroma-test-helpers/create-and-check-chroma.sh
@@ -3,4 +3,13 @@
 set -e
 
 # $1 is the path for chroma to write the contents to. The symbol "&" runs process in background
-chroma run --path "$1" &
+echo "Current venv is:"
+echo "$VIRTUAL_ENV"
+echo "Current path is:"
+echo "$PATH"
+ls -l "$VIRTUAL_ENV/bin/chroma"
+echo "================"
+cat "$VIRTUAL_ENV/bin/chroma"
+echo "================"
+# chroma run --path "$1" &
+python "$VIRTUAL_ENV/bin/chroma" run --path "$1" &
diff --git a/test_unstructured/file_utils/test_exploration.py b/test_unstructured/file_utils/test_exploration.py
deleted file mode 100644
index 7e38fbfd8..000000000
--- a/test_unstructured/file_utils/test_exploration.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import os
-import pathlib
-
-import pandas as pd
-import pytest
-
-from unstructured.file_utils import exploration
-from unstructured.file_utils.model import FileType
-
-DIRECTORY = pathlib.Path(__file__).parent.resolve()
-
-
-is_in_docker = os.path.exists("/.dockerenv")
-
-
-@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-def test_get_directory_file_info(tmpdir):
-    file_info_test = os.path.join(tmpdir, "file_info_test")
-    if not os.path.exists(file_info_test):
-        os.mkdir(file_info_test)
-
-    directory1 = os.path.join(file_info_test, "directory1")
-    if not os.path.exists(directory1):
-        os.mkdir(directory1)
-
-    filename1 = os.path.join(directory1, "filename1.txt")
-    with open(filename1, "w") as f:
-        f.write("hello there!")
-
-    directory2 = os.path.join(file_info_test, "directory2")
-    if not os.path.exists(directory2):
-        os.mkdir(directory2)
-
-    filename2 = os.path.join(directory2, "filename2.txt")
-    with open(filename2, "w") as f:
-        f.write("hello there!")
-
-    file_info = exploration.get_directory_file_info(file_info_test)
-    assert isinstance(file_info, pd.DataFrame)
-    assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
-
-    means = file_info.groupby("filetype").mean(numeric_only=True)
-    assert means.columns.to_list() == ["filesize"]
-
-
-@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-def test_get_file_info(tmpdir):
-    file_info_test = os.path.join(tmpdir, "file_info_test")
-    if not os.path.exists(file_info_test):
-        os.mkdir(file_info_test)
-
-    directory1 = os.path.join(file_info_test, "directory1")
-    if not os.path.exists(directory1):
-        os.mkdir(directory1)
-
-    filename1 = os.path.join(directory1, "filename1.txt")
-    with open(filename1, "w") as f:
-        f.write("hello there!")
-
-    directory2 = os.path.join(file_info_test, "directory2")
-    if not os.path.exists(directory2):
-        os.mkdir(directory2)
-
-    filename2 = os.path.join(directory2, "filename2.txt")
-    with open(filename2, "w") as f:
-        f.write("hello there!")
-
-    file_info = exploration.get_file_info([filename1, filename2])
-    assert isinstance(file_info, pd.DataFrame)
-    assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
-
-    means = file_info.groupby("filetype").mean(numeric_only=True)
-    assert means.columns.to_list() == ["filesize"]
-
-
-def test_get_file_info_from_file_contents():
-    file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt")
-    with open(file_contents_filename) as f:
-        file_contents = [f.read()]
-
-    file_info = exploration.get_file_info_from_file_contents(
-        file_contents=file_contents,
-        filenames=["test.eml"],
-    )
-    assert file_info.filetype[0] == FileType.EML
-
-
-def test_get_file_info_from_file_contents_raises_if_lists_no_equal():
-    file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt")
-    with open(file_contents_filename) as f:
-        file_contents = [f.read()]
-
-    with pytest.raises(ValueError):
-        exploration.get_file_info_from_file_contents(
-            file_contents=file_contents,
-            filenames=["test.eml", "test2.eml"],
-        )
diff --git a/test_unstructured/file_utils/test_metadata.py b/test_unstructured/file_utils/test_metadata.py
deleted file mode 100644
index 99ee2356b..000000000
--- a/test_unstructured/file_utils/test_metadata.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import datetime
-import os
-import pathlib
-
-import docx
-import openpyxl
-import pytest
-
-import unstructured.file_utils.metadata as meta
-from test_unstructured.unit_utils import example_doc_path
-
-DIRECTORY = pathlib.Path(__file__).parent.resolve()
-EXAMPLE_JPG_FILENAME = example_doc_path("img/example.jpg")
-
-
-def test_get_docx_metadata_from_filename(tmpdir):
-    filename = os.path.join(tmpdir, "test-doc.docx")
-
-    document = docx.Document()
-    document.add_paragraph("Lorem ipsum dolor sit amet.")
-    document.core_properties.author = "Mr. Miagi"
-    document.save(filename)
-
-    metadata = meta.get_docx_metadata(filename=filename)
-    assert metadata.author == "Mr. Miagi"
-    assert metadata.to_dict()["author"] == "Mr. Miagi"
-
-
-def test_get_docx_metadata_from_file(tmpdir):
-    filename = os.path.join(tmpdir, "test-doc.docx")
-
-    document = docx.Document()
-    document.add_paragraph("Lorem ipsum dolor sit amet.")
-    document.core_properties.author = "Mr. Miagi"
-    document.save(filename)
-
-    with open(filename, "rb") as f:
-        metadata = meta.get_docx_metadata(file=f)
-    assert metadata.author == "Mr. Miagi"
-
-
-def test_get_docx_metadata_raises_without_file_or_filename():
-    with pytest.raises(FileNotFoundError):
-        meta.get_docx_metadata()
-
-
-def test_get_xlsx_metadata_from_filename(tmpdir):
-    filename = os.path.join(tmpdir, "test-excel.xlsx")
-
-    workbook = openpyxl.Workbook()
-    workbook.properties.creator = "Mr. Miagi"
-    workbook.save(filename)
-
-    metadata = meta.get_xlsx_metadata(filename=filename)
-    metadata.author = "Mr. Miagi"
-
-
-def test_get_xlsx_metadata_from_file(tmpdir):
-    filename = os.path.join(tmpdir, "test-excel.xlsx")
-
-    workbook = openpyxl.Workbook()
-    workbook.properties.creator = "Mr. Miagi"
-    workbook.save(filename)
-
-    with open(filename, "rb") as f:
-        metadata = meta.get_xlsx_metadata(file=f)
-    metadata.author = "Mr. Miagi"
-
-
-def test_get_xlsx_metadata_raises_without_file_or_filename():
-    with pytest.raises(FileNotFoundError):
-        meta.get_xlsx_metadata()
-
-
-def test_get_jpg_metadata_from_filename():
-    metadata = meta.get_jpg_metadata(filename=EXAMPLE_JPG_FILENAME)
-    assert metadata.modified == datetime.datetime(2003, 12, 14, 12, 1, 44)
-    assert metadata.exif_data["Make"] == "Canon"
-
-
-def test_get_jpg_metadata_from_file():
-    with open(EXAMPLE_JPG_FILENAME, "rb") as f:
-        metadata = meta.get_jpg_metadata(file=f)
-    assert metadata.modified == datetime.datetime(2003, 12, 14, 12, 1, 44)
-    assert metadata.exif_data["Make"] == "Canon"
-
-
-def test_get_jpg_metadata_raises_without_file_or_filename():
-    with pytest.raises(FileNotFoundError):
-        meta.get_jpg_metadata()
-
-
-def test_get_exif_datetime():
-    exif_data = {"DateTime": "2022:12:23 15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
-    date = meta._get_exif_datetime(exif_data, "DateTime")
-    assert date == datetime.datetime(2022, 12, 23, 15, 49, 0)
-
-
-def test_get_exif_datetime_ignores_bad_formats():
-    exif_data = {"DateTime": "2022-12-23TZ15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
-    date = meta._get_exif_datetime(exif_data, "DateTime")
-    assert date is None
-
-
-def test_get_exif_datetime_ignores_missing_key():
-    exif_data = {"Datetime": "2022-12-23TZ15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
-    date = meta._get_exif_datetime(exif_data, "DateTimeDigitized")
-    assert date is None
diff --git a/test_unstructured_ingest/dest/mongodb.sh b/test_unstructured_ingest/dest/mongodb.sh
index c027035d4..938af0d5f 100755
--- a/test_unstructured_ingest/dest/mongodb.sh
+++ b/test_unstructured_ingest/dest/mongodb.sh
@@ -38,7 +38,7 @@ trap cleanup EXIT
 # which is incompatible with the bson installed from pypi. bson is installed as part of the
 # astradb dependencies.
 # ref: https://pymongo.readthedocs.io/en/stable/installation.html
-pip uninstall -y bson pymongo
+python -m pip uninstall -y bson pymongo
 make install-ingest-mongodb
 
 python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
diff --git a/test_unstructured_ingest/src/mongodb.sh b/test_unstructured_ingest/src/mongodb.sh
index b961ea753..553014266 100755
--- a/test_unstructured_ingest/src/mongodb.sh
+++ b/test_unstructured_ingest/src/mongodb.sh
@@ -24,7 +24,7 @@ fi
 # which is incompatible with the bson installed from pypi. bson is installed as part of the
 # astradb dependencies.
 # ref: https://pymongo.readthedocs.io/en/stable/installation.html
-pip uninstall -y bson pymongo
+python -m pip uninstall -y bson pymongo
 make install-ingest-mongodb
 
 PYTHONPATH=. ./unstructured/ingest/main.py \
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 0653b5d65..de4688b31 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.15.13-dev1"  # pragma: no cover
+__version__ = "0.15.13-dev2"  # pragma: no cover
diff --git a/unstructured/file_utils/exploration.py b/unstructured/file_utils/exploration.py
deleted file mode 100644
index 55d9719b1..000000000
--- a/unstructured/file_utils/exploration.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import base64
-import io
-import os
-from typing import Any, Dict, List, Optional
-
-import pandas as pd
-
-from unstructured.file_utils.filetype import detect_filetype
-
-
-def get_directory_file_info(directory: str) -> pd.DataFrame:
-    """Recursively walks a directory and extracts key file information to support initial
-    exploration of text data sets. Returns a pandas DataFrame."""
-    filenames: List[str] = []
-    for path, _, files in os.walk(directory):
-        for filename_no_path in files:
-            filenames.append(os.path.join(path, filename_no_path))
-    return get_file_info(filenames)
-
-
-def get_file_info(filenames: List[str]) -> pd.DataFrame:
-    """Returns a pandas DataFrame summarizing the filetypes for a list of files."""
-    data: Dict[str, List[Any]] = {
-        "filename": [],
-        "path": [],
-        "filesize": [],
-        "extension": [],
-        "filetype": [],
-    }
-
-    for filename in filenames:
-        path, filename_no_path = os.path.split(os.path.abspath(filename))
-        _, extension = os.path.splitext(filename)
-        filesize = os.path.getsize(filename)
-        filetype = detect_filetype(filename)
-
-        data["filename"].append(filename_no_path)
-        data["path"].append(path)
-        data["extension"].append(extension)
-        data["filesize"].append(filesize)
-        data["filetype"].append(filetype)
-
-    return pd.DataFrame(data)
-
-
-def get_file_info_from_file_contents(
-    file_contents: List[str],
-    filenames: Optional[List[str]] = None,
-) -> pd.DataFrame:
-    data: Dict[str, List[Any]] = {
-        "filesize": [],
-        "filetype": [],
-    }
-
-    if filenames:
-        if len(filenames) != len(file_contents):
-            raise ValueError(
-                f"There are {len(filenames)} filenames and {len(file_contents)} "
-                "file_contents. Both inputs must be the same length.",
-            )
-        data["filename"] = []
-
-    for i, file_content in enumerate(file_contents):
-        content_string = file_content.split(",")[-1]
-        content_bytes = base64.b64decode(content_string)
-        f = io.BytesIO(content_bytes)
-        filetype = detect_filetype(file=f)
-        f.seek(0, os.SEEK_END)
-        filesize = f.tell()
-
-        data["filesize"].append(filesize)
-        data["filetype"].append(filetype)
-        if filenames:
-            data["filename"].append(filenames[i])
-
-    return pd.DataFrame(data)
diff --git a/unstructured/file_utils/metadata.py b/unstructured/file_utils/metadata.py
deleted file mode 100644
index 20e447628..000000000
--- a/unstructured/file_utils/metadata.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import datetime
-import io
-from dataclasses import dataclass, field
-from typing import IO, Any, Dict, Final, Optional
-
-from unstructured.utils import requires_dependencies
-
-# NOTE(robison) - ref: https://www.media.mit.edu/pia/Research/deepview/exif.html
-EXIF_DATETIME_FMT: Final[str] = "%Y:%m:%d %H:%M:%S"
-
-
-@dataclass
-class Metadata:
-    author: str = ""
-    category: str = ""
-    comments: str = ""
-    content_status: str = ""
-    created: Optional[datetime.datetime] = None
-    identifier: str = ""
-    keywords: str = ""
-    language: str = ""
-    last_modified_by: str = ""
-    last_printed: Optional[datetime.datetime] = None
-    modified: Optional[datetime.datetime] = None
-    revision: Optional[int] = 0
-    subject: str = ""
-    title: str = ""
-    version: str = ""
-    description: str = ""
-    namespace: str = ""
-
-    # NOTE(robinson) - Metadata for use with image files
-    exif_data: Dict[str, Any] = field(default_factory=dict)
-
-    def to_dict(self):
-        return self.__dict__
-
-
-@requires_dependencies("docx")
-def get_docx_metadata(
-    filename: str = "",
-    file: Optional[IO[bytes]] = None,
-) -> Metadata:
-    """Extracts document metadata from a Microsoft .docx document."""
-    import docx
-
-    if filename:
-        doc = docx.Document(filename)
-    elif file:
-        doc = docx.Document(file)
-    else:
-        raise FileNotFoundError("No filename nor file were specified")
-
-    metadata = Metadata(
-        author=getattr(doc.core_properties, "author", ""),
-        category=getattr(doc.core_properties, "category", ""),
-        comments=getattr(doc.core_properties, "comments", ""),
-        content_status=getattr(doc.core_properties, "content_status", ""),
-        created=getattr(doc.core_properties, "created", None),
-        identifier=getattr(doc.core_properties, "identifier", ""),
-        keywords=getattr(doc.core_properties, "keywords", ""),
-        language=getattr(doc.core_properties, "language", ""),
-        last_modified_by=getattr(doc.core_properties, "last_modified_by", ""),
-        last_printed=getattr(doc.core_properties, "last_printed", None),
-        modified=getattr(doc.core_properties, "modified", None),
-        revision=getattr(doc.core_properties, "revision", None),
-        subject=getattr(doc.core_properties, "subject", ""),
-        title=getattr(doc.core_properties, "title", ""),
-        version=getattr(doc.core_properties, "version", ""),
-    )
-
-    return metadata
-
-
-@requires_dependencies("openpyxl")
-def get_xlsx_metadata(
-    filename: str = "",
-    file: Optional[IO[bytes]] = None,
-) -> Metadata:
-    """Extracts document metadata from a Microsoft .xlsx document."""
-    import openpyxl
-
-    if filename:
-        workbook = openpyxl.load_workbook(filename)
-    elif file:
-        workbook = openpyxl.load_workbook(file)
-    else:
-        raise FileNotFoundError("No filename nor file were specified")
-
-    metadata = Metadata(
-        author=getattr(workbook.properties, "creator", ""),
-        category=getattr(workbook.properties, "category", ""),
-        content_status=getattr(workbook.properties, "contentStatus", ""),
-        created=getattr(workbook.properties, "created", None),
-        description=getattr(workbook.properties, "description", ""),
-        identifier=getattr(workbook.properties, "identifier", ""),
-        keywords=getattr(workbook.properties, "keywords", ""),
-        language=getattr(workbook.properties, "language", ""),
-        last_modified_by=getattr(workbook.properties, "lastModifiedBy", ""),
-        last_printed=getattr(workbook.properties, "lastPrinted", None),
-        modified=getattr(workbook.properties, "modified", None),
-        namespace=getattr(workbook.properties, "namespace", ""),
-        revision=getattr(workbook.properties, "revision", None),
-        subject=getattr(workbook.properties, "subject", ""),
-        title=getattr(workbook.properties, "title", ""),
-        version=getattr(workbook.properties, "version", ""),
-    )
-
-    return metadata
-
-
-@requires_dependencies("PIL")
-def get_jpg_metadata(
-    filename: str = "",
-    file: Optional[IO[bytes]] = None,
-) -> Metadata:
-    """Extracts metadata from a JPG image, including EXIF metadata."""
-    from PIL import Image
-    from PIL.ExifTags import TAGS
-
-    if filename:
-        image = Image.open(filename)
-    elif file:
-        image = Image.open(io.BytesIO(file.read()))
-    else:
-        raise FileNotFoundError("No filename nor file were specified")
-
-    exif_data = image.getexif()
-    exif_dict: Dict[str, Any] = {}
-    for tag_id in exif_data:
-        tag = TAGS.get(tag_id, tag_id)
-        data = exif_data.get(tag_id)
-        exif_dict[tag] = data
-
-    metadata = Metadata(
-        author=exif_dict.get("Artist", ""),
-        comments=exif_dict.get("UserComment", ""),
-        created=_get_exif_datetime(exif_dict, "DateTimeOriginal"),
-        # NOTE(robinson) - Per EXIF docs, DateTime is the last modified data
-        # ref: https://www.media.mit.edu/pia/Research/deepview/exif.html
-        modified=_get_exif_datetime(exif_dict, "DateTime"),
-        exif_data=exif_dict,
-    )
-
-    return metadata
-
-
-def _get_exif_datetime(exif_dict: Dict[str, Any], key: str) -> Optional[datetime.datetime]:
-    """Converts a datetime string from the EXIF data to a Python datetime object."""
-    date = exif_dict.get(key)
-    if not date:
-        return None
-
-    try:
-        return datetime.datetime.strptime(date, EXIF_DATETIME_FMT)
-    # NOTE(robinson) - An exception could occur if the datetime is not formatted
-    # using the standard EXIF datetime format
-    except ValueError:
-        return None