diff --git a/CHANGELOG.md b/CHANGELOG.md index 13bb8014c..17e80c7be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,11 @@ -## 0.15.1-dev5 +## 0.15.1-dev6 ### Enhancements ### Features +* **Mark ingest as deprecated** Begin sunset of ingest code in this repo as it's been moved to a dedicated repo. + ### Fixes * **Update `HuggingFaceEmbeddingEncoder` to use `HuggingFaceEmbeddings` from `langchain_huggingface` package instead of the deprecated version from `langchain-community`.** This resolves the deprecation warning and ensures compatibility with future versions of langchain. @@ -20,7 +22,7 @@ ### Enhancements -* **Improve text clearing process in email partitioning.** Updated the email partitioner to remove both `=\n` and `=\r\n` characters during the clearing process. Previously, only `=\n` characters were removed. +* **Improve text clearing process in email partitioning.** Updated the email partitioner to remove both `=\n` and `=\r\n` characters during the clearing process. Previously, only `=\n` characters were removed. * **Bump unstructured.paddleocr to 2.8.0.1.** * **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `

`, `

`) nested inside a phrasing element (e.g. `` or ``). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation. * **Install rewritten HTML parser to fix 12 existing bugs and provide headroom for refinement and growth.** A rewritten HTML parser resolves a collection of outstanding bugs with HTML partitioning and provides a firm foundation for further elaborating that important partitioner. diff --git a/test_unstructured_ingest/test-ingest-dest.sh b/test_unstructured_ingest/test-ingest-dest.sh index 06ab975db..ab6634cc0 100755 --- a/test_unstructured_ingest/test-ingest-dest.sh +++ b/test_unstructured_ingest/test-ingest-dest.sh @@ -64,6 +64,7 @@ tests_to_ignore=( 'notion.sh' 'dropbox.sh' 'sharepoint.sh' + 'databricks-volumes.sh' ) for test in "${all_tests[@]}"; do diff --git a/unstructured/__version__.py b/unstructured/__version__.py index c3eea2bc9..3b8d04fe6 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.1-dev5" # pragma: no cover +__version__ = "0.15.1-dev6" # pragma: no cover diff --git a/unstructured/ingest/README.md b/unstructured/ingest/README.md index 3072a1d5c..738656a37 100644 --- a/unstructured/ingest/README.md +++ b/unstructured/ingest/README.md @@ -1,3 +1,8 @@ +# Ingest +![Project unmaintained](https://img.shields.io/badge/project-unmaintained-red.svg) + +Project has been moved to: [Unstructured Ingest](https://github.com/Unstructured-IO/unstructured-ingest) + # Batch Processing Documents [DEPRECATED] For the latest approach, go to: [v2](./v2) diff --git a/unstructured/ingest/__init__.py b/unstructured/ingest/__init__.py index 9d48db4f9..cae55db4a 100644 --- a/unstructured/ingest/__init__.py +++ b/unstructured/ingest/__init__.py @@ -1 +1,10 @@ from __future__ import annotations + +import warnings + +warnings.warn( + "unstructured.ingest will be removed in a future version. " + "Functionality moved to the unstructured-ingest project.", + DeprecationWarning, + stacklevel=2, +)