From 798dcc096cf4ef71cbc66777cc60391aefccef7d Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Wed, 24 Jul 2024 09:52:34 -0700 Subject: [PATCH] fix: update OpenAIEmbeddingEncoder to use `langchain-openai` instead of `langchain-community` (#3433) Closes https://github.com/Unstructured-IO/unstructured/issues/3378. ### Summary This PR aims to update `OpenAIEmbeddingEncoder` to use `OpenAIEmbeddings` from `langchain-openai` package instead of the deprecated version from `langchain-community`. This resolves the deprecation warning and ensures compatibility with future versions of langchain. --- CHANGELOG.md | 3 +- requirements/ingest/embed-openai.in | 5 +- requirements/ingest/embed-openai.txt | 90 +++------------------------- unstructured/__version__.py | 2 +- unstructured/embed/openai.py | 9 +-- 5 files changed, 17 insertions(+), 92 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4999a4975..8e998f8aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.1-dev2 +## 0.15.1-dev3 ### Enhancements @@ -6,6 +6,7 @@ ### Fixes +* **Update `OpenAIEmbeddingEncoder` to use `OpenAIEmbeddings` from `langchain-openai` package instead of the deprecated version from `langchain-community`.** This resolves the deprecation warning and ensures compatibility with future versions of langchain. * **Update import of Pinecone exception** Adds compatibility for pinecone-client>=5.0.0 * **File-type detection catches non-existent file-path.** `detect_filetype()` no longer silently falls back to detecting a file-type based on the extension when no file exists at the path provided. Instead `FileNotFoundError` is raised. This provides consistent user notification of a mis-typed path rather than an unpredictable exception from a file-type specific partitioner when the file cannot be opened. * **EML files specified as a file-path are detected correctly.** Resolved a bug where an EML file submitted to `partition()` as a file-path was identified as TXT and partitioned using `partition_text()`. EML files specified by path are now identified and processed correctly, including processing any attachments. diff --git a/requirements/ingest/embed-openai.in b/requirements/ingest/embed-openai.in index dae91b107..fb130e9cb 100644 --- a/requirements/ingest/embed-openai.in +++ b/requirements/ingest/embed-openai.in @@ -1,5 +1,4 @@ -c ../deps/constraints.txt -c ../base.txt -langchain-community -tiktoken -openai + +langchain-openai diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index 4f0fa239e..fd4187a05 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -4,12 +4,6 @@ # # pip-compile ./ingest/embed-openai.in # -aiohttp==3.9.5 - # via - # langchain - # langchain-community -aiosignal==1.3.1 - # via aiohttp annotated-types==0.7.0 # via pydantic anyio==3.7.1 @@ -18,12 +12,6 @@ anyio==3.7.1 # -c ./ingest/../deps/constraints.txt # httpx # openai -async-timeout==4.0.3 - # via - # aiohttp - # langchain -attrs==23.2.0 - # via aiohttp certifi==2024.7.4 # via # -c ./ingest/../base.txt @@ -35,20 +23,12 @@ charset-normalizer==3.3.2 # via # -c ./ingest/../base.txt # requests -dataclasses-json==0.6.7 - # via - # -c ./ingest/../base.txt - # langchain-community distro==1.9.0 # via openai exceptiongroup==1.2.2 # via # -c ./ingest/../base.txt # anyio -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal h11==0.14.0 # via # -c ./ingest/../base.txt @@ -67,48 +47,18 @@ idna==3.7 # anyio # httpx # requests - # yarl jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 # via jsonpatch -langchain==0.2.11 - # via langchain-community -langchain-community==0.2.10 - # via - # -c ./ingest/../deps/constraints.txt - # -r ./ingest/embed-openai.in langchain-core==0.2.23 - # via - # langchain - # langchain-community - # langchain-text-splitters -langchain-text-splitters==0.2.2 - # via langchain -langsmith==0.1.93 - # via - # langchain - # langchain-community - # langchain-core -marshmallow==3.21.3 - # via - # -c ./ingest/../base.txt - # dataclasses-json -multidict==6.0.5 - # via - # aiohttp - # yarl -mypy-extensions==1.0.0 - # via - # -c ./ingest/../base.txt - # typing-inspect -numpy==1.26.4 - # via - # -c ./ingest/../base.txt - # langchain - # langchain-community -openai==1.37.0 + # via langchain-openai +langchain-openai==0.1.17 # via -r ./ingest/embed-openai.in +langsmith==0.1.93 + # via langchain-core +openai==1.37.0 + # via langchain-openai orjson==3.10.6 # via langsmith packaging==23.2 @@ -116,20 +66,15 @@ packaging==23.2 # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt # langchain-core - # marshmallow pydantic==2.8.2 # via - # langchain # langchain-core # langsmith # openai pydantic-core==2.20.1 # via pydantic pyyaml==6.0.1 - # via - # langchain - # langchain-community - # langchain-core + # via langchain-core regex==2024.5.15 # via # -c ./ingest/../base.txt @@ -137,8 +82,6 @@ regex==2024.5.15 requests==2.32.3 # via # -c ./ingest/../base.txt - # langchain - # langchain-community # langsmith # tiktoken sniffio==1.3.1 @@ -147,17 +90,10 @@ sniffio==1.3.1 # anyio # httpx # openai -sqlalchemy==2.0.31 - # via - # langchain - # langchain-community tenacity==8.5.0 - # via - # langchain - # langchain-community - # langchain-core + # via langchain-core tiktoken==0.7.0 - # via -r ./ingest/embed-openai.in + # via langchain-openai tqdm==4.66.4 # via # -c ./ingest/../base.txt @@ -168,16 +104,8 @@ typing-extensions==4.12.2 # openai # pydantic # pydantic-core - # sqlalchemy - # typing-inspect -typing-inspect==0.9.0 - # via - # -c ./ingest/../base.txt - # dataclasses-json urllib3==1.26.19 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt # requests -yarl==1.9.4 - # via aiohttp diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d9462c4cf..c9b932d95 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.1-dev2" # pragma: no cover +__version__ = "0.15.1-dev3" # pragma: no cover diff --git a/unstructured/embed/openai.py b/unstructured/embed/openai.py index 272b4f28f..a2f7d6472 100644 --- a/unstructured/embed/openai.py +++ b/unstructured/embed/openai.py @@ -12,7 +12,7 @@ from unstructured.ingest.error import EmbeddingEncoderConnectionError from unstructured.utils import requires_dependencies if TYPE_CHECKING: - from langchain_community.embeddings import OpenAIEmbeddings + from langchain_openai.embeddings import OpenAIEmbeddings @dataclass @@ -65,13 +65,10 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder): return elements @EmbeddingEncoderConnectionError.wrap - @requires_dependencies( - ["langchain_community", "openai", "tiktoken"], - extras="openai", - ) + @requires_dependencies(["langchain_openai"], extras="openai") def create_client(self) -> "OpenAIEmbeddings": """Creates a langchain OpenAI python client to embed elements.""" - from langchain_community.embeddings import OpenAIEmbeddings + from langchain_openai import OpenAIEmbeddings openai_client = OpenAIEmbeddings( openai_api_key=self.config.api_key,