fix: update OpenAIEmbeddingEncoder to use langchain-openai instead of langchain-community (#3433)

Closes https://github.com/Unstructured-IO/unstructured/issues/3378.

### Summary
This PR aims to update `OpenAIEmbeddingEncoder` to use
`OpenAIEmbeddings` from `langchain-openai` package instead of the
deprecated version from `langchain-community`. This resolves the
deprecation warning and ensures compatibility with future versions of
langchain.
This commit is contained in:
Christine Straub 2024-07-24 09:52:34 -07:00 committed by GitHub
parent 3fe5c094fa
commit 798dcc096c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 17 additions and 92 deletions

View File

@ -1,4 +1,4 @@
## 0.15.1-dev2
## 0.15.1-dev3
### Enhancements
@ -6,6 +6,7 @@
### Fixes
* **Update `OpenAIEmbeddingEncoder` to use `OpenAIEmbeddings` from `langchain-openai` package instead of the deprecated version from `langchain-community`.** This resolves the deprecation warning and ensures compatibility with future versions of langchain.
* **Update import of Pinecone exception** Adds compatibility for pinecone-client>=5.0.0
* **File-type detection catches non-existent file-path.** `detect_filetype()` no longer silently falls back to detecting a file-type based on the extension when no file exists at the path provided. Instead `FileNotFoundError` is raised. This provides consistent user notification of a mis-typed path rather than an unpredictable exception from a file-type specific partitioner when the file cannot be opened.
* **EML files specified as a file-path are detected correctly.** Resolved a bug where an EML file submitted to `partition()` as a file-path was identified as TXT and partitioned using `partition_text()`. EML files specified by path are now identified and processed correctly, including processing any attachments.

View File

@ -1,5 +1,4 @@
-c ../deps/constraints.txt
-c ../base.txt
langchain-community
tiktoken
openai
langchain-openai

View File

@ -4,12 +4,6 @@
#
# pip-compile ./ingest/embed-openai.in
#
aiohttp==3.9.5
# via
# langchain
# langchain-community
aiosignal==1.3.1
# via aiohttp
annotated-types==0.7.0
# via pydantic
anyio==3.7.1
@ -18,12 +12,6 @@ anyio==3.7.1
# -c ./ingest/../deps/constraints.txt
# httpx
# openai
async-timeout==4.0.3
# via
# aiohttp
# langchain
attrs==23.2.0
# via aiohttp
certifi==2024.7.4
# via
# -c ./ingest/../base.txt
@ -35,20 +23,12 @@ charset-normalizer==3.3.2
# via
# -c ./ingest/../base.txt
# requests
dataclasses-json==0.6.7
# via
# -c ./ingest/../base.txt
# langchain-community
distro==1.9.0
# via openai
exceptiongroup==1.2.2
# via
# -c ./ingest/../base.txt
# anyio
frozenlist==1.4.1
# via
# aiohttp
# aiosignal
h11==0.14.0
# via
# -c ./ingest/../base.txt
@ -67,48 +47,18 @@ idna==3.7
# anyio
# httpx
# requests
# yarl
jsonpatch==1.33
# via langchain-core
jsonpointer==3.0.0
# via jsonpatch
langchain==0.2.11
# via langchain-community
langchain-community==0.2.10
# via
# -c ./ingest/../deps/constraints.txt
# -r ./ingest/embed-openai.in
langchain-core==0.2.23
# via
# langchain
# langchain-community
# langchain-text-splitters
langchain-text-splitters==0.2.2
# via langchain
langsmith==0.1.93
# via
# langchain
# langchain-community
# langchain-core
marshmallow==3.21.3
# via
# -c ./ingest/../base.txt
# dataclasses-json
multidict==6.0.5
# via
# aiohttp
# yarl
mypy-extensions==1.0.0
# via
# -c ./ingest/../base.txt
# typing-inspect
numpy==1.26.4
# via
# -c ./ingest/../base.txt
# langchain
# langchain-community
openai==1.37.0
# via langchain-openai
langchain-openai==0.1.17
# via -r ./ingest/embed-openai.in
langsmith==0.1.93
# via langchain-core
openai==1.37.0
# via langchain-openai
orjson==3.10.6
# via langsmith
packaging==23.2
@ -116,20 +66,15 @@ packaging==23.2
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# langchain-core
# marshmallow
pydantic==2.8.2
# via
# langchain
# langchain-core
# langsmith
# openai
pydantic-core==2.20.1
# via pydantic
pyyaml==6.0.1
# via
# langchain
# langchain-community
# langchain-core
# via langchain-core
regex==2024.5.15
# via
# -c ./ingest/../base.txt
@ -137,8 +82,6 @@ regex==2024.5.15
requests==2.32.3
# via
# -c ./ingest/../base.txt
# langchain
# langchain-community
# langsmith
# tiktoken
sniffio==1.3.1
@ -147,17 +90,10 @@ sniffio==1.3.1
# anyio
# httpx
# openai
sqlalchemy==2.0.31
# via
# langchain
# langchain-community
tenacity==8.5.0
# via
# langchain
# langchain-community
# langchain-core
# via langchain-core
tiktoken==0.7.0
# via -r ./ingest/embed-openai.in
# via langchain-openai
tqdm==4.66.4
# via
# -c ./ingest/../base.txt
@ -168,16 +104,8 @@ typing-extensions==4.12.2
# openai
# pydantic
# pydantic-core
# sqlalchemy
# typing-inspect
typing-inspect==0.9.0
# via
# -c ./ingest/../base.txt
# dataclasses-json
urllib3==1.26.19
# via
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# requests
yarl==1.9.4
# via aiohttp

View File

@ -1 +1 @@
__version__ = "0.15.1-dev2" # pragma: no cover
__version__ = "0.15.1-dev3" # pragma: no cover

View File

@ -12,7 +12,7 @@ from unstructured.ingest.error import EmbeddingEncoderConnectionError
from unstructured.utils import requires_dependencies
if TYPE_CHECKING:
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings
@dataclass
@ -65,13 +65,10 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
return elements
@EmbeddingEncoderConnectionError.wrap
@requires_dependencies(
["langchain_community", "openai", "tiktoken"],
extras="openai",
)
@requires_dependencies(["langchain_openai"], extras="openai")
def create_client(self) -> "OpenAIEmbeddings":
"""Creates a langchain OpenAI python client to embed elements."""
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
openai_client = OpenAIEmbeddings(
openai_api_key=self.config.api_key,