chore: remove pins (#3579)

- Remove constraint pins for `Office365-REST-Python-Client`,
`weaviate-client`, and `platformdirs`. Removing the pin for `Office365`
brought to light some bugs in the Onedrive connector, so some changes
were also made to
`unstructured/ingest/v2/processes/connectors/onedrive.py`.
- Also, as part of updating dependencies `unstructured-client` was
updated to `0.25.8`, which introduced a new default for the `strategy`
param and required updating a test fixture.
- The `hubspot.sh` integration test was failing and is now ignored in CI
with this PR per discussion with @rbiseck3.

May be easiest to review commit-by-commit.
This commit is contained in:
John 2024-09-12 09:48:59 -04:00 committed by GitHub
parent ebf16055d8
commit ab94c6c5d1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
37 changed files with 125 additions and 99 deletions

View File

@ -1,3 +1,11 @@
## 0.15.12-dev0
### Enhancements
### Features
### Fixes
## 0.15.11
### Enhancements

View File

@ -133,7 +133,7 @@ typing-inspect==0.9.0
# via
# dataclasses-json
# unstructured-client
unstructured-client==0.25.7
unstructured-client==0.25.8
# via -r ./base.in
urllib3==1.26.20
# via

View File

@ -6,23 +6,13 @@
# consistency with local-inference-pin
protobuf<4.24
grpcio>=1.65.5
# NOTE(alan) Pinned to avoid error that occurs with 2.4.3:
# AttributeError: 'ResourcePath' object has no attribute 'collection'
Office365-REST-Python-Client<2.4.3
# use the known compatible version of weaviate
weaviate-client>3.25.0
# TODO: Pinned in transformers package, remove when that gets updated
tokenizers>=0.19,<0.20
platformdirs==3.10.0
# TODO: Constaint due to boto, with python before 3.10 not requiring openssl 1.1.1, remove when that gets
# updated or we drop support for 3.9
urllib3<1.27
# TODO: Constriant due to aiobotocore, remove when that gets updates:
botocore<1.34.132
# python 3.12 support
wrapt>=1.14.0
langchain-community>=0.2.5

View File

@ -30,9 +30,8 @@ packaging==24.1
# build
pip-tools==7.4.1
# via -r ./dev.in
platformdirs==3.10.0
platformdirs==4.3.2
# via
# -c ././deps/constraints.txt
# -c ./test.txt
# virtualenv
pre-commit==3.8.0

View File

@ -58,7 +58,7 @@ imageio==2.35.1
# scikit-image
imgaug==0.4.0
# via unstructured-paddleocr
importlib-resources==6.4.4
importlib-resources==6.4.5
# via matplotlib
kiwisolver==1.4.7
# via matplotlib

View File

@ -75,7 +75,7 @@ idna==3.8
# via
# -c ./base.txt
# requests
importlib-resources==6.4.4
importlib-resources==6.4.5
# via matplotlib
iopath==0.1.10
# via layoutparser

View File

@ -8,9 +8,9 @@ anyio==4.4.0
# via
# -c ./ingest/../base.txt
# httpx
astrapy==1.4.1
astrapy==1.4.2
# via -r ./ingest/astradb.in
cassandra-driver==3.29.1
cassandra-driver==3.29.2
# via cassio
cassio==0.1.8
# via astrapy
@ -96,5 +96,5 @@ urllib3==1.26.20
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# requests
uuid6==2024.1.12
uuid6==2024.7.10
# via astrapy

View File

@ -67,7 +67,7 @@ msal==1.31.0
# msal-extensions
msal-extensions==1.2.0
# via azure-identity
multidict==6.0.5
multidict==6.1.0
# via
# aiohttp
# yarl
@ -94,10 +94,11 @@ typing-extensions==4.12.2
# azure-core
# azure-identity
# azure-storage-blob
# multidict
urllib3==1.26.20
# via
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# requests
yarl==1.11.0
yarl==1.11.1
# via aiohttp

View File

@ -102,7 +102,7 @@ importlib-metadata==8.4.0
# -r ./ingest/chroma.in
# build
# opentelemetry-api
importlib-resources==6.4.4
importlib-resources==6.4.5
# via chromadb
kubernetes==30.1.0
# via chromadb
@ -172,7 +172,7 @@ packaging==24.1
# build
# huggingface-hub
# onnxruntime
posthog==3.6.4
posthog==3.6.5
# via chromadb
protobuf==4.23.4
# via

View File

@ -60,7 +60,7 @@ requests==2.32.3
# via
# -c ./ingest/../base.txt
# clarifai-grpc
rich==13.8.0
rich==13.8.1
# via clarifai
schema==0.7.5
# via clarifai

View File

@ -14,7 +14,7 @@ charset-normalizer==3.3.2
# via
# -c ./ingest/../base.txt
# requests
databricks-sdk==0.32.0
databricks-sdk==0.32.1
# via -r ./ingest/databricks-volumes.in
google-auth==2.34.0
# via databricks-sdk

View File

@ -24,9 +24,13 @@ idna==3.8
# via
# -c ./ingest/../base.txt
# yarl
multidict==6.0.5
multidict==6.1.0
# via
# aiohttp
# yarl
yarl==1.11.0
typing-extensions==4.12.2
# via
# -c ./ingest/../base.txt
# multidict
yarl==1.11.1
# via aiohttp

View File

@ -20,7 +20,7 @@ certifi==2024.8.30
# elastic-transport
elastic-transport==8.15.0
# via elasticsearch
elasticsearch[async]==8.15.0
elasticsearch[async]==8.15.1
# via -r ./ingest/elasticsearch.in
frozenlist==1.4.1
# via
@ -30,14 +30,18 @@ idna==3.8
# via
# -c ./ingest/../base.txt
# yarl
multidict==6.0.5
multidict==6.1.0
# via
# aiohttp
# yarl
typing-extensions==4.12.2
# via
# -c ./ingest/../base.txt
# multidict
urllib3==1.26.20
# via
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# elastic-transport
yarl==1.11.0
yarl==1.11.1
# via aiohttp

View File

@ -93,7 +93,7 @@ langchain-core==0.2.38
# langchain-text-splitters
langchain-text-splitters==0.2.4
# via langchain
langsmith==0.1.116
langsmith==0.1.117
# via
# langchain
# langchain-community
@ -102,7 +102,7 @@ marshmallow==3.22.0
# via
# -c ./ingest/../base.txt
# dataclasses-json
multidict==6.0.5
multidict==6.1.0
# via
# aiohttp
# yarl
@ -169,6 +169,7 @@ typing-extensions==4.12.2
# -c ./ingest/../base.txt
# anyio
# langchain-core
# multidict
# pydantic
# pydantic-core
# sqlalchemy
@ -183,5 +184,5 @@ urllib3==1.26.20
# -c ./ingest/../deps/constraints.txt
# botocore
# requests
yarl==1.11.0
yarl==1.11.1
# via aiohttp

View File

@ -71,7 +71,7 @@ langchain-core==0.2.38
# via langchain-huggingface
langchain-huggingface==0.0.3
# via -r ./ingest/embed-huggingface.in
langsmith==0.1.116
langsmith==0.1.117
# via langchain-core
markupsafe==2.1.5
# via jinja2

View File

@ -47,7 +47,7 @@ idna==3.8
# requests
jiter==0.5.0
# via openai
openai==1.44.0
openai==1.44.1
# via -r ./ingest/embed-octoai.in
pydantic==2.9.1
# via openai

View File

@ -56,9 +56,9 @@ langchain-core==0.2.38
# via langchain-openai
langchain-openai==0.1.23
# via -r ./ingest/embed-openai.in
langsmith==0.1.116
langsmith==0.1.117
# via langchain-core
openai==1.44.0
openai==1.44.1
# via langchain-openai
orjson==3.10.7
# via langsmith

View File

@ -147,7 +147,7 @@ langchain-google-vertexai==1.0.10
# via -r ./ingest/embed-vertexai.in
langchain-text-splitters==0.2.4
# via langchain
langsmith==0.1.116
langsmith==0.1.117
# via
# langchain
# langchain-community
@ -156,7 +156,7 @@ marshmallow==3.22.0
# via
# -c ./ingest/../base.txt
# dataclasses-json
multidict==6.0.5
multidict==6.1.0
# via
# aiohttp
# yarl
@ -253,6 +253,7 @@ typing-extensions==4.12.2
# -c ./ingest/../base.txt
# anyio
# langchain-core
# multidict
# pydantic
# pydantic-core
# sqlalchemy
@ -266,5 +267,5 @@ urllib3==1.26.20
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# requests
yarl==1.11.0
yarl==1.11.1
# via aiohttp

View File

@ -78,11 +78,11 @@ langchain-text-splitters==0.2.4
# via langchain
langchain-voyageai==0.1.1
# via -r ./ingest/embed-voyageai.in
langsmith==0.1.116
langsmith==0.1.117
# via
# langchain
# langchain-core
multidict==6.0.5
multidict==6.1.0
# via
# aiohttp
# yarl
@ -131,6 +131,7 @@ typing-extensions==4.12.2
# -c ./ingest/../base.txt
# anyio
# langchain-core
# multidict
# pydantic
# pydantic-core
# sqlalchemy
@ -141,5 +142,5 @@ urllib3==1.26.20
# requests
voyageai==0.2.3
# via langchain-voyageai
yarl==1.11.0
yarl==1.11.1
# via aiohttp

View File

@ -72,7 +72,7 @@ idna==3.8
# -c ./ingest/../base.txt
# requests
# yarl
multidict==6.0.5
multidict==6.1.0
# via
# aiohttp
# yarl
@ -107,10 +107,14 @@ soupsieve==2.6
# via
# -c ./ingest/../base.txt
# beautifulsoup4
typing-extensions==4.12.2
# via
# -c ./ingest/../base.txt
# multidict
urllib3==1.26.20
# via
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# requests
yarl==1.11.0
yarl==1.11.1
# via aiohttp

View File

@ -32,10 +32,8 @@ msal==1.31.0
# via
# -r ./ingest/onedrive.in
# office365-rest-python-client
office365-rest-python-client==2.4.2
# via
# -c ./ingest/../deps/constraints.txt
# -r ./ingest/onedrive.in
office365-rest-python-client==2.5.12
# via -r ./ingest/onedrive.in
pycparser==2.22
# via cffi
pyjwt[crypto]==2.9.0
@ -51,6 +49,10 @@ soupsieve==2.6
# via
# -c ./ingest/../base.txt
# beautifulsoup4
typing-extensions==4.12.2
# via
# -c ./ingest/../base.txt
# office365-rest-python-client
urllib3==1.26.20
# via
# -c ./ingest/../base.txt

View File

@ -26,10 +26,8 @@ msal==1.31.0
# via
# -r ./ingest/outlook.in
# office365-rest-python-client
office365-rest-python-client==2.4.2
# via
# -c ./ingest/../deps/constraints.txt
# -r ./ingest/outlook.in
office365-rest-python-client==2.5.12
# via -r ./ingest/outlook.in
pycparser==2.22
# via cffi
pyjwt[crypto]==2.9.0
@ -41,6 +39,10 @@ requests==2.32.3
# -c ./ingest/../base.txt
# msal
# office365-rest-python-client
typing-extensions==4.12.2
# via
# -c ./ingest/../base.txt
# office365-rest-python-client
urllib3==1.26.20
# via
# -c ./ingest/../base.txt

View File

@ -38,7 +38,7 @@ idna==3.8
# yarl
jmespath==1.0.1
# via botocore
multidict==6.0.5
multidict==6.1.0
# via
# aiohttp
# yarl
@ -56,6 +56,7 @@ typing-extensions==4.12.2
# via
# -c ./ingest/../base.txt
# aioitertools
# multidict
urllib3==1.26.20
# via
# -c ./ingest/../base.txt
@ -66,5 +67,5 @@ wrapt==1.16.0
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# aiobotocore
yarl==1.11.0
yarl==1.11.1
# via aiohttp

View File

@ -30,10 +30,8 @@ lxml==5.3.0
# zeep
more-itertools==10.5.0
# via simple-salesforce
platformdirs==3.10.0
# via
# -c ./ingest/../deps/constraints.txt
# zeep
platformdirs==4.3.2
# via zeep
pycparser==2.22
# via cffi
pyjwt[crypto]==2.9.0

View File

@ -26,10 +26,8 @@ msal==1.31.0
# via
# -r ./ingest/sharepoint.in
# office365-rest-python-client
office365-rest-python-client==2.4.2
# via
# -c ./ingest/../deps/constraints.txt
# -r ./ingest/sharepoint.in
office365-rest-python-client==2.5.12
# via -r ./ingest/sharepoint.in
pycparser==2.22
# via cffi
pyjwt[crypto]==2.9.0
@ -41,6 +39,10 @@ requests==2.32.3
# -c ./ingest/../base.txt
# msal
# office365-rest-python-client
typing-extensions==4.12.2
# via
# -c ./ingest/../base.txt
# office365-rest-python-client
urllib3==1.26.20
# via
# -c ./ingest/../base.txt

View File

@ -36,6 +36,4 @@ urllib3==1.26.20
validators==0.34.0
# via weaviate-client
weaviate-client==3.26.7
# via
# -c ./ingest/../deps/constraints.txt
# -r ./ingest/weaviate.in
# via -r ./ingest/weaviate.in

View File

@ -93,7 +93,7 @@ lxml==5.3.0
# label-studio-sdk
mccabe==0.7.0
# via flake8
multidict==6.0.5
multidict==6.1.0
# via yarl
mypy==1.11.2
# via -r ./test.in
@ -121,10 +121,8 @@ pathspec==0.12.1
# via black
pillow==10.4.0
# via label-studio-sdk
platformdirs==3.10.0
# via
# -c ././deps/constraints.txt
# black
platformdirs==4.3.2
# via black
pluggy==1.5.0
# via pytest
pycodestyle==2.12.1
@ -143,7 +141,7 @@ pyflakes==3.2.0
# flake8
pyrsistent==0.20.0
# via jsonschema
pytest==8.3.2
pytest==8.3.3
# via
# pytest-cov
# pytest-mock
@ -214,6 +212,7 @@ typing-extensions==4.12.2
# anyio
# black
# label-studio-sdk
# multidict
# mypy
# pydantic
# pydantic-core
@ -236,7 +235,7 @@ wrapt==1.16.0
# vcrpy
xmljson==0.2.1
# via label-studio-sdk
yarl==1.11.0
yarl==1.11.1
# via vcrpy
# The following packages are considered to be unsafe in a requirements file:

View File

@ -515,7 +515,7 @@ def expected_call_():
split_pdf_concurrency_level=5,
split_pdf_page=True,
starting_page_number=None,
strategy=shared.Strategy.AUTO,
strategy=shared.Strategy.HI_RES,
unique_element_ids=False,
xml_keep_tags=False,
),

View File

@ -1 +1 @@
__version__ = "0.15.11" # pragma: no cover
__version__ = "0.15.12-dev0" # pragma: no cover

View File

@ -1,6 +1,6 @@
import logging
from dataclasses import dataclass, field
from typing import Optional, Type
from typing import Any, Optional, Type
import click
@ -30,7 +30,7 @@ class SrcCmd(BaseCmd):
]
)
def cmd(self, ctx: click.Context, **options) -> None:
def cmd(self, ctx: click.Context, **options: dict[str, Any]) -> None:
if ctx.invoked_subcommand:
return

View File

@ -13,7 +13,7 @@ from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
from unstructured.ingest.v2.logger import logger
def conform_click_options(options: dict):
def conform_click_options(options: dict[str, Any]) -> None:
# Click sets all multiple fields as tuple, this needs to be updated to list
for k, v in options.items():
if isinstance(v, tuple):

View File

@ -2,7 +2,7 @@ import os
from asyncio import Semaphore
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
from typing import Any, Optional
from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
@ -28,7 +28,7 @@ class ProcessorConfig(EnhancedDataClassJsonMixin):
uncompress: bool = False
# Used to keep track of state in pipeline
status: dict = field(default_factory=dict)
status: dict[str, Any] = field(default_factory=dict)
semaphore: Optional[Semaphore] = field(init=False, default=None)
def __post_init__(self):

View File

@ -1,3 +1,5 @@
from __future__ import annotations
import asyncio
import logging
import multiprocessing as mp
@ -138,7 +140,7 @@ class PipelineStep(ABC):
async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
raise NotImplementedError
def run(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
def run(self, _fn: Callable[..., Any] | None = None, **kwargs: Any) -> Optional[Any]:
try:
fn = _fn or self.process.run
return self._run(fn=fn, **kwargs)

View File

@ -1,3 +1,5 @@
from __future__ import annotations
import logging
import multiprocessing as mp
from dataclasses import InitVar, dataclass, field
@ -44,24 +46,24 @@ class Pipeline:
partitioner: InitVar[Partitioner]
partitioner_step: PartitionStep = field(init=False)
chunker: InitVar[Optional[Chunker]] = None
chunker_step: ChunkStep = field(init=False, default=None)
chunker_step: ChunkStep | None = field(init=False, default=None)
embedder: InitVar[Optional[Embedder]] = None
embedder_step: EmbedStep = field(init=False, default=None)
embedder_step: EmbedStep | None = field(init=False, default=None)
stager: InitVar[Optional[UploadStager]] = None
stager_step: UploadStageStep = field(init=False, default=None)
stager_step: UploadStageStep | None = field(init=False, default=None)
uploader: InitVar[Uploader] = field(default=LocalUploader())
uploader_step: UploadStep = field(init=False, default=None)
uncompress_step: UncompressStep = field(init=False, default=None)
uploader_step: UploadStep | None = field(init=False, default=None)
uncompress_step: UncompressStep | None = field(init=False, default=None)
def __post_init__(
self,
indexer: IndexerT,
downloader: DownloaderT,
partitioner: Partitioner,
chunker: Chunker = None,
embedder: Embedder = None,
stager: UploadStager = None,
uploader: Uploader = None,
chunker: Chunker | None = None,
embedder: Embedder | None = None,
stager: UploadStager | None = None,
uploader: Uploader | None = None,
):
make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
self.indexer_step = IndexStep(process=indexer, context=self.context)
@ -117,7 +119,7 @@ class Pipeline:
if self.context.status:
raise PipelineError("Pipeline did not run successfully")
def clean_results(self, results: Optional[list[Union[Any, list[Any]]]]) -> Optional[list[Any]]:
def clean_results(self, results: list[Union[Any, list[Any]]] | None) -> list[Any] | None:
if not results:
return None
results = [r for r in results if r]

View File

@ -1,7 +1,9 @@
from __future__ import annotations
import hashlib
import json
from dataclasses import dataclass
from typing import Generator, Optional, TypeVar
from typing import Any, Callable, Generator, Optional, TypeVar
from unstructured.ingest.v2.interfaces.indexer import Indexer
from unstructured.ingest.v2.logger import logger
@ -37,7 +39,9 @@ class IndexStep(PipelineStep):
f"connection configs: {connection_config}"
)
def run(self) -> Generator[str, None, None]:
def run(
self, _fn: Callable[..., Any] | None = None, **kwargs: Any
) -> Generator[str, None, None]:
for file_data in self.process.run():
logger.debug(f"Generated file data: {file_data}")
try:

View File

@ -1,10 +1,11 @@
import json
from datetime import datetime
from pathlib import Path
from typing import Any
def sterilize_dict(data: dict) -> dict:
def json_serial(obj):
def sterilize_dict(data: dict[str, Any]) -> dict[str, Any]:
def json_serial(obj: Any) -> str:
if isinstance(obj, Path):
return obj.as_posix()
if isinstance(obj, datetime):

View File

@ -1,3 +1,5 @@
from __future__ import annotations
import json
from dataclasses import dataclass, field
from pathlib import Path
@ -87,8 +89,8 @@ class OnedriveIndexer(Indexer):
connection_config: OnedriveConnectionConfig
index_config: OnedriveIndexerConfig
def list_objects(self, folder, recursive) -> list["DriveItem"]:
drive_items = folder.children.get().execute_query()
def list_objects(self, folder: DriveItem, recursive: bool) -> list[DriveItem]:
drive_items: list[DriveItem] = list(folder.children.get().execute_query())
files = [d for d in drive_items if d.is_file]
if not recursive:
return files
@ -123,12 +125,12 @@ class OnedriveIndexer(Indexer):
server_path = file_path + "/" + filename
rel_path = server_path.replace(self.index_config.path, "").lstrip("/")
date_modified_dt = (
parser.parse(drive_item.last_modified_datetime)
parser.parse(str(drive_item.last_modified_datetime))
if drive_item.last_modified_datetime
else None
)
date_created_at = (
parser.parse(drive_item.created_datetime) if drive_item.created_datetime else None
parser.parse(str(drive_item.created_datetime)) if drive_item.created_datetime else None
)
return FileData(
identifier=drive_item.id,
@ -140,7 +142,7 @@ class OnedriveIndexer(Indexer):
url=drive_item.parent_reference.path + "/" + drive_item.name,
version=drive_item.etag,
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
date_created=str(date_created_at.timestamp()) if date_modified_dt else None,
date_created=str(date_created_at.timestamp()) if date_created_at else None,
date_processed=str(time()),
record_locator={
"user_pname": self.connection_config.user_pname,