mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-26 06:36:06 +00:00
chore: remove pins (#3579)
- Remove constraint pins for `Office365-REST-Python-Client`, `weaviate-client`, and `platformdirs`. Removing the pin for `Office365` brought to light some bugs in the Onedrive connector, so some changes were also made to `unstructured/ingest/v2/processes/connectors/onedrive.py`. - Also, as part of updating dependencies `unstructured-client` was updated to `0.25.8`, which introduced a new default for the `strategy` param and required updating a test fixture. - The `hubspot.sh` integration test was failing and is now ignored in CI with this PR per discussion with @rbiseck3. May be easiest to review commit-by-commit.
This commit is contained in:
parent
ebf16055d8
commit
ab94c6c5d1
@ -1,3 +1,11 @@
|
||||
## 0.15.12-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.15.11
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -133,7 +133,7 @@ typing-inspect==0.9.0
|
||||
# via
|
||||
# dataclasses-json
|
||||
# unstructured-client
|
||||
unstructured-client==0.25.7
|
||||
unstructured-client==0.25.8
|
||||
# via -r ./base.in
|
||||
urllib3==1.26.20
|
||||
# via
|
||||
|
||||
@ -6,23 +6,13 @@
|
||||
# consistency with local-inference-pin
|
||||
protobuf<4.24
|
||||
grpcio>=1.65.5
|
||||
# NOTE(alan) Pinned to avoid error that occurs with 2.4.3:
|
||||
# AttributeError: 'ResourcePath' object has no attribute 'collection'
|
||||
Office365-REST-Python-Client<2.4.3
|
||||
# use the known compatible version of weaviate
|
||||
weaviate-client>3.25.0
|
||||
# TODO: Pinned in transformers package, remove when that gets updated
|
||||
tokenizers>=0.19,<0.20
|
||||
platformdirs==3.10.0
|
||||
|
||||
# TODO: Constaint due to boto, with python before 3.10 not requiring openssl 1.1.1, remove when that gets
|
||||
# updated or we drop support for 3.9
|
||||
urllib3<1.27
|
||||
|
||||
# TODO: Constriant due to aiobotocore, remove when that gets updates:
|
||||
botocore<1.34.132
|
||||
|
||||
# python 3.12 support
|
||||
wrapt>=1.14.0
|
||||
|
||||
langchain-community>=0.2.5
|
||||
|
||||
@ -30,9 +30,8 @@ packaging==24.1
|
||||
# build
|
||||
pip-tools==7.4.1
|
||||
# via -r ./dev.in
|
||||
platformdirs==3.10.0
|
||||
platformdirs==4.3.2
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./test.txt
|
||||
# virtualenv
|
||||
pre-commit==3.8.0
|
||||
|
||||
@ -58,7 +58,7 @@ imageio==2.35.1
|
||||
# scikit-image
|
||||
imgaug==0.4.0
|
||||
# via unstructured-paddleocr
|
||||
importlib-resources==6.4.4
|
||||
importlib-resources==6.4.5
|
||||
# via matplotlib
|
||||
kiwisolver==1.4.7
|
||||
# via matplotlib
|
||||
|
||||
@ -75,7 +75,7 @@ idna==3.8
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
importlib-resources==6.4.4
|
||||
importlib-resources==6.4.5
|
||||
# via matplotlib
|
||||
iopath==0.1.10
|
||||
# via layoutparser
|
||||
|
||||
@ -8,9 +8,9 @@ anyio==4.4.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# httpx
|
||||
astrapy==1.4.1
|
||||
astrapy==1.4.2
|
||||
# via -r ./ingest/astradb.in
|
||||
cassandra-driver==3.29.1
|
||||
cassandra-driver==3.29.2
|
||||
# via cassio
|
||||
cassio==0.1.8
|
||||
# via astrapy
|
||||
@ -96,5 +96,5 @@ urllib3==1.26.20
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# requests
|
||||
uuid6==2024.1.12
|
||||
uuid6==2024.7.10
|
||||
# via astrapy
|
||||
|
||||
@ -67,7 +67,7 @@ msal==1.31.0
|
||||
# msal-extensions
|
||||
msal-extensions==1.2.0
|
||||
# via azure-identity
|
||||
multidict==6.0.5
|
||||
multidict==6.1.0
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
@ -94,10 +94,11 @@ typing-extensions==4.12.2
|
||||
# azure-core
|
||||
# azure-identity
|
||||
# azure-storage-blob
|
||||
# multidict
|
||||
urllib3==1.26.20
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# requests
|
||||
yarl==1.11.0
|
||||
yarl==1.11.1
|
||||
# via aiohttp
|
||||
|
||||
@ -102,7 +102,7 @@ importlib-metadata==8.4.0
|
||||
# -r ./ingest/chroma.in
|
||||
# build
|
||||
# opentelemetry-api
|
||||
importlib-resources==6.4.4
|
||||
importlib-resources==6.4.5
|
||||
# via chromadb
|
||||
kubernetes==30.1.0
|
||||
# via chromadb
|
||||
@ -172,7 +172,7 @@ packaging==24.1
|
||||
# build
|
||||
# huggingface-hub
|
||||
# onnxruntime
|
||||
posthog==3.6.4
|
||||
posthog==3.6.5
|
||||
# via chromadb
|
||||
protobuf==4.23.4
|
||||
# via
|
||||
|
||||
@ -60,7 +60,7 @@ requests==2.32.3
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# clarifai-grpc
|
||||
rich==13.8.0
|
||||
rich==13.8.1
|
||||
# via clarifai
|
||||
schema==0.7.5
|
||||
# via clarifai
|
||||
|
||||
@ -14,7 +14,7 @@ charset-normalizer==3.3.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# requests
|
||||
databricks-sdk==0.32.0
|
||||
databricks-sdk==0.32.1
|
||||
# via -r ./ingest/databricks-volumes.in
|
||||
google-auth==2.34.0
|
||||
# via databricks-sdk
|
||||
|
||||
@ -24,9 +24,13 @@ idna==3.8
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# yarl
|
||||
multidict==6.0.5
|
||||
multidict==6.1.0
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
yarl==1.11.0
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# multidict
|
||||
yarl==1.11.1
|
||||
# via aiohttp
|
||||
|
||||
@ -20,7 +20,7 @@ certifi==2024.8.30
|
||||
# elastic-transport
|
||||
elastic-transport==8.15.0
|
||||
# via elasticsearch
|
||||
elasticsearch[async]==8.15.0
|
||||
elasticsearch[async]==8.15.1
|
||||
# via -r ./ingest/elasticsearch.in
|
||||
frozenlist==1.4.1
|
||||
# via
|
||||
@ -30,14 +30,18 @@ idna==3.8
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# yarl
|
||||
multidict==6.0.5
|
||||
multidict==6.1.0
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# multidict
|
||||
urllib3==1.26.20
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# elastic-transport
|
||||
yarl==1.11.0
|
||||
yarl==1.11.1
|
||||
# via aiohttp
|
||||
|
||||
@ -93,7 +93,7 @@ langchain-core==0.2.38
|
||||
# langchain-text-splitters
|
||||
langchain-text-splitters==0.2.4
|
||||
# via langchain
|
||||
langsmith==0.1.116
|
||||
langsmith==0.1.117
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
@ -102,7 +102,7 @@ marshmallow==3.22.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# dataclasses-json
|
||||
multidict==6.0.5
|
||||
multidict==6.1.0
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
@ -169,6 +169,7 @@ typing-extensions==4.12.2
|
||||
# -c ./ingest/../base.txt
|
||||
# anyio
|
||||
# langchain-core
|
||||
# multidict
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# sqlalchemy
|
||||
@ -183,5 +184,5 @@ urllib3==1.26.20
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# botocore
|
||||
# requests
|
||||
yarl==1.11.0
|
||||
yarl==1.11.1
|
||||
# via aiohttp
|
||||
|
||||
@ -71,7 +71,7 @@ langchain-core==0.2.38
|
||||
# via langchain-huggingface
|
||||
langchain-huggingface==0.0.3
|
||||
# via -r ./ingest/embed-huggingface.in
|
||||
langsmith==0.1.116
|
||||
langsmith==0.1.117
|
||||
# via langchain-core
|
||||
markupsafe==2.1.5
|
||||
# via jinja2
|
||||
|
||||
@ -47,7 +47,7 @@ idna==3.8
|
||||
# requests
|
||||
jiter==0.5.0
|
||||
# via openai
|
||||
openai==1.44.0
|
||||
openai==1.44.1
|
||||
# via -r ./ingest/embed-octoai.in
|
||||
pydantic==2.9.1
|
||||
# via openai
|
||||
|
||||
@ -56,9 +56,9 @@ langchain-core==0.2.38
|
||||
# via langchain-openai
|
||||
langchain-openai==0.1.23
|
||||
# via -r ./ingest/embed-openai.in
|
||||
langsmith==0.1.116
|
||||
langsmith==0.1.117
|
||||
# via langchain-core
|
||||
openai==1.44.0
|
||||
openai==1.44.1
|
||||
# via langchain-openai
|
||||
orjson==3.10.7
|
||||
# via langsmith
|
||||
|
||||
@ -147,7 +147,7 @@ langchain-google-vertexai==1.0.10
|
||||
# via -r ./ingest/embed-vertexai.in
|
||||
langchain-text-splitters==0.2.4
|
||||
# via langchain
|
||||
langsmith==0.1.116
|
||||
langsmith==0.1.117
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
@ -156,7 +156,7 @@ marshmallow==3.22.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# dataclasses-json
|
||||
multidict==6.0.5
|
||||
multidict==6.1.0
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
@ -253,6 +253,7 @@ typing-extensions==4.12.2
|
||||
# -c ./ingest/../base.txt
|
||||
# anyio
|
||||
# langchain-core
|
||||
# multidict
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# sqlalchemy
|
||||
@ -266,5 +267,5 @@ urllib3==1.26.20
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# requests
|
||||
yarl==1.11.0
|
||||
yarl==1.11.1
|
||||
# via aiohttp
|
||||
|
||||
@ -78,11 +78,11 @@ langchain-text-splitters==0.2.4
|
||||
# via langchain
|
||||
langchain-voyageai==0.1.1
|
||||
# via -r ./ingest/embed-voyageai.in
|
||||
langsmith==0.1.116
|
||||
langsmith==0.1.117
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
multidict==6.0.5
|
||||
multidict==6.1.0
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
@ -131,6 +131,7 @@ typing-extensions==4.12.2
|
||||
# -c ./ingest/../base.txt
|
||||
# anyio
|
||||
# langchain-core
|
||||
# multidict
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# sqlalchemy
|
||||
@ -141,5 +142,5 @@ urllib3==1.26.20
|
||||
# requests
|
||||
voyageai==0.2.3
|
||||
# via langchain-voyageai
|
||||
yarl==1.11.0
|
||||
yarl==1.11.1
|
||||
# via aiohttp
|
||||
|
||||
@ -72,7 +72,7 @@ idna==3.8
|
||||
# -c ./ingest/../base.txt
|
||||
# requests
|
||||
# yarl
|
||||
multidict==6.0.5
|
||||
multidict==6.1.0
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
@ -107,10 +107,14 @@ soupsieve==2.6
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# beautifulsoup4
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# multidict
|
||||
urllib3==1.26.20
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# requests
|
||||
yarl==1.11.0
|
||||
yarl==1.11.1
|
||||
# via aiohttp
|
||||
|
||||
@ -32,10 +32,8 @@ msal==1.31.0
|
||||
# via
|
||||
# -r ./ingest/onedrive.in
|
||||
# office365-rest-python-client
|
||||
office365-rest-python-client==2.4.2
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/onedrive.in
|
||||
office365-rest-python-client==2.5.12
|
||||
# via -r ./ingest/onedrive.in
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pyjwt[crypto]==2.9.0
|
||||
@ -51,6 +49,10 @@ soupsieve==2.6
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# beautifulsoup4
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# office365-rest-python-client
|
||||
urllib3==1.26.20
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
|
||||
@ -26,10 +26,8 @@ msal==1.31.0
|
||||
# via
|
||||
# -r ./ingest/outlook.in
|
||||
# office365-rest-python-client
|
||||
office365-rest-python-client==2.4.2
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/outlook.in
|
||||
office365-rest-python-client==2.5.12
|
||||
# via -r ./ingest/outlook.in
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pyjwt[crypto]==2.9.0
|
||||
@ -41,6 +39,10 @@ requests==2.32.3
|
||||
# -c ./ingest/../base.txt
|
||||
# msal
|
||||
# office365-rest-python-client
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# office365-rest-python-client
|
||||
urllib3==1.26.20
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
|
||||
@ -38,7 +38,7 @@ idna==3.8
|
||||
# yarl
|
||||
jmespath==1.0.1
|
||||
# via botocore
|
||||
multidict==6.0.5
|
||||
multidict==6.1.0
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
@ -56,6 +56,7 @@ typing-extensions==4.12.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# aioitertools
|
||||
# multidict
|
||||
urllib3==1.26.20
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
@ -66,5 +67,5 @@ wrapt==1.16.0
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# aiobotocore
|
||||
yarl==1.11.0
|
||||
yarl==1.11.1
|
||||
# via aiohttp
|
||||
|
||||
@ -30,10 +30,8 @@ lxml==5.3.0
|
||||
# zeep
|
||||
more-itertools==10.5.0
|
||||
# via simple-salesforce
|
||||
platformdirs==3.10.0
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# zeep
|
||||
platformdirs==4.3.2
|
||||
# via zeep
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pyjwt[crypto]==2.9.0
|
||||
|
||||
@ -26,10 +26,8 @@ msal==1.31.0
|
||||
# via
|
||||
# -r ./ingest/sharepoint.in
|
||||
# office365-rest-python-client
|
||||
office365-rest-python-client==2.4.2
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/sharepoint.in
|
||||
office365-rest-python-client==2.5.12
|
||||
# via -r ./ingest/sharepoint.in
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pyjwt[crypto]==2.9.0
|
||||
@ -41,6 +39,10 @@ requests==2.32.3
|
||||
# -c ./ingest/../base.txt
|
||||
# msal
|
||||
# office365-rest-python-client
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# office365-rest-python-client
|
||||
urllib3==1.26.20
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
|
||||
@ -36,6 +36,4 @@ urllib3==1.26.20
|
||||
validators==0.34.0
|
||||
# via weaviate-client
|
||||
weaviate-client==3.26.7
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/weaviate.in
|
||||
# via -r ./ingest/weaviate.in
|
||||
|
||||
@ -93,7 +93,7 @@ lxml==5.3.0
|
||||
# label-studio-sdk
|
||||
mccabe==0.7.0
|
||||
# via flake8
|
||||
multidict==6.0.5
|
||||
multidict==6.1.0
|
||||
# via yarl
|
||||
mypy==1.11.2
|
||||
# via -r ./test.in
|
||||
@ -121,10 +121,8 @@ pathspec==0.12.1
|
||||
# via black
|
||||
pillow==10.4.0
|
||||
# via label-studio-sdk
|
||||
platformdirs==3.10.0
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# black
|
||||
platformdirs==4.3.2
|
||||
# via black
|
||||
pluggy==1.5.0
|
||||
# via pytest
|
||||
pycodestyle==2.12.1
|
||||
@ -143,7 +141,7 @@ pyflakes==3.2.0
|
||||
# flake8
|
||||
pyrsistent==0.20.0
|
||||
# via jsonschema
|
||||
pytest==8.3.2
|
||||
pytest==8.3.3
|
||||
# via
|
||||
# pytest-cov
|
||||
# pytest-mock
|
||||
@ -214,6 +212,7 @@ typing-extensions==4.12.2
|
||||
# anyio
|
||||
# black
|
||||
# label-studio-sdk
|
||||
# multidict
|
||||
# mypy
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
@ -236,7 +235,7 @@ wrapt==1.16.0
|
||||
# vcrpy
|
||||
xmljson==0.2.1
|
||||
# via label-studio-sdk
|
||||
yarl==1.11.0
|
||||
yarl==1.11.1
|
||||
# via vcrpy
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
|
||||
@ -515,7 +515,7 @@ def expected_call_():
|
||||
split_pdf_concurrency_level=5,
|
||||
split_pdf_page=True,
|
||||
starting_page_number=None,
|
||||
strategy=shared.Strategy.AUTO,
|
||||
strategy=shared.Strategy.HI_RES,
|
||||
unique_element_ids=False,
|
||||
xml_keep_tags=False,
|
||||
),
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.15.11" # pragma: no cover
|
||||
__version__ = "0.15.12-dev0" # pragma: no cover
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Type
|
||||
from typing import Any, Optional, Type
|
||||
|
||||
import click
|
||||
|
||||
@ -30,7 +30,7 @@ class SrcCmd(BaseCmd):
|
||||
]
|
||||
)
|
||||
|
||||
def cmd(self, ctx: click.Context, **options) -> None:
|
||||
def cmd(self, ctx: click.Context, **options: dict[str, Any]) -> None:
|
||||
if ctx.invoked_subcommand:
|
||||
return
|
||||
|
||||
|
||||
@ -13,7 +13,7 @@ from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
||||
from unstructured.ingest.v2.logger import logger
|
||||
|
||||
|
||||
def conform_click_options(options: dict):
|
||||
def conform_click_options(options: dict[str, Any]) -> None:
|
||||
# Click sets all multiple fields as tuple, this needs to be updated to list
|
||||
for k, v in options.items():
|
||||
if isinstance(v, tuple):
|
||||
|
||||
@ -2,7 +2,7 @@ import os
|
||||
from asyncio import Semaphore
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
||||
|
||||
@ -28,7 +28,7 @@ class ProcessorConfig(EnhancedDataClassJsonMixin):
|
||||
uncompress: bool = False
|
||||
|
||||
# Used to keep track of state in pipeline
|
||||
status: dict = field(default_factory=dict)
|
||||
status: dict[str, Any] = field(default_factory=dict)
|
||||
semaphore: Optional[Semaphore] = field(init=False, default=None)
|
||||
|
||||
def __post_init__(self):
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
@ -138,7 +140,7 @@ class PipelineStep(ABC):
|
||||
async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
|
||||
raise NotImplementedError
|
||||
|
||||
def run(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
|
||||
def run(self, _fn: Callable[..., Any] | None = None, **kwargs: Any) -> Optional[Any]:
|
||||
try:
|
||||
fn = _fn or self.process.run
|
||||
return self._run(fn=fn, **kwargs)
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
from dataclasses import InitVar, dataclass, field
|
||||
@ -44,24 +46,24 @@ class Pipeline:
|
||||
partitioner: InitVar[Partitioner]
|
||||
partitioner_step: PartitionStep = field(init=False)
|
||||
chunker: InitVar[Optional[Chunker]] = None
|
||||
chunker_step: ChunkStep = field(init=False, default=None)
|
||||
chunker_step: ChunkStep | None = field(init=False, default=None)
|
||||
embedder: InitVar[Optional[Embedder]] = None
|
||||
embedder_step: EmbedStep = field(init=False, default=None)
|
||||
embedder_step: EmbedStep | None = field(init=False, default=None)
|
||||
stager: InitVar[Optional[UploadStager]] = None
|
||||
stager_step: UploadStageStep = field(init=False, default=None)
|
||||
stager_step: UploadStageStep | None = field(init=False, default=None)
|
||||
uploader: InitVar[Uploader] = field(default=LocalUploader())
|
||||
uploader_step: UploadStep = field(init=False, default=None)
|
||||
uncompress_step: UncompressStep = field(init=False, default=None)
|
||||
uploader_step: UploadStep | None = field(init=False, default=None)
|
||||
uncompress_step: UncompressStep | None = field(init=False, default=None)
|
||||
|
||||
def __post_init__(
|
||||
self,
|
||||
indexer: IndexerT,
|
||||
downloader: DownloaderT,
|
||||
partitioner: Partitioner,
|
||||
chunker: Chunker = None,
|
||||
embedder: Embedder = None,
|
||||
stager: UploadStager = None,
|
||||
uploader: Uploader = None,
|
||||
chunker: Chunker | None = None,
|
||||
embedder: Embedder | None = None,
|
||||
stager: UploadStager | None = None,
|
||||
uploader: Uploader | None = None,
|
||||
):
|
||||
make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
|
||||
self.indexer_step = IndexStep(process=indexer, context=self.context)
|
||||
@ -117,7 +119,7 @@ class Pipeline:
|
||||
if self.context.status:
|
||||
raise PipelineError("Pipeline did not run successfully")
|
||||
|
||||
def clean_results(self, results: Optional[list[Union[Any, list[Any]]]]) -> Optional[list[Any]]:
|
||||
def clean_results(self, results: list[Union[Any, list[Any]]] | None) -> list[Any] | None:
|
||||
if not results:
|
||||
return None
|
||||
results = [r for r in results if r]
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Generator, Optional, TypeVar
|
||||
from typing import Any, Callable, Generator, Optional, TypeVar
|
||||
|
||||
from unstructured.ingest.v2.interfaces.indexer import Indexer
|
||||
from unstructured.ingest.v2.logger import logger
|
||||
@ -37,7 +39,9 @@ class IndexStep(PipelineStep):
|
||||
f"connection configs: {connection_config}"
|
||||
)
|
||||
|
||||
def run(self) -> Generator[str, None, None]:
|
||||
def run(
|
||||
self, _fn: Callable[..., Any] | None = None, **kwargs: Any
|
||||
) -> Generator[str, None, None]:
|
||||
for file_data in self.process.run():
|
||||
logger.debug(f"Generated file data: {file_data}")
|
||||
try:
|
||||
|
||||
@ -1,10 +1,11 @@
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def sterilize_dict(data: dict) -> dict:
|
||||
def json_serial(obj):
|
||||
def sterilize_dict(data: dict[str, Any]) -> dict[str, Any]:
|
||||
def json_serial(obj: Any) -> str:
|
||||
if isinstance(obj, Path):
|
||||
return obj.as_posix()
|
||||
if isinstance(obj, datetime):
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
@ -87,8 +89,8 @@ class OnedriveIndexer(Indexer):
|
||||
connection_config: OnedriveConnectionConfig
|
||||
index_config: OnedriveIndexerConfig
|
||||
|
||||
def list_objects(self, folder, recursive) -> list["DriveItem"]:
|
||||
drive_items = folder.children.get().execute_query()
|
||||
def list_objects(self, folder: DriveItem, recursive: bool) -> list[DriveItem]:
|
||||
drive_items: list[DriveItem] = list(folder.children.get().execute_query())
|
||||
files = [d for d in drive_items if d.is_file]
|
||||
if not recursive:
|
||||
return files
|
||||
@ -123,12 +125,12 @@ class OnedriveIndexer(Indexer):
|
||||
server_path = file_path + "/" + filename
|
||||
rel_path = server_path.replace(self.index_config.path, "").lstrip("/")
|
||||
date_modified_dt = (
|
||||
parser.parse(drive_item.last_modified_datetime)
|
||||
parser.parse(str(drive_item.last_modified_datetime))
|
||||
if drive_item.last_modified_datetime
|
||||
else None
|
||||
)
|
||||
date_created_at = (
|
||||
parser.parse(drive_item.created_datetime) if drive_item.created_datetime else None
|
||||
parser.parse(str(drive_item.created_datetime)) if drive_item.created_datetime else None
|
||||
)
|
||||
return FileData(
|
||||
identifier=drive_item.id,
|
||||
@ -140,7 +142,7 @@ class OnedriveIndexer(Indexer):
|
||||
url=drive_item.parent_reference.path + "/" + drive_item.name,
|
||||
version=drive_item.etag,
|
||||
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
||||
date_created=str(date_created_at.timestamp()) if date_modified_dt else None,
|
||||
date_created=str(date_created_at.timestamp()) if date_created_at else None,
|
||||
date_processed=str(time()),
|
||||
record_locator={
|
||||
"user_pname": self.connection_config.user_pname,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user