mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 02:53:31 +00:00
support passing credentials from memory for google connectors (#1888)
### Description
### Google Drive
The existing service account parameter was expanded to support either a
file path or a json value to generate the credentials when instantiating
the google drive client.
### GCS
Google Cloud Storage already supports the value being passed in, from
their docstring:
> - you may supply a token generated by the
[gcloud](https://cloud.google.com/sdk/docs/)
utility; this is either a python dictionary, the name of a file
containing the JSON returned by logging in with the gcloud CLI tool,
or a Credentials object.
I tested this locally:
```python
from gcsfs import GCSFileSystem
import json
with open("/Users/romanisecke/.ssh/google-cloud-unstructured-ingest-test-d4fc30286d9d.json") as json_file:
json_data = json.load(json_file)
print(json_data)
fs = GCSFileSystem(token=json_data)
print(fs.ls(path="gs://utic-test-ingest-fixtures/"))
```
`['utic-test-ingest-fixtures/ideas-page.html',
'utic-test-ingest-fixtures/nested-1',
'utic-test-ingest-fixtures/nested-2']`
This commit is contained in:
parent
922bc84cee
commit
123ad20f4c
@ -1,9 +1,10 @@
|
||||
## 0.10.29-dev2
|
||||
## 0.10.29-dev3
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
|
||||
* **Add retry logic for all source connectors** All http calls being made by the ingest source connectors have been isolated and wrapped by the `SourceConnectionNetworkError` custom error, which triggers the retry logic, if enabled, in the ingest pipeline.
|
||||
* **Google Drive source connector supports credentials from memory** Originally, the connector expected a filepath to pull the credentials from when creating the client. This was expanded to support passing that information from memory as a dict if access to the file system might not be available.
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--input-path example-docs/fake-memo.pdf \
|
||||
--work-dir "$WORK_DIR" \
|
||||
gcs \
|
||||
--token "$GCP_INGEST_SERVICE_KEY_FILE" \
|
||||
--service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" \
|
||||
--remote-url "$DESTINATION_GCS"
|
||||
|
||||
# Simply check the number of files uploaded
|
||||
|
||||
@ -40,7 +40,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--reprocess \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--verbose \
|
||||
--token "$GCP_INGEST_SERVICE_KEY_FILE" \
|
||||
--service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" \
|
||||
--recursive \
|
||||
--remote-url gs://utic-test-ingest-fixtures/ \
|
||||
--work-dir "$WORK_DIR"
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.10.29-dev2" # pragma: no cover
|
||||
__version__ = "0.10.29-dev3" # pragma: no cover
|
||||
|
||||
@ -4,9 +4,7 @@ from dataclasses import dataclass
|
||||
import click
|
||||
|
||||
from unstructured.ingest.cli.base.src import BaseSrcCmd
|
||||
from unstructured.ingest.cli.interfaces import (
|
||||
CliMixin,
|
||||
)
|
||||
from unstructured.ingest.cli.interfaces import CliMixin, FileOrJson
|
||||
from unstructured.ingest.interfaces import BaseConfig
|
||||
|
||||
CMD_NAME = "gcs"
|
||||
@ -14,17 +12,17 @@ CMD_NAME = "gcs"
|
||||
|
||||
@dataclass
|
||||
class GcsCliConfig(BaseConfig, CliMixin):
|
||||
token: t.Optional[str] = None
|
||||
service_account_key: t.Optional[t.Union[dict, str]] = None
|
||||
|
||||
@staticmethod
|
||||
def get_cli_options() -> t.List[click.Option]:
|
||||
options = [
|
||||
click.Option(
|
||||
["--token"],
|
||||
["--service-account-key"],
|
||||
default=None,
|
||||
help="Token used to access Google Cloud. GCSFS will attempt to use your "
|
||||
"default gcloud creds or get creds from the google metadata service "
|
||||
"or fall back to anonymous access.",
|
||||
type=FileOrJson(),
|
||||
help="Either the file path of the credentials file to use or a json string of "
|
||||
"those values to use for authentication",
|
||||
),
|
||||
]
|
||||
return options
|
||||
|
||||
@ -4,17 +4,14 @@ from dataclasses import dataclass
|
||||
import click
|
||||
|
||||
from unstructured.ingest.cli.base.src import BaseSrcCmd
|
||||
from unstructured.ingest.cli.interfaces import (
|
||||
CliMixin,
|
||||
CliRecursiveConfig,
|
||||
)
|
||||
from unstructured.ingest.cli.interfaces import CliMixin, CliRecursiveConfig, FileOrJson
|
||||
from unstructured.ingest.interfaces import BaseConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class GoogleDriveCliConfig(BaseConfig, CliMixin):
|
||||
drive_id: str
|
||||
service_account_key: str
|
||||
service_account_key: t.Union[dict, str]
|
||||
extension: t.Optional[str] = None
|
||||
|
||||
@staticmethod
|
||||
@ -29,8 +26,9 @@ class GoogleDriveCliConfig(BaseConfig, CliMixin):
|
||||
click.Option(
|
||||
["--service-account-key"],
|
||||
required=True,
|
||||
type=str,
|
||||
help="Path to the Google Drive service account json file.",
|
||||
type=FileOrJson(),
|
||||
help="Either the file path of the credentials file to use or a json string of "
|
||||
"those values to use for authentication",
|
||||
),
|
||||
click.Option(
|
||||
["--extension"],
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
import json
|
||||
import os.path
|
||||
import typing as t
|
||||
from abc import abstractmethod
|
||||
from dataclasses import fields
|
||||
from gettext import ngettext
|
||||
from gettext import gettext, ngettext
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
@ -20,6 +22,33 @@ from unstructured.ingest.interfaces import (
|
||||
)
|
||||
|
||||
|
||||
class FileOrJson(click.ParamType):
|
||||
name = "file-or-json"
|
||||
|
||||
def convert(
|
||||
self,
|
||||
value: t.Any,
|
||||
param: t.Optional[click.Parameter],
|
||||
ctx: t.Optional[click.Context],
|
||||
) -> t.Any:
|
||||
# check if valid file
|
||||
full_path = os.path.abspath(os.path.expanduser(value))
|
||||
if os.path.isfile(full_path):
|
||||
return str(Path(full_path).resolve())
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return json.loads(value)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
self.fail(
|
||||
gettext(
|
||||
"{value} is not a valid json string nor an existing filepath.",
|
||||
).format(value=value),
|
||||
param,
|
||||
ctx,
|
||||
)
|
||||
|
||||
|
||||
class DelimitedString(click.ParamType):
|
||||
name = "delimited-string"
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ class GoogleDriveSessionHandle(BaseSessionHandle):
|
||||
|
||||
|
||||
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
||||
def create_service_account_object(key_path, id=None):
|
||||
def create_service_account_object(key_path: t.Union[str, dict], id=None):
|
||||
"""
|
||||
Creates a service object for interacting with Google Drive.
|
||||
|
||||
@ -53,12 +53,21 @@ def create_service_account_object(key_path, id=None):
|
||||
Service account object
|
||||
"""
|
||||
from google.auth import default, exceptions
|
||||
from google.oauth2 import service_account
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.errors import HttpError
|
||||
|
||||
try:
|
||||
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
|
||||
creds, _ = default()
|
||||
if isinstance(key_path, dict):
|
||||
creds = service_account.Credentials.from_service_account_info(key_path)
|
||||
elif isinstance(key_path, str):
|
||||
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
|
||||
creds, _ = default()
|
||||
else:
|
||||
raise ValueError(
|
||||
f"key path not recognized as a dictionary or a file path: "
|
||||
f"[{type(key_path)}] {key_path}",
|
||||
)
|
||||
service = build("drive", "v3", credentials=creds)
|
||||
|
||||
if id:
|
||||
@ -85,7 +94,7 @@ class SimpleGoogleDriveConfig(ConfigSessionHandleMixin, BaseConnectorConfig):
|
||||
|
||||
# Google Drive Specific Options
|
||||
drive_id: str
|
||||
service_account_key: str
|
||||
service_account_key: t.Union[str, dict]
|
||||
extension: t.Optional[str] = None
|
||||
recursive: bool = False
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ from unstructured.ingest.runner.utils import update_download_dir_remote_url
|
||||
class GCSRunner(FsspecBaseRunner):
|
||||
def run(
|
||||
self,
|
||||
token: t.Optional[str] = None,
|
||||
service_account_key: t.Optional[t.Union[dict, str]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO)
|
||||
@ -24,7 +24,11 @@ class GCSRunner(FsspecBaseRunner):
|
||||
from unstructured.ingest.connector.gcs import GcsSourceConnector, SimpleGcsConfig
|
||||
|
||||
connector_config = SimpleGcsConfig.from_dict(self.fsspec_config.to_dict()) # type: ignore
|
||||
connector_config.access_kwargs = {"token": token}
|
||||
access_kwargs = {}
|
||||
if service_account_key:
|
||||
access_kwargs["token"] = service_account_key
|
||||
|
||||
connector_config.access_kwargs = access_kwargs
|
||||
|
||||
source_doc_connector = GcsSourceConnector( # type: ignore
|
||||
connector_config=connector_config,
|
||||
|
||||
@ -10,7 +10,7 @@ from unstructured.ingest.runner.utils import update_download_dir_hash
|
||||
class GoogleDriveRunner(Runner):
|
||||
def run(
|
||||
self,
|
||||
service_account_key: str,
|
||||
service_account_key: t.Union[str, dict],
|
||||
drive_id: str,
|
||||
recursive: bool = False,
|
||||
extension: t.Optional[str] = None,
|
||||
|
||||
@ -5,7 +5,7 @@ from unstructured.ingest.interfaces import BaseDestinationConnector
|
||||
|
||||
def gcs_writer(
|
||||
remote_url: str,
|
||||
token: t.Optional[str],
|
||||
service_account_key: t.Optional[str],
|
||||
verbose: bool = False,
|
||||
**kwargs,
|
||||
) -> BaseDestinationConnector:
|
||||
@ -19,6 +19,6 @@ def gcs_writer(
|
||||
write_config=FsspecWriteConfig(),
|
||||
connector_config=SimpleGcsConfig(
|
||||
remote_url=remote_url,
|
||||
access_kwargs={"token": token},
|
||||
access_kwargs={"token": service_account_key},
|
||||
),
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user