diff --git a/CHANGELOG.md b/CHANGELOG.md index 46bc51f6c..d33d34b24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,10 @@ -## 0.10.29-dev2 +## 0.10.29-dev3 ### Enhancements * **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning. * **Add retry logic for all source connectors** All http calls being made by the ingest source connectors have been isolated and wrapped by the `SourceConnectionNetworkError` custom error, which triggers the retry logic, if enabled, in the ingest pipeline. +* **Google Drive source connector supports credentials from memory** Originally, the connector expected a filepath to pull the credentials from when creating the client. This was expanded to support passing that information from memory as a dict if access to the file system might not be available. ### Features diff --git a/test_unstructured_ingest/test-ingest-gcs-dest.sh b/test_unstructured_ingest/test-ingest-gcs-dest.sh index 2437b6805..bae5f2e93 100755 --- a/test_unstructured_ingest/test-ingest-gcs-dest.sh +++ b/test_unstructured_ingest/test-ingest-gcs-dest.sh @@ -49,7 +49,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --input-path example-docs/fake-memo.pdf \ --work-dir "$WORK_DIR" \ gcs \ - --token "$GCP_INGEST_SERVICE_KEY_FILE" \ + --service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" \ --remote-url "$DESTINATION_GCS" # Simply check the number of files uploaded diff --git a/test_unstructured_ingest/test-ingest-gcs.sh b/test_unstructured_ingest/test-ingest-gcs.sh index 2efe7cc25..2c28444aa 100755 --- a/test_unstructured_ingest/test-ingest-gcs.sh +++ b/test_unstructured_ingest/test-ingest-gcs.sh @@ -40,7 +40,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --reprocess \ --output-dir "$OUTPUT_DIR" \ --verbose \ - --token "$GCP_INGEST_SERVICE_KEY_FILE" \ + --service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" \ --recursive \ --remote-url gs://utic-test-ingest-fixtures/ \ --work-dir "$WORK_DIR" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 13626425c..3edbc66e9 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.29-dev2" # pragma: no cover +__version__ = "0.10.29-dev3" # pragma: no cover diff --git a/unstructured/ingest/cli/cmds/gcs.py b/unstructured/ingest/cli/cmds/gcs.py index cb6ea8043..95105d668 100644 --- a/unstructured/ingest/cli/cmds/gcs.py +++ b/unstructured/ingest/cli/cmds/gcs.py @@ -4,9 +4,7 @@ from dataclasses import dataclass import click from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliMixin, -) +from unstructured.ingest.cli.interfaces import CliMixin, FileOrJson from unstructured.ingest.interfaces import BaseConfig CMD_NAME = "gcs" @@ -14,17 +12,17 @@ CMD_NAME = "gcs" @dataclass class GcsCliConfig(BaseConfig, CliMixin): - token: t.Optional[str] = None + service_account_key: t.Optional[t.Union[dict, str]] = None @staticmethod def get_cli_options() -> t.List[click.Option]: options = [ click.Option( - ["--token"], + ["--service-account-key"], default=None, - help="Token used to access Google Cloud. GCSFS will attempt to use your " - "default gcloud creds or get creds from the google metadata service " - "or fall back to anonymous access.", + type=FileOrJson(), + help="Either the file path of the credentials file to use or a json string of " + "those values to use for authentication", ), ] return options diff --git a/unstructured/ingest/cli/cmds/google_drive.py b/unstructured/ingest/cli/cmds/google_drive.py index 758184e4f..5e1158413 100644 --- a/unstructured/ingest/cli/cmds/google_drive.py +++ b/unstructured/ingest/cli/cmds/google_drive.py @@ -4,17 +4,14 @@ from dataclasses import dataclass import click from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliMixin, - CliRecursiveConfig, -) +from unstructured.ingest.cli.interfaces import CliMixin, CliRecursiveConfig, FileOrJson from unstructured.ingest.interfaces import BaseConfig @dataclass class GoogleDriveCliConfig(BaseConfig, CliMixin): drive_id: str - service_account_key: str + service_account_key: t.Union[dict, str] extension: t.Optional[str] = None @staticmethod @@ -29,8 +26,9 @@ class GoogleDriveCliConfig(BaseConfig, CliMixin): click.Option( ["--service-account-key"], required=True, - type=str, - help="Path to the Google Drive service account json file.", + type=FileOrJson(), + help="Either the file path of the credentials file to use or a json string of " + "those values to use for authentication", ), click.Option( ["--extension"], diff --git a/unstructured/ingest/cli/interfaces.py b/unstructured/ingest/cli/interfaces.py index b8d9cdc94..a5a365506 100644 --- a/unstructured/ingest/cli/interfaces.py +++ b/unstructured/ingest/cli/interfaces.py @@ -1,7 +1,9 @@ +import json +import os.path import typing as t from abc import abstractmethod from dataclasses import fields -from gettext import ngettext +from gettext import gettext, ngettext from pathlib import Path import click @@ -20,6 +22,33 @@ from unstructured.ingest.interfaces import ( ) +class FileOrJson(click.ParamType): + name = "file-or-json" + + def convert( + self, + value: t.Any, + param: t.Optional[click.Parameter], + ctx: t.Optional[click.Context], + ) -> t.Any: + # check if valid file + full_path = os.path.abspath(os.path.expanduser(value)) + if os.path.isfile(full_path): + return str(Path(full_path).resolve()) + if isinstance(value, str): + try: + return json.loads(value) + except json.JSONDecodeError: + pass + self.fail( + gettext( + "{value} is not a valid json string nor an existing filepath.", + ).format(value=value), + param, + ctx, + ) + + class DelimitedString(click.ParamType): name = "delimited-string" diff --git a/unstructured/ingest/connector/google_drive.py b/unstructured/ingest/connector/google_drive.py index 4608fb53e..a2f53695c 100644 --- a/unstructured/ingest/connector/google_drive.py +++ b/unstructured/ingest/connector/google_drive.py @@ -38,7 +38,7 @@ class GoogleDriveSessionHandle(BaseSessionHandle): @requires_dependencies(["googleapiclient"], extras="google-drive") -def create_service_account_object(key_path, id=None): +def create_service_account_object(key_path: t.Union[str, dict], id=None): """ Creates a service object for interacting with Google Drive. @@ -53,12 +53,21 @@ def create_service_account_object(key_path, id=None): Service account object """ from google.auth import default, exceptions + from google.oauth2 import service_account from googleapiclient.discovery import build from googleapiclient.errors import HttpError try: - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path - creds, _ = default() + if isinstance(key_path, dict): + creds = service_account.Credentials.from_service_account_info(key_path) + elif isinstance(key_path, str): + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path + creds, _ = default() + else: + raise ValueError( + f"key path not recognized as a dictionary or a file path: " + f"[{type(key_path)}] {key_path}", + ) service = build("drive", "v3", credentials=creds) if id: @@ -85,7 +94,7 @@ class SimpleGoogleDriveConfig(ConfigSessionHandleMixin, BaseConnectorConfig): # Google Drive Specific Options drive_id: str - service_account_key: str + service_account_key: t.Union[str, dict] extension: t.Optional[str] = None recursive: bool = False diff --git a/unstructured/ingest/runner/gcs.py b/unstructured/ingest/runner/gcs.py index a4570c713..9211411d7 100644 --- a/unstructured/ingest/runner/gcs.py +++ b/unstructured/ingest/runner/gcs.py @@ -9,7 +9,7 @@ from unstructured.ingest.runner.utils import update_download_dir_remote_url class GCSRunner(FsspecBaseRunner): def run( self, - token: t.Optional[str] = None, + service_account_key: t.Optional[t.Union[dict, str]] = None, **kwargs, ): ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) @@ -24,7 +24,11 @@ class GCSRunner(FsspecBaseRunner): from unstructured.ingest.connector.gcs import GcsSourceConnector, SimpleGcsConfig connector_config = SimpleGcsConfig.from_dict(self.fsspec_config.to_dict()) # type: ignore - connector_config.access_kwargs = {"token": token} + access_kwargs = {} + if service_account_key: + access_kwargs["token"] = service_account_key + + connector_config.access_kwargs = access_kwargs source_doc_connector = GcsSourceConnector( # type: ignore connector_config=connector_config, diff --git a/unstructured/ingest/runner/google_drive.py b/unstructured/ingest/runner/google_drive.py index 3635a9b3b..78e5a37e1 100644 --- a/unstructured/ingest/runner/google_drive.py +++ b/unstructured/ingest/runner/google_drive.py @@ -10,7 +10,7 @@ from unstructured.ingest.runner.utils import update_download_dir_hash class GoogleDriveRunner(Runner): def run( self, - service_account_key: str, + service_account_key: t.Union[str, dict], drive_id: str, recursive: bool = False, extension: t.Optional[str] = None, diff --git a/unstructured/ingest/runner/writers/gcs.py b/unstructured/ingest/runner/writers/gcs.py index 3f0000d26..0495a8c49 100644 --- a/unstructured/ingest/runner/writers/gcs.py +++ b/unstructured/ingest/runner/writers/gcs.py @@ -5,7 +5,7 @@ from unstructured.ingest.interfaces import BaseDestinationConnector def gcs_writer( remote_url: str, - token: t.Optional[str], + service_account_key: t.Optional[str], verbose: bool = False, **kwargs, ) -> BaseDestinationConnector: @@ -19,6 +19,6 @@ def gcs_writer( write_config=FsspecWriteConfig(), connector_config=SimpleGcsConfig( remote_url=remote_url, - access_kwargs={"token": token}, + access_kwargs={"token": service_account_key}, ), )