mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 15:13:35 +00:00
bug CORE-3971: fix deserialization in google-drive source connector key path (#2586)
Google Drive Service account key can be a dict or a file path(str) We have successfully been using the path. But the dict can also end up being stored as a string that needs to be deserialized. The deserialization can have issues with single and double quotes.
This commit is contained in:
parent
6a4b7a134b
commit
43250d5576
11
CHANGELOG.md
11
CHANGELOG.md
@ -1,4 +1,4 @@
|
||||
## 0.12.6-dev0
|
||||
## 0.12.6-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
### Fixes
|
||||
|
||||
* **Fix SharePoint dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string.
|
||||
* **Fix Google Drive source key** Allow passing string for source connector key.
|
||||
|
||||
## 0.12.5
|
||||
|
||||
@ -19,17 +20,17 @@
|
||||
* **Add parent_element to overlapping case output** Adds parent_element to the output for `identify_overlapping_or_nesting_case` and `catch_overlapping_and_nested_bboxes` functions.
|
||||
* **Add table structure evaluation** Adds a new function to evaluate the structure of a table and return a metric that represents the quality of the table structure. This function is used to evaluate the quality of the table structure and the table contents.
|
||||
* **Add AstraDB destination connector** Adds support for writing embedded documents into an AstraDB vector database.
|
||||
* **Add OctoAI embedder** Adds support for embeddings via OctoAI.
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Fix passing list type parameters when calling unstructured API via `partition_via_api()`** Update `partition_via_api()` to convert all list type parameters to JSON formatted strings before calling the unstructured client SDK. This will support image block extraction via `partition_via_api()`.
|
||||
* **Add OctoAI embedder** Adds support for embeddings via OctoAI.
|
||||
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors**
|
||||
* **Fix don't treat plain text files with double quotes as JSON ** If a file can be deserialized as JSON but it deserializes as a string, treat it as plain text even though it's valid JSON.
|
||||
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors **
|
||||
* **Fix don't treat plain text files with double quotes as JSON** If a file can be deserialized as JSON but it deserializes as a string, treat it as plain text even though it's valid JSON.
|
||||
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors**
|
||||
* **Fix cluster of bugs in `partition_xlsx()` that dropped content.** Algorithm for detecting "subtables" within a worksheet dropped table elements for certain patterns of populated cells such as when a trailing single-cell row appeared in a contiguous block of populated cells.
|
||||
* **Improved documentation**. Fixed broken links and improved readability on `Key Concepts` page.
|
||||
* **Rename `OpenAiEmbeddingConfig` to `OpenAIEmbeddingConfig`.
|
||||
* **Rename `OpenAiEmbeddingConfig` to `OpenAIEmbeddingConfig`.**
|
||||
* **Fix partition_json() doesn't chunk.** The `@add_chunking_strategy` decorator was missing from `partition_json()` such that pre-partitioned documents serialized to JSON did not chunk when a chunking-strategy was specified.
|
||||
|
||||
## 0.12.4
|
||||
|
||||
@ -8,6 +8,7 @@ in our community `Slack. <https://short.unstructured.io/pzw05l7>`_
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
destination_connectors/astra
|
||||
destination_connectors/azure
|
||||
destination_connectors/azure_cognitive_search
|
||||
destination_connectors/box
|
||||
|
||||
@ -4,6 +4,7 @@ from dataclasses import dataclass, field
|
||||
|
||||
from unstructured.ingest.cli.utils import extract_config
|
||||
from unstructured.ingest.interfaces import BaseConfig
|
||||
from unstructured.ingest.utils.string_utils import json_to_dict
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -99,3 +100,31 @@ def test_extract_config_dict():
|
||||
c = extract_config(flat_data=flat_data, config=C)
|
||||
expected_result = {"c": True, "b": {}}
|
||||
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
||||
|
||||
|
||||
def test_json_to_dict_valid_json():
|
||||
json_string = '{"key": "value"}'
|
||||
expected_result = {"key": "value"}
|
||||
assert json_to_dict(json_string) == expected_result
|
||||
assert isinstance(json_to_dict(json_string), dict)
|
||||
|
||||
|
||||
def test_json_to_dict_malformed_json():
|
||||
json_string = '{"key": "value"'
|
||||
expected_result = '{"key": "value"'
|
||||
assert json_to_dict(json_string) == expected_result
|
||||
assert isinstance(json_to_dict(json_string), str)
|
||||
|
||||
|
||||
def test_json_to_dict_single_quotes():
|
||||
json_string = "{'key': 'value'}"
|
||||
expected_result = {"key": "value"}
|
||||
assert json_to_dict(json_string) == expected_result
|
||||
assert isinstance(json_to_dict(json_string), dict)
|
||||
|
||||
|
||||
def test_json_to_dict_path():
|
||||
json_string = "/path/to/file.json"
|
||||
expected_result = "/path/to/file.json"
|
||||
assert json_to_dict(json_string) == expected_result
|
||||
assert isinstance(json_to_dict(json_string), str)
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.12.6-dev0" # pragma: no cover
|
||||
__version__ = "0.12.6-dev1" # pragma: no cover
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
import json
|
||||
import typing as t
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
@ -14,6 +13,7 @@ from unstructured.ingest.connector.fsspec.fsspec import (
|
||||
from unstructured.ingest.enhanced_dataclass import enhanced_field
|
||||
from unstructured.ingest.error import SourceConnectionError
|
||||
from unstructured.ingest.interfaces import AccessConfig
|
||||
from unstructured.ingest.utils.string_utils import json_to_dict
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
|
||||
@ -25,6 +25,7 @@ class GcsAccessConfig(AccessConfig):
|
||||
|
||||
def __post_init__(self):
|
||||
ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud"
|
||||
|
||||
# Case: null value
|
||||
if not self.token:
|
||||
return
|
||||
@ -32,14 +33,8 @@ class GcsAccessConfig(AccessConfig):
|
||||
if self.token in ALLOWED_AUTH_VALUES:
|
||||
return
|
||||
# Case: token as json
|
||||
try:
|
||||
str_token = self.token.replace("'", '"')
|
||||
str_token = json.loads(str_token)
|
||||
except json.JSONDecodeError:
|
||||
# Not neccessary an error if it is a path
|
||||
pass
|
||||
else:
|
||||
self.token = str_token
|
||||
if isinstance(json_to_dict(self.token), dict):
|
||||
self.token = json_to_dict(self.token)
|
||||
return
|
||||
# Case: path to token
|
||||
if Path(self.token).is_file():
|
||||
|
||||
@ -24,6 +24,7 @@ from unstructured.ingest.interfaces import (
|
||||
SourceMetadata,
|
||||
)
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.ingest.utils.string_utils import json_to_dict
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
@ -47,7 +48,7 @@ def create_service_account_object(key_path: t.Union[str, dict], id=None):
|
||||
Providing a drive id enforces a key validation process.
|
||||
|
||||
Args:
|
||||
key_path: Path to Google Drive service account json file.
|
||||
key_path: Path to Google Drive service account json file. (or the actual json)
|
||||
id: ID of a file on Google Drive. File has to be either publicly accessible or accessible
|
||||
to the service account.
|
||||
|
||||
@ -59,6 +60,10 @@ def create_service_account_object(key_path: t.Union[str, dict], id=None):
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.errors import HttpError
|
||||
|
||||
# Service account key can be a dict or a file path(str)
|
||||
# But the dict may come in as a string
|
||||
key_path = json_to_dict(key_path)
|
||||
|
||||
try:
|
||||
if isinstance(key_path, dict):
|
||||
creds = service_account.Credentials.from_service_account_info(key_path)
|
||||
|
||||
18
unstructured/ingest/utils/string_utils.py
Normal file
18
unstructured/ingest/utils/string_utils.py
Normal file
@ -0,0 +1,18 @@
|
||||
import json
|
||||
import typing as t
|
||||
|
||||
|
||||
def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]:
|
||||
"""Helper function attempts to deserialize json string to a dictionary."""
|
||||
try:
|
||||
return json.loads(json_string)
|
||||
except json.JSONDecodeError:
|
||||
# Not neccessary an error if it is a path or malformed json
|
||||
pass
|
||||
try:
|
||||
# This is common when single quotes are used instead of double quotes
|
||||
return json.loads(json_string.replace("'", '"'))
|
||||
except json.JSONDecodeError:
|
||||
# Not neccessary an error if it is a path
|
||||
pass
|
||||
return json_string
|
||||
Loading…
x
Reference in New Issue
Block a user