fix API-297: List parameters incorrectly passed to API requests (#3154)

In two places parameters passed to the python client when using either
Ingest workflow and `partition_via_api` function directly we parse the
parameters with list values to strings e.g.
```python
extract_image_block_types=["image"] -> extract_image_block_types='["image"]'
```
as of now these parameters are parsed incorrectly when given as strings
and correctly when given as lists.

This PR removes parsing from `PartitionConfig` and `partition_via_api`.

---------

Co-authored-by: Filip Knefel <filip@unstructured.io>
This commit is contained in:
Filip Knefel 2024-06-11 23:00:41 +02:00 committed by GitHub
parent 2f0400f279
commit c2065db716
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 7 additions and 15 deletions

View File

@ -5,6 +5,7 @@
### Features
### Fixes
* **Fix passing parameters to python-client** - Remove parsing list arguments to strings in passing arguments to python-client in Ingest workflow and `partition_via_api`
**table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string.

View File

@ -249,10 +249,12 @@ def test_partition_via_api_pass_list_type_parameters(monkeypatch):
ANY,
data=ANY,
files=[
["extract_image_block_types", [None, '["image", "table"]']],
["extract_image_block_types[]", [None, "image"]],
["extract_image_block_types[]", [None, "table"]],
["files", ANY],
["languages", [None, '["eng"]']],
["skip_infer_table_types", [None, '["pdf", "docx"]']],
["languages[]", [None, "eng"]],
["skip_infer_table_types[]", [None, "pdf"]],
["skip_infer_table_types[]", [None, "docx"]],
["strategy", [None, "hi_res"]],
],
headers=ANY,

View File

@ -574,14 +574,11 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
logger.debug(f"Using remote partition ({endpoint})")
passthrough_partition_kwargs = {
k: str(v) for k, v in partition_kwargs.items() if v is not None
}
elements = partition_via_api(
filename=str(self.filename),
api_key=partition_config.api_key,
api_url=endpoint,
**passthrough_partition_kwargs,
**partition_kwargs,
)
# TODO: add m_data_source_metadata to unstructured-api pipeline_api and then
# pass the stringified json here

View File

@ -1,7 +1,6 @@
from __future__ import annotations
import contextlib
import json
from typing import IO, Optional
import requests
@ -84,13 +83,6 @@ def partition_via_api(
)
files = shared.Files(content=file, file_name=metadata_filename)
# NOTE(christine): Converts all list type parameters to JSON formatted strings
# (e.g. ["image", "table"] -> '["image", "table"]')
# This can be removed if "speakeasy" supports passing list type parameters to FastAPI.
for k, v in request_kwargs.items():
if isinstance(v, list):
request_kwargs[k] = json.dumps(v)
req = shared.PartitionParameters(files=files, **request_kwargs)
response = sdk.general.partition(req)