fix: Fix api_url param to partition_via_api (#2342)

Closes #2340 

We need to make sure the custom url is passed to our client. The client
constructor takes the base url, so for compatibility we can continue to
take the full url and strip off the path.

To verify, run the api locally and confirm you can make calls to it.

```
# In unstructured-api
make run-web-app

# In ipython in this repo
from unstructured.partition.api import partition_via_api
filename = "example-docs/layout-parser-paper.pdf"
partition_via_api(filename=filename, api_url="http://localhost:8000")
```
This commit is contained in:
Austin Walker 2024-01-03 15:08:48 -05:00 committed by GitHub
parent 1b70ea86b3
commit 91b892c79d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 43 additions and 9 deletions

View File

@ -1,4 +1,4 @@
## 0.11.7-dev4
## 0.11.7
### Enhancements
@ -14,6 +14,7 @@
* **Fix table structure metric script** Update the call to table agent to now provide OCR tokens as required
* **Fix element extraction not working when using "auto" strategy for pdf and image** If element extraction is specified, the "auto" strategy falls back to the "hi_res" strategy.
* **Fix a bug passing a custom url to `partition_via_api`** Users that self host the api were not able to pass their custom url to `partition_via_api`.
## 0.11.6

View File

@ -2,11 +2,11 @@ import contextlib
import json
import os
import pathlib
from unittest.mock import ANY, Mock
import pytest
import requests
from unstructured_client.general import General
from unstructured_client.models.errors.sdkerror import SDKError
from unstructured.documents.elements import NarrativeText
from unstructured.partition.api import partition_multiple_via_api, partition_via_api
@ -45,6 +45,7 @@ class MockResponse:
# layer in the new unstructured-client:
# `elements_from_json(text=response.raw_response.text)`
self.raw_response = MockRawResponse()
self.headers = {"Content-Type": "application/json"}
def json(self):
return json.loads(self.text)
@ -71,6 +72,34 @@ def test_partition_via_api_from_filename(monkeypatch):
assert elements[0].metadata.filetype == "message/rfc822"
def test_partition_via_api_custom_url(monkeypatch):
"""
Assert that we can specify api_url and requests are sent to the right place
"""
mock_request = Mock(return_value=MockResponse(status_code=200))
monkeypatch.setattr(requests.Session, "request", mock_request)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
custom_url = "http://localhost:8000/general/v0/general"
with open(filename, "rb") as f:
partition_via_api(file=f, api_url=custom_url, metadata_filename=filename)
mock_request.assert_called_with(
"POST", custom_url, data=ANY, files=ANY, headers=ANY, params=ANY
)
# The sdk uses the server url, so we should be able to pass that as well
base_url = "http://localhost:8000"
with open(filename, "rb") as f:
partition_via_api(file=f, api_url=base_url, metadata_filename=filename)
mock_request.assert_called_with(
"POST", custom_url, data=ANY, files=ANY, headers=ANY, params=ANY
)
def test_partition_via_api_from_file(monkeypatch):
monkeypatch.setattr(
General,
@ -181,10 +210,11 @@ def test_partition_via_api_valid_request_data_kwargs():
assert isinstance(elements, list)
def test_partition_via_api_invalid_request_data_kwargs():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf")
with pytest.raises(SDKError):
partition_via_api(filename=filename, strategy="not_a_strategy")
# Note(austin) - This test is way too noisy against the hosted api
# def test_partition_via_api_invalid_request_data_kwargs():
# filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf")
# with pytest.raises(SDKError):
# partition_via_api(filename=filename, strategy="not_a_strategy")
class MockMultipleResponse:

View File

@ -1 +1 @@
__version__ = "0.11.7-dev4" # pragma: no cover
__version__ = "0.11.7" # pragma: no cover

View File

@ -65,7 +65,10 @@ def partition_via_api(
"Please use metadata_filename instead.",
)
s = UnstructuredClient(api_key_auth=api_key)
# Note(austin) - the sdk takes the base url, but we have the full api_url
# For consistency, just strip off the path when it's given
base_url = api_url[:-19] if "/general/v0/general" in api_url else api_url
sdk = UnstructuredClient(api_key_auth=api_key, server_url=base_url)
if filename is not None:
with open(filename, "rb") as f:
@ -89,7 +92,7 @@ def partition_via_api(
files=files,
**request_kwargs,
)
response = s.general.partition(req)
response = sdk.general.partition(req)
if response.status_code == 200:
return elements_from_json(text=response.raw_response.text)