mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-03 11:34:07 +00:00
fix: partition_via_api reflects actual filetype in metadata (#696)
* fix: `partition_via_api` reflects actual filetype in metadata * added in list length check * changelog typo
This commit is contained in:
parent
dabda67c8f
commit
aa4d4329db
@ -8,6 +8,8 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* `partition_via_api` reflects the actual filetype for the file processed in the API.
|
||||
|
||||
## 0.7.2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -32,11 +32,15 @@ class MockResponse:
|
||||
"Matthew Robinson <mrobinson@unstructured.io>"
|
||||
],
|
||||
"subject": "Test Email",
|
||||
"filename": "fake-email.eml"
|
||||
"filename": "fake-email.eml",
|
||||
"filetype": "message/rfc822"
|
||||
}
|
||||
}
|
||||
]"""
|
||||
|
||||
def json(self):
|
||||
return json.loads(self.text)
|
||||
|
||||
|
||||
def test_partition_via_api_from_filename(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
@ -47,6 +51,7 @@ def test_partition_via_api_from_filename(monkeypatch):
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
|
||||
elements = partition_via_api(filename=filename, api_key="FAKEROO")
|
||||
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
|
||||
assert elements[0].metadata.filetype == "message/rfc822"
|
||||
|
||||
|
||||
def test_partition_via_api_from_file(monkeypatch):
|
||||
@ -60,6 +65,7 @@ def test_partition_via_api_from_file(monkeypatch):
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_via_api(file=f, file_filename=filename, api_key="FAKEROO")
|
||||
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
|
||||
assert elements[0].metadata.filetype == "message/rfc822"
|
||||
|
||||
|
||||
def test_partition_via_api_from_file_raises_without_filename(monkeypatch):
|
||||
@ -110,7 +116,8 @@ class MockMultipleResponse:
|
||||
"Matthew Robinson <mrobinson@unstructured.io>"
|
||||
],
|
||||
"subject": "Test Email",
|
||||
"filename": "fake-email.eml"
|
||||
"filename": "fake-email.eml",
|
||||
"filetype": "message/rfc822"
|
||||
}
|
||||
}
|
||||
],
|
||||
@ -128,13 +135,27 @@ class MockMultipleResponse:
|
||||
"Matthew Robinson <mrobinson@unstructured.io>"
|
||||
],
|
||||
"subject": "Test Email",
|
||||
"filename": "fake-email.eml"
|
||||
"filename": "fake-email.eml",
|
||||
"filetype": "message/rfc822"
|
||||
}
|
||||
}
|
||||
]
|
||||
]"""
|
||||
|
||||
|
||||
def test_partition_multiple_via_api_with_single_filename(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
requests,
|
||||
"post",
|
||||
lambda *args, **kwargs: MockResponse(status_code=200),
|
||||
)
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
|
||||
|
||||
elements = partition_multiple_via_api(filenames=[filename], api_key="FAKEROO")
|
||||
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
|
||||
assert elements[0][0].metadata.filetype == "message/rfc822"
|
||||
|
||||
|
||||
def test_partition_multiple_via_api_from_filenames(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
requests,
|
||||
@ -150,6 +171,7 @@ def test_partition_multiple_via_api_from_filenames(monkeypatch):
|
||||
elements = partition_multiple_via_api(filenames=filenames, api_key="FAKEROO")
|
||||
assert len(elements) == 2
|
||||
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
|
||||
assert elements[0][0].metadata.filetype == "message/rfc822"
|
||||
|
||||
|
||||
def test_partition_multiple_via_api_from_files(monkeypatch):
|
||||
@ -173,6 +195,7 @@ def test_partition_multiple_via_api_from_files(monkeypatch):
|
||||
)
|
||||
assert len(elements) == 2
|
||||
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
|
||||
assert elements[0][0].metadata.filetype == "message/rfc822"
|
||||
|
||||
|
||||
def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch):
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
import contextlib
|
||||
import json
|
||||
from typing import (
|
||||
IO,
|
||||
List,
|
||||
@ -10,7 +9,7 @@ import requests
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.partition.json import partition_json
|
||||
from unstructured.staging.base import dict_to_elements, elements_from_json
|
||||
|
||||
|
||||
def partition_via_api(
|
||||
@ -82,7 +81,7 @@ def partition_via_api(
|
||||
response = requests.post(api_url, headers=headers, data=data, files=files) # type: ignore
|
||||
|
||||
if response.status_code == 200:
|
||||
return partition_json(text=response.text)
|
||||
return elements_from_json(text=response.text)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Receive unexpected status code {response.status_code} from the API.",
|
||||
@ -172,8 +171,14 @@ def partition_multiple_via_api(
|
||||
|
||||
if response.status_code == 200:
|
||||
documents = []
|
||||
for document in response.json():
|
||||
documents.append(partition_json(text=json.dumps(document)))
|
||||
response_list = response.json()
|
||||
# NOTE(robinson) - this check is because if only one filename is passed, the return
|
||||
# type from the API is a list of objects instead of a list of lists
|
||||
if not isinstance(response_list[0], list):
|
||||
response_list = [response_list]
|
||||
|
||||
for document in response_list:
|
||||
documents.append(dict_to_elements(document))
|
||||
return documents
|
||||
else:
|
||||
raise ValueError(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user