fix: partition_via_api reflects actual filetype in metadata (#696)

* fix: `partition_via_api` reflects actual filetype in metadata

* added in list length check

* changelog typo
This commit is contained in:
Matt Robinson 2023-06-08 09:24:16 -04:00 committed by GitHub
parent dabda67c8f
commit aa4d4329db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 38 additions and 8 deletions

View File

@ -8,6 +8,8 @@
### Fixes
* `partition_via_api` reflects the actual filetype for the file processed in the API.
## 0.7.2
### Enhancements

View File

@ -32,11 +32,15 @@ class MockResponse:
"Matthew Robinson <mrobinson@unstructured.io>"
],
"subject": "Test Email",
"filename": "fake-email.eml"
"filename": "fake-email.eml",
"filetype": "message/rfc822"
}
}
]"""
def json(self):
return json.loads(self.text)
def test_partition_via_api_from_filename(monkeypatch):
monkeypatch.setattr(
@ -47,6 +51,7 @@ def test_partition_via_api_from_filename(monkeypatch):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
elements = partition_via_api(filename=filename, api_key="FAKEROO")
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0].metadata.filetype == "message/rfc822"
def test_partition_via_api_from_file(monkeypatch):
@ -60,6 +65,7 @@ def test_partition_via_api_from_file(monkeypatch):
with open(filename, "rb") as f:
elements = partition_via_api(file=f, file_filename=filename, api_key="FAKEROO")
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0].metadata.filetype == "message/rfc822"
def test_partition_via_api_from_file_raises_without_filename(monkeypatch):
@ -110,7 +116,8 @@ class MockMultipleResponse:
"Matthew Robinson <mrobinson@unstructured.io>"
],
"subject": "Test Email",
"filename": "fake-email.eml"
"filename": "fake-email.eml",
"filetype": "message/rfc822"
}
}
],
@ -128,13 +135,27 @@ class MockMultipleResponse:
"Matthew Robinson <mrobinson@unstructured.io>"
],
"subject": "Test Email",
"filename": "fake-email.eml"
"filename": "fake-email.eml",
"filetype": "message/rfc822"
}
}
]
]"""
def test_partition_multiple_via_api_with_single_filename(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockResponse(status_code=200),
)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
elements = partition_multiple_via_api(filenames=[filename], api_key="FAKEROO")
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0][0].metadata.filetype == "message/rfc822"
def test_partition_multiple_via_api_from_filenames(monkeypatch):
monkeypatch.setattr(
requests,
@ -150,6 +171,7 @@ def test_partition_multiple_via_api_from_filenames(monkeypatch):
elements = partition_multiple_via_api(filenames=filenames, api_key="FAKEROO")
assert len(elements) == 2
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0][0].metadata.filetype == "message/rfc822"
def test_partition_multiple_via_api_from_files(monkeypatch):
@ -173,6 +195,7 @@ def test_partition_multiple_via_api_from_files(monkeypatch):
)
assert len(elements) == 2
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0][0].metadata.filetype == "message/rfc822"
def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch):

View File

@ -1,5 +1,4 @@
import contextlib
import json
from typing import (
IO,
List,
@ -10,7 +9,7 @@ import requests
from unstructured.documents.elements import Element
from unstructured.partition.common import exactly_one
from unstructured.partition.json import partition_json
from unstructured.staging.base import dict_to_elements, elements_from_json
def partition_via_api(
@ -82,7 +81,7 @@ def partition_via_api(
response = requests.post(api_url, headers=headers, data=data, files=files) # type: ignore
if response.status_code == 200:
return partition_json(text=response.text)
return elements_from_json(text=response.text)
else:
raise ValueError(
f"Receive unexpected status code {response.status_code} from the API.",
@ -172,8 +171,14 @@ def partition_multiple_via_api(
if response.status_code == 200:
documents = []
for document in response.json():
documents.append(partition_json(text=json.dumps(document)))
response_list = response.json()
# NOTE(robinson) - this check is because if only one filename is passed, the return
# type from the API is a list of objects instead of a list of lists
if not isinstance(response_list[0], list):
response_list = [response_list]
for document in response_list:
documents.append(dict_to_elements(document))
return documents
else:
raise ValueError(