Matt Robinson aa4d4329db
fix: partition_via_api reflects actual filetype in metadata (#696)
* fix: `partition_via_api` reflects actual filetype in metadata

* added in list length check

* changelog typo
2023-06-08 13:24:16 +00:00

187 lines
6.5 KiB
Python

import contextlib
from typing import (
IO,
List,
Optional,
)
import requests
from unstructured.documents.elements import Element
from unstructured.partition.common import exactly_one
from unstructured.staging.base import dict_to_elements, elements_from_json
def partition_via_api(
filename: Optional[str] = None,
content_type: Optional[str] = None,
file: Optional[IO] = None,
file_filename: Optional[str] = None,
strategy: str = "hi_res",
api_url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
) -> List[Element]:
"""Partitions a document using the Unstructured REST API. This is equivalent to
running the document through partition.
See https://api.unstructured.io/general/docs for the hosted API documentation or
https://github.com/Unstructured-IO/unstructured-api for instructions on how to run
the API locally as a container.
Parameters
----------
filename
A string defining the target filename path.
content_type
A string defining the file content in MIME type
file
A file-like object using "rb" mode --> open(filename, "rb").
file_filename
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
strategy
The strategy to use for partitioning the PDF. Uses a layout detection model if set
to 'hi_res', otherwise partition_pdf simply extracts the text from the document
and processes it.
api_url
The URL for the Unstructured API. Defaults to the hosted Unstructured API.
api_key
The API key to pass to the Unstructured API.
"""
exactly_one(filename=filename, file=file)
headers = {
"ACCEPT": "application/json",
"UNSTRUCTURED-API-KEY": api_key,
}
data = {
"strategy": strategy,
}
if filename is not None:
with open(filename, "rb") as f:
files = [
("files", (filename, f, content_type)),
]
response = requests.post(
api_url,
headers=headers,
data=data,
files=files, # type: ignore
)
elif file is not None:
if file_filename is None:
raise ValueError(
"If file is specified in partition_via_api, "
"file_filename must be specified as well.",
)
files = [
("files", (file_filename, file, content_type)), # type: ignore
]
response = requests.post(api_url, headers=headers, data=data, files=files) # type: ignore
if response.status_code == 200:
return elements_from_json(text=response.text)
else:
raise ValueError(
f"Receive unexpected status code {response.status_code} from the API.",
)
def partition_multiple_via_api(
filenames: Optional[List[str]] = None,
content_types: Optional[List[str]] = None,
files: Optional[List[str]] = None,
file_filenames: Optional[List[str]] = None,
strategy: str = "hi_res",
api_url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
) -> List[List[Element]]:
"""Partitions multiple document using the Unstructured REST API by batching
the documents into a single HTTP request.
See https://api.unstructured.io/general/docs for the hosted API documentation or
https://github.com/Unstructured-IO/unstructured-api for instructions on how to run
the API locally as a container.
Parameters
----------
filename
A list of strings defining the target filename paths.
content_types
A list of strings defining the file contents in MIME types.
files
A list of file-like object using "rb" mode --> open(filename, "rb").
file_filename
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
strategy
The strategy to use for partitioning the PDF. Uses a layout detection model if set
to 'hi_res', otherwise partition_pdf simply extracts the text from the document
and processes it.
api_url
The URL for the Unstructured API. Defaults to the hosted Unstructured API.
api_key
The API key to pass to the Unstructured API.
"""
headers = {
"ACCEPT": "application/json",
"UNSTRUCTURED-API-KEY": api_key,
}
data = {
"strategy": strategy,
}
if filenames is not None:
if content_types and len(content_types) != len(filenames):
raise ValueError("content_types and filenames must have the same length.")
with contextlib.ExitStack() as stack:
files = [stack.enter_context(open(f, "rb")) for f in filenames] # type: ignore
_files = []
for i, file in enumerate(files):
filename = filenames[i]
content_type = content_types[i] if content_types is not None else None
_files.append(("files", (filename, file, content_type)))
response = requests.post(
api_url,
headers=headers,
data=data,
files=_files, # type: ignore
)
elif files is not None:
if content_types and len(content_types) != len(files):
raise ValueError("content_types and files must have the same length.")
if not file_filenames:
raise ValueError("file_filenames must be specified if files are passed")
elif len(file_filenames) != len(files):
raise ValueError("file_filenames and files must have the same length.")
_files = []
for i, _file in enumerate(files): # type: ignore
content_type = content_types[i] if content_types is not None else None
filename = file_filenames[i]
_files.append(("files", (filename, _file, content_type)))
response = requests.post(api_url, headers=headers, data=data, files=_files) # type: ignore
if response.status_code == 200:
documents = []
response_list = response.json()
# NOTE(robinson) - this check is because if only one filename is passed, the return
# type from the API is a list of objects instead of a list of lists
if not isinstance(response_list[0], list):
response_list = [response_list]
for document in response_list:
documents.append(dict_to_elements(document))
return documents
else:
raise ValueError(
f"Receive unexpected status code {response.status_code} from the API.",
)