mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-06 04:52:10 +00:00
* fix: `partition_via_api` reflects actual filetype in metadata * added in list length check * changelog typo
187 lines
6.5 KiB
Python
187 lines
6.5 KiB
Python
import contextlib
|
|
from typing import (
|
|
IO,
|
|
List,
|
|
Optional,
|
|
)
|
|
|
|
import requests
|
|
|
|
from unstructured.documents.elements import Element
|
|
from unstructured.partition.common import exactly_one
|
|
from unstructured.staging.base import dict_to_elements, elements_from_json
|
|
|
|
|
|
def partition_via_api(
|
|
filename: Optional[str] = None,
|
|
content_type: Optional[str] = None,
|
|
file: Optional[IO] = None,
|
|
file_filename: Optional[str] = None,
|
|
strategy: str = "hi_res",
|
|
api_url: str = "https://api.unstructured.io/general/v0/general",
|
|
api_key: str = "",
|
|
) -> List[Element]:
|
|
"""Partitions a document using the Unstructured REST API. This is equivalent to
|
|
running the document through partition.
|
|
|
|
See https://api.unstructured.io/general/docs for the hosted API documentation or
|
|
https://github.com/Unstructured-IO/unstructured-api for instructions on how to run
|
|
the API locally as a container.
|
|
|
|
Parameters
|
|
----------
|
|
filename
|
|
A string defining the target filename path.
|
|
content_type
|
|
A string defining the file content in MIME type
|
|
file
|
|
A file-like object using "rb" mode --> open(filename, "rb").
|
|
file_filename
|
|
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
|
strategy
|
|
The strategy to use for partitioning the PDF. Uses a layout detection model if set
|
|
to 'hi_res', otherwise partition_pdf simply extracts the text from the document
|
|
and processes it.
|
|
api_url
|
|
The URL for the Unstructured API. Defaults to the hosted Unstructured API.
|
|
api_key
|
|
The API key to pass to the Unstructured API.
|
|
"""
|
|
exactly_one(filename=filename, file=file)
|
|
|
|
headers = {
|
|
"ACCEPT": "application/json",
|
|
"UNSTRUCTURED-API-KEY": api_key,
|
|
}
|
|
|
|
data = {
|
|
"strategy": strategy,
|
|
}
|
|
|
|
if filename is not None:
|
|
with open(filename, "rb") as f:
|
|
files = [
|
|
("files", (filename, f, content_type)),
|
|
]
|
|
response = requests.post(
|
|
api_url,
|
|
headers=headers,
|
|
data=data,
|
|
files=files, # type: ignore
|
|
)
|
|
elif file is not None:
|
|
if file_filename is None:
|
|
raise ValueError(
|
|
"If file is specified in partition_via_api, "
|
|
"file_filename must be specified as well.",
|
|
)
|
|
files = [
|
|
("files", (file_filename, file, content_type)), # type: ignore
|
|
]
|
|
response = requests.post(api_url, headers=headers, data=data, files=files) # type: ignore
|
|
|
|
if response.status_code == 200:
|
|
return elements_from_json(text=response.text)
|
|
else:
|
|
raise ValueError(
|
|
f"Receive unexpected status code {response.status_code} from the API.",
|
|
)
|
|
|
|
|
|
def partition_multiple_via_api(
|
|
filenames: Optional[List[str]] = None,
|
|
content_types: Optional[List[str]] = None,
|
|
files: Optional[List[str]] = None,
|
|
file_filenames: Optional[List[str]] = None,
|
|
strategy: str = "hi_res",
|
|
api_url: str = "https://api.unstructured.io/general/v0/general",
|
|
api_key: str = "",
|
|
) -> List[List[Element]]:
|
|
"""Partitions multiple document using the Unstructured REST API by batching
|
|
the documents into a single HTTP request.
|
|
|
|
See https://api.unstructured.io/general/docs for the hosted API documentation or
|
|
https://github.com/Unstructured-IO/unstructured-api for instructions on how to run
|
|
the API locally as a container.
|
|
|
|
Parameters
|
|
----------
|
|
filename
|
|
A list of strings defining the target filename paths.
|
|
content_types
|
|
A list of strings defining the file contents in MIME types.
|
|
files
|
|
A list of file-like object using "rb" mode --> open(filename, "rb").
|
|
file_filename
|
|
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
|
strategy
|
|
The strategy to use for partitioning the PDF. Uses a layout detection model if set
|
|
to 'hi_res', otherwise partition_pdf simply extracts the text from the document
|
|
and processes it.
|
|
api_url
|
|
The URL for the Unstructured API. Defaults to the hosted Unstructured API.
|
|
api_key
|
|
The API key to pass to the Unstructured API.
|
|
"""
|
|
headers = {
|
|
"ACCEPT": "application/json",
|
|
"UNSTRUCTURED-API-KEY": api_key,
|
|
}
|
|
|
|
data = {
|
|
"strategy": strategy,
|
|
}
|
|
|
|
if filenames is not None:
|
|
if content_types and len(content_types) != len(filenames):
|
|
raise ValueError("content_types and filenames must have the same length.")
|
|
|
|
with contextlib.ExitStack() as stack:
|
|
files = [stack.enter_context(open(f, "rb")) for f in filenames] # type: ignore
|
|
|
|
_files = []
|
|
for i, file in enumerate(files):
|
|
filename = filenames[i]
|
|
content_type = content_types[i] if content_types is not None else None
|
|
_files.append(("files", (filename, file, content_type)))
|
|
|
|
response = requests.post(
|
|
api_url,
|
|
headers=headers,
|
|
data=data,
|
|
files=_files, # type: ignore
|
|
)
|
|
|
|
elif files is not None:
|
|
if content_types and len(content_types) != len(files):
|
|
raise ValueError("content_types and files must have the same length.")
|
|
|
|
if not file_filenames:
|
|
raise ValueError("file_filenames must be specified if files are passed")
|
|
elif len(file_filenames) != len(files):
|
|
raise ValueError("file_filenames and files must have the same length.")
|
|
|
|
_files = []
|
|
for i, _file in enumerate(files): # type: ignore
|
|
content_type = content_types[i] if content_types is not None else None
|
|
filename = file_filenames[i]
|
|
_files.append(("files", (filename, _file, content_type)))
|
|
|
|
response = requests.post(api_url, headers=headers, data=data, files=_files) # type: ignore
|
|
|
|
if response.status_code == 200:
|
|
documents = []
|
|
response_list = response.json()
|
|
# NOTE(robinson) - this check is because if only one filename is passed, the return
|
|
# type from the API is a list of objects instead of a list of lists
|
|
if not isinstance(response_list[0], list):
|
|
response_list = [response_list]
|
|
|
|
for document in response_list:
|
|
documents.append(dict_to_elements(document))
|
|
return documents
|
|
else:
|
|
raise ValueError(
|
|
f"Receive unexpected status code {response.status_code} from the API.",
|
|
)
|