diff --git a/CHANGELOG.md b/CHANGELOG.md index 0064e1d89..d59e5adfd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.9.1-dev6 +## 0.9.1-dev7 ### Enhancements @@ -13,12 +13,12 @@ ### Fixes +* Remove unused `_partition_via_api` function * Fixed emoji bug in `partition_xlsx`. -* Pass file_filename metadata when partitioning file object +* Pass `file_filename` metadata when partitioning file object * Skip ingest test on missing Slack token * Add Dropbox variables to CI environments * Adds new element type `EmailAddress` for recognising email address in the  text - * Simplifies `min_partition` logic; makes partitions falling below the `min_partition` less likely. diff --git a/unstructured/__version__.py b/unstructured/__version__.py index e08b09a78..e474e6f79 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.9.1-dev6" # pragma: no cover +__version__ = "0.9.1-dev7" # pragma: no cover diff --git a/unstructured/partition/__init__.py b/unstructured/partition/__init__.py index b3ed9e2fe..e69de29bb 100644 --- a/unstructured/partition/__init__.py +++ b/unstructured/partition/__init__.py @@ -1,56 +0,0 @@ -from typing import BinaryIO, List, Mapping, Optional, Tuple, Union -from urllib.parse import urlsplit - -import requests # type: ignore - -from unstructured.documents.elements import Element - - -def _partition_via_api( - filename: str = "", - file: Optional[Union[BinaryIO, bytes]] = None, - url: str = "https://ml.unstructured.io/layout/pdf", - token: Optional[str] = None, - data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing - include_page_breaks: bool = False, -) -> List[Element]: - """Use API for partitioning.""" - if not filename and not file: - raise FileNotFoundError("No filename nor file were specified") - - split_url = urlsplit(url) - healthcheck_url = f"{split_url.scheme}://{split_url.netloc}/healthcheck" - healthcheck_response = requests.models.Response() - if not token: - healthcheck_response = requests.get(url=healthcheck_url) - - if healthcheck_response.status_code != 200: - raise ValueError("endpoint api healthcheck has failed!") - - file_: Mapping[str, Tuple[str, Union[BinaryIO, bytes]]] = { - "file": ( - filename, - file if file else open(filename, "rb"), # noqa: SIM115 - ), - } - response = requests.post( - url=url, - headers={"Authorization": f"Bearer {token}" if token else ""}, - files=file_, - data=data, # NOTE(alan): Remove after unstructured API is using routing - ) - - if response.status_code == 200: - pages = response.json()["pages"] - num_pages = len(pages) - elements = [] - for i, page in enumerate(pages): - for element in page["elements"]: - elements.append(element) - if include_page_breaks and i < num_pages - 1: - elements.append({"type": "PageBreak"}) - - return elements - - else: - raise ValueError(f"response status code = {response.status_code}")