chore: remove unused _partition_via_api function (#999)

* don't push

* fix: clean up code

* fix: remove unused _partition_via_api

* feat: update changelog

* clean up

* changelog and version

* remove print

* remove print

* revert test file

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
kravetsmic 2023-08-04 22:07:15 +03:00 committed by GitHub
parent bef93aef6e
commit 2888c20a46
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 4 additions and 60 deletions

View File

@ -1,4 +1,4 @@
## 0.9.1-dev6
## 0.9.1-dev7
### Enhancements
@ -13,12 +13,12 @@
### Fixes
* Remove unused `_partition_via_api` function
* Fixed emoji bug in `partition_xlsx`.
* Pass file_filename metadata when partitioning file object
* Pass `file_filename` metadata when partitioning file object
* Skip ingest test on missing Slack token
* Add Dropbox variables to CI environments
* Adds new element type `EmailAddress` for recognising email address in the  text
* Simplifies `min_partition` logic; makes partitions falling below the `min_partition`
less likely.

View File

@ -1 +1 @@
__version__ = "0.9.1-dev6" # pragma: no cover
__version__ = "0.9.1-dev7" # pragma: no cover

View File

@ -1,56 +0,0 @@
from typing import BinaryIO, List, Mapping, Optional, Tuple, Union
from urllib.parse import urlsplit
import requests # type: ignore
from unstructured.documents.elements import Element
def _partition_via_api(
filename: str = "",
file: Optional[Union[BinaryIO, bytes]] = None,
url: str = "https://ml.unstructured.io/layout/pdf",
token: Optional[str] = None,
data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing
include_page_breaks: bool = False,
) -> List[Element]:
"""Use API for partitioning."""
if not filename and not file:
raise FileNotFoundError("No filename nor file were specified")
split_url = urlsplit(url)
healthcheck_url = f"{split_url.scheme}://{split_url.netloc}/healthcheck"
healthcheck_response = requests.models.Response()
if not token:
healthcheck_response = requests.get(url=healthcheck_url)
if healthcheck_response.status_code != 200:
raise ValueError("endpoint api healthcheck has failed!")
file_: Mapping[str, Tuple[str, Union[BinaryIO, bytes]]] = {
"file": (
filename,
file if file else open(filename, "rb"), # noqa: SIM115
),
}
response = requests.post(
url=url,
headers={"Authorization": f"Bearer {token}" if token else ""},
files=file_,
data=data, # NOTE(alan): Remove after unstructured API is using routing
)
if response.status_code == 200:
pages = response.json()["pages"]
num_pages = len(pages)
elements = []
for i, page in enumerate(pages):
for element in page["elements"]:
elements.append(element)
if include_page_breaks and i < num_pages - 1:
elements.append({"type": "PageBreak"})
return elements
else:
raise ValueError(f"response status code = {response.status_code}")