chore: deprecation warning for file_filename (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
This commit is contained in:
Matt Robinson 2023-08-24 03:02:47 -04:00 committed by GitHub
parent 835378aba6
commit cdae53cc29
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 189 additions and 54 deletions

View File

@ -9,6 +9,8 @@
* Fix bug in `partition_pdf_or_image` where two partitions were called if `strategy == "ocr_only"`.
* Bump unstructured-inference
* Fix issue where temporary files were being left behind (0.5.16)
* Adds deprecation warning for the `file_filename` kwarg to `partition`, `partition_via_api`,
and `partition_multiple_via_api`.
* Fix documentation build workflow by pinning dependencies
## 0.10.5

View File

@ -496,7 +496,7 @@ Examples:
with ExitStack() as stack:
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
documents = partition_multiple_via_api(files=files, file_filenames=filenames)
documents = partition_multiple_via_api(files=files, metadata_filenames=filenames)
For more information about the ``partition_multiple_via_api`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/partition/api.py>`_.
@ -794,7 +794,7 @@ type for the file. If you do not explicitly pass it, the MIME type will be infer
elements = partition_via_api(filename=filename, api_key="MY_API_KEY", content_type="message/rfc822")
with open(filename, "rb") as f:
elements = partition_via_api(file=f, file_filename=filename, api_key="MY_API_KEY")
elements = partition_via_api(file=f, metadata_filename=filename, api_key="MY_API_KEY")
You can pass additional settings such as ``strategy``, ``ocr_languages`` and ``encoding`` to the

View File

@ -67,11 +67,38 @@ def test_partition_via_api_from_file(monkeypatch):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
with open(filename, "rb") as f:
elements = partition_via_api(file=f, file_filename=filename)
elements = partition_via_api(file=f, metadata_filename=filename)
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0].metadata.filetype == "message/rfc822"
def test_partition_via_api_from_file_warns_with_file_filename(monkeypatch, caplog):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockResponse(status_code=200),
)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
with open(filename, "rb") as f:
partition_via_api(file=f, file_filename=filename)
assert "WARNING" in caplog.text
assert "The file_filename kwarg will be deprecated" in caplog.text
def test_partition_via_api_from_file_raises_with_metadata_and_file_filename(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockResponse(status_code=200),
)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
with open(filename, "rb") as f, pytest.raises(ValueError):
partition_via_api(file=f, file_filename=filename, metadata_filename=filename)
def test_partition_via_api_from_file_raises_without_filename(monkeypatch):
monkeypatch.setattr(
requests,
@ -246,13 +273,57 @@ def test_partition_multiple_via_api_from_files(monkeypatch):
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
elements = partition_multiple_via_api(
files=files,
file_filenames=filenames,
metadata_filenames=filenames,
)
assert len(elements) == 2
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0][0].metadata.filetype == "message/rfc822"
def test_partition_multiple_via_api_warns_with_file_filename(monkeypatch, caplog):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
)
filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
]
with contextlib.ExitStack() as stack:
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
partition_multiple_via_api(
files=files,
file_filenames=filenames,
)
assert "WARNING" in caplog.text
assert "The file_filenames kwarg will be deprecated" in caplog.text
def test_partition_multiple_via_api_warns_with_file_and_metadata_filename(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
)
filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
]
with contextlib.ExitStack() as stack:
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
with pytest.raises(ValueError):
partition_multiple_via_api(
files=files,
metadata_filenames=filenames,
file_filenames=filenames,
)
def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch):
monkeypatch.setattr(
requests,
@ -305,7 +376,7 @@ def test_partition_multiple_via_api_from_files_raises_with_size_mismatch(monkeyp
with pytest.raises(ValueError):
partition_multiple_via_api(
files=files,
file_filenames=filenames,
metadata_filenames=filenames,
content_types=["text/plain"],
)

View File

@ -118,24 +118,24 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
)
def test_auto_partition_doc_with_filename(
mock_docx_document,
expected_docx_elements,
tmpdir,
pass_file_filename,
pass_metadata_filename,
content_type,
):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_docx_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
file_filename = doc_filename if pass_file_filename else None
metadata_filename = doc_filename if pass_metadata_filename else None
elements = partition(
filename=doc_filename,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="hi_res",
)
@ -159,15 +159,15 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_filename(pass_file_filename, content_type):
def test_auto_partition_html_from_filename(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
file_filename = filename if pass_file_filename else None
metadata_filename = filename if pass_metadata_filename else None
elements = partition(
filename=filename,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="hi_res",
)
@ -177,16 +177,16 @@ def test_auto_partition_html_from_filename(pass_file_filename, content_type):
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_file(pass_file_filename, content_type):
def test_auto_partition_html_from_file(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
file_filename = filename if pass_file_filename else None
metadata_filename = filename if pass_metadata_filename else None
with open(filename) as f:
elements = partition(
file=f,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="hi_res",
)
@ -285,16 +285,16 @@ def test_auto_partition_text_from_file():
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_filename(pass_file_filename, content_type, request):
def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type, request):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
file_filename = filename if pass_file_filename else None
metadata_filename = filename if pass_metadata_filename else None
elements = partition(
filename=filename,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="hi_res",
)
@ -332,6 +332,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
mock_partition.assert_called_once_with(
filename=filename,
metadata_filename=None,
file=None,
url=None,
include_page_breaks=False,
@ -342,17 +343,17 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_file(pass_file_filename, content_type, request):
def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, request):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
file_filename = filename if pass_file_filename else None
metadata_filename = filename if pass_metadata_filename else None
with open(filename, "rb") as f:
elements = partition(
file=f,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="hi_res",
)
@ -379,15 +380,15 @@ def test_partition_pdf_doesnt_raise_warning():
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, content_type):
def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
file_filename = filename if pass_file_filename else None
metadata_filename = filename if pass_metadata_filename else None
elements = partition(
filename=filename,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="auto",
)
@ -399,15 +400,15 @@ def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, conten
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpg(pass_file_filename, content_type):
def test_auto_partition_jpg(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
file_filename = filename if pass_file_filename else None
metadata_filename = filename if pass_metadata_filename else None
elements = partition(
filename=filename,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="auto",
)
@ -415,16 +416,16 @@ def test_auto_partition_jpg(pass_file_filename, content_type):
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpg_from_file(pass_file_filename, content_type):
def test_auto_partition_jpg_from_file(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
file_filename = filename if pass_file_filename else None
metadata_filename = filename if pass_metadata_filename else None
with open(filename, "rb") as f:
elements = partition(
file=f,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="auto",
)
@ -874,11 +875,26 @@ def test_auto_partition_rst_from_file(filename="example-docs/README.rst"):
assert elements[0].metadata.filetype == "text/x-rst"
def test_auto_partition_metadata_file_filename():
def test_auto_partition_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
with open(filename) as f:
elements = partition(file=f, metadata_filename=filename)
assert elements[0].metadata.filename == os.path.split(filename)[-1]
def test_auto_partition_warns_about_file_filename_deprecation(caplog):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
with open(filename) as f:
elements = partition(file=f, file_filename=filename)
assert elements[0].metadata.filename == os.path.split(filename)[-1]
assert "WARNING" in caplog.text
assert "The file_filename kwarg will be deprecated" in caplog.text
def test_auto_partition_raises_with_file_and_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
with open(filename) as f, pytest.raises(ValueError):
partition(file=f, file_filename=filename, metadata_filename=filename)
def test_get_partition_with_extras_prompts_for_install_if_missing():

View File

@ -8,6 +8,7 @@ from typing import (
import requests
from unstructured.documents.elements import Element
from unstructured.logger import logger
from unstructured.partition.common import exactly_one
from unstructured.staging.base import dict_to_elements, elements_from_json
@ -19,6 +20,7 @@ def partition_via_api(
file_filename: Optional[str] = None,
api_url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
metadata_filename: Optional[str] = None,
**request_kwargs,
) -> List[Element]:
"""Partitions a document using the Unstructured REST API. This is equivalent to
@ -36,7 +38,7 @@ def partition_via_api(
A string defining the file content in MIME type
file
A file-like object using "rb" mode --> open(filename, "rb").
file_filename
metadata_filename
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
api_url
The URL for the Unstructured API. Defaults to the hosted Unstructured API.
@ -48,6 +50,19 @@ def partition_via_api(
"""
exactly_one(filename=filename, file=file)
if metadata_filename and file_filename:
raise ValueError(
"Only one of metadata_filename and file_filename is specified. "
"metadata_filename is preferred. file_filename is marked for deprecation.",
)
if file_filename is not None:
metadata_filename = file_filename
logger.warn(
"The file_filename kwarg will be deprecated in a future version of unstructured. "
"Please use metadata_filename instead.",
)
headers = {
"ACCEPT": "application/json",
"UNSTRUCTURED-API-KEY": api_key,
@ -65,13 +80,13 @@ def partition_via_api(
files=files, # type: ignore
)
elif file is not None:
if file_filename is None:
if metadata_filename is None:
raise ValueError(
"If file is specified in partition_via_api, "
"file_filename must be specified as well.",
"metadata_filename must be specified as well.",
)
files = [
("files", (file_filename, file, content_type)), # type: ignore
("files", (metadata_filename, file, content_type)), # type: ignore
]
response = requests.post(
api_url,
@ -95,6 +110,7 @@ def partition_multiple_via_api(
file_filenames: Optional[List[str]] = None,
api_url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
metadata_filenames: Optional[List[str]] = None,
**request_kwargs,
) -> List[List[Element]]:
"""Partitions multiple document using the Unstructured REST API by batching
@ -112,7 +128,7 @@ def partition_multiple_via_api(
A list of strings defining the file contents in MIME types.
files
A list of file-like object using "rb" mode --> open(filename, "rb").
file_filename
metadata_filename
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
api_url
The URL for the Unstructured API. Defaults to the hosted Unstructured API.
@ -127,6 +143,19 @@ def partition_multiple_via_api(
"UNSTRUCTURED-API-KEY": api_key,
}
if metadata_filenames and file_filenames:
raise ValueError(
"Only one of metadata_filenames and file_filenames is specified. "
"metadata_filenames is preferred. file_filenames is marked for deprecation.",
)
if file_filenames is not None:
metadata_filenames = file_filenames
logger.warn(
"The file_filenames kwarg will be deprecated in a future version of unstructured. "
"Please use metadata_filenames instead.",
)
if filenames is not None:
if content_types and len(content_types) != len(filenames):
raise ValueError("content_types and filenames must have the same length.")
@ -151,15 +180,15 @@ def partition_multiple_via_api(
if content_types and len(content_types) != len(files):
raise ValueError("content_types and files must have the same length.")
if not file_filenames:
raise ValueError("file_filenames must be specified if files are passed")
elif len(file_filenames) != len(files):
raise ValueError("file_filenames and files must have the same length.")
if not metadata_filenames:
raise ValueError("metadata_filenames must be specified if files are passed")
elif len(metadata_filenames) != len(files):
raise ValueError("metadata_filenames and files must have the same length.")
_files = []
for i, _file in enumerate(files): # type: ignore
content_type = content_types[i] if content_types is not None else None
filename = file_filenames[i]
filename = metadata_filenames[i]
_files.append(("files", (filename, _file, content_type)))
response = requests.post(

View File

@ -132,6 +132,7 @@ def partition(
pdf_infer_table_structure: bool = False,
xml_keep_tags: bool = False,
data_source_metadata: Optional[DataSourceMetadata] = None,
metadata_filename: Optional[str] = None,
**kwargs,
):
"""Partitions a document into its constituent elements. Will use libmagic to determine
@ -147,7 +148,7 @@ def partition(
A string defining the file content in MIME type
file
A file-like object using "rb" mode --> open(filename, "rb").
file_filename
metadata_filename
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
url
The url for a remote document. Pass in content_type if you want partition to treat
@ -181,6 +182,20 @@ def partition(
"""
exactly_one(file=file, filename=filename, url=url)
if metadata_filename and file_filename:
raise ValueError(
"Only one of metadata_filename and file_filename is specified. "
"metadata_filename is preferred. file_filename is marked for deprecation.",
)
if file_filename is not None:
metadata_filename = file_filename
logger.warn(
"The file_filename kwarg will be deprecated in a future version of unstructured. "
"Please use metadata_filename instead.",
)
kwargs.setdefault("metadata_filename", metadata_filename)
if url is not None:
file, filetype = file_and_type_from_url(
url=url,
@ -197,7 +212,7 @@ def partition(
filetype = detect_filetype(
filename=filename,
file=file,
file_filename=file_filename,
file_filename=metadata_filename,
content_type=content_type,
encoding=encoding,
)
@ -211,9 +226,6 @@ def partition(
pdf_infer_table_structure,
)
if file is not None and file_filename is not None:
kwargs.setdefault("metadata_filename", file_filename)
if filetype == FileType.DOC:
_partition_doc = _get_partition_with_extras("doc")
elements = _partition_doc(filename=filename, file=file, **kwargs)

View File

@ -1,4 +1,5 @@
import tempfile
import warnings
from typing import IO, List, Optional
from ebooklib import epub
@ -52,7 +53,11 @@ def partition_epub(
filename = tmp.name
last_modification_date = get_last_modified_date_from_file(file)
book = epub.read_epub(filename, options={"ignore_ncx": False})
# NOTE(robinson): ignore ebooklib warning about changing the ignore_ncx default
# in the future.
with warnings.catch_warnings():
warnings.simplefilter("ignore")
book = epub.read_epub(filename, options={"ignore_ncx": False})
# book.items also includes EpubLink, EpubImage, EpubNcx (page navigation info)
# and EpubItem (fomatting/css)
html_items = [item for item in book.items if isinstance(item, epub.EpubHtml)]