chore: deprecation warning for file_filename (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
This commit is contained in:
Matt Robinson 2023-08-24 03:02:47 -04:00 committed by GitHub
parent 835378aba6
commit cdae53cc29
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 189 additions and 54 deletions

View File

@ -9,6 +9,8 @@
* Fix bug in `partition_pdf_or_image` where two partitions were called if `strategy == "ocr_only"`. * Fix bug in `partition_pdf_or_image` where two partitions were called if `strategy == "ocr_only"`.
* Bump unstructured-inference * Bump unstructured-inference
* Fix issue where temporary files were being left behind (0.5.16) * Fix issue where temporary files were being left behind (0.5.16)
* Adds deprecation warning for the `file_filename` kwarg to `partition`, `partition_via_api`,
and `partition_multiple_via_api`.
* Fix documentation build workflow by pinning dependencies * Fix documentation build workflow by pinning dependencies
## 0.10.5 ## 0.10.5

View File

@ -496,7 +496,7 @@ Examples:
with ExitStack() as stack: with ExitStack() as stack:
files = [stack.enter_context(open(filename, "rb")) for filename in filenames] files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
documents = partition_multiple_via_api(files=files, file_filenames=filenames) documents = partition_multiple_via_api(files=files, metadata_filenames=filenames)
For more information about the ``partition_multiple_via_api`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/partition/api.py>`_. For more information about the ``partition_multiple_via_api`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/partition/api.py>`_.
@ -794,7 +794,7 @@ type for the file. If you do not explicitly pass it, the MIME type will be infer
elements = partition_via_api(filename=filename, api_key="MY_API_KEY", content_type="message/rfc822") elements = partition_via_api(filename=filename, api_key="MY_API_KEY", content_type="message/rfc822")
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition_via_api(file=f, file_filename=filename, api_key="MY_API_KEY") elements = partition_via_api(file=f, metadata_filename=filename, api_key="MY_API_KEY")
You can pass additional settings such as ``strategy``, ``ocr_languages`` and ``encoding`` to the You can pass additional settings such as ``strategy``, ``ocr_languages`` and ``encoding`` to the

View File

@ -67,11 +67,38 @@ def test_partition_via_api_from_file(monkeypatch):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition_via_api(file=f, file_filename=filename) elements = partition_via_api(file=f, metadata_filename=filename)
assert elements[0] == NarrativeText("This is a test email to use for unit tests.") assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0].metadata.filetype == "message/rfc822" assert elements[0].metadata.filetype == "message/rfc822"
def test_partition_via_api_from_file_warns_with_file_filename(monkeypatch, caplog):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockResponse(status_code=200),
)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
with open(filename, "rb") as f:
partition_via_api(file=f, file_filename=filename)
assert "WARNING" in caplog.text
assert "The file_filename kwarg will be deprecated" in caplog.text
def test_partition_via_api_from_file_raises_with_metadata_and_file_filename(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockResponse(status_code=200),
)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
with open(filename, "rb") as f, pytest.raises(ValueError):
partition_via_api(file=f, file_filename=filename, metadata_filename=filename)
def test_partition_via_api_from_file_raises_without_filename(monkeypatch): def test_partition_via_api_from_file_raises_without_filename(monkeypatch):
monkeypatch.setattr( monkeypatch.setattr(
requests, requests,
@ -246,13 +273,57 @@ def test_partition_multiple_via_api_from_files(monkeypatch):
files = [stack.enter_context(open(filename, "rb")) for filename in filenames] files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
elements = partition_multiple_via_api( elements = partition_multiple_via_api(
files=files, files=files,
file_filenames=filenames, metadata_filenames=filenames,
) )
assert len(elements) == 2 assert len(elements) == 2
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.") assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0][0].metadata.filetype == "message/rfc822" assert elements[0][0].metadata.filetype == "message/rfc822"
def test_partition_multiple_via_api_warns_with_file_filename(monkeypatch, caplog):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
)
filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
]
with contextlib.ExitStack() as stack:
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
partition_multiple_via_api(
files=files,
file_filenames=filenames,
)
assert "WARNING" in caplog.text
assert "The file_filenames kwarg will be deprecated" in caplog.text
def test_partition_multiple_via_api_warns_with_file_and_metadata_filename(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
)
filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
]
with contextlib.ExitStack() as stack:
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
with pytest.raises(ValueError):
partition_multiple_via_api(
files=files,
metadata_filenames=filenames,
file_filenames=filenames,
)
def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch): def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch):
monkeypatch.setattr( monkeypatch.setattr(
requests, requests,
@ -305,7 +376,7 @@ def test_partition_multiple_via_api_from_files_raises_with_size_mismatch(monkeyp
with pytest.raises(ValueError): with pytest.raises(ValueError):
partition_multiple_via_api( partition_multiple_via_api(
files=files, files=files,
file_filenames=filenames, metadata_filenames=filenames,
content_types=["text/plain"], content_types=["text/plain"],
) )

View File

@ -118,24 +118,24 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
@pytest.mark.parametrize( @pytest.mark.parametrize(
("pass_file_filename", "content_type"), ("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)], [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
) )
def test_auto_partition_doc_with_filename( def test_auto_partition_doc_with_filename(
mock_docx_document, mock_docx_document,
expected_docx_elements, expected_docx_elements,
tmpdir, tmpdir,
pass_file_filename, pass_metadata_filename,
content_type, content_type,
): ):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_docx_document.save(docx_filename) mock_docx_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc") convert_office_doc(docx_filename, tmpdir.dirname, "doc")
file_filename = doc_filename if pass_file_filename else None metadata_filename = doc_filename if pass_metadata_filename else None
elements = partition( elements = partition(
filename=doc_filename, filename=doc_filename,
file_filename=file_filename, metadata_filename=metadata_filename,
content_type=content_type, content_type=content_type,
strategy="hi_res", strategy="hi_res",
) )
@ -159,15 +159,15 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
@pytest.mark.parametrize( @pytest.mark.parametrize(
("pass_file_filename", "content_type"), ("pass_metadata_filename", "content_type"),
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)], [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
) )
def test_auto_partition_html_from_filename(pass_file_filename, content_type): def test_auto_partition_html_from_filename(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
file_filename = filename if pass_file_filename else None metadata_filename = filename if pass_metadata_filename else None
elements = partition( elements = partition(
filename=filename, filename=filename,
file_filename=file_filename, metadata_filename=metadata_filename,
content_type=content_type, content_type=content_type,
strategy="hi_res", strategy="hi_res",
) )
@ -177,16 +177,16 @@ def test_auto_partition_html_from_filename(pass_file_filename, content_type):
@pytest.mark.parametrize( @pytest.mark.parametrize(
("pass_file_filename", "content_type"), ("pass_metadata_filename", "content_type"),
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)], [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
) )
def test_auto_partition_html_from_file(pass_file_filename, content_type): def test_auto_partition_html_from_file(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
file_filename = filename if pass_file_filename else None metadata_filename = filename if pass_metadata_filename else None
with open(filename) as f: with open(filename) as f:
elements = partition( elements = partition(
file=f, file=f,
file_filename=file_filename, metadata_filename=metadata_filename,
content_type=content_type, content_type=content_type,
strategy="hi_res", strategy="hi_res",
) )
@ -285,16 +285,16 @@ def test_auto_partition_text_from_file():
@pytest.mark.parametrize( @pytest.mark.parametrize(
("pass_file_filename", "content_type"), ("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)], [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
) )
def test_auto_partition_pdf_from_filename(pass_file_filename, content_type, request): def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type, request):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
file_filename = filename if pass_file_filename else None metadata_filename = filename if pass_metadata_filename else None
elements = partition( elements = partition(
filename=filename, filename=filename,
file_filename=file_filename, metadata_filename=metadata_filename,
content_type=content_type, content_type=content_type,
strategy="hi_res", strategy="hi_res",
) )
@ -332,6 +332,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
mock_partition.assert_called_once_with( mock_partition.assert_called_once_with(
filename=filename, filename=filename,
metadata_filename=None,
file=None, file=None,
url=None, url=None,
include_page_breaks=False, include_page_breaks=False,
@ -342,17 +343,17 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
@pytest.mark.parametrize( @pytest.mark.parametrize(
("pass_file_filename", "content_type"), ("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)], [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
) )
def test_auto_partition_pdf_from_file(pass_file_filename, content_type, request): def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, request):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
file_filename = filename if pass_file_filename else None metadata_filename = filename if pass_metadata_filename else None
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition( elements = partition(
file=f, file=f,
file_filename=file_filename, metadata_filename=metadata_filename,
content_type=content_type, content_type=content_type,
strategy="hi_res", strategy="hi_res",
) )
@ -379,15 +380,15 @@ def test_partition_pdf_doesnt_raise_warning():
@pytest.mark.parametrize( @pytest.mark.parametrize(
("pass_file_filename", "content_type"), ("pass_metadata_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)], [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
) )
def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, content_type): def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
file_filename = filename if pass_file_filename else None metadata_filename = filename if pass_metadata_filename else None
elements = partition( elements = partition(
filename=filename, filename=filename,
file_filename=file_filename, metadata_filename=metadata_filename,
content_type=content_type, content_type=content_type,
strategy="auto", strategy="auto",
) )
@ -399,15 +400,15 @@ def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, conten
@pytest.mark.parametrize( @pytest.mark.parametrize(
("pass_file_filename", "content_type"), ("pass_metadata_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)], [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
) )
def test_auto_partition_jpg(pass_file_filename, content_type): def test_auto_partition_jpg(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
file_filename = filename if pass_file_filename else None metadata_filename = filename if pass_metadata_filename else None
elements = partition( elements = partition(
filename=filename, filename=filename,
file_filename=file_filename, metadata_filename=metadata_filename,
content_type=content_type, content_type=content_type,
strategy="auto", strategy="auto",
) )
@ -415,16 +416,16 @@ def test_auto_partition_jpg(pass_file_filename, content_type):
@pytest.mark.parametrize( @pytest.mark.parametrize(
("pass_file_filename", "content_type"), ("pass_metadata_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)], [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
) )
def test_auto_partition_jpg_from_file(pass_file_filename, content_type): def test_auto_partition_jpg_from_file(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
file_filename = filename if pass_file_filename else None metadata_filename = filename if pass_metadata_filename else None
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition( elements = partition(
file=f, file=f,
file_filename=file_filename, metadata_filename=metadata_filename,
content_type=content_type, content_type=content_type,
strategy="auto", strategy="auto",
) )
@ -874,11 +875,26 @@ def test_auto_partition_rst_from_file(filename="example-docs/README.rst"):
assert elements[0].metadata.filetype == "text/x-rst" assert elements[0].metadata.filetype == "text/x-rst"
def test_auto_partition_metadata_file_filename(): def test_auto_partition_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
with open(filename) as f:
elements = partition(file=f, metadata_filename=filename)
assert elements[0].metadata.filename == os.path.split(filename)[-1]
def test_auto_partition_warns_about_file_filename_deprecation(caplog):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
with open(filename) as f: with open(filename) as f:
elements = partition(file=f, file_filename=filename) elements = partition(file=f, file_filename=filename)
assert elements[0].metadata.filename == os.path.split(filename)[-1] assert elements[0].metadata.filename == os.path.split(filename)[-1]
assert "WARNING" in caplog.text
assert "The file_filename kwarg will be deprecated" in caplog.text
def test_auto_partition_raises_with_file_and_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
with open(filename) as f, pytest.raises(ValueError):
partition(file=f, file_filename=filename, metadata_filename=filename)
def test_get_partition_with_extras_prompts_for_install_if_missing(): def test_get_partition_with_extras_prompts_for_install_if_missing():

View File

@ -8,6 +8,7 @@ from typing import (
import requests import requests
from unstructured.documents.elements import Element from unstructured.documents.elements import Element
from unstructured.logger import logger
from unstructured.partition.common import exactly_one from unstructured.partition.common import exactly_one
from unstructured.staging.base import dict_to_elements, elements_from_json from unstructured.staging.base import dict_to_elements, elements_from_json
@ -19,6 +20,7 @@ def partition_via_api(
file_filename: Optional[str] = None, file_filename: Optional[str] = None,
api_url: str = "https://api.unstructured.io/general/v0/general", api_url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "", api_key: str = "",
metadata_filename: Optional[str] = None,
**request_kwargs, **request_kwargs,
) -> List[Element]: ) -> List[Element]:
"""Partitions a document using the Unstructured REST API. This is equivalent to """Partitions a document using the Unstructured REST API. This is equivalent to
@ -36,7 +38,7 @@ def partition_via_api(
A string defining the file content in MIME type A string defining the file content in MIME type
file file
A file-like object using "rb" mode --> open(filename, "rb"). A file-like object using "rb" mode --> open(filename, "rb").
file_filename metadata_filename
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt" When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
api_url api_url
The URL for the Unstructured API. Defaults to the hosted Unstructured API. The URL for the Unstructured API. Defaults to the hosted Unstructured API.
@ -48,6 +50,19 @@ def partition_via_api(
""" """
exactly_one(filename=filename, file=file) exactly_one(filename=filename, file=file)
if metadata_filename and file_filename:
raise ValueError(
"Only one of metadata_filename and file_filename is specified. "
"metadata_filename is preferred. file_filename is marked for deprecation.",
)
if file_filename is not None:
metadata_filename = file_filename
logger.warn(
"The file_filename kwarg will be deprecated in a future version of unstructured. "
"Please use metadata_filename instead.",
)
headers = { headers = {
"ACCEPT": "application/json", "ACCEPT": "application/json",
"UNSTRUCTURED-API-KEY": api_key, "UNSTRUCTURED-API-KEY": api_key,
@ -65,13 +80,13 @@ def partition_via_api(
files=files, # type: ignore files=files, # type: ignore
) )
elif file is not None: elif file is not None:
if file_filename is None: if metadata_filename is None:
raise ValueError( raise ValueError(
"If file is specified in partition_via_api, " "If file is specified in partition_via_api, "
"file_filename must be specified as well.", "metadata_filename must be specified as well.",
) )
files = [ files = [
("files", (file_filename, file, content_type)), # type: ignore ("files", (metadata_filename, file, content_type)), # type: ignore
] ]
response = requests.post( response = requests.post(
api_url, api_url,
@ -95,6 +110,7 @@ def partition_multiple_via_api(
file_filenames: Optional[List[str]] = None, file_filenames: Optional[List[str]] = None,
api_url: str = "https://api.unstructured.io/general/v0/general", api_url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "", api_key: str = "",
metadata_filenames: Optional[List[str]] = None,
**request_kwargs, **request_kwargs,
) -> List[List[Element]]: ) -> List[List[Element]]:
"""Partitions multiple document using the Unstructured REST API by batching """Partitions multiple document using the Unstructured REST API by batching
@ -112,7 +128,7 @@ def partition_multiple_via_api(
A list of strings defining the file contents in MIME types. A list of strings defining the file contents in MIME types.
files files
A list of file-like object using "rb" mode --> open(filename, "rb"). A list of file-like object using "rb" mode --> open(filename, "rb").
file_filename metadata_filename
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt" When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
api_url api_url
The URL for the Unstructured API. Defaults to the hosted Unstructured API. The URL for the Unstructured API. Defaults to the hosted Unstructured API.
@ -127,6 +143,19 @@ def partition_multiple_via_api(
"UNSTRUCTURED-API-KEY": api_key, "UNSTRUCTURED-API-KEY": api_key,
} }
if metadata_filenames and file_filenames:
raise ValueError(
"Only one of metadata_filenames and file_filenames is specified. "
"metadata_filenames is preferred. file_filenames is marked for deprecation.",
)
if file_filenames is not None:
metadata_filenames = file_filenames
logger.warn(
"The file_filenames kwarg will be deprecated in a future version of unstructured. "
"Please use metadata_filenames instead.",
)
if filenames is not None: if filenames is not None:
if content_types and len(content_types) != len(filenames): if content_types and len(content_types) != len(filenames):
raise ValueError("content_types and filenames must have the same length.") raise ValueError("content_types and filenames must have the same length.")
@ -151,15 +180,15 @@ def partition_multiple_via_api(
if content_types and len(content_types) != len(files): if content_types and len(content_types) != len(files):
raise ValueError("content_types and files must have the same length.") raise ValueError("content_types and files must have the same length.")
if not file_filenames: if not metadata_filenames:
raise ValueError("file_filenames must be specified if files are passed") raise ValueError("metadata_filenames must be specified if files are passed")
elif len(file_filenames) != len(files): elif len(metadata_filenames) != len(files):
raise ValueError("file_filenames and files must have the same length.") raise ValueError("metadata_filenames and files must have the same length.")
_files = [] _files = []
for i, _file in enumerate(files): # type: ignore for i, _file in enumerate(files): # type: ignore
content_type = content_types[i] if content_types is not None else None content_type = content_types[i] if content_types is not None else None
filename = file_filenames[i] filename = metadata_filenames[i]
_files.append(("files", (filename, _file, content_type))) _files.append(("files", (filename, _file, content_type)))
response = requests.post( response = requests.post(

View File

@ -132,6 +132,7 @@ def partition(
pdf_infer_table_structure: bool = False, pdf_infer_table_structure: bool = False,
xml_keep_tags: bool = False, xml_keep_tags: bool = False,
data_source_metadata: Optional[DataSourceMetadata] = None, data_source_metadata: Optional[DataSourceMetadata] = None,
metadata_filename: Optional[str] = None,
**kwargs, **kwargs,
): ):
"""Partitions a document into its constituent elements. Will use libmagic to determine """Partitions a document into its constituent elements. Will use libmagic to determine
@ -147,7 +148,7 @@ def partition(
A string defining the file content in MIME type A string defining the file content in MIME type
file file
A file-like object using "rb" mode --> open(filename, "rb"). A file-like object using "rb" mode --> open(filename, "rb").
file_filename metadata_filename
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt" When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
url url
The url for a remote document. Pass in content_type if you want partition to treat The url for a remote document. Pass in content_type if you want partition to treat
@ -181,6 +182,20 @@ def partition(
""" """
exactly_one(file=file, filename=filename, url=url) exactly_one(file=file, filename=filename, url=url)
if metadata_filename and file_filename:
raise ValueError(
"Only one of metadata_filename and file_filename is specified. "
"metadata_filename is preferred. file_filename is marked for deprecation.",
)
if file_filename is not None:
metadata_filename = file_filename
logger.warn(
"The file_filename kwarg will be deprecated in a future version of unstructured. "
"Please use metadata_filename instead.",
)
kwargs.setdefault("metadata_filename", metadata_filename)
if url is not None: if url is not None:
file, filetype = file_and_type_from_url( file, filetype = file_and_type_from_url(
url=url, url=url,
@ -197,7 +212,7 @@ def partition(
filetype = detect_filetype( filetype = detect_filetype(
filename=filename, filename=filename,
file=file, file=file,
file_filename=file_filename, file_filename=metadata_filename,
content_type=content_type, content_type=content_type,
encoding=encoding, encoding=encoding,
) )
@ -211,9 +226,6 @@ def partition(
pdf_infer_table_structure, pdf_infer_table_structure,
) )
if file is not None and file_filename is not None:
kwargs.setdefault("metadata_filename", file_filename)
if filetype == FileType.DOC: if filetype == FileType.DOC:
_partition_doc = _get_partition_with_extras("doc") _partition_doc = _get_partition_with_extras("doc")
elements = _partition_doc(filename=filename, file=file, **kwargs) elements = _partition_doc(filename=filename, file=file, **kwargs)

View File

@ -1,4 +1,5 @@
import tempfile import tempfile
import warnings
from typing import IO, List, Optional from typing import IO, List, Optional
from ebooklib import epub from ebooklib import epub
@ -52,7 +53,11 @@ def partition_epub(
filename = tmp.name filename = tmp.name
last_modification_date = get_last_modified_date_from_file(file) last_modification_date = get_last_modified_date_from_file(file)
book = epub.read_epub(filename, options={"ignore_ncx": False}) # NOTE(robinson): ignore ebooklib warning about changing the ignore_ncx default
# in the future.
with warnings.catch_warnings():
warnings.simplefilter("ignore")
book = epub.read_epub(filename, options={"ignore_ncx": False})
# book.items also includes EpubLink, EpubImage, EpubNcx (page navigation info) # book.items also includes EpubLink, EpubImage, EpubNcx (page navigation info)
# and EpubItem (fomatting/css) # and EpubItem (fomatting/css)
html_items = [item for item in book.items if isinstance(item, epub.EpubHtml)] html_items = [item for item in book.items if isinstance(item, epub.EpubHtml)]