mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 20:27:37 +00:00
chore: deprecation warning for file_filename
(#1191)
### Summary Closes #1007. Adds a deprecation warning for the `file_filename` kwarg to `partition`, `partition_via_api`, and `partition_multiple_via_api`. Also catches a warning in `ebooklib` that we do not want to emit in `unstructured`. ### Testing ```python from unstructured.partition.auto import partition filename = "example-docs/winter-sports.epub" # Should not emit a warning with open(filename, "rb") as f: elements = partition(file=f, metadata_filename="test.epub") # Should be test.epub elements[0].metadata.filename # Should emit a warning with open(filename, "rb") as f: elements = partition(file=f, file_filename="test.epub") # Should be test.epub elements[0].metadata.filename # Should raise an error with open(filename, "rb") as f: elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub") ```
This commit is contained in:
parent
835378aba6
commit
cdae53cc29
@ -9,6 +9,8 @@
|
||||
* Fix bug in `partition_pdf_or_image` where two partitions were called if `strategy == "ocr_only"`.
|
||||
* Bump unstructured-inference
|
||||
* Fix issue where temporary files were being left behind (0.5.16)
|
||||
* Adds deprecation warning for the `file_filename` kwarg to `partition`, `partition_via_api`,
|
||||
and `partition_multiple_via_api`.
|
||||
* Fix documentation build workflow by pinning dependencies
|
||||
|
||||
## 0.10.5
|
||||
|
@ -496,7 +496,7 @@ Examples:
|
||||
|
||||
with ExitStack() as stack:
|
||||
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
||||
documents = partition_multiple_via_api(files=files, file_filenames=filenames)
|
||||
documents = partition_multiple_via_api(files=files, metadata_filenames=filenames)
|
||||
|
||||
For more information about the ``partition_multiple_via_api`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/partition/api.py>`_.
|
||||
|
||||
@ -794,7 +794,7 @@ type for the file. If you do not explicitly pass it, the MIME type will be infer
|
||||
elements = partition_via_api(filename=filename, api_key="MY_API_KEY", content_type="message/rfc822")
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_via_api(file=f, file_filename=filename, api_key="MY_API_KEY")
|
||||
elements = partition_via_api(file=f, metadata_filename=filename, api_key="MY_API_KEY")
|
||||
|
||||
|
||||
You can pass additional settings such as ``strategy``, ``ocr_languages`` and ``encoding`` to the
|
||||
|
@ -67,11 +67,38 @@ def test_partition_via_api_from_file(monkeypatch):
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_via_api(file=f, file_filename=filename)
|
||||
elements = partition_via_api(file=f, metadata_filename=filename)
|
||||
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
|
||||
assert elements[0].metadata.filetype == "message/rfc822"
|
||||
|
||||
|
||||
def test_partition_via_api_from_file_warns_with_file_filename(monkeypatch, caplog):
|
||||
monkeypatch.setattr(
|
||||
requests,
|
||||
"post",
|
||||
lambda *args, **kwargs: MockResponse(status_code=200),
|
||||
)
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
partition_via_api(file=f, file_filename=filename)
|
||||
|
||||
assert "WARNING" in caplog.text
|
||||
assert "The file_filename kwarg will be deprecated" in caplog.text
|
||||
|
||||
|
||||
def test_partition_via_api_from_file_raises_with_metadata_and_file_filename(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
requests,
|
||||
"post",
|
||||
lambda *args, **kwargs: MockResponse(status_code=200),
|
||||
)
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
|
||||
|
||||
with open(filename, "rb") as f, pytest.raises(ValueError):
|
||||
partition_via_api(file=f, file_filename=filename, metadata_filename=filename)
|
||||
|
||||
|
||||
def test_partition_via_api_from_file_raises_without_filename(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
requests,
|
||||
@ -246,13 +273,57 @@ def test_partition_multiple_via_api_from_files(monkeypatch):
|
||||
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
||||
elements = partition_multiple_via_api(
|
||||
files=files,
|
||||
file_filenames=filenames,
|
||||
metadata_filenames=filenames,
|
||||
)
|
||||
assert len(elements) == 2
|
||||
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
|
||||
assert elements[0][0].metadata.filetype == "message/rfc822"
|
||||
|
||||
|
||||
def test_partition_multiple_via_api_warns_with_file_filename(monkeypatch, caplog):
|
||||
monkeypatch.setattr(
|
||||
requests,
|
||||
"post",
|
||||
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
|
||||
)
|
||||
|
||||
filenames = [
|
||||
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
|
||||
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
|
||||
]
|
||||
|
||||
with contextlib.ExitStack() as stack:
|
||||
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
||||
partition_multiple_via_api(
|
||||
files=files,
|
||||
file_filenames=filenames,
|
||||
)
|
||||
assert "WARNING" in caplog.text
|
||||
assert "The file_filenames kwarg will be deprecated" in caplog.text
|
||||
|
||||
|
||||
def test_partition_multiple_via_api_warns_with_file_and_metadata_filename(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
requests,
|
||||
"post",
|
||||
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
|
||||
)
|
||||
|
||||
filenames = [
|
||||
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
|
||||
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
|
||||
]
|
||||
|
||||
with contextlib.ExitStack() as stack:
|
||||
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
||||
with pytest.raises(ValueError):
|
||||
partition_multiple_via_api(
|
||||
files=files,
|
||||
metadata_filenames=filenames,
|
||||
file_filenames=filenames,
|
||||
)
|
||||
|
||||
|
||||
def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
requests,
|
||||
@ -305,7 +376,7 @@ def test_partition_multiple_via_api_from_files_raises_with_size_mismatch(monkeyp
|
||||
with pytest.raises(ValueError):
|
||||
partition_multiple_via_api(
|
||||
files=files,
|
||||
file_filenames=filenames,
|
||||
metadata_filenames=filenames,
|
||||
content_types=["text/plain"],
|
||||
)
|
||||
|
||||
|
@ -118,24 +118,24 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
("pass_metadata_filename", "content_type"),
|
||||
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_doc_with_filename(
|
||||
mock_docx_document,
|
||||
expected_docx_elements,
|
||||
tmpdir,
|
||||
pass_file_filename,
|
||||
pass_metadata_filename,
|
||||
content_type,
|
||||
):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||
mock_docx_document.save(docx_filename)
|
||||
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
||||
file_filename = doc_filename if pass_file_filename else None
|
||||
metadata_filename = doc_filename if pass_metadata_filename else None
|
||||
elements = partition(
|
||||
filename=doc_filename,
|
||||
file_filename=file_filename,
|
||||
metadata_filename=metadata_filename,
|
||||
content_type=content_type,
|
||||
strategy="hi_res",
|
||||
)
|
||||
@ -159,15 +159,15 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
("pass_metadata_filename", "content_type"),
|
||||
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_html_from_filename(pass_file_filename, content_type):
|
||||
def test_auto_partition_html_from_filename(pass_metadata_filename, content_type):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
|
||||
file_filename = filename if pass_file_filename else None
|
||||
metadata_filename = filename if pass_metadata_filename else None
|
||||
elements = partition(
|
||||
filename=filename,
|
||||
file_filename=file_filename,
|
||||
metadata_filename=metadata_filename,
|
||||
content_type=content_type,
|
||||
strategy="hi_res",
|
||||
)
|
||||
@ -177,16 +177,16 @@ def test_auto_partition_html_from_filename(pass_file_filename, content_type):
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
("pass_metadata_filename", "content_type"),
|
||||
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_html_from_file(pass_file_filename, content_type):
|
||||
def test_auto_partition_html_from_file(pass_metadata_filename, content_type):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
|
||||
file_filename = filename if pass_file_filename else None
|
||||
metadata_filename = filename if pass_metadata_filename else None
|
||||
with open(filename) as f:
|
||||
elements = partition(
|
||||
file=f,
|
||||
file_filename=file_filename,
|
||||
metadata_filename=metadata_filename,
|
||||
content_type=content_type,
|
||||
strategy="hi_res",
|
||||
)
|
||||
@ -285,16 +285,16 @@ def test_auto_partition_text_from_file():
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
("pass_metadata_filename", "content_type"),
|
||||
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_pdf_from_filename(pass_file_filename, content_type, request):
|
||||
def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type, request):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
file_filename = filename if pass_file_filename else None
|
||||
metadata_filename = filename if pass_metadata_filename else None
|
||||
|
||||
elements = partition(
|
||||
filename=filename,
|
||||
file_filename=file_filename,
|
||||
metadata_filename=metadata_filename,
|
||||
content_type=content_type,
|
||||
strategy="hi_res",
|
||||
)
|
||||
@ -332,6 +332,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
|
||||
|
||||
mock_partition.assert_called_once_with(
|
||||
filename=filename,
|
||||
metadata_filename=None,
|
||||
file=None,
|
||||
url=None,
|
||||
include_page_breaks=False,
|
||||
@ -342,17 +343,17 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
("pass_metadata_filename", "content_type"),
|
||||
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_pdf_from_file(pass_file_filename, content_type, request):
|
||||
def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, request):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
file_filename = filename if pass_file_filename else None
|
||||
metadata_filename = filename if pass_metadata_filename else None
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(
|
||||
file=f,
|
||||
file_filename=file_filename,
|
||||
metadata_filename=metadata_filename,
|
||||
content_type=content_type,
|
||||
strategy="hi_res",
|
||||
)
|
||||
@ -379,15 +380,15 @@ def test_partition_pdf_doesnt_raise_warning():
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
("pass_metadata_filename", "content_type"),
|
||||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, content_type):
|
||||
def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, content_type):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
||||
file_filename = filename if pass_file_filename else None
|
||||
metadata_filename = filename if pass_metadata_filename else None
|
||||
elements = partition(
|
||||
filename=filename,
|
||||
file_filename=file_filename,
|
||||
metadata_filename=metadata_filename,
|
||||
content_type=content_type,
|
||||
strategy="auto",
|
||||
)
|
||||
@ -399,15 +400,15 @@ def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, conten
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
("pass_metadata_filename", "content_type"),
|
||||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_jpg(pass_file_filename, content_type):
|
||||
def test_auto_partition_jpg(pass_metadata_filename, content_type):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
||||
file_filename = filename if pass_file_filename else None
|
||||
metadata_filename = filename if pass_metadata_filename else None
|
||||
elements = partition(
|
||||
filename=filename,
|
||||
file_filename=file_filename,
|
||||
metadata_filename=metadata_filename,
|
||||
content_type=content_type,
|
||||
strategy="auto",
|
||||
)
|
||||
@ -415,16 +416,16 @@ def test_auto_partition_jpg(pass_file_filename, content_type):
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
("pass_metadata_filename", "content_type"),
|
||||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_jpg_from_file(pass_file_filename, content_type):
|
||||
def test_auto_partition_jpg_from_file(pass_metadata_filename, content_type):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
||||
file_filename = filename if pass_file_filename else None
|
||||
metadata_filename = filename if pass_metadata_filename else None
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(
|
||||
file=f,
|
||||
file_filename=file_filename,
|
||||
metadata_filename=metadata_filename,
|
||||
content_type=content_type,
|
||||
strategy="auto",
|
||||
)
|
||||
@ -874,11 +875,26 @@ def test_auto_partition_rst_from_file(filename="example-docs/README.rst"):
|
||||
assert elements[0].metadata.filetype == "text/x-rst"
|
||||
|
||||
|
||||
def test_auto_partition_metadata_file_filename():
|
||||
def test_auto_partition_metadata_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
||||
with open(filename) as f:
|
||||
elements = partition(file=f, metadata_filename=filename)
|
||||
assert elements[0].metadata.filename == os.path.split(filename)[-1]
|
||||
|
||||
|
||||
def test_auto_partition_warns_about_file_filename_deprecation(caplog):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
||||
with open(filename) as f:
|
||||
elements = partition(file=f, file_filename=filename)
|
||||
assert elements[0].metadata.filename == os.path.split(filename)[-1]
|
||||
assert "WARNING" in caplog.text
|
||||
assert "The file_filename kwarg will be deprecated" in caplog.text
|
||||
|
||||
|
||||
def test_auto_partition_raises_with_file_and_metadata_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
||||
with open(filename) as f, pytest.raises(ValueError):
|
||||
partition(file=f, file_filename=filename, metadata_filename=filename)
|
||||
|
||||
|
||||
def test_get_partition_with_extras_prompts_for_install_if_missing():
|
||||
|
@ -8,6 +8,7 @@ from typing import (
|
||||
import requests
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.staging.base import dict_to_elements, elements_from_json
|
||||
|
||||
@ -19,6 +20,7 @@ def partition_via_api(
|
||||
file_filename: Optional[str] = None,
|
||||
api_url: str = "https://api.unstructured.io/general/v0/general",
|
||||
api_key: str = "",
|
||||
metadata_filename: Optional[str] = None,
|
||||
**request_kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions a document using the Unstructured REST API. This is equivalent to
|
||||
@ -36,7 +38,7 @@ def partition_via_api(
|
||||
A string defining the file content in MIME type
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
file_filename
|
||||
metadata_filename
|
||||
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
||||
api_url
|
||||
The URL for the Unstructured API. Defaults to the hosted Unstructured API.
|
||||
@ -48,6 +50,19 @@ def partition_via_api(
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
if metadata_filename and file_filename:
|
||||
raise ValueError(
|
||||
"Only one of metadata_filename and file_filename is specified. "
|
||||
"metadata_filename is preferred. file_filename is marked for deprecation.",
|
||||
)
|
||||
|
||||
if file_filename is not None:
|
||||
metadata_filename = file_filename
|
||||
logger.warn(
|
||||
"The file_filename kwarg will be deprecated in a future version of unstructured. "
|
||||
"Please use metadata_filename instead.",
|
||||
)
|
||||
|
||||
headers = {
|
||||
"ACCEPT": "application/json",
|
||||
"UNSTRUCTURED-API-KEY": api_key,
|
||||
@ -65,13 +80,13 @@ def partition_via_api(
|
||||
files=files, # type: ignore
|
||||
)
|
||||
elif file is not None:
|
||||
if file_filename is None:
|
||||
if metadata_filename is None:
|
||||
raise ValueError(
|
||||
"If file is specified in partition_via_api, "
|
||||
"file_filename must be specified as well.",
|
||||
"metadata_filename must be specified as well.",
|
||||
)
|
||||
files = [
|
||||
("files", (file_filename, file, content_type)), # type: ignore
|
||||
("files", (metadata_filename, file, content_type)), # type: ignore
|
||||
]
|
||||
response = requests.post(
|
||||
api_url,
|
||||
@ -95,6 +110,7 @@ def partition_multiple_via_api(
|
||||
file_filenames: Optional[List[str]] = None,
|
||||
api_url: str = "https://api.unstructured.io/general/v0/general",
|
||||
api_key: str = "",
|
||||
metadata_filenames: Optional[List[str]] = None,
|
||||
**request_kwargs,
|
||||
) -> List[List[Element]]:
|
||||
"""Partitions multiple document using the Unstructured REST API by batching
|
||||
@ -112,7 +128,7 @@ def partition_multiple_via_api(
|
||||
A list of strings defining the file contents in MIME types.
|
||||
files
|
||||
A list of file-like object using "rb" mode --> open(filename, "rb").
|
||||
file_filename
|
||||
metadata_filename
|
||||
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
||||
api_url
|
||||
The URL for the Unstructured API. Defaults to the hosted Unstructured API.
|
||||
@ -127,6 +143,19 @@ def partition_multiple_via_api(
|
||||
"UNSTRUCTURED-API-KEY": api_key,
|
||||
}
|
||||
|
||||
if metadata_filenames and file_filenames:
|
||||
raise ValueError(
|
||||
"Only one of metadata_filenames and file_filenames is specified. "
|
||||
"metadata_filenames is preferred. file_filenames is marked for deprecation.",
|
||||
)
|
||||
|
||||
if file_filenames is not None:
|
||||
metadata_filenames = file_filenames
|
||||
logger.warn(
|
||||
"The file_filenames kwarg will be deprecated in a future version of unstructured. "
|
||||
"Please use metadata_filenames instead.",
|
||||
)
|
||||
|
||||
if filenames is not None:
|
||||
if content_types and len(content_types) != len(filenames):
|
||||
raise ValueError("content_types and filenames must have the same length.")
|
||||
@ -151,15 +180,15 @@ def partition_multiple_via_api(
|
||||
if content_types and len(content_types) != len(files):
|
||||
raise ValueError("content_types and files must have the same length.")
|
||||
|
||||
if not file_filenames:
|
||||
raise ValueError("file_filenames must be specified if files are passed")
|
||||
elif len(file_filenames) != len(files):
|
||||
raise ValueError("file_filenames and files must have the same length.")
|
||||
if not metadata_filenames:
|
||||
raise ValueError("metadata_filenames must be specified if files are passed")
|
||||
elif len(metadata_filenames) != len(files):
|
||||
raise ValueError("metadata_filenames and files must have the same length.")
|
||||
|
||||
_files = []
|
||||
for i, _file in enumerate(files): # type: ignore
|
||||
content_type = content_types[i] if content_types is not None else None
|
||||
filename = file_filenames[i]
|
||||
filename = metadata_filenames[i]
|
||||
_files.append(("files", (filename, _file, content_type)))
|
||||
|
||||
response = requests.post(
|
||||
|
@ -132,6 +132,7 @@ def partition(
|
||||
pdf_infer_table_structure: bool = False,
|
||||
xml_keep_tags: bool = False,
|
||||
data_source_metadata: Optional[DataSourceMetadata] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
||||
@ -147,7 +148,7 @@ def partition(
|
||||
A string defining the file content in MIME type
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
file_filename
|
||||
metadata_filename
|
||||
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
||||
url
|
||||
The url for a remote document. Pass in content_type if you want partition to treat
|
||||
@ -181,6 +182,20 @@ def partition(
|
||||
"""
|
||||
exactly_one(file=file, filename=filename, url=url)
|
||||
|
||||
if metadata_filename and file_filename:
|
||||
raise ValueError(
|
||||
"Only one of metadata_filename and file_filename is specified. "
|
||||
"metadata_filename is preferred. file_filename is marked for deprecation.",
|
||||
)
|
||||
|
||||
if file_filename is not None:
|
||||
metadata_filename = file_filename
|
||||
logger.warn(
|
||||
"The file_filename kwarg will be deprecated in a future version of unstructured. "
|
||||
"Please use metadata_filename instead.",
|
||||
)
|
||||
kwargs.setdefault("metadata_filename", metadata_filename)
|
||||
|
||||
if url is not None:
|
||||
file, filetype = file_and_type_from_url(
|
||||
url=url,
|
||||
@ -197,7 +212,7 @@ def partition(
|
||||
filetype = detect_filetype(
|
||||
filename=filename,
|
||||
file=file,
|
||||
file_filename=file_filename,
|
||||
file_filename=metadata_filename,
|
||||
content_type=content_type,
|
||||
encoding=encoding,
|
||||
)
|
||||
@ -211,9 +226,6 @@ def partition(
|
||||
pdf_infer_table_structure,
|
||||
)
|
||||
|
||||
if file is not None and file_filename is not None:
|
||||
kwargs.setdefault("metadata_filename", file_filename)
|
||||
|
||||
if filetype == FileType.DOC:
|
||||
_partition_doc = _get_partition_with_extras("doc")
|
||||
elements = _partition_doc(filename=filename, file=file, **kwargs)
|
||||
|
@ -1,4 +1,5 @@
|
||||
import tempfile
|
||||
import warnings
|
||||
from typing import IO, List, Optional
|
||||
|
||||
from ebooklib import epub
|
||||
@ -52,7 +53,11 @@ def partition_epub(
|
||||
filename = tmp.name
|
||||
last_modification_date = get_last_modified_date_from_file(file)
|
||||
|
||||
book = epub.read_epub(filename, options={"ignore_ncx": False})
|
||||
# NOTE(robinson): ignore ebooklib warning about changing the ignore_ncx default
|
||||
# in the future.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
book = epub.read_epub(filename, options={"ignore_ncx": False})
|
||||
# book.items also includes EpubLink, EpubImage, EpubNcx (page navigation info)
|
||||
# and EpubItem (fomatting/css)
|
||||
html_items = [item for item in book.items if isinstance(item, epub.EpubHtml)]
|
||||
|
Loading…
x
Reference in New Issue
Block a user