mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 20:27:37 +00:00
chore: deprecation warning for file_filename
(#1191)
### Summary Closes #1007. Adds a deprecation warning for the `file_filename` kwarg to `partition`, `partition_via_api`, and `partition_multiple_via_api`. Also catches a warning in `ebooklib` that we do not want to emit in `unstructured`. ### Testing ```python from unstructured.partition.auto import partition filename = "example-docs/winter-sports.epub" # Should not emit a warning with open(filename, "rb") as f: elements = partition(file=f, metadata_filename="test.epub") # Should be test.epub elements[0].metadata.filename # Should emit a warning with open(filename, "rb") as f: elements = partition(file=f, file_filename="test.epub") # Should be test.epub elements[0].metadata.filename # Should raise an error with open(filename, "rb") as f: elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub") ```
This commit is contained in:
parent
835378aba6
commit
cdae53cc29
@ -9,6 +9,8 @@
|
|||||||
* Fix bug in `partition_pdf_or_image` where two partitions were called if `strategy == "ocr_only"`.
|
* Fix bug in `partition_pdf_or_image` where two partitions were called if `strategy == "ocr_only"`.
|
||||||
* Bump unstructured-inference
|
* Bump unstructured-inference
|
||||||
* Fix issue where temporary files were being left behind (0.5.16)
|
* Fix issue where temporary files were being left behind (0.5.16)
|
||||||
|
* Adds deprecation warning for the `file_filename` kwarg to `partition`, `partition_via_api`,
|
||||||
|
and `partition_multiple_via_api`.
|
||||||
* Fix documentation build workflow by pinning dependencies
|
* Fix documentation build workflow by pinning dependencies
|
||||||
|
|
||||||
## 0.10.5
|
## 0.10.5
|
||||||
|
@ -496,7 +496,7 @@ Examples:
|
|||||||
|
|
||||||
with ExitStack() as stack:
|
with ExitStack() as stack:
|
||||||
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
||||||
documents = partition_multiple_via_api(files=files, file_filenames=filenames)
|
documents = partition_multiple_via_api(files=files, metadata_filenames=filenames)
|
||||||
|
|
||||||
For more information about the ``partition_multiple_via_api`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/partition/api.py>`_.
|
For more information about the ``partition_multiple_via_api`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/partition/api.py>`_.
|
||||||
|
|
||||||
@ -794,7 +794,7 @@ type for the file. If you do not explicitly pass it, the MIME type will be infer
|
|||||||
elements = partition_via_api(filename=filename, api_key="MY_API_KEY", content_type="message/rfc822")
|
elements = partition_via_api(filename=filename, api_key="MY_API_KEY", content_type="message/rfc822")
|
||||||
|
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
elements = partition_via_api(file=f, file_filename=filename, api_key="MY_API_KEY")
|
elements = partition_via_api(file=f, metadata_filename=filename, api_key="MY_API_KEY")
|
||||||
|
|
||||||
|
|
||||||
You can pass additional settings such as ``strategy``, ``ocr_languages`` and ``encoding`` to the
|
You can pass additional settings such as ``strategy``, ``ocr_languages`` and ``encoding`` to the
|
||||||
|
@ -67,11 +67,38 @@ def test_partition_via_api_from_file(monkeypatch):
|
|||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
|
||||||
|
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
elements = partition_via_api(file=f, file_filename=filename)
|
elements = partition_via_api(file=f, metadata_filename=filename)
|
||||||
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
|
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
|
||||||
assert elements[0].metadata.filetype == "message/rfc822"
|
assert elements[0].metadata.filetype == "message/rfc822"
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_via_api_from_file_warns_with_file_filename(monkeypatch, caplog):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
requests,
|
||||||
|
"post",
|
||||||
|
lambda *args, **kwargs: MockResponse(status_code=200),
|
||||||
|
)
|
||||||
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
|
||||||
|
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
partition_via_api(file=f, file_filename=filename)
|
||||||
|
|
||||||
|
assert "WARNING" in caplog.text
|
||||||
|
assert "The file_filename kwarg will be deprecated" in caplog.text
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_via_api_from_file_raises_with_metadata_and_file_filename(monkeypatch):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
requests,
|
||||||
|
"post",
|
||||||
|
lambda *args, **kwargs: MockResponse(status_code=200),
|
||||||
|
)
|
||||||
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
|
||||||
|
|
||||||
|
with open(filename, "rb") as f, pytest.raises(ValueError):
|
||||||
|
partition_via_api(file=f, file_filename=filename, metadata_filename=filename)
|
||||||
|
|
||||||
|
|
||||||
def test_partition_via_api_from_file_raises_without_filename(monkeypatch):
|
def test_partition_via_api_from_file_raises_without_filename(monkeypatch):
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
requests,
|
requests,
|
||||||
@ -246,13 +273,57 @@ def test_partition_multiple_via_api_from_files(monkeypatch):
|
|||||||
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
||||||
elements = partition_multiple_via_api(
|
elements = partition_multiple_via_api(
|
||||||
files=files,
|
files=files,
|
||||||
file_filenames=filenames,
|
metadata_filenames=filenames,
|
||||||
)
|
)
|
||||||
assert len(elements) == 2
|
assert len(elements) == 2
|
||||||
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
|
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
|
||||||
assert elements[0][0].metadata.filetype == "message/rfc822"
|
assert elements[0][0].metadata.filetype == "message/rfc822"
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_multiple_via_api_warns_with_file_filename(monkeypatch, caplog):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
requests,
|
||||||
|
"post",
|
||||||
|
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
|
||||||
|
)
|
||||||
|
|
||||||
|
filenames = [
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
|
||||||
|
]
|
||||||
|
|
||||||
|
with contextlib.ExitStack() as stack:
|
||||||
|
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
||||||
|
partition_multiple_via_api(
|
||||||
|
files=files,
|
||||||
|
file_filenames=filenames,
|
||||||
|
)
|
||||||
|
assert "WARNING" in caplog.text
|
||||||
|
assert "The file_filenames kwarg will be deprecated" in caplog.text
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_multiple_via_api_warns_with_file_and_metadata_filename(monkeypatch):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
requests,
|
||||||
|
"post",
|
||||||
|
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
|
||||||
|
)
|
||||||
|
|
||||||
|
filenames = [
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
|
||||||
|
]
|
||||||
|
|
||||||
|
with contextlib.ExitStack() as stack:
|
||||||
|
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
partition_multiple_via_api(
|
||||||
|
files=files,
|
||||||
|
metadata_filenames=filenames,
|
||||||
|
file_filenames=filenames,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch):
|
def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch):
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
requests,
|
requests,
|
||||||
@ -305,7 +376,7 @@ def test_partition_multiple_via_api_from_files_raises_with_size_mismatch(monkeyp
|
|||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
partition_multiple_via_api(
|
partition_multiple_via_api(
|
||||||
files=files,
|
files=files,
|
||||||
file_filenames=filenames,
|
metadata_filenames=filenames,
|
||||||
content_types=["text/plain"],
|
content_types=["text/plain"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -118,24 +118,24 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("pass_file_filename", "content_type"),
|
("pass_metadata_filename", "content_type"),
|
||||||
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
|
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
|
||||||
)
|
)
|
||||||
def test_auto_partition_doc_with_filename(
|
def test_auto_partition_doc_with_filename(
|
||||||
mock_docx_document,
|
mock_docx_document,
|
||||||
expected_docx_elements,
|
expected_docx_elements,
|
||||||
tmpdir,
|
tmpdir,
|
||||||
pass_file_filename,
|
pass_metadata_filename,
|
||||||
content_type,
|
content_type,
|
||||||
):
|
):
|
||||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||||
mock_docx_document.save(docx_filename)
|
mock_docx_document.save(docx_filename)
|
||||||
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
||||||
file_filename = doc_filename if pass_file_filename else None
|
metadata_filename = doc_filename if pass_metadata_filename else None
|
||||||
elements = partition(
|
elements = partition(
|
||||||
filename=doc_filename,
|
filename=doc_filename,
|
||||||
file_filename=file_filename,
|
metadata_filename=metadata_filename,
|
||||||
content_type=content_type,
|
content_type=content_type,
|
||||||
strategy="hi_res",
|
strategy="hi_res",
|
||||||
)
|
)
|
||||||
@ -159,15 +159,15 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("pass_file_filename", "content_type"),
|
("pass_metadata_filename", "content_type"),
|
||||||
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
||||||
)
|
)
|
||||||
def test_auto_partition_html_from_filename(pass_file_filename, content_type):
|
def test_auto_partition_html_from_filename(pass_metadata_filename, content_type):
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
|
||||||
file_filename = filename if pass_file_filename else None
|
metadata_filename = filename if pass_metadata_filename else None
|
||||||
elements = partition(
|
elements = partition(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file_filename=file_filename,
|
metadata_filename=metadata_filename,
|
||||||
content_type=content_type,
|
content_type=content_type,
|
||||||
strategy="hi_res",
|
strategy="hi_res",
|
||||||
)
|
)
|
||||||
@ -177,16 +177,16 @@ def test_auto_partition_html_from_filename(pass_file_filename, content_type):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("pass_file_filename", "content_type"),
|
("pass_metadata_filename", "content_type"),
|
||||||
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
||||||
)
|
)
|
||||||
def test_auto_partition_html_from_file(pass_file_filename, content_type):
|
def test_auto_partition_html_from_file(pass_metadata_filename, content_type):
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
|
||||||
file_filename = filename if pass_file_filename else None
|
metadata_filename = filename if pass_metadata_filename else None
|
||||||
with open(filename) as f:
|
with open(filename) as f:
|
||||||
elements = partition(
|
elements = partition(
|
||||||
file=f,
|
file=f,
|
||||||
file_filename=file_filename,
|
metadata_filename=metadata_filename,
|
||||||
content_type=content_type,
|
content_type=content_type,
|
||||||
strategy="hi_res",
|
strategy="hi_res",
|
||||||
)
|
)
|
||||||
@ -285,16 +285,16 @@ def test_auto_partition_text_from_file():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("pass_file_filename", "content_type"),
|
("pass_metadata_filename", "content_type"),
|
||||||
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
||||||
)
|
)
|
||||||
def test_auto_partition_pdf_from_filename(pass_file_filename, content_type, request):
|
def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type, request):
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||||
file_filename = filename if pass_file_filename else None
|
metadata_filename = filename if pass_metadata_filename else None
|
||||||
|
|
||||||
elements = partition(
|
elements = partition(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file_filename=file_filename,
|
metadata_filename=metadata_filename,
|
||||||
content_type=content_type,
|
content_type=content_type,
|
||||||
strategy="hi_res",
|
strategy="hi_res",
|
||||||
)
|
)
|
||||||
@ -332,6 +332,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
|
|||||||
|
|
||||||
mock_partition.assert_called_once_with(
|
mock_partition.assert_called_once_with(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
|
metadata_filename=None,
|
||||||
file=None,
|
file=None,
|
||||||
url=None,
|
url=None,
|
||||||
include_page_breaks=False,
|
include_page_breaks=False,
|
||||||
@ -342,17 +343,17 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("pass_file_filename", "content_type"),
|
("pass_metadata_filename", "content_type"),
|
||||||
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
||||||
)
|
)
|
||||||
def test_auto_partition_pdf_from_file(pass_file_filename, content_type, request):
|
def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, request):
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||||
file_filename = filename if pass_file_filename else None
|
metadata_filename = filename if pass_metadata_filename else None
|
||||||
|
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
elements = partition(
|
elements = partition(
|
||||||
file=f,
|
file=f,
|
||||||
file_filename=file_filename,
|
metadata_filename=metadata_filename,
|
||||||
content_type=content_type,
|
content_type=content_type,
|
||||||
strategy="hi_res",
|
strategy="hi_res",
|
||||||
)
|
)
|
||||||
@ -379,15 +380,15 @@ def test_partition_pdf_doesnt_raise_warning():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("pass_file_filename", "content_type"),
|
("pass_metadata_filename", "content_type"),
|
||||||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||||||
)
|
)
|
||||||
def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, content_type):
|
def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, content_type):
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
||||||
file_filename = filename if pass_file_filename else None
|
metadata_filename = filename if pass_metadata_filename else None
|
||||||
elements = partition(
|
elements = partition(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file_filename=file_filename,
|
metadata_filename=metadata_filename,
|
||||||
content_type=content_type,
|
content_type=content_type,
|
||||||
strategy="auto",
|
strategy="auto",
|
||||||
)
|
)
|
||||||
@ -399,15 +400,15 @@ def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, conten
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("pass_file_filename", "content_type"),
|
("pass_metadata_filename", "content_type"),
|
||||||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||||||
)
|
)
|
||||||
def test_auto_partition_jpg(pass_file_filename, content_type):
|
def test_auto_partition_jpg(pass_metadata_filename, content_type):
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
||||||
file_filename = filename if pass_file_filename else None
|
metadata_filename = filename if pass_metadata_filename else None
|
||||||
elements = partition(
|
elements = partition(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file_filename=file_filename,
|
metadata_filename=metadata_filename,
|
||||||
content_type=content_type,
|
content_type=content_type,
|
||||||
strategy="auto",
|
strategy="auto",
|
||||||
)
|
)
|
||||||
@ -415,16 +416,16 @@ def test_auto_partition_jpg(pass_file_filename, content_type):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("pass_file_filename", "content_type"),
|
("pass_metadata_filename", "content_type"),
|
||||||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||||||
)
|
)
|
||||||
def test_auto_partition_jpg_from_file(pass_file_filename, content_type):
|
def test_auto_partition_jpg_from_file(pass_metadata_filename, content_type):
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
||||||
file_filename = filename if pass_file_filename else None
|
metadata_filename = filename if pass_metadata_filename else None
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
elements = partition(
|
elements = partition(
|
||||||
file=f,
|
file=f,
|
||||||
file_filename=file_filename,
|
metadata_filename=metadata_filename,
|
||||||
content_type=content_type,
|
content_type=content_type,
|
||||||
strategy="auto",
|
strategy="auto",
|
||||||
)
|
)
|
||||||
@ -874,11 +875,26 @@ def test_auto_partition_rst_from_file(filename="example-docs/README.rst"):
|
|||||||
assert elements[0].metadata.filetype == "text/x-rst"
|
assert elements[0].metadata.filetype == "text/x-rst"
|
||||||
|
|
||||||
|
|
||||||
def test_auto_partition_metadata_file_filename():
|
def test_auto_partition_metadata_filename():
|
||||||
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
||||||
|
with open(filename) as f:
|
||||||
|
elements = partition(file=f, metadata_filename=filename)
|
||||||
|
assert elements[0].metadata.filename == os.path.split(filename)[-1]
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_partition_warns_about_file_filename_deprecation(caplog):
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
||||||
with open(filename) as f:
|
with open(filename) as f:
|
||||||
elements = partition(file=f, file_filename=filename)
|
elements = partition(file=f, file_filename=filename)
|
||||||
assert elements[0].metadata.filename == os.path.split(filename)[-1]
|
assert elements[0].metadata.filename == os.path.split(filename)[-1]
|
||||||
|
assert "WARNING" in caplog.text
|
||||||
|
assert "The file_filename kwarg will be deprecated" in caplog.text
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_partition_raises_with_file_and_metadata_filename():
|
||||||
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
||||||
|
with open(filename) as f, pytest.raises(ValueError):
|
||||||
|
partition(file=f, file_filename=filename, metadata_filename=filename)
|
||||||
|
|
||||||
|
|
||||||
def test_get_partition_with_extras_prompts_for_install_if_missing():
|
def test_get_partition_with_extras_prompts_for_install_if_missing():
|
||||||
|
@ -8,6 +8,7 @@ from typing import (
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from unstructured.documents.elements import Element
|
from unstructured.documents.elements import Element
|
||||||
|
from unstructured.logger import logger
|
||||||
from unstructured.partition.common import exactly_one
|
from unstructured.partition.common import exactly_one
|
||||||
from unstructured.staging.base import dict_to_elements, elements_from_json
|
from unstructured.staging.base import dict_to_elements, elements_from_json
|
||||||
|
|
||||||
@ -19,6 +20,7 @@ def partition_via_api(
|
|||||||
file_filename: Optional[str] = None,
|
file_filename: Optional[str] = None,
|
||||||
api_url: str = "https://api.unstructured.io/general/v0/general",
|
api_url: str = "https://api.unstructured.io/general/v0/general",
|
||||||
api_key: str = "",
|
api_key: str = "",
|
||||||
|
metadata_filename: Optional[str] = None,
|
||||||
**request_kwargs,
|
**request_kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions a document using the Unstructured REST API. This is equivalent to
|
"""Partitions a document using the Unstructured REST API. This is equivalent to
|
||||||
@ -36,7 +38,7 @@ def partition_via_api(
|
|||||||
A string defining the file content in MIME type
|
A string defining the file content in MIME type
|
||||||
file
|
file
|
||||||
A file-like object using "rb" mode --> open(filename, "rb").
|
A file-like object using "rb" mode --> open(filename, "rb").
|
||||||
file_filename
|
metadata_filename
|
||||||
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
||||||
api_url
|
api_url
|
||||||
The URL for the Unstructured API. Defaults to the hosted Unstructured API.
|
The URL for the Unstructured API. Defaults to the hosted Unstructured API.
|
||||||
@ -48,6 +50,19 @@ def partition_via_api(
|
|||||||
"""
|
"""
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file)
|
||||||
|
|
||||||
|
if metadata_filename and file_filename:
|
||||||
|
raise ValueError(
|
||||||
|
"Only one of metadata_filename and file_filename is specified. "
|
||||||
|
"metadata_filename is preferred. file_filename is marked for deprecation.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if file_filename is not None:
|
||||||
|
metadata_filename = file_filename
|
||||||
|
logger.warn(
|
||||||
|
"The file_filename kwarg will be deprecated in a future version of unstructured. "
|
||||||
|
"Please use metadata_filename instead.",
|
||||||
|
)
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
"ACCEPT": "application/json",
|
"ACCEPT": "application/json",
|
||||||
"UNSTRUCTURED-API-KEY": api_key,
|
"UNSTRUCTURED-API-KEY": api_key,
|
||||||
@ -65,13 +80,13 @@ def partition_via_api(
|
|||||||
files=files, # type: ignore
|
files=files, # type: ignore
|
||||||
)
|
)
|
||||||
elif file is not None:
|
elif file is not None:
|
||||||
if file_filename is None:
|
if metadata_filename is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"If file is specified in partition_via_api, "
|
"If file is specified in partition_via_api, "
|
||||||
"file_filename must be specified as well.",
|
"metadata_filename must be specified as well.",
|
||||||
)
|
)
|
||||||
files = [
|
files = [
|
||||||
("files", (file_filename, file, content_type)), # type: ignore
|
("files", (metadata_filename, file, content_type)), # type: ignore
|
||||||
]
|
]
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
api_url,
|
api_url,
|
||||||
@ -95,6 +110,7 @@ def partition_multiple_via_api(
|
|||||||
file_filenames: Optional[List[str]] = None,
|
file_filenames: Optional[List[str]] = None,
|
||||||
api_url: str = "https://api.unstructured.io/general/v0/general",
|
api_url: str = "https://api.unstructured.io/general/v0/general",
|
||||||
api_key: str = "",
|
api_key: str = "",
|
||||||
|
metadata_filenames: Optional[List[str]] = None,
|
||||||
**request_kwargs,
|
**request_kwargs,
|
||||||
) -> List[List[Element]]:
|
) -> List[List[Element]]:
|
||||||
"""Partitions multiple document using the Unstructured REST API by batching
|
"""Partitions multiple document using the Unstructured REST API by batching
|
||||||
@ -112,7 +128,7 @@ def partition_multiple_via_api(
|
|||||||
A list of strings defining the file contents in MIME types.
|
A list of strings defining the file contents in MIME types.
|
||||||
files
|
files
|
||||||
A list of file-like object using "rb" mode --> open(filename, "rb").
|
A list of file-like object using "rb" mode --> open(filename, "rb").
|
||||||
file_filename
|
metadata_filename
|
||||||
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
||||||
api_url
|
api_url
|
||||||
The URL for the Unstructured API. Defaults to the hosted Unstructured API.
|
The URL for the Unstructured API. Defaults to the hosted Unstructured API.
|
||||||
@ -127,6 +143,19 @@ def partition_multiple_via_api(
|
|||||||
"UNSTRUCTURED-API-KEY": api_key,
|
"UNSTRUCTURED-API-KEY": api_key,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if metadata_filenames and file_filenames:
|
||||||
|
raise ValueError(
|
||||||
|
"Only one of metadata_filenames and file_filenames is specified. "
|
||||||
|
"metadata_filenames is preferred. file_filenames is marked for deprecation.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if file_filenames is not None:
|
||||||
|
metadata_filenames = file_filenames
|
||||||
|
logger.warn(
|
||||||
|
"The file_filenames kwarg will be deprecated in a future version of unstructured. "
|
||||||
|
"Please use metadata_filenames instead.",
|
||||||
|
)
|
||||||
|
|
||||||
if filenames is not None:
|
if filenames is not None:
|
||||||
if content_types and len(content_types) != len(filenames):
|
if content_types and len(content_types) != len(filenames):
|
||||||
raise ValueError("content_types and filenames must have the same length.")
|
raise ValueError("content_types and filenames must have the same length.")
|
||||||
@ -151,15 +180,15 @@ def partition_multiple_via_api(
|
|||||||
if content_types and len(content_types) != len(files):
|
if content_types and len(content_types) != len(files):
|
||||||
raise ValueError("content_types and files must have the same length.")
|
raise ValueError("content_types and files must have the same length.")
|
||||||
|
|
||||||
if not file_filenames:
|
if not metadata_filenames:
|
||||||
raise ValueError("file_filenames must be specified if files are passed")
|
raise ValueError("metadata_filenames must be specified if files are passed")
|
||||||
elif len(file_filenames) != len(files):
|
elif len(metadata_filenames) != len(files):
|
||||||
raise ValueError("file_filenames and files must have the same length.")
|
raise ValueError("metadata_filenames and files must have the same length.")
|
||||||
|
|
||||||
_files = []
|
_files = []
|
||||||
for i, _file in enumerate(files): # type: ignore
|
for i, _file in enumerate(files): # type: ignore
|
||||||
content_type = content_types[i] if content_types is not None else None
|
content_type = content_types[i] if content_types is not None else None
|
||||||
filename = file_filenames[i]
|
filename = metadata_filenames[i]
|
||||||
_files.append(("files", (filename, _file, content_type)))
|
_files.append(("files", (filename, _file, content_type)))
|
||||||
|
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
|
@ -132,6 +132,7 @@ def partition(
|
|||||||
pdf_infer_table_structure: bool = False,
|
pdf_infer_table_structure: bool = False,
|
||||||
xml_keep_tags: bool = False,
|
xml_keep_tags: bool = False,
|
||||||
data_source_metadata: Optional[DataSourceMetadata] = None,
|
data_source_metadata: Optional[DataSourceMetadata] = None,
|
||||||
|
metadata_filename: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
||||||
@ -147,7 +148,7 @@ def partition(
|
|||||||
A string defining the file content in MIME type
|
A string defining the file content in MIME type
|
||||||
file
|
file
|
||||||
A file-like object using "rb" mode --> open(filename, "rb").
|
A file-like object using "rb" mode --> open(filename, "rb").
|
||||||
file_filename
|
metadata_filename
|
||||||
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
||||||
url
|
url
|
||||||
The url for a remote document. Pass in content_type if you want partition to treat
|
The url for a remote document. Pass in content_type if you want partition to treat
|
||||||
@ -181,6 +182,20 @@ def partition(
|
|||||||
"""
|
"""
|
||||||
exactly_one(file=file, filename=filename, url=url)
|
exactly_one(file=file, filename=filename, url=url)
|
||||||
|
|
||||||
|
if metadata_filename and file_filename:
|
||||||
|
raise ValueError(
|
||||||
|
"Only one of metadata_filename and file_filename is specified. "
|
||||||
|
"metadata_filename is preferred. file_filename is marked for deprecation.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if file_filename is not None:
|
||||||
|
metadata_filename = file_filename
|
||||||
|
logger.warn(
|
||||||
|
"The file_filename kwarg will be deprecated in a future version of unstructured. "
|
||||||
|
"Please use metadata_filename instead.",
|
||||||
|
)
|
||||||
|
kwargs.setdefault("metadata_filename", metadata_filename)
|
||||||
|
|
||||||
if url is not None:
|
if url is not None:
|
||||||
file, filetype = file_and_type_from_url(
|
file, filetype = file_and_type_from_url(
|
||||||
url=url,
|
url=url,
|
||||||
@ -197,7 +212,7 @@ def partition(
|
|||||||
filetype = detect_filetype(
|
filetype = detect_filetype(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
file_filename=file_filename,
|
file_filename=metadata_filename,
|
||||||
content_type=content_type,
|
content_type=content_type,
|
||||||
encoding=encoding,
|
encoding=encoding,
|
||||||
)
|
)
|
||||||
@ -211,9 +226,6 @@ def partition(
|
|||||||
pdf_infer_table_structure,
|
pdf_infer_table_structure,
|
||||||
)
|
)
|
||||||
|
|
||||||
if file is not None and file_filename is not None:
|
|
||||||
kwargs.setdefault("metadata_filename", file_filename)
|
|
||||||
|
|
||||||
if filetype == FileType.DOC:
|
if filetype == FileType.DOC:
|
||||||
_partition_doc = _get_partition_with_extras("doc")
|
_partition_doc = _get_partition_with_extras("doc")
|
||||||
elements = _partition_doc(filename=filename, file=file, **kwargs)
|
elements = _partition_doc(filename=filename, file=file, **kwargs)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import tempfile
|
import tempfile
|
||||||
|
import warnings
|
||||||
from typing import IO, List, Optional
|
from typing import IO, List, Optional
|
||||||
|
|
||||||
from ebooklib import epub
|
from ebooklib import epub
|
||||||
@ -52,7 +53,11 @@ def partition_epub(
|
|||||||
filename = tmp.name
|
filename = tmp.name
|
||||||
last_modification_date = get_last_modified_date_from_file(file)
|
last_modification_date = get_last_modified_date_from_file(file)
|
||||||
|
|
||||||
book = epub.read_epub(filename, options={"ignore_ncx": False})
|
# NOTE(robinson): ignore ebooklib warning about changing the ignore_ncx default
|
||||||
|
# in the future.
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("ignore")
|
||||||
|
book = epub.read_epub(filename, options={"ignore_ncx": False})
|
||||||
# book.items also includes EpubLink, EpubImage, EpubNcx (page navigation info)
|
# book.items also includes EpubLink, EpubImage, EpubNcx (page navigation info)
|
||||||
# and EpubItem (fomatting/css)
|
# and EpubItem (fomatting/css)
|
||||||
html_items = [item for item in book.items if isinstance(item, epub.EpubHtml)]
|
html_items = [item for item in book.items if isinstance(item, epub.EpubHtml)]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user