2024-06-17 12:43:18 -07:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
from pytest_mock import MockFixture
|
2023-03-14 11:52:21 -04:00
|
|
|
|
2024-05-22 17:51:08 -07:00
|
|
|
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
2023-09-11 16:00:14 -05:00
|
|
|
from unstructured.chunking.title import chunk_by_title
|
2023-09-15 09:51:22 -07:00
|
|
|
from unstructured.documents.elements import Table, Text
|
2023-03-14 11:52:21 -04:00
|
|
|
from unstructured.partition.epub import partition_epub
|
2023-10-05 15:26:47 -05:00
|
|
|
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
2023-03-14 11:52:21 -04:00
|
|
|
|
2023-03-30 16:54:29 -04:00
|
|
|
|
2023-03-14 11:52:21 -04:00
|
|
|
def test_partition_epub_from_filename():
|
2024-10-02 14:04:37 -07:00
|
|
|
elements = partition_epub(example_doc_path("simple.epub"))
|
|
|
|
|
2023-03-14 11:52:21 -04:00
|
|
|
assert len(elements) > 0
|
2024-10-02 14:04:37 -07:00
|
|
|
assert isinstance(elements[0], Text)
|
2025-03-07 17:25:21 -08:00
|
|
|
assert elements[1].text.startswith("a shared culture")
|
2023-10-05 15:26:47 -05:00
|
|
|
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
|
|
|
assert {element.metadata.detection_origin for element in elements} == {"epub"}
|
2023-07-05 15:02:22 -05:00
|
|
|
|
|
|
|
|
2023-09-12 18:27:05 -07:00
|
|
|
def test_partition_epub_from_filename_returns_table_in_elements():
|
rfctr(html): replace html parser (#3218)
**Summary**
Replace legacy HTML parser with recursive version that captures all
content and provides flexibility to add new metadata. It's also
substantially faster although that's just a happy side-effect.
**Additional Context**
The prior HTML parsing algorithm that makes up the core of HTML
partitioning was buggy and very difficult to reason about because it did
not conform to the inherently recursive structure of HTML. The new
version retains `lxml` as the performant and reliable base library but
uses `lxml`'s custom element classes to efficiently classify HTML
elements by their behaviors (block-item and inline (phrasing) primarily)
and give those elements the desired partitioning behaviors.
This solves a host of existing problems with content being skipped and
elements (paragraphs) being divided improperly, but also provides a
clear domain model for reasoning about its behavior and reliably
adjusting it to suit our existing and future purposes.
The parser's operation is recursive, closely modeling the recursive
structure of HTML itself. It's behaviors are based on the HTML Standard
and reliably produce proper and explainable results even for novel
cases.
Fixes #2325
Fixes #2562
Fixes #2675
Fixes #3168
Fixes #3227
Fixes #3228
Fixes #3230
Fixes #3237
Fixes #3245
Fixes #3247
Fixes #3255
Fixes #3309
### BEHAVIOR DIFFERENCES
#### `emphasized_text_tags` encoding is changed:
- `<strong>` is encoded as `"b"` rather than `"strong"`.
- `<em>` is encoded as `"i"` rather than `"em"`.
- `<span>` is no longer recorded in `emphasized_text_tags` (because
without the CSS we can't tell whether it's used for emphasis or if so
what kind).
- nested emphasis (e.g. bold+italic) is encoded as multiple characters
("bi").
- `emphasized_text_contents` is broken on emphasis-change boundaries,
like:
```html
`<p>foo <b>bar <i>baz</i> bada</b> bing</p>`
```
produces:
```json
{
"emphasized_text_contents": ["bar", "baz", "bada"],
"emphasized_text_tags": ["b", "bi", "b"]
}
```
whereas previously it would have produced:
```json
{
"emphasized_text_contents": ["bar baz bada", "baz"],
"emphasized_text_tags": ["b", "i"]
}
```
#### `<pre>` text is preserved as it appears in the html
Except that a leading newline is removed if present (has to be in
position 0 of text). Also, a trailing newline is stripped but only if it
appears in the very last position ([-1]) of the `<pre>` text. Old parser
stripped all leading and trailing whitespace.
Result is that:
```html
<pre>
foo
bar
baz
</pre>
```
parses to `"foo\nbar\nbaz"` which is the same result produced for:
```html
<pre>foo
bar
baz</pre>
```
This equivalence is the same behavior exhibited by a browser, which is
why we did the extra work to make it this way.
#### Whitespace normalization
Leading and trailing whitespace are removed from element text, just as
it is removed in the browser. Runs of whitespace within the element text
are reduced to a single space character (like in the browser). Note this
means that `\t`, `\n`, and ` ` are replaced with a regular space
character. All text derived from elements is whitespace normalized
except the text within a `<pre>` tag. Any leading or trailing newline is
trimmed from `<pre>` element text; all other whitespace is preserved
just as it appeared in the HTML source.
#### `link_start_indexes` metadata is no longer captured. Rationale:
- It was frequently wrong, often `-1`.
- It was deprecated but then added back in a community PR.
- Maintaining it across any possible downstream transformations (e.g.
chunking) would be expensive and almost certainly lead to wrong values
as distant code evolves.
- It is complex to compute and recompute when whitespace is normalized,
adding substantial complexity to the code and reducing readability and
maintainability
#### `<br/>` element is replaced with a single newline (`"\n"`)
but that is usually replaced with a space in `Element.text` when it is
normalized. The newline is preserved within a `<pre>` element.
- Related: _No paragraph-break on `<br/><br/>`_
#### Empty `h1..h6` elements are dropped.
HTML heading elements (`<h1..h6>`) are "skipped" (do not generate a
`Title` element) when they contain no text or contain only whitespace.
---------
Co-authored-by: scanny <scanny@users.noreply.github.com>
2024-07-10 17:14:28 -07:00
|
|
|
elements = partition_epub(example_doc_path("winter-sports.epub"))
|
2025-03-07 17:25:21 -08:00
|
|
|
assert elements[12] == Table(
|
rfctr(html): replace html parser (#3218)
**Summary**
Replace legacy HTML parser with recursive version that captures all
content and provides flexibility to add new metadata. It's also
substantially faster although that's just a happy side-effect.
**Additional Context**
The prior HTML parsing algorithm that makes up the core of HTML
partitioning was buggy and very difficult to reason about because it did
not conform to the inherently recursive structure of HTML. The new
version retains `lxml` as the performant and reliable base library but
uses `lxml`'s custom element classes to efficiently classify HTML
elements by their behaviors (block-item and inline (phrasing) primarily)
and give those elements the desired partitioning behaviors.
This solves a host of existing problems with content being skipped and
elements (paragraphs) being divided improperly, but also provides a
clear domain model for reasoning about its behavior and reliably
adjusting it to suit our existing and future purposes.
The parser's operation is recursive, closely modeling the recursive
structure of HTML itself. It's behaviors are based on the HTML Standard
and reliably produce proper and explainable results even for novel
cases.
Fixes #2325
Fixes #2562
Fixes #2675
Fixes #3168
Fixes #3227
Fixes #3228
Fixes #3230
Fixes #3237
Fixes #3245
Fixes #3247
Fixes #3255
Fixes #3309
### BEHAVIOR DIFFERENCES
#### `emphasized_text_tags` encoding is changed:
- `<strong>` is encoded as `"b"` rather than `"strong"`.
- `<em>` is encoded as `"i"` rather than `"em"`.
- `<span>` is no longer recorded in `emphasized_text_tags` (because
without the CSS we can't tell whether it's used for emphasis or if so
what kind).
- nested emphasis (e.g. bold+italic) is encoded as multiple characters
("bi").
- `emphasized_text_contents` is broken on emphasis-change boundaries,
like:
```html
`<p>foo <b>bar <i>baz</i> bada</b> bing</p>`
```
produces:
```json
{
"emphasized_text_contents": ["bar", "baz", "bada"],
"emphasized_text_tags": ["b", "bi", "b"]
}
```
whereas previously it would have produced:
```json
{
"emphasized_text_contents": ["bar baz bada", "baz"],
"emphasized_text_tags": ["b", "i"]
}
```
#### `<pre>` text is preserved as it appears in the html
Except that a leading newline is removed if present (has to be in
position 0 of text). Also, a trailing newline is stripped but only if it
appears in the very last position ([-1]) of the `<pre>` text. Old parser
stripped all leading and trailing whitespace.
Result is that:
```html
<pre>
foo
bar
baz
</pre>
```
parses to `"foo\nbar\nbaz"` which is the same result produced for:
```html
<pre>foo
bar
baz</pre>
```
This equivalence is the same behavior exhibited by a browser, which is
why we did the extra work to make it this way.
#### Whitespace normalization
Leading and trailing whitespace are removed from element text, just as
it is removed in the browser. Runs of whitespace within the element text
are reduced to a single space character (like in the browser). Note this
means that `\t`, `\n`, and ` ` are replaced with a regular space
character. All text derived from elements is whitespace normalized
except the text within a `<pre>` tag. Any leading or trailing newline is
trimmed from `<pre>` element text; all other whitespace is preserved
just as it appeared in the HTML source.
#### `link_start_indexes` metadata is no longer captured. Rationale:
- It was frequently wrong, often `-1`.
- It was deprecated but then added back in a community PR.
- Maintaining it across any possible downstream transformations (e.g.
chunking) would be expensive and almost certainly lead to wrong values
as distant code evolves.
- It is complex to compute and recompute when whitespace is normalized,
adding substantial complexity to the code and reducing readability and
maintainability
#### `<br/>` element is replaced with a single newline (`"\n"`)
but that is usually replaced with a space in `Element.text` when it is
normalized. The newline is preserved within a `<pre>` element.
- Related: _No paragraph-break on `<br/><br/>`_
#### Empty `h1..h6` elements are dropped.
HTML heading elements (`<h1..h6>`) are "skipped" (do not generate a
`Title` element) when they contain no text or contain only whitespace.
---------
Co-authored-by: scanny <scanny@users.noreply.github.com>
2024-07-10 17:14:28 -07:00
|
|
|
"Contents. List of Illustrations (In certain versions of this etext [in certain\nbrowsers]"
|
|
|
|
" clicking on the image will bring up a larger\nversion.) (etext transcriber's note)"
|
2023-09-12 18:27:05 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-10-02 14:04:37 -07:00
|
|
|
def test_partition_epub_from_file():
|
|
|
|
with open(example_doc_path("winter-sports.epub"), "rb") as f:
|
|
|
|
elements = partition_epub(file=f)
|
|
|
|
|
2023-09-15 09:51:22 -07:00
|
|
|
assert len(elements) > 0
|
2025-03-07 17:25:21 -08:00
|
|
|
assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
2024-10-02 14:04:37 -07:00
|
|
|
|
|
|
|
|
|
|
|
# -- .metadata.filename --------------------------------------------------------------------------
|
|
|
|
|
2023-09-15 09:51:22 -07:00
|
|
|
|
2024-10-02 14:04:37 -07:00
|
|
|
def test_partition_epub_from_filename_gets_filename_from_filename_arg():
|
|
|
|
elements = partition_epub(example_doc_path("simple.epub"))
|
2023-09-15 09:51:22 -07:00
|
|
|
|
2023-07-05 15:02:22 -05:00
|
|
|
assert len(elements) > 0
|
2024-10-02 14:04:37 -07:00
|
|
|
assert all(e.metadata.filename == "simple.epub" for e in elements)
|
2023-03-14 11:52:21 -04:00
|
|
|
|
|
|
|
|
2024-10-02 14:04:37 -07:00
|
|
|
def test_partition_epub_from_file_gets_filename_None():
|
|
|
|
with open(example_doc_path("simple.epub"), "rb") as f:
|
2023-03-14 11:52:21 -04:00
|
|
|
elements = partition_epub(file=f)
|
2024-10-02 14:04:37 -07:00
|
|
|
|
2023-03-14 11:52:21 -04:00
|
|
|
assert len(elements) > 0
|
2024-10-02 14:04:37 -07:00
|
|
|
assert all(e.metadata.filename is None for e in elements)
|
|
|
|
|
2023-07-05 15:02:22 -05:00
|
|
|
|
2024-10-02 14:04:37 -07:00
|
|
|
def test_partition_epub_from_filename_prefers_metadata_filename():
|
|
|
|
elements = partition_epub(example_doc_path("simple.epub"), metadata_filename="orig-name.epub")
|
2023-07-05 15:02:22 -05:00
|
|
|
|
|
|
|
assert len(elements) > 0
|
2024-10-02 14:04:37 -07:00
|
|
|
assert all(element.metadata.filename == "orig-name.epub" for element in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_epub_from_file_prefers_metadata_filename():
|
|
|
|
with open(example_doc_path("simple.epub"), "rb") as f:
|
|
|
|
elements = partition_epub(file=f, metadata_filename="orig-name.epub")
|
|
|
|
|
|
|
|
assert all(e.metadata.filename == "orig-name.epub" for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
# -- .metadata.filetype --------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_epub_gets_the_EPUB_MIME_type_in_metadata_filetype():
|
|
|
|
EPUB_MIME_TYPE = "application/epub"
|
|
|
|
elements = partition_epub(example_doc_path("simple.epub"))
|
|
|
|
assert all(e.metadata.filetype == EPUB_MIME_TYPE for e in elements), (
|
|
|
|
f"Expected all elements to have '{EPUB_MIME_TYPE}' as their filetype, but got:"
|
|
|
|
f" {repr(elements[0].metadata.filetype)}"
|
|
|
|
)
|
2023-06-30 09:44:46 -05:00
|
|
|
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
# -- .metadata.last_modified ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_epub_from_file_path_gets_last_modified_from_filesystem(mocker: MockFixture):
|
2024-06-17 12:43:18 -07:00
|
|
|
filesystem_last_modified = "2024-06-14T16:01:29"
|
2023-07-26 15:10:14 -04:00
|
|
|
mocker.patch(
|
2024-09-23 15:23:10 -07:00
|
|
|
"unstructured.partition.epub.get_last_modified_date", return_value=filesystem_last_modified
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
elements = partition_epub(example_doc_path("winter-sports.epub"))
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2024-10-02 14:04:37 -07:00
|
|
|
assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
2024-10-02 14:04:37 -07:00
|
|
|
def test_partition_epub_from_file_gets_last_modified_None():
|
|
|
|
with open(example_doc_path("simple.epub"), "rb") as f:
|
|
|
|
elements = partition_epub(file=f)
|
|
|
|
|
|
|
|
assert all(e.metadata.last_modified is None for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_epub_from_file_path_prefers_metadata_last_modified(mocker: MockFixture):
|
2024-09-23 15:23:10 -07:00
|
|
|
filesystem_last_modified = "2024-06-14T16:01:29"
|
|
|
|
metadata_last_modified = "2020-03-08T06:10:23"
|
2023-07-26 15:10:14 -04:00
|
|
|
mocker.patch(
|
2024-09-23 15:23:10 -07:00
|
|
|
"unstructured.partition.epub.get_last_modified_date", return_value=filesystem_last_modified
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
elements = partition_epub(
|
2024-09-23 15:23:10 -07:00
|
|
|
example_doc_path("winter-sports.epub"), metadata_last_modified=metadata_last_modified
|
2024-03-18 02:09:44 +01:00
|
|
|
)
|
|
|
|
|
2024-06-17 12:43:18 -07:00
|
|
|
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
2024-03-18 02:09:44 +01:00
|
|
|
|
|
|
|
|
2024-10-02 14:04:37 -07:00
|
|
|
def test_partition_epub_from_file_prefers_metadata_last_modified():
|
|
|
|
metadata_last_modified = "2020-03-08T06:10:23"
|
|
|
|
with open(example_doc_path("simple.epub"), "rb") as f:
|
|
|
|
elements = partition_epub(file=f, metadata_last_modified=metadata_last_modified)
|
|
|
|
|
|
|
|
assert all(e.metadata.last_modified is metadata_last_modified for e in elements)
|
|
|
|
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
# ------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
2023-10-12 12:47:55 -07:00
|
|
|
def test_partition_epub_with_json():
|
|
|
|
filename = "example-docs/winter-sports.epub"
|
2023-08-29 16:59:26 -04:00
|
|
|
elements = partition_epub(filename=filename)
|
|
|
|
|
2023-10-12 12:47:55 -07:00
|
|
|
assert_round_trips_through_JSON(elements)
|
2023-09-11 16:00:14 -05:00
|
|
|
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
def test_add_chunking_strategy_on_partition_epub():
|
|
|
|
file_path = example_doc_path("winter-sports.epub")
|
|
|
|
elements = partition_epub(file_path)
|
|
|
|
chunk_elements = partition_epub(file_path, chunking_strategy="by_title")
|
2023-09-11 16:00:14 -05:00
|
|
|
chunks = chunk_by_title(elements)
|
|
|
|
assert chunk_elements != elements
|
|
|
|
assert chunk_elements == chunks
|
2023-10-03 09:40:34 -07:00
|
|
|
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
def test_add_chunking_strategy_on_partition_epub_non_default():
|
|
|
|
file_path = example_doc_path("winter-sports.epub")
|
|
|
|
elements = partition_epub(filename=file_path)
|
2023-10-03 09:40:34 -07:00
|
|
|
chunk_elements = partition_epub(
|
2024-09-23 15:23:10 -07:00
|
|
|
file_path,
|
2023-10-03 09:40:34 -07:00
|
|
|
chunking_strategy="by_title",
|
|
|
|
max_characters=5,
|
|
|
|
new_after_n_chars=5,
|
|
|
|
combine_text_under_n_chars=0,
|
|
|
|
)
|
|
|
|
chunks = chunk_by_title(
|
|
|
|
elements,
|
|
|
|
max_characters=5,
|
|
|
|
new_after_n_chars=5,
|
|
|
|
combine_text_under_n_chars=0,
|
|
|
|
)
|
|
|
|
assert chunk_elements != elements
|
|
|
|
assert chunk_elements == chunks
|
2023-10-10 20:47:56 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_epub_element_metadata_has_languages():
|
2024-05-22 17:51:08 -07:00
|
|
|
filename = example_doc_path("winter-sports.epub")
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = partition_epub(filename=filename)
|
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_epub_respects_detect_language_per_element():
|
|
|
|
filename = "example-docs/language-docs/eng_spa_mult.epub"
|
|
|
|
elements = partition_epub(filename=filename, detect_language_per_element=True)
|
|
|
|
langs = [element.metadata.languages for element in elements]
|
2023-10-26 12:22:40 -05:00
|
|
|
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
|