feat: add chunking to partition_tsv (#2982)

Closes #2980
This commit is contained in:
John 2024-05-07 18:09:27 -05:00 committed by GitHub
parent 668dd0122f
commit ef47d530f6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 23 additions and 3 deletions

View File

@ -1,4 +1,4 @@
## 0.13.7-dev8
## 0.13.7-dev9
### Enhancements
@ -15,6 +15,7 @@
* **`partition_docx()` handles short table rows.** The DOCX format allows a table row to start late and/or end early, meaning cells at the beginning or end of a row can be omitted. While there are legitimate uses for this capability, using it in practice is relatively rare. However, it can happen unintentionally when adjusting cell borders with the mouse. Accommodate this case and generate accurate `.text` and `.metadata.text_as_html` for these tables.
* **Remedy macOS test failure not triggered by CI.** Generalize temp-file detection beyond hard-coded Linux-specific prefix.
* **Remove unnecessary warning log for using default layout model.**
* **Add chunking to partition_tsv** Even though partition_tsv() produces a single Table element, chunking is made available because the Table element is often larger than the desired chunk size and must be divided into smaller chunks.
## 0.13.6

View File

@ -10,6 +10,7 @@ from test_unstructured.partition.test_constants import (
EXPECTED_TEXT_XLSX,
)
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table
from unstructured.partition.tsv import partition_tsv
@ -228,7 +229,7 @@ def test_partition_tsv_element_metadata_has_languages():
assert elements[0].metadata.languages == ["eng"]
def test_partition_csv_header():
def test_partition_tsv_header():
filename = "example-docs/stanley-cups.tsv"
elements = partition_tsv(filename=filename, strategy="fast", include_header=True)
assert (
@ -236,3 +237,19 @@ def test_partition_csv_header():
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
)
assert "<thead>" in elements[0].metadata.text_as_html
def test_partition_tsv_supports_chunking_strategy_while_partitioning():
elements = partition_tsv(filename=example_doc_path("stanley-cups.tsv"))
chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=0)
chunk_elements = partition_tsv(
example_doc_path("stanley-cups.tsv"),
chunking_strategy="by_title",
max_characters=9,
combine_text_under_n_chars=0,
include_header=False,
)
# The same chunks are returned if chunking elements or chunking during partitioning.
assert chunk_elements == chunks

View File

@ -1 +1 @@
__version__ = "0.13.7-dev8" # pragma: no cover
__version__ = "0.13.7-dev9" # pragma: no cover

View File

@ -5,6 +5,7 @@ from typing import IO, Any, Optional
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import (
Element,
ElementMetadata,
@ -25,6 +26,7 @@ DETECTION_ORIGIN: str = "tsv"
@process_metadata()
@add_metadata_with_filetype(FileType.TSV)
@add_chunking_strategy
def partition_tsv(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,