mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 20:27:37 +00:00
parent
668dd0122f
commit
ef47d530f6
@ -1,4 +1,4 @@
|
||||
## 0.13.7-dev8
|
||||
## 0.13.7-dev9
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
* **`partition_docx()` handles short table rows.** The DOCX format allows a table row to start late and/or end early, meaning cells at the beginning or end of a row can be omitted. While there are legitimate uses for this capability, using it in practice is relatively rare. However, it can happen unintentionally when adjusting cell borders with the mouse. Accommodate this case and generate accurate `.text` and `.metadata.text_as_html` for these tables.
|
||||
* **Remedy macOS test failure not triggered by CI.** Generalize temp-file detection beyond hard-coded Linux-specific prefix.
|
||||
* **Remove unnecessary warning log for using default layout model.**
|
||||
* **Add chunking to partition_tsv** Even though partition_tsv() produces a single Table element, chunking is made available because the Table element is often larger than the desired chunk size and must be divided into smaller chunks.
|
||||
|
||||
## 0.13.6
|
||||
|
||||
|
@ -10,6 +10,7 @@ from test_unstructured.partition.test_constants import (
|
||||
EXPECTED_TEXT_XLSX,
|
||||
)
|
||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import Table
|
||||
from unstructured.partition.tsv import partition_tsv
|
||||
@ -228,7 +229,7 @@ def test_partition_tsv_element_metadata_has_languages():
|
||||
assert elements[0].metadata.languages == ["eng"]
|
||||
|
||||
|
||||
def test_partition_csv_header():
|
||||
def test_partition_tsv_header():
|
||||
filename = "example-docs/stanley-cups.tsv"
|
||||
elements = partition_tsv(filename=filename, strategy="fast", include_header=True)
|
||||
assert (
|
||||
@ -236,3 +237,19 @@ def test_partition_csv_header():
|
||||
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
|
||||
)
|
||||
assert "<thead>" in elements[0].metadata.text_as_html
|
||||
|
||||
|
||||
def test_partition_tsv_supports_chunking_strategy_while_partitioning():
|
||||
elements = partition_tsv(filename=example_doc_path("stanley-cups.tsv"))
|
||||
chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=0)
|
||||
|
||||
chunk_elements = partition_tsv(
|
||||
example_doc_path("stanley-cups.tsv"),
|
||||
chunking_strategy="by_title",
|
||||
max_characters=9,
|
||||
combine_text_under_n_chars=0,
|
||||
include_header=False,
|
||||
)
|
||||
|
||||
# The same chunks are returned if chunking elements or chunking during partitioning.
|
||||
assert chunk_elements == chunks
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.13.7-dev8" # pragma: no cover
|
||||
__version__ = "0.13.7-dev9" # pragma: no cover
|
||||
|
@ -5,6 +5,7 @@ from typing import IO, Any, Optional
|
||||
import pandas as pd
|
||||
from lxml.html.soupparser import fromstring as soupparser_fromstring
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
@ -25,6 +26,7 @@ DETECTION_ORIGIN: str = "tsv"
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.TSV)
|
||||
@add_chunking_strategy
|
||||
def partition_tsv(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
Loading…
x
Reference in New Issue
Block a user