mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 02:53:31 +00:00
fix: track filename in metadata for docx tables (#597)
* fix: track filename in metadata for docx tables * bump version * remove accidental commit
This commit is contained in:
parent
301cef27a4
commit
b6bfbf9108
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,14 +1,9 @@
|
||||
## 0.6.7-dev5
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.
|
||||
|
||||
## 0.6.7-dev4
|
||||
## 0.6.7-dev6
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Add `file_directory` to metadata
|
||||
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.
|
||||
* Added a `--partition-strategy` parameter to unstructured-ingest so that users can specify
|
||||
partition strategy in CLI. For example, `--partition-strategy fast`.
|
||||
* Added metadata for filetype.
|
||||
@ -26,6 +21,7 @@
|
||||
* Makes `pytesseract` a function level import in `partition_pdf` so you can use the `"fast"`
|
||||
or `"hi_res"` strategies if `pytesseract` is not installed. Also adds the
|
||||
`required_dependencies` decorator for the `"hi_res"` and `"ocr_only"` strategies.
|
||||
* Fix to ensure `filename` is tracked in metadata for `docx` tables.
|
||||
|
||||
## 0.6.6
|
||||
|
||||
|
||||
@ -7,6 +7,7 @@ from unstructured.documents.elements import (
|
||||
Address,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
Table,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
@ -97,3 +98,21 @@ def test_partition_docx_raises_with_both_specified(mock_document, tmpdir):
|
||||
def test_partition_docx_raises_with_neither():
|
||||
with pytest.raises(ValueError):
|
||||
partition_docx()
|
||||
|
||||
|
||||
def test_partition_docx_processes_table(filename="example-docs/fake_table.docx"):
|
||||
elements = partition_docx(filename=filename)
|
||||
|
||||
assert isinstance(elements[0], Table)
|
||||
assert (
|
||||
elements[0].metadata.text_as_html
|
||||
== """<table>
|
||||
<thead>
|
||||
<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr><td>Lorem ipsum </td><td>A Link example</td></tr>
|
||||
</tbody>
|
||||
</table>"""
|
||||
)
|
||||
assert elements[0].metadata.filename == "fake_table.docx"
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.6.7-dev5" # pragma: no cover
|
||||
__version__ = "0.6.7-dev6" # pragma: no cover
|
||||
|
||||
@ -137,8 +137,10 @@ def partition_docx(
|
||||
text_table = _convert_table_to_text(table, as_html=False)
|
||||
element = Table(text_table)
|
||||
if element is not None:
|
||||
element.metadata = ElementMetadata(filename=metadata_filename)
|
||||
element.metadata = ElementMetadata(text_as_html=html_table)
|
||||
element.metadata = ElementMetadata(
|
||||
text_as_html=html_table,
|
||||
filename=metadata_filename,
|
||||
)
|
||||
elements.append(element)
|
||||
table_index += 1
|
||||
elif element_item.tag.endswith("p"):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user