fix: track filename in metadata for docx tables (#597)

* fix: track filename in metadata for docx tables

* bump version

* remove accidental commit
This commit is contained in:
Matt Robinson 2023-05-18 10:20:38 -04:00 committed by GitHub
parent 301cef27a4
commit b6bfbf9108
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 27 additions and 10 deletions

View File

@ -1,14 +1,9 @@
## 0.6.7-dev5
### Enhancements
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.
## 0.6.7-dev4
## 0.6.7-dev6
### Enhancements
* Add `file_directory` to metadata
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.
* Added a `--partition-strategy` parameter to unstructured-ingest so that users can specify
partition strategy in CLI. For example, `--partition-strategy fast`.
* Added metadata for filetype.
@ -26,6 +21,7 @@
* Makes `pytesseract` a function level import in `partition_pdf` so you can use the `"fast"`
or `"hi_res"` strategies if `pytesseract` is not installed. Also adds the
`required_dependencies` decorator for the `"hi_res"` and `"ocr_only"` strategies.
* Fix to ensure `filename` is tracked in metadata for `docx` tables.
## 0.6.6

View File

@ -7,6 +7,7 @@ from unstructured.documents.elements import (
Address,
ListItem,
NarrativeText,
Table,
Text,
Title,
)
@ -97,3 +98,21 @@ def test_partition_docx_raises_with_both_specified(mock_document, tmpdir):
def test_partition_docx_raises_with_neither():
with pytest.raises(ValueError):
partition_docx()
def test_partition_docx_processes_table(filename="example-docs/fake_table.docx"):
elements = partition_docx(filename=filename)
assert isinstance(elements[0], Table)
assert (
elements[0].metadata.text_as_html
== """<table>
<thead>
<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>
</thead>
<tbody>
<tr><td>Lorem ipsum </td><td>A Link example</td></tr>
</tbody>
</table>"""
)
assert elements[0].metadata.filename == "fake_table.docx"

View File

@ -1 +1 @@
__version__ = "0.6.7-dev5" # pragma: no cover
__version__ = "0.6.7-dev6" # pragma: no cover

View File

@ -137,8 +137,10 @@ def partition_docx(
text_table = _convert_table_to_text(table, as_html=False)
element = Table(text_table)
if element is not None:
element.metadata = ElementMetadata(filename=metadata_filename)
element.metadata = ElementMetadata(text_as_html=html_table)
element.metadata = ElementMetadata(
text_as_html=html_table,
filename=metadata_filename,
)
elements.append(element)
table_index += 1
elif element_item.tag.endswith("p"):