fix: track filename in metadata for docx tables (#597)

* fix: track filename in metadata for docx tables

* bump version

* remove accidental commit
This commit is contained in:
Matt Robinson 2023-05-18 10:20:38 -04:00 committed by GitHub
parent 301cef27a4
commit b6bfbf9108
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 27 additions and 10 deletions

View File

@ -1,14 +1,9 @@
## 0.6.7-dev5 ## 0.6.7-dev6
### Enhancements
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.
## 0.6.7-dev4
### Enhancements ### Enhancements
* Add `file_directory` to metadata * Add `file_directory` to metadata
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.
* Added a `--partition-strategy` parameter to unstructured-ingest so that users can specify * Added a `--partition-strategy` parameter to unstructured-ingest so that users can specify
partition strategy in CLI. For example, `--partition-strategy fast`. partition strategy in CLI. For example, `--partition-strategy fast`.
* Added metadata for filetype. * Added metadata for filetype.
@ -26,6 +21,7 @@
* Makes `pytesseract` a function level import in `partition_pdf` so you can use the `"fast"` * Makes `pytesseract` a function level import in `partition_pdf` so you can use the `"fast"`
or `"hi_res"` strategies if `pytesseract` is not installed. Also adds the or `"hi_res"` strategies if `pytesseract` is not installed. Also adds the
`required_dependencies` decorator for the `"hi_res"` and `"ocr_only"` strategies. `required_dependencies` decorator for the `"hi_res"` and `"ocr_only"` strategies.
* Fix to ensure `filename` is tracked in metadata for `docx` tables.
## 0.6.6 ## 0.6.6

View File

@ -7,6 +7,7 @@ from unstructured.documents.elements import (
Address, Address,
ListItem, ListItem,
NarrativeText, NarrativeText,
Table,
Text, Text,
Title, Title,
) )
@ -97,3 +98,21 @@ def test_partition_docx_raises_with_both_specified(mock_document, tmpdir):
def test_partition_docx_raises_with_neither(): def test_partition_docx_raises_with_neither():
with pytest.raises(ValueError): with pytest.raises(ValueError):
partition_docx() partition_docx()
def test_partition_docx_processes_table(filename="example-docs/fake_table.docx"):
elements = partition_docx(filename=filename)
assert isinstance(elements[0], Table)
assert (
elements[0].metadata.text_as_html
== """<table>
<thead>
<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>
</thead>
<tbody>
<tr><td>Lorem ipsum </td><td>A Link example</td></tr>
</tbody>
</table>"""
)
assert elements[0].metadata.filename == "fake_table.docx"

View File

@ -1 +1 @@
__version__ = "0.6.7-dev5" # pragma: no cover __version__ = "0.6.7-dev6" # pragma: no cover

View File

@ -137,8 +137,10 @@ def partition_docx(
text_table = _convert_table_to_text(table, as_html=False) text_table = _convert_table_to_text(table, as_html=False)
element = Table(text_table) element = Table(text_table)
if element is not None: if element is not None:
element.metadata = ElementMetadata(filename=metadata_filename) element.metadata = ElementMetadata(
element.metadata = ElementMetadata(text_as_html=html_table) text_as_html=html_table,
filename=metadata_filename,
)
elements.append(element) elements.append(element)
table_index += 1 table_index += 1
elif element_item.tag.endswith("p"): elif element_item.tag.endswith("p"):