diff --git a/CHANGELOG.md b/CHANGELOG.md index 0dc30ae3e..8649d7cd0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,14 +1,9 @@ -## 0.6.7-dev5 - -### Enhancements - -* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents. - -## 0.6.7-dev4 +## 0.6.7-dev6 ### Enhancements * Add `file_directory` to metadata +* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents. * Added a `--partition-strategy` parameter to unstructured-ingest so that users can specify partition strategy in CLI. For example, `--partition-strategy fast`. * Added metadata for filetype. @@ -26,6 +21,7 @@ * Makes `pytesseract` a function level import in `partition_pdf` so you can use the `"fast"` or `"hi_res"` strategies if `pytesseract` is not installed. Also adds the `required_dependencies` decorator for the `"hi_res"` and `"ocr_only"` strategies. +* Fix to ensure `filename` is tracked in metadata for `docx` tables. ## 0.6.6 diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py index e65272ce4..82505ef78 100644 --- a/test_unstructured/partition/test_docx.py +++ b/test_unstructured/partition/test_docx.py @@ -7,6 +7,7 @@ from unstructured.documents.elements import ( Address, ListItem, NarrativeText, + Table, Text, Title, ) @@ -97,3 +98,21 @@ def test_partition_docx_raises_with_both_specified(mock_document, tmpdir): def test_partition_docx_raises_with_neither(): with pytest.raises(ValueError): partition_docx() + + +def test_partition_docx_processes_table(filename="example-docs/fake_table.docx"): + elements = partition_docx(filename=filename) + + assert isinstance(elements[0], Table) + assert ( + elements[0].metadata.text_as_html + == """ + + + + + + +
Header Col 1 Header Col 2
Lorem ipsum A Link example
""" + ) + assert elements[0].metadata.filename == "fake_table.docx" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index aab7a9ba6..a7d1ec7f1 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.6.7-dev5" # pragma: no cover +__version__ = "0.6.7-dev6" # pragma: no cover diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 4a94e0753..539242450 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -137,8 +137,10 @@ def partition_docx( text_table = _convert_table_to_text(table, as_html=False) element = Table(text_table) if element is not None: - element.metadata = ElementMetadata(filename=metadata_filename) - element.metadata = ElementMetadata(text_as_html=html_table) + element.metadata = ElementMetadata( + text_as_html=html_table, + filename=metadata_filename, + ) elements.append(element) table_index += 1 elif element_item.tag.endswith("p"):