mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-10 15:37:58 +00:00
fix: track filename in metadata for docx tables (#597)
* fix: track filename in metadata for docx tables * bump version * remove accidental commit
This commit is contained in:
parent
301cef27a4
commit
b6bfbf9108
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,14 +1,9 @@
|
|||||||
## 0.6.7-dev5
|
## 0.6.7-dev6
|
||||||
|
|
||||||
### Enhancements
|
|
||||||
|
|
||||||
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.
|
|
||||||
|
|
||||||
## 0.6.7-dev4
|
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* Add `file_directory` to metadata
|
* Add `file_directory` to metadata
|
||||||
|
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.
|
||||||
* Added a `--partition-strategy` parameter to unstructured-ingest so that users can specify
|
* Added a `--partition-strategy` parameter to unstructured-ingest so that users can specify
|
||||||
partition strategy in CLI. For example, `--partition-strategy fast`.
|
partition strategy in CLI. For example, `--partition-strategy fast`.
|
||||||
* Added metadata for filetype.
|
* Added metadata for filetype.
|
||||||
@ -26,6 +21,7 @@
|
|||||||
* Makes `pytesseract` a function level import in `partition_pdf` so you can use the `"fast"`
|
* Makes `pytesseract` a function level import in `partition_pdf` so you can use the `"fast"`
|
||||||
or `"hi_res"` strategies if `pytesseract` is not installed. Also adds the
|
or `"hi_res"` strategies if `pytesseract` is not installed. Also adds the
|
||||||
`required_dependencies` decorator for the `"hi_res"` and `"ocr_only"` strategies.
|
`required_dependencies` decorator for the `"hi_res"` and `"ocr_only"` strategies.
|
||||||
|
* Fix to ensure `filename` is tracked in metadata for `docx` tables.
|
||||||
|
|
||||||
## 0.6.6
|
## 0.6.6
|
||||||
|
|
||||||
|
|||||||
@ -7,6 +7,7 @@ from unstructured.documents.elements import (
|
|||||||
Address,
|
Address,
|
||||||
ListItem,
|
ListItem,
|
||||||
NarrativeText,
|
NarrativeText,
|
||||||
|
Table,
|
||||||
Text,
|
Text,
|
||||||
Title,
|
Title,
|
||||||
)
|
)
|
||||||
@ -97,3 +98,21 @@ def test_partition_docx_raises_with_both_specified(mock_document, tmpdir):
|
|||||||
def test_partition_docx_raises_with_neither():
|
def test_partition_docx_raises_with_neither():
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
partition_docx()
|
partition_docx()
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_docx_processes_table(filename="example-docs/fake_table.docx"):
|
||||||
|
elements = partition_docx(filename=filename)
|
||||||
|
|
||||||
|
assert isinstance(elements[0], Table)
|
||||||
|
assert (
|
||||||
|
elements[0].metadata.text_as_html
|
||||||
|
== """<table>
|
||||||
|
<thead>
|
||||||
|
<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>Lorem ipsum </td><td>A Link example</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>"""
|
||||||
|
)
|
||||||
|
assert elements[0].metadata.filename == "fake_table.docx"
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.6.7-dev5" # pragma: no cover
|
__version__ = "0.6.7-dev6" # pragma: no cover
|
||||||
|
|||||||
@ -137,8 +137,10 @@ def partition_docx(
|
|||||||
text_table = _convert_table_to_text(table, as_html=False)
|
text_table = _convert_table_to_text(table, as_html=False)
|
||||||
element = Table(text_table)
|
element = Table(text_table)
|
||||||
if element is not None:
|
if element is not None:
|
||||||
element.metadata = ElementMetadata(filename=metadata_filename)
|
element.metadata = ElementMetadata(
|
||||||
element.metadata = ElementMetadata(text_as_html=html_table)
|
text_as_html=html_table,
|
||||||
|
filename=metadata_filename,
|
||||||
|
)
|
||||||
elements.append(element)
|
elements.append(element)
|
||||||
table_index += 1
|
table_index += 1
|
||||||
elif element_item.tag.endswith("p"):
|
elif element_item.tag.endswith("p"):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user