feat: partition_tsv for tab separated value files (#758)

* first pass at partition_tsv

* working tests

* create constants for tests and debug `make test` failure

* make check and tidy

* undo changes for testing locally

* update changelog and version

* fix bricks.rst

* refactor if statements

* make tidy

* fix README and change try/except to if/else

* update changelog and version

* fix\ docstring
This commit is contained in:
John 2023-06-15 13:50:53 -05:00 committed by GitHub
parent 075bf0bdba
commit a9b9b873b1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 179 additions and 116 deletions

View File

@ -9,6 +9,7 @@
### Features ### Features
* Updates `partition_docx` to include headers and footers in the output. * Updates `partition_docx` to include headers and footers in the output.
* Create `partition_tsv` and associated tests. Make additional changes to `detect_filetype`.
### Fixes ### Fixes

View File

@ -102,6 +102,9 @@ about the library.
| Power Points (`.pptx`) | `partition_pptx` | N/A | Yes | Include Page Breaks | | Power Points (`.pptx`) | `partition_pptx` | N/A | Yes | Include Page Breaks |
| ReStructured Text (`.rst`) | `partition_rst` | N/A | Yes | Include Page Breaks | | ReStructured Text (`.rst`) | `partition_rst` | N/A | Yes | Include Page Breaks |
| Rich Text Files (`.rtf`) | `partition_rtf` | N/A | Yes | Include Page Breaks | | Rich Text Files (`.rtf`) | `partition_rtf` | N/A | Yes | Include Page Breaks |
| TSV Files (`.tsv`) | `partition_tsv` | N/A | Yes | None |
| Word Documents (`.doc`) | `partition_doc` | N/A | Yes | None |
| Word Documents (`.docx`) | `partition_docx` | N/A | Yes | None |
| Word Documents (`.doc`) | `partition_doc` | N/A | Yes | Include Page Breaks | | Word Documents (`.doc`) | `partition_doc` | N/A | Yes | Include Page Breaks |
| Word Documents (`.docx`) | `partition_docx` | N/A | Yes | Include Page Breaks | | Word Documents (`.docx`) | `partition_docx` | N/A | Yes | Include Page Breaks |
| XML Documents (`.xml`) | `partition_xml` | N/A | No | Encoding; XML Keep Tags | | XML Documents (`.xml`) | `partition_xml` | N/A | No | Encoding; XML Keep Tags |

View File

@ -82,7 +82,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
file type and route it to the appropriate partitioning brick. All partitioning bricks file type and route it to the appropriate partitioning brick. All partitioning bricks
called within ``partition`` are called using the default kwargs. Use the document-type called within ``partition`` are called using the default kwargs. Use the document-type
specific bricks if you need to apply non-default settings. specific bricks if you need to apply non-default settings.
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``, ``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.tsv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
``.png``, ``.jpg``, and ``.txt`` files. ``.png``, ``.jpg``, and ``.txt`` files.
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``, If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
``.png``, and ``.jpg``. ``.png``, and ``.jpg``.
@ -149,6 +149,23 @@ Examples:
print(elements[0].metadata.text_as_html) print(elements[0].metadata.text_as_html)
``partition_tsv``
------------------
The ``partition_tsv`` function pre-processes TSV files. The output is a single
``Table`` element. The ``text_as_html`` attribute in the element metadata will
contain an HTML representation of the table.
Examples:
.. code:: python
from unstructured.partition.tsv import partition_tsv
elements = partition_tsv(filename="example-docs/stanley-cups.tsv")
print(elements[0].metadata.text_as_html)
``partition_doc`` ``partition_doc``
------------------ ------------------

View File

@ -0,0 +1,5 @@
Stanley Cups
Team Location Stanley Cups
Blues STL 1
Flyers PHI 2
Maple Leafs TOR 13
1 Stanley Cups
2 Team Location Stanley Cups
3 Blues STL 1
4 Flyers PHI 2
5 Maple Leafs TOR 13

View File

@ -41,6 +41,7 @@ XLSX_MIME_TYPES = [
# NOTE(robinson) - currently failing in the docker tests because the detected # NOTE(robinson) - currently failing in the docker tests because the detected
# MIME type is text/csv # MIME type is text/csv
# ("stanley-cups.csv", FileType.CSV), # ("stanley-cups.csv", FileType.CSV),
("stanley-cups.tsv", FileType.TSV),
("fake-power-point.pptx", FileType.PPTX), ("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB), ("winter-sports.epub", FileType.EPUB),
("spring-weather.html.json", FileType.JSON), ("spring-weather.html.json", FileType.JSON),
@ -67,6 +68,7 @@ def test_detect_filetype_from_filename(file, expected):
("fake-html.html", FileType.HTML), ("fake-html.html", FileType.HTML),
("stanley-cups.xlsx", FileType.XLSX), ("stanley-cups.xlsx", FileType.XLSX),
("stanley-cups.csv", FileType.CSV), ("stanley-cups.csv", FileType.CSV),
("stanley-cups.tsv", FileType.TSV),
("fake-power-point.pptx", FileType.PPTX), ("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB), ("winter-sports.epub", FileType.EPUB),
("fake-doc.rtf", FileType.RTF), ("fake-doc.rtf", FileType.RTF),
@ -105,6 +107,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
# NOTE(robinson) - currently failing in the docker tests because the detected # NOTE(robinson) - currently failing in the docker tests because the detected
# MIME type is text/csv # MIME type is text/csv
# ("stanley-cups.csv", FileType.CSV), # ("stanley-cups.csv", FileType.CSV),
("stanley-cups.tsv", FileType.TSV),
("fake-power-point.pptx", FileType.PPTX), ("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB), ("winter-sports.epub", FileType.EPUB),
], ],

View File

@ -8,6 +8,7 @@ from unittest.mock import patch
import docx import docx
import pytest import pytest
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
from unstructured.cleaners.core import clean_extra_whitespace from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import ( from unstructured.documents.elements import (
Address, Address,
@ -630,34 +631,6 @@ def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.
assert elements[5].text == "<name>United States</name>" assert elements[5].text == "<name>United States</name>"
EXPECTED_XLSX_TABLE = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
</tbody>
</table>"""
EXPECTED_XLSX_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
@ -667,8 +640,8 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x
assert all(isinstance(element, Table) for element in elements) assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2 assert len(elements) == 2
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.page_number == 1 assert elements[0].metadata.page_number == 1
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
@ -680,8 +653,8 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
assert all(isinstance(element, Table) for element in elements) assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2 assert len(elements) == 2
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.page_number == 1 assert elements[0].metadata.page_number == 1
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
@ -780,8 +753,8 @@ def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.x
def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"): def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
elements = partition(filename=filename) elements = partition(filename=filename)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == "text/csv" assert elements[0].metadata.filetype == "text/csv"
@ -790,9 +763,9 @@ def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition(file=f) elements = partition(file=f)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert isinstance(elements[0], Table) assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == "text/csv" assert elements[0].metadata.filetype == "text/csv"

View File

@ -0,0 +1,27 @@
EXPECTED_TABLE = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
</tbody>
</table>"""
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"

View File

@ -1,35 +1,8 @@
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
from unstructured.cleaners.core import clean_extra_whitespace from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table from unstructured.documents.elements import Table
from unstructured.partition.csv import partition_csv from unstructured.partition.csv import partition_csv
EXPECTED_TABLE = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
</tbody>
</table>"""
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
EXPECTED_FILETYPE = "text/csv" EXPECTED_FILETYPE = "text/csv"

View File

@ -0,0 +1,33 @@
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table
from unstructured.partition.tsv import partition_tsv
EXPECTED_FILETYPE = "text/tsv"
def test_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
elements = partition_tsv(filename=filename)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
def test_partition_tsv_from_file(filename="example-docs/stanley-cups.tsv"):
with open(filename, "rb") as f:
elements = partition_tsv(file=f)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
def test_partition_tsv_can_exclude_metadata(filename="example-docs/stanley-cups.tsv"):
elements = partition_tsv(filename=filename, include_metadata=False)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html is None
assert elements[0].metadata.filetype is None

View File

@ -1,35 +1,8 @@
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
from unstructured.cleaners.core import clean_extra_whitespace from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table from unstructured.documents.elements import Table
from unstructured.partition.xlsx import partition_xlsx from unstructured.partition.xlsx import partition_xlsx
EXPECTED_TABLE = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
</tbody>
</table>"""
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
EXPECTED_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" EXPECTED_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
EXCEPTED_PAGE_NAME = "Stanley Cups" EXCEPTED_PAGE_NAME = "Stanley Cups"

View File

@ -74,6 +74,7 @@ class FileType(Enum):
TXT = 42 TXT = 42
JSON = 43 JSON = 43
CSV = 44 CSV = 44
TSV = 45
# Markup Types # Markup Types
HTML = 50 HTML = 50
@ -106,6 +107,7 @@ STR_TO_FILETYPE = {
"text/comma-separated-values": FileType.CSV, "text/comma-separated-values": FileType.CSV,
"text/x-comma-separated-values": FileType.CSV, "text/x-comma-separated-values": FileType.CSV,
"text/csv": FileType.CSV, "text/csv": FileType.CSV,
"text/tsv": FileType.TSV,
"text/markdown": FileType.MD, "text/markdown": FileType.MD,
"text/x-markdown": FileType.MD, "text/x-markdown": FileType.MD,
"text/x-rst": FileType.RST, "text/x-rst": FileType.RST,
@ -166,6 +168,7 @@ EXT_TO_FILETYPE = {
".msg": FileType.MSG, ".msg": FileType.MSG,
".odt": FileType.ODT, ".odt": FileType.ODT,
".csv": FileType.CSV, ".csv": FileType.CSV,
".tsv": FileType.TSV,
# NOTE(robinson) - for now we are treating code files as plain text # NOTE(robinson) - for now we are treating code files as plain text
".js": FileType.TXT, ".js": FileType.TXT,
".py": FileType.TXT, ".py": FileType.TXT,
@ -229,7 +232,11 @@ def detect_filetype(
return EXT_TO_FILETYPE.get(extension, FileType.UNK) return EXT_TO_FILETYPE.get(extension, FileType.UNK)
elif file is not None: elif file is not None:
extension = None if hasattr(file, "name"):
_, extension = os.path.splitext(file.name)
else:
extension = ""
extension = extension.lower()
# NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes # NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
# Increased to 4096 because otherwise .xlsx files get detected as a zip file # Increased to 4096 because otherwise .xlsx files get detected as a zip file
# ref: https://github.com/ahupp/python-magic#usage # ref: https://github.com/ahupp/python-magic#usage
@ -251,43 +258,33 @@ def detect_filetype(
"""Mime type special cases.""" """Mime type special cases."""
# third check (mime_type) # third check (mime_type)
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
# installed on the Unstructured docker image, .json files resolve to "text/plain"
# rather than "application/json". this corrects for that case.
if mime_type == "text/plain" and extension == ".json":
return FileType.JSON
# NOTE(Crag): older magic lib does not differentiate between xls and doc # NOTE(Crag): older magic lib does not differentiate between xls and doc
if mime_type == "application/msword" and extension == ".xls": if mime_type == "application/msword" and extension == ".xls":
return FileType.XLS return FileType.XLS
elif mime_type.endswith("xml"): elif mime_type.endswith("xml"):
if extension and (extension == ".html" or extension == ".htm"): if extension == ".html" or extension == ".htm":
return FileType.HTML return FileType.HTML
else: else:
return FileType.XML return FileType.XML
elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"): elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
if extension and extension == ".eml": # NOTE(crag): for older versions of the OS libmagic package, such as is currently
return FileType.EML # installed on the Unstructured docker image, .json files resolve to "text/plain"
elif extension and extension == ".md": # rather than "application/json". this corrects for that case.
return FileType.MD
elif extension and extension == ".rst":
return FileType.RST
elif extension and extension == ".rtf":
return FileType.RTF
elif extension and extension == ".html":
return FileType.HTML
if _is_text_file_a_json(file=file, filename=filename, encoding=encoding): if _is_text_file_a_json(file=file, filename=filename, encoding=encoding):
return FileType.JSON return FileType.JSON
if _is_text_file_a_csv(file=file, filename=filename, encoding=encoding): if _is_text_file_a_csv(file=file, filename=filename, encoding=encoding):
return FileType.CSV return FileType.CSV
if file and not extension and _check_eml_from_buffer(file=file) is True: if file and _check_eml_from_buffer(file=file) is True:
return FileType.EML return FileType.EML
if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".tsv", ".json"]:
return EXT_TO_FILETYPE.get(extension)
# Safety catch # Safety catch
if mime_type in STR_TO_FILETYPE: if mime_type in STR_TO_FILETYPE:
return STR_TO_FILETYPE[mime_type] return STR_TO_FILETYPE[mime_type]
@ -295,14 +292,16 @@ def detect_filetype(
return FileType.TXT return FileType.TXT
elif mime_type == "application/octet-stream": elif mime_type == "application/octet-stream":
if file and not extension: if extension == ".docx":
return FileType.DOCX
elif file:
return _detect_filetype_from_octet_stream(file=file) return _detect_filetype_from_octet_stream(file=file)
else: else:
return EXT_TO_FILETYPE.get(extension, FileType.UNK) return EXT_TO_FILETYPE.get(extension, FileType.UNK)
elif mime_type == "application/zip": elif mime_type == "application/zip":
filetype = FileType.UNK filetype = FileType.UNK
if file and not extension: if file:
filetype = _detect_filetype_from_octet_stream(file=file) filetype = _detect_filetype_from_octet_stream(file=file)
elif filename is not None: elif filename is not None:
with open(filename, "rb") as f: with open(filename, "rb") as f:
@ -310,9 +309,9 @@ def detect_filetype(
extension = extension if extension else "" extension = extension if extension else ""
if filetype == FileType.UNK: if filetype == FileType.UNK:
return EXT_TO_FILETYPE.get(extension.lower(), FileType.ZIP) return FileType.ZIP
else: else:
return EXT_TO_FILETYPE.get(extension.lower(), filetype) return EXT_TO_FILETYPE.get(extension, filetype)
elif _is_code_mime_type(mime_type): elif _is_code_mime_type(mime_type):
# NOTE(robinson) - we'll treat all code files as plain text for now. # NOTE(robinson) - we'll treat all code files as plain text for now.

View File

@ -29,6 +29,7 @@ from unstructured.partition.pptx import partition_pptx
from unstructured.partition.rst import partition_rst from unstructured.partition.rst import partition_rst
from unstructured.partition.rtf import partition_rtf from unstructured.partition.rtf import partition_rtf
from unstructured.partition.text import partition_text from unstructured.partition.text import partition_text
from unstructured.partition.tsv import partition_tsv
from unstructured.partition.xlsx import partition_xlsx from unstructured.partition.xlsx import partition_xlsx
from unstructured.partition.xml import partition_xml from unstructured.partition.xml import partition_xml
@ -211,6 +212,8 @@ def partition(
elements = partition_xlsx(filename=filename, file=file) elements = partition_xlsx(filename=filename, file=file)
elif filetype == FileType.CSV: elif filetype == FileType.CSV:
elements = partition_csv(filename=filename, file=file) elements = partition_csv(filename=filename, file=file)
elif filetype == FileType.TSV:
elements = partition_tsv(filename=filename, file=file)
elif filetype == FileType.EMPTY: elif filetype == FileType.EMPTY:
elements = [] elements = []
else: else:

View File

@ -0,0 +1,53 @@
from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast
import lxml.html
import pandas as pd
from unstructured.documents.elements import Element, ElementMetadata, Table
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
@add_metadata_with_filetype(FileType.TSV)
def partition_tsv(
filename: Optional[str] = None,
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
) -> List[Element]:
"""Partitions TSV files into document elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
metadata_filename
The filename to use for the metadata.
include_metadata
Determines whether or not metadata is included in the output.
"""
exactly_one(filename=filename, file=file)
if filename:
table = pd.read_csv(filename, sep="\t")
else:
f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file))
table = pd.read_csv(f, sep="\t")
metadata_filename = filename or metadata_filename
html_text = table.to_html(index=False, header=False, na_rep="")
text = lxml.html.document_fromstring(html_text).text_content()
if include_metadata:
metadata = ElementMetadata(
text_as_html=html_text,
filename=metadata_filename,
)
else:
metadata = ElementMetadata()
return [Table(text=text, metadata=metadata)]