mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: partition_tsv for tab separated value files (#758)
* first pass at partition_tsv * working tests * create constants for tests and debug `make test` failure * make check and tidy * undo changes for testing locally * update changelog and version * fix bricks.rst * refactor if statements * make tidy * fix README and change try/except to if/else * update changelog and version * fix\ docstring
This commit is contained in:
parent
075bf0bdba
commit
a9b9b873b1
@ -9,6 +9,7 @@
|
||||
### Features
|
||||
|
||||
* Updates `partition_docx` to include headers and footers in the output.
|
||||
* Create `partition_tsv` and associated tests. Make additional changes to `detect_filetype`.
|
||||
|
||||
### Fixes
|
||||
|
||||
|
@ -102,6 +102,9 @@ about the library.
|
||||
| Power Points (`.pptx`) | `partition_pptx` | N/A | Yes | Include Page Breaks |
|
||||
| ReStructured Text (`.rst`) | `partition_rst` | N/A | Yes | Include Page Breaks |
|
||||
| Rich Text Files (`.rtf`) | `partition_rtf` | N/A | Yes | Include Page Breaks |
|
||||
| TSV Files (`.tsv`) | `partition_tsv` | N/A | Yes | None |
|
||||
| Word Documents (`.doc`) | `partition_doc` | N/A | Yes | None |
|
||||
| Word Documents (`.docx`) | `partition_docx` | N/A | Yes | None |
|
||||
| Word Documents (`.doc`) | `partition_doc` | N/A | Yes | Include Page Breaks |
|
||||
| Word Documents (`.docx`) | `partition_docx` | N/A | Yes | Include Page Breaks |
|
||||
| XML Documents (`.xml`) | `partition_xml` | N/A | No | Encoding; XML Keep Tags |
|
||||
|
@ -82,7 +82,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
|
||||
file type and route it to the appropriate partitioning brick. All partitioning bricks
|
||||
called within ``partition`` are called using the default kwargs. Use the document-type
|
||||
specific bricks if you need to apply non-default settings.
|
||||
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
|
||||
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.tsv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
|
||||
``.png``, ``.jpg``, and ``.txt`` files.
|
||||
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
|
||||
``.png``, and ``.jpg``.
|
||||
@ -149,6 +149,23 @@ Examples:
|
||||
print(elements[0].metadata.text_as_html)
|
||||
|
||||
|
||||
``partition_tsv``
|
||||
------------------
|
||||
|
||||
The ``partition_tsv`` function pre-processes TSV files. The output is a single
|
||||
``Table`` element. The ``text_as_html`` attribute in the element metadata will
|
||||
contain an HTML representation of the table.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.tsv import partition_tsv
|
||||
|
||||
elements = partition_tsv(filename="example-docs/stanley-cups.tsv")
|
||||
print(elements[0].metadata.text_as_html)
|
||||
|
||||
|
||||
``partition_doc``
|
||||
------------------
|
||||
|
||||
|
5
example-docs/stanley-cups.tsv
Normal file
5
example-docs/stanley-cups.tsv
Normal file
@ -0,0 +1,5 @@
|
||||
Stanley Cups
|
||||
Team Location Stanley Cups
|
||||
Blues STL 1
|
||||
Flyers PHI 2
|
||||
Maple Leafs TOR 13
|
|
@ -41,6 +41,7 @@ XLSX_MIME_TYPES = [
|
||||
# NOTE(robinson) - currently failing in the docker tests because the detected
|
||||
# MIME type is text/csv
|
||||
# ("stanley-cups.csv", FileType.CSV),
|
||||
("stanley-cups.tsv", FileType.TSV),
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
("winter-sports.epub", FileType.EPUB),
|
||||
("spring-weather.html.json", FileType.JSON),
|
||||
@ -67,6 +68,7 @@ def test_detect_filetype_from_filename(file, expected):
|
||||
("fake-html.html", FileType.HTML),
|
||||
("stanley-cups.xlsx", FileType.XLSX),
|
||||
("stanley-cups.csv", FileType.CSV),
|
||||
("stanley-cups.tsv", FileType.TSV),
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
("winter-sports.epub", FileType.EPUB),
|
||||
("fake-doc.rtf", FileType.RTF),
|
||||
@ -105,6 +107,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
|
||||
# NOTE(robinson) - currently failing in the docker tests because the detected
|
||||
# MIME type is text/csv
|
||||
# ("stanley-cups.csv", FileType.CSV),
|
||||
("stanley-cups.tsv", FileType.TSV),
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
("winter-sports.epub", FileType.EPUB),
|
||||
],
|
||||
|
@ -8,6 +8,7 @@ from unittest.mock import patch
|
||||
import docx
|
||||
import pytest
|
||||
|
||||
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import (
|
||||
Address,
|
||||
@ -630,34 +631,6 @@ def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.
|
||||
assert elements[5].text == "<name>United States</name>"
|
||||
|
||||
|
||||
EXPECTED_XLSX_TABLE = """<table border="1" class="dataframe">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Team</td>
|
||||
<td>Location</td>
|
||||
<td>Stanley Cups</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Blues</td>
|
||||
<td>STL</td>
|
||||
<td>1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Flyers</td>
|
||||
<td>PHI</td>
|
||||
<td>2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Maple Leafs</td>
|
||||
<td>TOR</td>
|
||||
<td>13</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>"""
|
||||
|
||||
|
||||
EXPECTED_XLSX_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
||||
|
||||
EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
|
||||
|
||||
@ -667,8 +640,8 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x
|
||||
assert all(isinstance(element, Table) for element in elements)
|
||||
assert len(elements) == 2
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
||||
assert elements[0].metadata.page_number == 1
|
||||
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
|
||||
|
||||
@ -680,8 +653,8 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
|
||||
assert all(isinstance(element, Table) for element in elements)
|
||||
assert len(elements) == 2
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
||||
assert elements[0].metadata.page_number == 1
|
||||
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
|
||||
|
||||
@ -780,8 +753,8 @@ def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.x
|
||||
def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
|
||||
elements = partition(filename=filename)
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
||||
assert elements[0].metadata.filetype == "text/csv"
|
||||
|
||||
|
||||
@ -790,9 +763,9 @@ def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f)
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert isinstance(elements[0], Table)
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
||||
assert elements[0].metadata.filetype == "text/csv"
|
||||
|
||||
|
||||
|
27
test_unstructured/partition/test_constants.py
Normal file
27
test_unstructured/partition/test_constants.py
Normal file
@ -0,0 +1,27 @@
|
||||
EXPECTED_TABLE = """<table border="1" class="dataframe">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Team</td>
|
||||
<td>Location</td>
|
||||
<td>Stanley Cups</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Blues</td>
|
||||
<td>STL</td>
|
||||
<td>1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Flyers</td>
|
||||
<td>PHI</td>
|
||||
<td>2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Maple Leafs</td>
|
||||
<td>TOR</td>
|
||||
<td>13</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>"""
|
||||
|
||||
|
||||
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
@ -1,35 +1,8 @@
|
||||
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import Table
|
||||
from unstructured.partition.csv import partition_csv
|
||||
|
||||
EXPECTED_TABLE = """<table border="1" class="dataframe">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Team</td>
|
||||
<td>Location</td>
|
||||
<td>Stanley Cups</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Blues</td>
|
||||
<td>STL</td>
|
||||
<td>1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Flyers</td>
|
||||
<td>PHI</td>
|
||||
<td>2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Maple Leafs</td>
|
||||
<td>TOR</td>
|
||||
<td>13</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>"""
|
||||
|
||||
|
||||
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
||||
|
||||
EXPECTED_FILETYPE = "text/csv"
|
||||
|
||||
|
||||
|
33
test_unstructured/partition/test_tsv.py
Normal file
33
test_unstructured/partition/test_tsv.py
Normal file
@ -0,0 +1,33 @@
|
||||
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import Table
|
||||
from unstructured.partition.tsv import partition_tsv
|
||||
|
||||
EXPECTED_FILETYPE = "text/tsv"
|
||||
|
||||
|
||||
def test_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
|
||||
elements = partition_tsv(filename=filename)
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
||||
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
|
||||
|
||||
|
||||
def test_partition_tsv_from_file(filename="example-docs/stanley-cups.tsv"):
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_tsv(file=f)
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert isinstance(elements[0], Table)
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
||||
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
|
||||
|
||||
|
||||
def test_partition_tsv_can_exclude_metadata(filename="example-docs/stanley-cups.tsv"):
|
||||
elements = partition_tsv(filename=filename, include_metadata=False)
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert isinstance(elements[0], Table)
|
||||
assert elements[0].metadata.text_as_html is None
|
||||
assert elements[0].metadata.filetype is None
|
@ -1,35 +1,8 @@
|
||||
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import Table
|
||||
from unstructured.partition.xlsx import partition_xlsx
|
||||
|
||||
EXPECTED_TABLE = """<table border="1" class="dataframe">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Team</td>
|
||||
<td>Location</td>
|
||||
<td>Stanley Cups</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Blues</td>
|
||||
<td>STL</td>
|
||||
<td>1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Flyers</td>
|
||||
<td>PHI</td>
|
||||
<td>2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Maple Leafs</td>
|
||||
<td>TOR</td>
|
||||
<td>13</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>"""
|
||||
|
||||
|
||||
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
||||
|
||||
EXPECTED_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
|
||||
EXCEPTED_PAGE_NAME = "Stanley Cups"
|
||||
|
@ -74,6 +74,7 @@ class FileType(Enum):
|
||||
TXT = 42
|
||||
JSON = 43
|
||||
CSV = 44
|
||||
TSV = 45
|
||||
|
||||
# Markup Types
|
||||
HTML = 50
|
||||
@ -106,6 +107,7 @@ STR_TO_FILETYPE = {
|
||||
"text/comma-separated-values": FileType.CSV,
|
||||
"text/x-comma-separated-values": FileType.CSV,
|
||||
"text/csv": FileType.CSV,
|
||||
"text/tsv": FileType.TSV,
|
||||
"text/markdown": FileType.MD,
|
||||
"text/x-markdown": FileType.MD,
|
||||
"text/x-rst": FileType.RST,
|
||||
@ -166,6 +168,7 @@ EXT_TO_FILETYPE = {
|
||||
".msg": FileType.MSG,
|
||||
".odt": FileType.ODT,
|
||||
".csv": FileType.CSV,
|
||||
".tsv": FileType.TSV,
|
||||
# NOTE(robinson) - for now we are treating code files as plain text
|
||||
".js": FileType.TXT,
|
||||
".py": FileType.TXT,
|
||||
@ -229,7 +232,11 @@ def detect_filetype(
|
||||
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
|
||||
|
||||
elif file is not None:
|
||||
extension = None
|
||||
if hasattr(file, "name"):
|
||||
_, extension = os.path.splitext(file.name)
|
||||
else:
|
||||
extension = ""
|
||||
extension = extension.lower()
|
||||
# NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
|
||||
# Increased to 4096 because otherwise .xlsx files get detected as a zip file
|
||||
# ref: https://github.com/ahupp/python-magic#usage
|
||||
@ -251,43 +258,33 @@ def detect_filetype(
|
||||
|
||||
"""Mime type special cases."""
|
||||
# third check (mime_type)
|
||||
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
|
||||
# installed on the Unstructured docker image, .json files resolve to "text/plain"
|
||||
# rather than "application/json". this corrects for that case.
|
||||
if mime_type == "text/plain" and extension == ".json":
|
||||
return FileType.JSON
|
||||
|
||||
# NOTE(Crag): older magic lib does not differentiate between xls and doc
|
||||
if mime_type == "application/msword" and extension == ".xls":
|
||||
return FileType.XLS
|
||||
|
||||
elif mime_type.endswith("xml"):
|
||||
if extension and (extension == ".html" or extension == ".htm"):
|
||||
if extension == ".html" or extension == ".htm":
|
||||
return FileType.HTML
|
||||
else:
|
||||
return FileType.XML
|
||||
|
||||
elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
|
||||
if extension and extension == ".eml":
|
||||
return FileType.EML
|
||||
elif extension and extension == ".md":
|
||||
return FileType.MD
|
||||
elif extension and extension == ".rst":
|
||||
return FileType.RST
|
||||
elif extension and extension == ".rtf":
|
||||
return FileType.RTF
|
||||
elif extension and extension == ".html":
|
||||
return FileType.HTML
|
||||
|
||||
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
|
||||
# installed on the Unstructured docker image, .json files resolve to "text/plain"
|
||||
# rather than "application/json". this corrects for that case.
|
||||
if _is_text_file_a_json(file=file, filename=filename, encoding=encoding):
|
||||
return FileType.JSON
|
||||
|
||||
if _is_text_file_a_csv(file=file, filename=filename, encoding=encoding):
|
||||
return FileType.CSV
|
||||
|
||||
if file and not extension and _check_eml_from_buffer(file=file) is True:
|
||||
if file and _check_eml_from_buffer(file=file) is True:
|
||||
return FileType.EML
|
||||
|
||||
if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".tsv", ".json"]:
|
||||
return EXT_TO_FILETYPE.get(extension)
|
||||
|
||||
# Safety catch
|
||||
if mime_type in STR_TO_FILETYPE:
|
||||
return STR_TO_FILETYPE[mime_type]
|
||||
@ -295,14 +292,16 @@ def detect_filetype(
|
||||
return FileType.TXT
|
||||
|
||||
elif mime_type == "application/octet-stream":
|
||||
if file and not extension:
|
||||
if extension == ".docx":
|
||||
return FileType.DOCX
|
||||
elif file:
|
||||
return _detect_filetype_from_octet_stream(file=file)
|
||||
else:
|
||||
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
|
||||
|
||||
elif mime_type == "application/zip":
|
||||
filetype = FileType.UNK
|
||||
if file and not extension:
|
||||
if file:
|
||||
filetype = _detect_filetype_from_octet_stream(file=file)
|
||||
elif filename is not None:
|
||||
with open(filename, "rb") as f:
|
||||
@ -310,9 +309,9 @@ def detect_filetype(
|
||||
|
||||
extension = extension if extension else ""
|
||||
if filetype == FileType.UNK:
|
||||
return EXT_TO_FILETYPE.get(extension.lower(), FileType.ZIP)
|
||||
return FileType.ZIP
|
||||
else:
|
||||
return EXT_TO_FILETYPE.get(extension.lower(), filetype)
|
||||
return EXT_TO_FILETYPE.get(extension, filetype)
|
||||
|
||||
elif _is_code_mime_type(mime_type):
|
||||
# NOTE(robinson) - we'll treat all code files as plain text for now.
|
||||
|
@ -29,6 +29,7 @@ from unstructured.partition.pptx import partition_pptx
|
||||
from unstructured.partition.rst import partition_rst
|
||||
from unstructured.partition.rtf import partition_rtf
|
||||
from unstructured.partition.text import partition_text
|
||||
from unstructured.partition.tsv import partition_tsv
|
||||
from unstructured.partition.xlsx import partition_xlsx
|
||||
from unstructured.partition.xml import partition_xml
|
||||
|
||||
@ -211,6 +212,8 @@ def partition(
|
||||
elements = partition_xlsx(filename=filename, file=file)
|
||||
elif filetype == FileType.CSV:
|
||||
elements = partition_csv(filename=filename, file=file)
|
||||
elif filetype == FileType.TSV:
|
||||
elements = partition_tsv(filename=filename, file=file)
|
||||
elif filetype == FileType.EMPTY:
|
||||
elements = []
|
||||
else:
|
||||
|
53
unstructured/partition/tsv.py
Normal file
53
unstructured/partition/tsv.py
Normal file
@ -0,0 +1,53 @@
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import IO, BinaryIO, List, Optional, Union, cast
|
||||
|
||||
import lxml.html
|
||||
import pandas as pd
|
||||
|
||||
from unstructured.documents.elements import Element, ElementMetadata, Table
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
|
||||
|
||||
|
||||
@add_metadata_with_filetype(FileType.TSV)
|
||||
def partition_tsv(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
include_metadata: bool = True,
|
||||
) -> List[Element]:
|
||||
"""Partitions TSV files into document elements.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
metadata_filename
|
||||
The filename to use for the metadata.
|
||||
include_metadata
|
||||
Determines whether or not metadata is included in the output.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
if filename:
|
||||
table = pd.read_csv(filename, sep="\t")
|
||||
else:
|
||||
f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file))
|
||||
table = pd.read_csv(f, sep="\t")
|
||||
|
||||
metadata_filename = filename or metadata_filename
|
||||
|
||||
html_text = table.to_html(index=False, header=False, na_rep="")
|
||||
text = lxml.html.document_fromstring(html_text).text_content()
|
||||
|
||||
if include_metadata:
|
||||
metadata = ElementMetadata(
|
||||
text_as_html=html_text,
|
||||
filename=metadata_filename,
|
||||
)
|
||||
else:
|
||||
metadata = ElementMetadata()
|
||||
|
||||
return [Table(text=text, metadata=metadata)]
|
Loading…
x
Reference in New Issue
Block a user