mirror of
https://github.com/docling-project/docling.git
synced 2025-06-27 05:20:05 +00:00

* add coverage calculation and push Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * new codecov version and usage of token Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * enable ruff formatter instead of black and isort Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff lint fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff unsafe fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add removed imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * runs 1 on linter issues Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * finalize linter fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update pyproject.toml Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
52 lines
1.4 KiB
Python
52 lines
1.4 KiB
Python
import logging
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
from docling.document_converter import DocumentConverter
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
|
|
def main():
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
|
output_dir = Path("scratch")
|
|
|
|
doc_converter = DocumentConverter()
|
|
|
|
start_time = time.time()
|
|
|
|
conv_res = doc_converter.convert(input_doc_path)
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
doc_filename = conv_res.input.file.stem
|
|
|
|
# Export tables
|
|
for table_ix, table in enumerate(conv_res.document.tables):
|
|
table_df: pd.DataFrame = table.export_to_dataframe()
|
|
print(f"## Table {table_ix}")
|
|
print(table_df.to_markdown())
|
|
|
|
# Save the table as csv
|
|
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
|
|
_log.info(f"Saving CSV table to {element_csv_filename}")
|
|
table_df.to_csv(element_csv_filename)
|
|
|
|
# Save the table as html
|
|
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
|
|
_log.info(f"Saving HTML table to {element_html_filename}")
|
|
with element_html_filename.open("w") as fp:
|
|
fp.write(table.export_to_html(doc=conv_res.document))
|
|
|
|
end_time = time.time() - start_time
|
|
|
|
_log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|