docling/docs/examples/export_tables.py

52 lines
1.4 KiB
Python
Raw Normal View History

import logging
import time
from pathlib import Path
import pandas as pd
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
def main():
logging.basicConfig(level=logging.INFO)
fix: Test cases for RTL programmatic PDFs and fixes for the formula model (#903) fix: Support for RTL programmatic documents fix(parser): detect and handle rotated pages fix(parser): fix bug causing duplicated text fix(formula): improve stopping criteria chore: update lock file fix: temporary constrain beautifulsoup * switch to code formula model v1.0.1 and new test pdf Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * switch to code formula model v1.0.1 and new test pdf Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * cleaned up the data folder in the tests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * switch to code formula model v1.0.1 and new test pdf Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * added three test-files for right-to-left Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix black Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * added new gt for test_e2e_conversion Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * added new gt for test_e2e_conversion Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * Add code to expose text direction of cell Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * new test file Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * update lock Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix mypy reports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix example filepaths Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add test data results Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * pin wheel of latest docling-parse release Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use latest docling-core Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove debugging code Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix path to files in example Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Revert unwanted RTL additions Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix test data paths in examples Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> Co-authored-by: Peter Staar <taa@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
2025-02-07 08:43:31 +01:00
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch")
doc_converter = DocumentConverter()
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path)
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_res.input.file.stem
# Export tables
for table_ix, table in enumerate(conv_res.document.tables):
table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}")
print(table_df.to_markdown())
# Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename)
# Save the table as html
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
_log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html(doc=conv_res.document))
end_time = time.time() - start_time
_log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")
if __name__ == "__main__":
main()