| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  | import logging | 
					
						
							|  |  |  | import time | 
					
						
							|  |  |  | from pathlib import Path | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import pandas as pd | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from docling.document_converter import DocumentConverter | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | _log = logging.getLogger(__name__) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def main(): | 
					
						
							|  |  |  |     logging.basicConfig(level=logging.INFO) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-07 08:43:31 +01:00
										 |  |  |     input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |     output_dir = Path("scratch") | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     doc_converter = DocumentConverter() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     start_time = time.time() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |     conv_res = doc_converter.convert(input_doc_path) | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     output_dir.mkdir(parents=True, exist_ok=True) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |     doc_filename = conv_res.input.file.stem | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Export tables | 
					
						
							|  |  |  |     for table_ix, table in enumerate(conv_res.document.tables): | 
					
						
							|  |  |  |         table_df: pd.DataFrame = table.export_to_dataframe() | 
					
						
							|  |  |  |         print(f"## Table {table_ix}") | 
					
						
							|  |  |  |         print(table_df.to_markdown()) | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |         # Save the table as csv | 
					
						
							| 
									
										
										
										
											2025-04-14 18:01:26 +02:00
										 |  |  |         element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv" | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |         _log.info(f"Saving CSV table to {element_csv_filename}") | 
					
						
							|  |  |  |         table_df.to_csv(element_csv_filename) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Save the table as html | 
					
						
							| 
									
										
										
										
											2025-04-14 18:01:26 +02:00
										 |  |  |         element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html" | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |         _log.info(f"Saving HTML table to {element_html_filename}") | 
					
						
							|  |  |  |         with element_html_filename.open("w") as fp: | 
					
						
							| 
									
										
										
										
											2025-04-14 08:41:50 +02:00
										 |  |  |             fp.write(table.export_to_html(doc=conv_res.document)) | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     end_time = time.time() - start_time | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |     _log.info(f"Document converted and tables exported in {end_time:.2f} seconds.") | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  |     main() |