| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  | import logging | 
					
						
							|  |  |  | import time | 
					
						
							|  |  |  | from pathlib import Path | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-24 20:19:41 +02:00
										 |  |  | from docling_core.types.doc import ImageRefMode, PictureItem, TableItem | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | from docling.datamodel.base_models import FigureElement, InputFormat, Table | 
					
						
							|  |  |  | from docling.datamodel.pipeline_options import PdfPipelineOptions | 
					
						
							|  |  |  | from docling.document_converter import DocumentConverter, PdfFormatOption | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | _log = logging.getLogger(__name__) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | IMAGE_RESOLUTION_SCALE = 2.0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def main(): | 
					
						
							|  |  |  |     logging.basicConfig(level=logging.INFO) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |     input_doc_path = Path("./tests/data/2206.01062.pdf") | 
					
						
							|  |  |  |     output_dir = Path("scratch") | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Important: For operating with page images, we must keep them, otherwise the DocumentConverter | 
					
						
							|  |  |  |     # will destroy them for cleaning up memory. | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |     # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images. | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  |     # scale=1 correspond of a standard 72 DPI image | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |     # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched | 
					
						
							|  |  |  |     # with the image field | 
					
						
							|  |  |  |     pipeline_options = PdfPipelineOptions() | 
					
						
							|  |  |  |     pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE | 
					
						
							|  |  |  |     pipeline_options.generate_page_images = True | 
					
						
							|  |  |  |     pipeline_options.generate_picture_images = True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     doc_converter = DocumentConverter( | 
					
						
							|  |  |  |         format_options={ | 
					
						
							|  |  |  |             InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     start_time = time.time() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |     conv_res = doc_converter.convert(input_doc_path) | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     output_dir.mkdir(parents=True, exist_ok=True) | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |     doc_filename = conv_res.input.file.stem | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Save page images | 
					
						
							|  |  |  |     for page_no, page in conv_res.document.pages.items(): | 
					
						
							|  |  |  |         page_no = page.page_no | 
					
						
							|  |  |  |         page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" | 
					
						
							|  |  |  |         with page_image_filename.open("wb") as fp: | 
					
						
							|  |  |  |             page.image.pil_image.save(fp, format="PNG") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Save images of figures and tables | 
					
						
							|  |  |  |     table_counter = 0 | 
					
						
							|  |  |  |     picture_counter = 0 | 
					
						
							|  |  |  |     for element, _level in conv_res.document.iterate_items(): | 
					
						
							|  |  |  |         if isinstance(element, TableItem): | 
					
						
							|  |  |  |             table_counter += 1 | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  |             element_image_filename = ( | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |                 output_dir / f"{doc_filename}-table-{table_counter}.png" | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  |             ) | 
					
						
							|  |  |  |             with element_image_filename.open("wb") as fp: | 
					
						
							| 
									
										
										
										
											2024-11-19 16:28:52 +01:00
										 |  |  |                 element.get_image(conv_res.document).save(fp, "PNG") | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |         if isinstance(element, PictureItem): | 
					
						
							|  |  |  |             picture_counter += 1 | 
					
						
							|  |  |  |             element_image_filename = ( | 
					
						
							|  |  |  |                 output_dir / f"{doc_filename}-picture-{picture_counter}.png" | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |             with element_image_filename.open("wb") as fp: | 
					
						
							| 
									
										
										
										
											2024-11-19 16:28:52 +01:00
										 |  |  |                 element.get_image(conv_res.document).save(fp, "PNG") | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-24 20:19:41 +02:00
										 |  |  |     # Save markdown with embedded pictures | 
					
						
							|  |  |  |     md_filename = output_dir / f"{doc_filename}-with-images.md" | 
					
						
							| 
									
										
										
										
											2024-11-27 13:07:00 +01:00
										 |  |  |     conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Save markdown with externally referenced pictures | 
					
						
							|  |  |  |     md_filename = output_dir / f"{doc_filename}-with-image-refs.md" | 
					
						
							|  |  |  |     conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Save HTML with externally referenced pictures | 
					
						
							|  |  |  |     html_filename = output_dir / f"{doc_filename}-with-image-refs.html" | 
					
						
							|  |  |  |     conv_res.document.save_as_html(html_filename, image_mode=ImageRefMode.REFERENCED) | 
					
						
							| 
									
										
										
										
											2024-10-24 20:19:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  |     end_time = time.time() - start_time | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-16 21:02:03 +02:00
										 |  |  |     _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.") | 
					
						
							| 
									
										
										
										
											2024-10-14 14:13:13 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  |     main() |