import logging from pathlib import Path from docling_core.types.doc import PictureItem from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( PdfPipelineOptions, granite_picture_description, smolvlm_picture_description, ) from docling.document_converter import DocumentConverter, PdfFormatOption def main(): logging.basicConfig(level=logging.INFO) input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") pipeline_options = PdfPipelineOptions() pipeline_options.do_picture_description = True pipeline_options.picture_description_options = smolvlm_picture_description # pipeline_options.picture_description_options = granite_picture_description pipeline_options.picture_description_options.prompt = ( "Describe the image in three sentences. Be consise and accurate." ) doc_converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, ) } ) result = doc_converter.convert(input_doc_path) for element, _level in result.document.iterate_items(): if isinstance(element, PictureItem): print( f"Picture {element.self_ref}\n" f"Caption: {element.caption_text(doc=result.document)}\n" f"Annotations: {element.annotations}" ) if __name__ == "__main__": main()