from pathlib import Path from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( PdfPipelineOptions, TesseractCliOcrOptions, ) from docling.document_converter import DocumentConverter, PdfFormatOption def main(): data_folder = Path(__file__).parent / "../../tests/data" input_doc_path = data_folder / "pdf/2206.01062.pdf" pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = True pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only), RapidOcrOptions # ocr_options = EasyOcrOptions(force_full_page_ocr=True) # ocr_options = TesseractOcrOptions(force_full_page_ocr=True) # ocr_options = OcrMacOptions(force_full_page_ocr=True) # ocr_options = RapidOcrOptions(force_full_page_ocr=True) ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True) pipeline_options.ocr_options = ocr_options converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, ) } ) doc = converter.convert(input_doc_path).document md = doc.export_to_markdown() print(md) if __name__ == "__main__": main()