mirror of
https://github.com/docling-project/docling.git
synced 2025-09-09 09:11:03 +00:00

* feat: exploring new version * DCO Remediation Commit for Georg Heiler <georg.kf.heiler@gmail.com> I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 5815c8f81b0e5ce400332597b6795e5a97ecf775 Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * chore: autoformat DCO Remediation Commit for Georg Heiler <georg.kf.heiler@gmail.com> I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 5815c8f81b0e5ce400332597b6795e5a97ecf775 * feat: enable configurable runtime for rapidocr and handle new result better; Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * chore: fix linter Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * chore: use new server model * chore: change default engine type to onnx * chore: tests update for new rapidocr * fix: rebase from main and fix clashes * DCO Remediation Commit for Georg Heiler <georg.kf.heiler@gmail.com> I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 5815c8f81b0e5ce400332597b6795e5a97ecf775 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 02f9db85f562e5cdfda40c52fee55cfd4030d70a I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: a7bcb205faedb881f94a89b3bbd29cb31ccd54f0 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: a39482a98cbcff7a825c8321134732af0c65930a I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 63e9d717fa26951566b02761f3fdfc752c31f805 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: ef12a6ec1ea2846a8a8e2e776eeaa59c2a0c4dfe Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * DCO Remediation Commit for Georg Heiler <georg.kf.heiler@gmail.com> I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 2222d2340387f8d9d66f3ca9d8e21a0945a44e7a I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: bc6a1dc507d7f146ec4797a2d3840414f46ac64d I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 56e0d67da7c57d4b5caf8eaef8dff7056c3efd32 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 871ca21271412006c76acf3c19426140efed3d50 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 7b1b77159da729d483a581a86c7309acba1712a7 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: a792a714a43e19a91b2b782f54621c1c5efda632 Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * DCO Remediation Commit for Georg Heiler <georg.kf.heiler@gmail.com> I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: d1fed26323ff829b716bc667fe69532839363e45 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 346ec1cad943765f886e5d17fb0a54221124689c I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 4d0bbe5bd6e9f7261b97362ff8823af244267089 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 34a5ad53892a7064a6bf35f890d344d464c78b2f I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 9151959db3ad53535011d1cfdcf9181fdf936bb1 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 8ef5536f2c098826c6c0a05190f8a80614c3f3cb Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * DCO Remediation Commit for Georg Heiler <georg.kf.heiler@gmail.com> I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 7e18637a35c6786c90bc41b40607404f4b084b45 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 63fb8ff599035186aba2d958fbaec32739e92d01 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 0cb9444fb89b978e456dcf607815d7a8416c1ffa I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 38940d9978c5c18bd7fbffb8170f1b1a90680b94 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: b6d461ac427ebc8b814a7e1d0a452a4ac8a374af I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: ee55eb3408ed5decb5324ec441e166e180512cf4 Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> --------- Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>
59 lines
1.6 KiB
Python
Vendored
59 lines
1.6 KiB
Python
Vendored
import os
|
|
|
|
from modelscope import snapshot_download
|
|
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
|
|
from docling.document_converter import (
|
|
ConversionResult,
|
|
DocumentConverter,
|
|
InputFormat,
|
|
PdfFormatOption,
|
|
)
|
|
|
|
|
|
def main():
|
|
# Source document to convert
|
|
source = "https://arxiv.org/pdf/2408.09869v4"
|
|
|
|
# Download RappidOCR models from HuggingFace
|
|
print("Downloading RapidOCR models")
|
|
download_path = snapshot_download(repo_id="RapidAI/RapidOCR")
|
|
|
|
# Setup RapidOcrOptions for english detection
|
|
det_model_path = os.path.join(
|
|
download_path, "onnx", "PP-OCRv5", "det", "ch_PP-OCRv5_server_det.onnx"
|
|
)
|
|
rec_model_path = os.path.join(
|
|
download_path, "onnx", "PP-OCRv5", "rec", "ch_PP-OCRv5_rec_server_infer.onnx"
|
|
)
|
|
cls_model_path = os.path.join(
|
|
download_path, "onnx", "PP-OCRv4", "cls", "ch_ppocr_mobile_v2.0_cls_infer.onnx"
|
|
)
|
|
ocr_options = RapidOcrOptions(
|
|
det_model_path=det_model_path,
|
|
rec_model_path=rec_model_path,
|
|
cls_model_path=cls_model_path,
|
|
)
|
|
|
|
pipeline_options = PdfPipelineOptions(
|
|
ocr_options=ocr_options,
|
|
)
|
|
|
|
# Convert the document
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_options=pipeline_options,
|
|
),
|
|
},
|
|
)
|
|
|
|
conversion_result: ConversionResult = converter.convert(source=source)
|
|
doc = conversion_result.document
|
|
md = doc.export_to_markdown()
|
|
print(md)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|