mirror of
https://github.com/docling-project/docling.git
synced 2025-12-05 07:14:03 +00:00
* docs: Added documentation to use SuryaOCR via plugin `docling-surya` Signed-off-by: Harry Ho <kho7@student.umgc.edu> * Add PyPI link for docling-surya package Added a link to the PyPI page for docling-surya. Signed-off-by: Harry Ho <4719770+harrykhh@users.noreply.github.com> * Add licensing note for SuryaOCR integration Added important licensing note regarding SuryaOCR integration. Signed-off-by: Harry Ho <4719770+harrykhh@users.noreply.github.com> * Ran linter to reformat Signed-off-by: Harry Ho <4719770+harrykhh@users.noreply.github.com> --------- Signed-off-by: Harry Ho <kho7@student.umgc.edu> Signed-off-by: Harry Ho <4719770+harrykhh@users.noreply.github.com> Co-authored-by: Harry Ho <kho7@student.umgc.edu>
55 lines
1.9 KiB
Python
Vendored
55 lines
1.9 KiB
Python
Vendored
# Example: Integrating SuryaOCR with Docling for PDF OCR and Markdown Export
|
|
#
|
|
# Overview:
|
|
# - Configures SuryaOCR options for OCR.
|
|
# - Executes PDF pipeline with SuryaOCR integration.
|
|
# - Models auto-download from Hugging Face on first run.
|
|
#
|
|
# Prerequisites:
|
|
# - Install: `pip install docling-surya`
|
|
# - Ensure `docling` imports successfully.
|
|
#
|
|
# Execution:
|
|
# - Run from repo root: `python docs/examples/suryaocr_with_custom_models.py`
|
|
# - Outputs Markdown to stdout.
|
|
#
|
|
# Notes:
|
|
# - Default source: EPA PDF URL; substitute with local path as needed.
|
|
# - Models cached in `~/.cache/huggingface`; override with HF_HOME env var.
|
|
# - Use proxy config for restricted networks.
|
|
# - **Important Licensing Note**: The `docling-surya` package integrates SuryaOCR, which is licensed under the GNU General Public License (GPL).
|
|
# Using this integration may impose GPL obligations on your project. Review the license terms carefully.
|
|
|
|
# Requires `pip install docling-surya`
|
|
# See https://pypi.org/project/docling-surya/
|
|
from docling_surya import SuryaOcrOptions
|
|
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
|
|
|
|
def main():
|
|
source = "https://19january2021snapshot.epa.gov/sites/static/files/2016-02/documents/epa_sample_letter_sent_to_commissioners_dated_february_29_2015.pdf"
|
|
|
|
pipeline_options = PdfPipelineOptions(
|
|
do_ocr=True,
|
|
ocr_model="suryaocr",
|
|
allow_external_plugins=True,
|
|
ocr_options=SuryaOcrOptions(lang=["en"]),
|
|
)
|
|
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
|
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
|
|
}
|
|
)
|
|
|
|
result = converter.convert(source)
|
|
print(result.document.export_to_markdown())
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|