docling/docs/examples/enrich_simple_pipeline.py
Michele Dolfi 2c9123419f
feat: enrichment steps on all convert pipelines (incl docx, html, etc) (#2251)
* allow enrichment on all convert pipelines

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* set options in CLI

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-09-11 15:09:00 +02:00

36 lines
916 B
Python
Vendored

import logging
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ConvertPipelineOptions
from docling.document_converter import (
DocumentConverter,
HTMLFormatOption,
WordFormatOption,
)
_log = logging.getLogger(__name__)
def main():
input_path = Path("tests/data/docx/word_sample.docx")
pipeline_options = ConvertPipelineOptions()
pipeline_options.do_picture_classification = True
pipeline_options.do_picture_description = True
doc_converter = DocumentConverter(
format_options={
InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
InputFormat.HTML: HTMLFormatOption(pipeline_options=pipeline_options),
},
)
res = doc_converter.convert(input_path)
print(res.document.export_to_markdown())
if __name__ == "__main__":
main()