docling/docs/examples/enrich_simple_pipeline.py

36 lines
916 B
Python
Raw Normal View History

import logging
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ConvertPipelineOptions
from docling.document_converter import (
DocumentConverter,
HTMLFormatOption,
WordFormatOption,
)
_log = logging.getLogger(__name__)
def main():
input_path = Path("tests/data/docx/word_sample.docx")
pipeline_options = ConvertPipelineOptions()
pipeline_options.do_picture_classification = True
pipeline_options.do_picture_description = True
doc_converter = DocumentConverter(
format_options={
InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
InputFormat.HTML: HTMLFormatOption(pipeline_options=pipeline_options),
},
)
res = doc_converter.convert(input_path)
print(res.document.export_to_markdown())
if __name__ == "__main__":
main()