mirror of
https://github.com/docling-project/docling.git
synced 2025-09-30 11:35:20 +00:00
36 lines
916 B
Python
36 lines
916 B
Python
![]() |
import logging
|
||
|
from pathlib import Path
|
||
|
|
||
|
from docling.datamodel.base_models import InputFormat
|
||
|
from docling.datamodel.pipeline_options import ConvertPipelineOptions
|
||
|
from docling.document_converter import (
|
||
|
DocumentConverter,
|
||
|
HTMLFormatOption,
|
||
|
WordFormatOption,
|
||
|
)
|
||
|
|
||
|
_log = logging.getLogger(__name__)
|
||
|
|
||
|
|
||
|
def main():
|
||
|
input_path = Path("tests/data/docx/word_sample.docx")
|
||
|
|
||
|
pipeline_options = ConvertPipelineOptions()
|
||
|
pipeline_options.do_picture_classification = True
|
||
|
pipeline_options.do_picture_description = True
|
||
|
|
||
|
doc_converter = DocumentConverter(
|
||
|
format_options={
|
||
|
InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
|
||
|
InputFormat.HTML: HTMLFormatOption(pipeline_options=pipeline_options),
|
||
|
},
|
||
|
)
|
||
|
|
||
|
res = doc_converter.convert(input_path)
|
||
|
|
||
|
print(res.document.export_to_markdown())
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|