docling/docs/examples/run_with_formats.py
Peter W. J. Staar f542460af3
fix: fix duplicate title and heading + add e2e tests for html and docx (#186)
* add real e2e tests for html and docx

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated the output of itxt

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* reformatted the text

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fixed the tests

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fixed the tests (2)

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fixed the examples (1)

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fixed the output of the test

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated the tests, moved the ground-truth

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* moved the ground-truth data

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fixed the html tests

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* restructure title fix (#187)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2024-10-30 13:14:56 +01:00

82 lines
2.6 KiB
Python

import json
import logging
from pathlib import Path
import yaml
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
DocumentConverter,
PdfFormatOption,
WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
_log = logging.getLogger(__name__)
def main():
input_paths = [
Path("README.md"),
Path("tests/data/html/wiki_duck.html"),
Path("tests/data/docx/word_sample.docx"),
Path("tests/data/docx/lorem_ipsum.docx"),
Path("tests/data/pptx/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"),
Path("tests/data/test_01.asciidoc"),
Path("tests/data/test_01.asciidoc"),
]
## for defaults use:
# doc_converter = DocumentConverter()
## to customize use:
doc_converter = (
DocumentConverter( # all of the below is optional, has internal defaults.
allowed_formats=[
InputFormat.PDF,
InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
InputFormat.ASCIIDOC,
InputFormat.MD,
], # whitelist formats, non-matching files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
),
},
)
)
conv_results = doc_converter.convert_all(input_paths)
for res in conv_results:
out_path = Path("scratch")
print(
f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {str(out_path)}"
)
_log.debug(res.document._export_to_indented_text(max_text_len=16))
# Export Docling document format to markdowndoc:
with (out_path / f"{res.input.file.stem}.md").open("w") as fp:
fp.write(res.document.export_to_markdown())
with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
fp.write(json.dumps(res.document.export_to_dict()))
with (out_path / f"{res.input.file.stem}.yaml").open("w") as fp:
fp.write(yaml.safe_dump(res.document.export_to_dict()))
if __name__ == "__main__":
main()