mirror of
				https://github.com/docling-project/docling.git
				synced 2025-11-03 20:32:54 +00:00 
			
		
		
		
	feat: add convert_string to document-converter (#2069)
* feat: add convert_string to document-converter Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix unsupported operand type(s) for |: type and NoneType Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added tests for convert_string Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
		
							parent
							
								
									e2cca931be
								
							
						
					
					
						commit
						b09033cb73
					
				@ -5,7 +5,9 @@ import threading
 | 
			
		||||
import time
 | 
			
		||||
from collections.abc import Iterable, Iterator
 | 
			
		||||
from concurrent.futures import ThreadPoolExecutor
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
from functools import partial
 | 
			
		||||
from io import BytesIO
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from typing import Dict, List, Optional, Tuple, Type, Union
 | 
			
		||||
 | 
			
		||||
@ -275,6 +277,34 @@ class DocumentConverter:
 | 
			
		||||
                "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    @validate_call(config=ConfigDict(strict=True))
 | 
			
		||||
    def convert_string(
 | 
			
		||||
        self,
 | 
			
		||||
        content: str,
 | 
			
		||||
        format: InputFormat,
 | 
			
		||||
        name: Optional[str],
 | 
			
		||||
    ) -> ConversionResult:
 | 
			
		||||
        name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 | 
			
		||||
 | 
			
		||||
        if format == InputFormat.MD:
 | 
			
		||||
            if not name.endswith(".md"):
 | 
			
		||||
                name += ".md"
 | 
			
		||||
 | 
			
		||||
            buff = BytesIO(content.encode("utf-8"))
 | 
			
		||||
            doc_stream = DocumentStream(name=name, stream=buff)
 | 
			
		||||
 | 
			
		||||
            return self.convert(doc_stream)
 | 
			
		||||
        elif format == InputFormat.HTML:
 | 
			
		||||
            if not name.endswith(".html"):
 | 
			
		||||
                name += ".html"
 | 
			
		||||
 | 
			
		||||
            buff = BytesIO(content.encode("utf-8"))
 | 
			
		||||
            doc_stream = DocumentStream(name=name, stream=buff)
 | 
			
		||||
 | 
			
		||||
            return self.convert(doc_stream)
 | 
			
		||||
        else:
 | 
			
		||||
            raise ValueError(f"format {format} is not supported in `convert_string`")
 | 
			
		||||
 | 
			
		||||
    def _convert(
 | 
			
		||||
        self, conv_input: _DocumentConversionInput, raises_on_error: bool
 | 
			
		||||
    ) -> Iterator[ConversionResult]:
 | 
			
		||||
 | 
			
		||||
@ -2,10 +2,19 @@ from pathlib import Path
 | 
			
		||||
 | 
			
		||||
from docling.backend.md_backend import MarkdownDocumentBackend
 | 
			
		||||
from docling.datamodel.base_models import InputFormat
 | 
			
		||||
from docling.datamodel.document import DoclingDocument, InputDocument
 | 
			
		||||
from docling.datamodel.document import (
 | 
			
		||||
    ConversionResult,
 | 
			
		||||
    DoclingDocument,
 | 
			
		||||
    InputDocument,
 | 
			
		||||
    SectionHeaderItem,
 | 
			
		||||
)
 | 
			
		||||
from docling.document_converter import DocumentConverter
 | 
			
		||||
from tests.verify_utils import CONFID_PREC, COORD_PREC
 | 
			
		||||
 | 
			
		||||
from .test_data_gen_flag import GEN_TEST_DATA
 | 
			
		||||
from .verify_utils import verify_document, verify_export
 | 
			
		||||
 | 
			
		||||
GENERATE = GEN_TEST_DATA
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_convert_valid():
 | 
			
		||||
@ -54,3 +63,45 @@ def test_convert_valid():
 | 
			
		||||
            if in_path.stem in yaml_filter:
 | 
			
		||||
                exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
 | 
			
		||||
                assert act_doc == exp_doc, f"export to yaml failed on {in_path}"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_md_paths():
 | 
			
		||||
    # Define the directory you want to search
 | 
			
		||||
    directory = Path("./tests/groundtruth/docling_v2")
 | 
			
		||||
 | 
			
		||||
    # List all MD files in the directory and its subdirectories
 | 
			
		||||
    md_files = sorted(directory.rglob("*.md"))
 | 
			
		||||
    return md_files
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_converter():
 | 
			
		||||
    converter = DocumentConverter(allowed_formats=[InputFormat.MD])
 | 
			
		||||
 | 
			
		||||
    return converter
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_e2e_md_conversions():
 | 
			
		||||
    md_paths = get_md_paths()
 | 
			
		||||
    converter = get_converter()
 | 
			
		||||
 | 
			
		||||
    for md_path in md_paths:
 | 
			
		||||
        # print(f"converting {md_path}")
 | 
			
		||||
 | 
			
		||||
        with open(md_path) as fr:
 | 
			
		||||
            true_md = fr.read()
 | 
			
		||||
 | 
			
		||||
        conv_result: ConversionResult = converter.convert(md_path)
 | 
			
		||||
 | 
			
		||||
        doc: DoclingDocument = conv_result.document
 | 
			
		||||
 | 
			
		||||
        pred_md: str = doc.export_to_markdown()
 | 
			
		||||
        assert true_md == pred_md
 | 
			
		||||
 | 
			
		||||
        conv_result_: ConversionResult = converter.convert_string(
 | 
			
		||||
            true_md, format=InputFormat.MD
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        doc_: DoclingDocument = conv_result_.document
 | 
			
		||||
 | 
			
		||||
        pred_md_: str = doc_.export_to_markdown()
 | 
			
		||||
        assert true_md == pred_md_
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user