mirror of
				https://github.com/docling-project/docling.git
				synced 2025-11-04 12:53:05 +00:00 
			
		
		
		
	feat: add convert_string to document-converter (#2069)
* feat: add convert_string to document-converter Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix unsupported operand type(s) for |: type and NoneType Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added tests for convert_string Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
		
							parent
							
								
									e2cca931be
								
							
						
					
					
						commit
						b09033cb73
					
				@ -5,7 +5,9 @@ import threading
 | 
				
			|||||||
import time
 | 
					import time
 | 
				
			||||||
from collections.abc import Iterable, Iterator
 | 
					from collections.abc import Iterable, Iterator
 | 
				
			||||||
from concurrent.futures import ThreadPoolExecutor
 | 
					from concurrent.futures import ThreadPoolExecutor
 | 
				
			||||||
 | 
					from datetime import datetime
 | 
				
			||||||
from functools import partial
 | 
					from functools import partial
 | 
				
			||||||
 | 
					from io import BytesIO
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from typing import Dict, List, Optional, Tuple, Type, Union
 | 
					from typing import Dict, List, Optional, Tuple, Type, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -275,6 +277,34 @@ class DocumentConverter:
 | 
				
			|||||||
                "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
 | 
					                "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @validate_call(config=ConfigDict(strict=True))
 | 
				
			||||||
 | 
					    def convert_string(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        content: str,
 | 
				
			||||||
 | 
					        format: InputFormat,
 | 
				
			||||||
 | 
					        name: Optional[str],
 | 
				
			||||||
 | 
					    ) -> ConversionResult:
 | 
				
			||||||
 | 
					        name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if format == InputFormat.MD:
 | 
				
			||||||
 | 
					            if not name.endswith(".md"):
 | 
				
			||||||
 | 
					                name += ".md"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            buff = BytesIO(content.encode("utf-8"))
 | 
				
			||||||
 | 
					            doc_stream = DocumentStream(name=name, stream=buff)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            return self.convert(doc_stream)
 | 
				
			||||||
 | 
					        elif format == InputFormat.HTML:
 | 
				
			||||||
 | 
					            if not name.endswith(".html"):
 | 
				
			||||||
 | 
					                name += ".html"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            buff = BytesIO(content.encode("utf-8"))
 | 
				
			||||||
 | 
					            doc_stream = DocumentStream(name=name, stream=buff)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            return self.convert(doc_stream)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            raise ValueError(f"format {format} is not supported in `convert_string`")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _convert(
 | 
					    def _convert(
 | 
				
			||||||
        self, conv_input: _DocumentConversionInput, raises_on_error: bool
 | 
					        self, conv_input: _DocumentConversionInput, raises_on_error: bool
 | 
				
			||||||
    ) -> Iterator[ConversionResult]:
 | 
					    ) -> Iterator[ConversionResult]:
 | 
				
			||||||
 | 
				
			|||||||
@ -2,10 +2,19 @@ from pathlib import Path
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from docling.backend.md_backend import MarkdownDocumentBackend
 | 
					from docling.backend.md_backend import MarkdownDocumentBackend
 | 
				
			||||||
from docling.datamodel.base_models import InputFormat
 | 
					from docling.datamodel.base_models import InputFormat
 | 
				
			||||||
from docling.datamodel.document import DoclingDocument, InputDocument
 | 
					from docling.datamodel.document import (
 | 
				
			||||||
 | 
					    ConversionResult,
 | 
				
			||||||
 | 
					    DoclingDocument,
 | 
				
			||||||
 | 
					    InputDocument,
 | 
				
			||||||
 | 
					    SectionHeaderItem,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from docling.document_converter import DocumentConverter
 | 
				
			||||||
from tests.verify_utils import CONFID_PREC, COORD_PREC
 | 
					from tests.verify_utils import CONFID_PREC, COORD_PREC
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .test_data_gen_flag import GEN_TEST_DATA
 | 
					from .test_data_gen_flag import GEN_TEST_DATA
 | 
				
			||||||
 | 
					from .verify_utils import verify_document, verify_export
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					GENERATE = GEN_TEST_DATA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_convert_valid():
 | 
					def test_convert_valid():
 | 
				
			||||||
@ -54,3 +63,45 @@ def test_convert_valid():
 | 
				
			|||||||
            if in_path.stem in yaml_filter:
 | 
					            if in_path.stem in yaml_filter:
 | 
				
			||||||
                exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
 | 
					                exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
 | 
				
			||||||
                assert act_doc == exp_doc, f"export to yaml failed on {in_path}"
 | 
					                assert act_doc == exp_doc, f"export to yaml failed on {in_path}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_md_paths():
 | 
				
			||||||
 | 
					    # Define the directory you want to search
 | 
				
			||||||
 | 
					    directory = Path("./tests/groundtruth/docling_v2")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # List all MD files in the directory and its subdirectories
 | 
				
			||||||
 | 
					    md_files = sorted(directory.rglob("*.md"))
 | 
				
			||||||
 | 
					    return md_files
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_converter():
 | 
				
			||||||
 | 
					    converter = DocumentConverter(allowed_formats=[InputFormat.MD])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return converter
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_e2e_md_conversions():
 | 
				
			||||||
 | 
					    md_paths = get_md_paths()
 | 
				
			||||||
 | 
					    converter = get_converter()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for md_path in md_paths:
 | 
				
			||||||
 | 
					        # print(f"converting {md_path}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        with open(md_path) as fr:
 | 
				
			||||||
 | 
					            true_md = fr.read()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        conv_result: ConversionResult = converter.convert(md_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        doc: DoclingDocument = conv_result.document
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        pred_md: str = doc.export_to_markdown()
 | 
				
			||||||
 | 
					        assert true_md == pred_md
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        conv_result_: ConversionResult = converter.convert_string(
 | 
				
			||||||
 | 
					            true_md, format=InputFormat.MD
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        doc_: DoclingDocument = conv_result_.document
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        pred_md_: str = doc_.export_to_markdown()
 | 
				
			||||||
 | 
					        assert true_md == pred_md_
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user