docling/tests/test_backend_csv.py

import json
import os
from pathlib import Path

from pytest import warns

from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument
from docling.document_converter import DocumentConverter

from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export

GENERATE = GEN_TEST_DATA


def get_csv_paths():

    # Define the directory you want to search
    directory = Path(f"./tests/data/csv/")

    # List all CSV files in the directory and its subdirectories
    return sorted(directory.rglob("*.csv"))


def get_csv_path(name: str):

    # Return the matching CSV file path
    return Path(f"./tests/data/csv/{name}.csv")


def get_converter():

    converter = DocumentConverter(allowed_formats=[InputFormat.CSV])

    return converter


def test_e2e_valid_csv_conversions():
    valid_csv_paths = get_csv_paths()
    converter = get_converter()

    for csv_path in valid_csv_paths:
        print(f"converting {csv_path}")

        gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name

        conv_result: ConversionResult = converter.convert(csv_path)

        doc: DoclingDocument = conv_result.document

        pred_md: str = doc.export_to_markdown()
        assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"

        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
        assert verify_export(
            pred_itxt, str(gt_path) + ".itxt"
        ), "export to indented-text"

        assert verify_document(
            pred_doc=doc,
            gtfile=str(gt_path) + ".json",
            generate=GENERATE,
        ), "export to json"


def test_e2e_invalid_csv_conversions():
    csv_too_few_columns = get_csv_path("csv-too-few-columns")
    csv_too_many_columns = get_csv_path("csv-too-many-columns")
    csv_inconsistent_header = get_csv_path("csv-inconsistent-header")
    converter = get_converter()

    print(f"converting {csv_too_few_columns}")
    with warns(UserWarning, match="Inconsistent column lengths"):
        converter.convert(csv_too_few_columns)

    print(f"converting {csv_too_many_columns}")
    with warns(UserWarning, match="Inconsistent column lengths"):
        converter.convert(csv_too_many_columns)

    print(f"converting {csv_inconsistent_header}")
    with warns(UserWarning, match="Inconsistent column lengths"):
        converter.convert(csv_inconsistent_header)
feat: Add support for CSV input with new backend to transform CSV files to DoclingDocument (#945) * feat: Implement csv backend and format detection Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * test: Implement csv parsing and format tests Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * docs: Add example and CSV format documentation Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * feat: Add support for various CSV dialects and update documentation Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * feat: Add validation for delimiters and tests for inconsistent csv files Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> --------- Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> 2025-02-14 15:55:09 +08:00			`import json`
			`import os`
			`from pathlib import Path`

			`from pytest import warns`

			`from docling.datamodel.base_models import InputFormat`
			`from docling.datamodel.document import ConversionResult, DoclingDocument`
			`from docling.document_converter import DocumentConverter`

feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905) * Add DoclingParseV3 backend implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use docling-core with docling-parse types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes and test updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test units Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back DoclingParse v1 backend, pipeline options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update locks Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Ground-truth files updated Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests, use TextCell.from_ocr property Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Text fixes, new test data Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename docling backend to v4 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Test all backends, fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset all tests to use docling-parse v1 for now Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for DPv4 backend init, better test coverage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * test_input_doc use default backend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> 2025-03-18 10:38:19 +01:00			`from .test_data_gen_flag import GEN_TEST_DATA`
test: avoid testing exact JSON in CSV backend (#1038) * feat: updated verify_export Moved verify_export to verify_utils Reuse verify_export in tests Signed-off-by: Matheus Abdias <matheusfabdias@gmail.com> * feat: replace verify_export with verify_document in CSV conversion tests Signed-off-by: Matheus Abdias <matheusfabdias@gmail.com> --------- Signed-off-by: Matheus Abdias <matheusfabdias@gmail.com> 2025-02-24 07:10:40 +00:00			`from .verify_utils import verify_document, verify_export`

feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905) * Add DoclingParseV3 backend implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use docling-core with docling-parse types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes and test updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test units Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back DoclingParse v1 backend, pipeline options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update locks Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Ground-truth files updated Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests, use TextCell.from_ocr property Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Text fixes, new test data Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename docling backend to v4 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Test all backends, fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset all tests to use docling-parse v1 for now Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for DPv4 backend init, better test coverage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * test_input_doc use default backend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> 2025-03-18 10:38:19 +01:00			`GENERATE = GEN_TEST_DATA`
feat: Add support for CSV input with new backend to transform CSV files to DoclingDocument (#945) * feat: Implement csv backend and format detection Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * test: Implement csv parsing and format tests Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * docs: Add example and CSV format documentation Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * feat: Add support for various CSV dialects and update documentation Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * feat: Add validation for delimiters and tests for inconsistent csv files Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> --------- Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> 2025-02-14 15:55:09 +08:00

			`def get_csv_paths():`

			`# Define the directory you want to search`
			`directory = Path(f"./tests/data/csv/")`

			`# List all CSV files in the directory and its subdirectories`
			`return sorted(directory.rglob("*.csv"))`


			`def get_csv_path(name: str):`

			`# Return the matching CSV file path`
			`return Path(f"./tests/data/csv/{name}.csv")`


			`def get_converter():`

			`converter = DocumentConverter(allowed_formats=[InputFormat.CSV])`

			`return converter`


			`def test_e2e_valid_csv_conversions():`
			`valid_csv_paths = get_csv_paths()`
			`converter = get_converter()`

			`for csv_path in valid_csv_paths:`
			`print(f"converting {csv_path}")`

			`gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name`

			`conv_result: ConversionResult = converter.convert(csv_path)`

			`doc: DoclingDocument = conv_result.document`

			`pred_md: str = doc.export_to_markdown()`
			`assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"`

			`pred_itxt: str = doc._export_to_indented_text(`
			`max_text_len=70, explicit_tables=False`
			`)`
			`assert verify_export(`
			`pred_itxt, str(gt_path) + ".itxt"`
			`), "export to indented-text"`

fix: use first table row as col headers (#1156) Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> 2025-03-13 15:34:18 +01:00			`assert verify_document(`
			`pred_doc=doc,`
			`gtfile=str(gt_path) + ".json",`
			`generate=GENERATE,`
			`), "export to json"`
feat: Add support for CSV input with new backend to transform CSV files to DoclingDocument (#945) * feat: Implement csv backend and format detection Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * test: Implement csv parsing and format tests Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * docs: Add example and CSV format documentation Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * feat: Add support for various CSV dialects and update documentation Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * feat: Add validation for delimiters and tests for inconsistent csv files Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> --------- Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> 2025-02-14 15:55:09 +08:00

			`def test_e2e_invalid_csv_conversions():`
			`csv_too_few_columns = get_csv_path("csv-too-few-columns")`
			`csv_too_many_columns = get_csv_path("csv-too-many-columns")`
			`csv_inconsistent_header = get_csv_path("csv-inconsistent-header")`
			`converter = get_converter()`

			`print(f"converting {csv_too_few_columns}")`
			`with warns(UserWarning, match="Inconsistent column lengths"):`
			`converter.convert(csv_too_few_columns)`

			`print(f"converting {csv_too_many_columns}")`
			`with warns(UserWarning, match="Inconsistent column lengths"):`
			`converter.convert(csv_too_many_columns)`

			`print(f"converting {csv_inconsistent_header}")`
			`with warns(UserWarning, match="Inconsistent column lengths"):`
			`converter.convert(csv_inconsistent_header)`