2024-10-23 15:52:30 +02:00
|
|
|
from io import BytesIO
|
|
|
|
from pathlib import Path
|
|
|
|
|
2025-03-18 10:38:19 +01:00
|
|
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
|
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
|
|
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
2024-10-23 15:52:30 +02:00
|
|
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
|
|
|
from docling.datamodel.base_models import DocumentStream, InputFormat
|
2024-12-17 16:35:23 +01:00
|
|
|
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
2025-01-31 15:23:00 +01:00
|
|
|
from docling.datamodel.settings import DocumentLimits
|
2025-03-18 10:38:19 +01:00
|
|
|
from docling.document_converter import PdfFormatOption
|
2024-10-23 15:52:30 +02:00
|
|
|
|
|
|
|
|
|
|
|
def test_in_doc_from_valid_path():
|
2025-02-07 08:43:31 +01:00
|
|
|
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
2024-10-23 15:52:30 +02:00
|
|
|
doc = _make_input_doc(test_doc_path)
|
2025-04-14 18:01:26 +02:00
|
|
|
assert doc.valid is True
|
2024-10-23 15:52:30 +02:00
|
|
|
|
|
|
|
|
|
|
|
def test_in_doc_from_invalid_path():
|
|
|
|
test_doc_path = Path("./tests/does/not/exist.pdf")
|
|
|
|
|
|
|
|
doc = _make_input_doc(test_doc_path)
|
|
|
|
|
2025-04-14 18:01:26 +02:00
|
|
|
assert doc.valid is False
|
2024-10-23 15:52:30 +02:00
|
|
|
|
|
|
|
|
|
|
|
def test_in_doc_from_valid_buf():
|
2025-02-07 08:43:31 +01:00
|
|
|
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
|
2024-10-23 15:52:30 +02:00
|
|
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
|
|
|
|
|
|
|
doc = _make_input_doc_from_stream(stream)
|
2025-04-14 18:01:26 +02:00
|
|
|
assert doc.valid is True
|
2024-10-23 15:52:30 +02:00
|
|
|
|
|
|
|
|
|
|
|
def test_in_doc_from_invalid_buf():
|
|
|
|
buf = BytesIO(b"")
|
|
|
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
|
|
|
|
|
|
|
doc = _make_input_doc_from_stream(stream)
|
2025-04-14 18:01:26 +02:00
|
|
|
assert doc.valid is False
|
2024-10-23 15:52:30 +02:00
|
|
|
|
|
|
|
|
2025-03-18 10:38:19 +01:00
|
|
|
def test_image_in_pdf_backend():
|
|
|
|
in_doc = InputDocument(
|
|
|
|
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
|
|
|
format=InputFormat.IMAGE,
|
|
|
|
backend=PyPdfiumDocumentBackend,
|
|
|
|
)
|
|
|
|
|
|
|
|
assert in_doc.valid
|
|
|
|
in_doc = InputDocument(
|
|
|
|
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
|
|
|
format=InputFormat.IMAGE,
|
|
|
|
backend=DoclingParseDocumentBackend,
|
|
|
|
)
|
|
|
|
assert in_doc.valid
|
|
|
|
|
|
|
|
in_doc = InputDocument(
|
|
|
|
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
|
|
|
format=InputFormat.IMAGE,
|
|
|
|
backend=DoclingParseV2DocumentBackend,
|
|
|
|
)
|
|
|
|
assert in_doc.valid
|
|
|
|
|
|
|
|
in_doc = InputDocument(
|
|
|
|
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
|
|
|
format=InputFormat.IMAGE,
|
|
|
|
backend=DoclingParseV4DocumentBackend,
|
|
|
|
)
|
|
|
|
assert in_doc.valid
|
|
|
|
|
|
|
|
|
2025-01-31 15:23:00 +01:00
|
|
|
def test_in_doc_with_page_range():
|
2025-02-07 08:43:31 +01:00
|
|
|
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
2025-01-31 15:23:00 +01:00
|
|
|
limits = DocumentLimits()
|
|
|
|
limits.page_range = (1, 10)
|
|
|
|
|
|
|
|
doc = InputDocument(
|
|
|
|
path_or_stream=test_doc_path,
|
|
|
|
format=InputFormat.PDF,
|
|
|
|
backend=PyPdfiumDocumentBackend,
|
|
|
|
limits=limits,
|
|
|
|
)
|
2025-04-14 18:01:26 +02:00
|
|
|
assert doc.valid is True
|
2025-01-31 15:23:00 +01:00
|
|
|
|
|
|
|
limits.page_range = (9, 9)
|
|
|
|
|
|
|
|
doc = InputDocument(
|
|
|
|
path_or_stream=test_doc_path,
|
|
|
|
format=InputFormat.PDF,
|
|
|
|
backend=PyPdfiumDocumentBackend,
|
|
|
|
limits=limits,
|
|
|
|
)
|
2025-04-14 18:01:26 +02:00
|
|
|
assert doc.valid is True
|
2025-01-31 15:23:00 +01:00
|
|
|
|
|
|
|
limits.page_range = (11, 12)
|
|
|
|
|
|
|
|
doc = InputDocument(
|
|
|
|
path_or_stream=test_doc_path,
|
|
|
|
format=InputFormat.PDF,
|
|
|
|
backend=PyPdfiumDocumentBackend,
|
|
|
|
limits=limits,
|
|
|
|
)
|
2025-04-14 18:01:26 +02:00
|
|
|
assert doc.valid is False
|
2025-01-31 15:23:00 +01:00
|
|
|
|
|
|
|
|
2024-12-17 16:35:23 +01:00
|
|
|
def test_guess_format(tmp_path):
|
|
|
|
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
|
|
|
|
dci = _DocumentConversionInput(path_or_stream_iterator=[])
|
|
|
|
temp_dir = tmp_path / "test_guess_format"
|
|
|
|
temp_dir.mkdir()
|
|
|
|
|
|
|
|
# Valid PDF
|
2025-02-07 08:43:31 +01:00
|
|
|
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
|
2024-12-17 16:35:23 +01:00
|
|
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
|
|
|
assert dci._guess_format(stream) == InputFormat.PDF
|
2025-02-07 08:43:31 +01:00
|
|
|
doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
2024-12-17 16:35:23 +01:00
|
|
|
assert dci._guess_format(doc_path) == InputFormat.PDF
|
|
|
|
|
|
|
|
# Valid MS Office
|
|
|
|
buf = BytesIO(Path("./tests/data/docx/lorem_ipsum.docx").open("rb").read())
|
|
|
|
stream = DocumentStream(name="lorem_ipsum.docx", stream=buf)
|
|
|
|
assert dci._guess_format(stream) == InputFormat.DOCX
|
|
|
|
doc_path = Path("./tests/data/docx/lorem_ipsum.docx")
|
|
|
|
assert dci._guess_format(doc_path) == InputFormat.DOCX
|
|
|
|
|
|
|
|
# Valid HTML
|
|
|
|
buf = BytesIO(Path("./tests/data/html/wiki_duck.html").open("rb").read())
|
|
|
|
stream = DocumentStream(name="wiki_duck.html", stream=buf)
|
|
|
|
assert dci._guess_format(stream) == InputFormat.HTML
|
|
|
|
doc_path = Path("./tests/data/html/wiki_duck.html")
|
|
|
|
assert dci._guess_format(doc_path) == InputFormat.HTML
|
|
|
|
|
2025-06-02 08:43:24 +02:00
|
|
|
html_str = ( # HTML starting with a script
|
|
|
|
"<script>\nconsole.log('foo');\n</script>"
|
|
|
|
'<!doctype html>\n<html lang="en-us class="no-js"></html>'
|
|
|
|
)
|
|
|
|
stream = DocumentStream(name="lorem_ipsum", stream=BytesIO(f"{html_str}".encode()))
|
|
|
|
assert dci._guess_format(stream) == InputFormat.HTML
|
|
|
|
|
2024-12-17 16:35:23 +01:00
|
|
|
# Valid MD
|
|
|
|
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
|
|
|
|
stream = DocumentStream(name="wiki.md", stream=buf)
|
|
|
|
assert dci._guess_format(stream) == InputFormat.MD
|
|
|
|
doc_path = Path("./tests/data/md/wiki.md")
|
|
|
|
assert dci._guess_format(doc_path) == InputFormat.MD
|
|
|
|
|
2025-02-14 15:55:09 +08:00
|
|
|
# Valid CSV
|
|
|
|
buf = BytesIO(Path("./tests/data/csv/csv-comma.csv").open("rb").read())
|
|
|
|
stream = DocumentStream(name="csv-comma.csv", stream=buf)
|
|
|
|
assert dci._guess_format(stream) == InputFormat.CSV
|
|
|
|
stream = DocumentStream(name="test-comma", stream=buf)
|
|
|
|
assert dci._guess_format(stream) == InputFormat.CSV
|
|
|
|
doc_path = Path("./tests/data/csv/csv-comma.csv")
|
|
|
|
assert dci._guess_format(doc_path) == InputFormat.CSV
|
|
|
|
|
2024-12-17 16:35:23 +01:00
|
|
|
# Valid XML USPTO patent
|
|
|
|
buf = BytesIO(Path("./tests/data/uspto/ipa20110039701.xml").open("rb").read())
|
|
|
|
stream = DocumentStream(name="ipa20110039701.xml", stream=buf)
|
|
|
|
assert dci._guess_format(stream) == InputFormat.XML_USPTO
|
|
|
|
doc_path = Path("./tests/data/uspto/ipa20110039701.xml")
|
|
|
|
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
|
|
|
|
|
|
|
|
buf = BytesIO(Path("./tests/data/uspto/pftaps057006474.txt").open("rb").read())
|
|
|
|
stream = DocumentStream(name="pftaps057006474.txt", stream=buf)
|
|
|
|
assert dci._guess_format(stream) == InputFormat.XML_USPTO
|
|
|
|
doc_path = Path("./tests/data/uspto/pftaps057006474.txt")
|
|
|
|
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
|
|
|
|
|
2025-02-17 10:43:31 +01:00
|
|
|
# Valid XML JATS
|
|
|
|
buf = BytesIO(Path("./tests/data/jats/elife-56337.xml").open("rb").read())
|
2024-12-17 19:27:09 +01:00
|
|
|
stream = DocumentStream(name="elife-56337.xml", stream=buf)
|
2025-02-17 10:43:31 +01:00
|
|
|
assert dci._guess_format(stream) == InputFormat.XML_JATS
|
|
|
|
doc_path = Path("./tests/data/jats/elife-56337.xml")
|
|
|
|
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
|
2024-12-17 19:27:09 +01:00
|
|
|
|
2025-02-17 10:43:31 +01:00
|
|
|
buf = BytesIO(Path("./tests/data/jats/elife-56337.nxml").open("rb").read())
|
2024-12-17 19:27:09 +01:00
|
|
|
stream = DocumentStream(name="elife-56337.nxml", stream=buf)
|
2025-02-17 10:43:31 +01:00
|
|
|
assert dci._guess_format(stream) == InputFormat.XML_JATS
|
|
|
|
doc_path = Path("./tests/data/jats/elife-56337.nxml")
|
|
|
|
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
|
2024-12-17 19:27:09 +01:00
|
|
|
|
2025-02-17 10:43:31 +01:00
|
|
|
buf = BytesIO(Path("./tests/data/jats/elife-56337.txt").open("rb").read())
|
2024-12-17 19:27:09 +01:00
|
|
|
stream = DocumentStream(name="elife-56337.txt", stream=buf)
|
2025-02-17 10:43:31 +01:00
|
|
|
assert dci._guess_format(stream) == InputFormat.XML_JATS
|
|
|
|
doc_path = Path("./tests/data/jats/elife-56337.txt")
|
|
|
|
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
|
2024-12-17 19:27:09 +01:00
|
|
|
|
2024-12-17 16:35:23 +01:00
|
|
|
# Valid XML, non-supported flavor
|
|
|
|
xml_content = (
|
|
|
|
'<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE docling_test SYSTEM '
|
|
|
|
'"test.dtd"><docling>Docling parses documents</docling>'
|
|
|
|
)
|
|
|
|
doc_path = temp_dir / "docling_test.xml"
|
|
|
|
doc_path.write_text(xml_content, encoding="utf-8")
|
2025-04-14 18:01:26 +02:00
|
|
|
assert dci._guess_format(doc_path) is None
|
2024-12-17 16:35:23 +01:00
|
|
|
buf = BytesIO(Path(doc_path).open("rb").read())
|
|
|
|
stream = DocumentStream(name="docling_test.xml", stream=buf)
|
2025-04-14 18:01:26 +02:00
|
|
|
assert dci._guess_format(stream) is None
|
2024-12-17 16:35:23 +01:00
|
|
|
|
|
|
|
# Invalid USPTO patent (as plain text)
|
|
|
|
stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz"))
|
2025-04-14 18:01:26 +02:00
|
|
|
assert dci._guess_format(stream) is None
|
2024-12-17 16:35:23 +01:00
|
|
|
doc_path = temp_dir / "pftaps_wrong.txt"
|
|
|
|
doc_path.write_text("xyz", encoding="utf-8")
|
2025-04-14 18:01:26 +02:00
|
|
|
assert dci._guess_format(doc_path) is None
|
2024-12-17 16:35:23 +01:00
|
|
|
|
2025-01-24 18:05:23 +01:00
|
|
|
# Valid Docling JSON
|
|
|
|
test_str = '{"name": ""}'
|
|
|
|
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
|
|
|
|
assert dci._guess_format(stream) == InputFormat.JSON_DOCLING
|
|
|
|
doc_path = temp_dir / "test.json"
|
|
|
|
doc_path.write_text(test_str, encoding="utf-8")
|
|
|
|
assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING
|
|
|
|
|
|
|
|
# Non-Docling JSON
|
|
|
|
# TODO: Docling JSON is currently the single supported JSON flavor and the pipeline
|
|
|
|
# will try to validate *any* JSON (based on suffix/MIME) as Docling JSON; proper
|
2025-03-14 12:35:29 +01:00
|
|
|
# disambiguation seen as part of https://github.com/docling-project/docling/issues/802
|
2025-01-24 18:05:23 +01:00
|
|
|
test_str = "{}"
|
|
|
|
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
|
|
|
|
assert dci._guess_format(stream) == InputFormat.JSON_DOCLING
|
|
|
|
doc_path = temp_dir / "test.json"
|
|
|
|
doc_path.write_text(test_str, encoding="utf-8")
|
|
|
|
assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING
|
|
|
|
|
2024-12-17 16:35:23 +01:00
|
|
|
|
2024-10-23 15:52:30 +02:00
|
|
|
def _make_input_doc(path):
|
|
|
|
in_doc = InputDocument(
|
|
|
|
path_or_stream=path,
|
|
|
|
format=InputFormat.PDF,
|
2025-03-18 10:38:19 +01:00
|
|
|
backend=PdfFormatOption().backend, # use default
|
2024-10-23 15:52:30 +02:00
|
|
|
)
|
|
|
|
return in_doc
|
|
|
|
|
|
|
|
|
|
|
|
def _make_input_doc_from_stream(doc_stream):
|
|
|
|
in_doc = InputDocument(
|
|
|
|
path_or_stream=doc_stream.stream,
|
|
|
|
format=InputFormat.PDF,
|
|
|
|
filename=doc_stream.name,
|
2025-03-18 10:38:19 +01:00
|
|
|
backend=PdfFormatOption().backend, # use default
|
2024-10-23 15:52:30 +02:00
|
|
|
)
|
|
|
|
return in_doc
|
fix: multi-page image support (tiff) (#1928)
* Initial plan
* Fix multi-page TIFF image support
Co-authored-by: cau-git <60343111+cau-git@users.noreply.github.com>
* add RGB conversion
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* Remove pointless test
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* Add multi-page TIFF test data and verification tests
Co-authored-by: cau-git <60343111+cau-git@users.noreply.github.com>
* Revert "Add multi-page TIFF test data and verification tests"
This reverts commit 130a10e2d97d4bd27e2fea0e93180904047dd087.
* Proper test for 2 page tiff file
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* DCO Remediation Commit for copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
I, copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>, hereby add my Signed-off-by to this commit: 420df478f302ea4aab1762be467dc4a90a7d7903
I, copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>, hereby add my Signed-off-by to this commit: c1d722725f23bf59316fcf394a5b9434bbdc1e61
I, Christoph Auer <cau@zurich.ibm.com>, hereby add my Signed-off-by to this commit: 6aa85cc933888c4c31546984ebf8760c8dca4977
I, copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>, hereby add my Signed-off-by to this commit: 130a10e2d97d4bd27e2fea0e93180904047dd087
I, Christoph Auer <cau@zurich.ibm.com>, hereby add my Signed-off-by to this commit: d571f362991b2546a3274be843b40a76b8bd9fb6
I, Christoph Auer <cau@zurich.ibm.com>, hereby add my Signed-off-by to this commit: 2aab66288b620a7aa038fdff45715d1811e486a7
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* Proper test for 2 page tiff file (2)
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---------
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: cau-git <60343111+cau-git@users.noreply.github.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-23 09:55:40 +02:00
|
|
|
|
|
|
|
|
|
|
|
def test_tiff_two_pages():
|
|
|
|
tiff_path = Path("./tests/data/tiff/2206.01062.tif")
|
|
|
|
doc = InputDocument(
|
|
|
|
path_or_stream=tiff_path,
|
|
|
|
format=InputFormat.IMAGE,
|
|
|
|
backend=PdfFormatOption().backend, # use default backend
|
|
|
|
)
|
|
|
|
assert doc.valid is True
|
|
|
|
assert doc.page_count == 2
|
|
|
|
|
|
|
|
# Expect two full-page rectangles
|
|
|
|
rects_page1 = doc._backend.load_page(0).get_bitmap_rects()
|
|
|
|
rects_page2 = doc._backend.load_page(1).get_bitmap_rects()
|
|
|
|
|
|
|
|
page1_rect = next(rects_page1)
|
|
|
|
page2_rect = next(rects_page2)
|
|
|
|
|
|
|
|
assert page1_rect.t == page2_rect.t == 0
|
|
|
|
assert page1_rect.l == page2_rect.l == 0
|
|
|
|
assert page1_rect.r == page2_rect.r == 612.0
|
|
|
|
assert page1_rect.b == page2_rect.b == 792.0
|