mirror of
https://github.com/docling-project/docling.git
synced 2025-06-27 05:20:05 +00:00
59 lines
1.4 KiB
Python
59 lines
1.4 KiB
Python
![]() |
from io import BytesIO
|
||
|
from pathlib import Path
|
||
|
|
||
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||
|
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||
|
from docling.datamodel.document import InputDocument
|
||
|
|
||
|
|
||
|
def test_in_doc_from_valid_path():
|
||
|
|
||
|
test_doc_path = Path("./tests/data/2206.01062.pdf")
|
||
|
doc = _make_input_doc(test_doc_path)
|
||
|
assert doc.valid == True
|
||
|
|
||
|
|
||
|
def test_in_doc_from_invalid_path():
|
||
|
test_doc_path = Path("./tests/does/not/exist.pdf")
|
||
|
|
||
|
doc = _make_input_doc(test_doc_path)
|
||
|
|
||
|
assert doc.valid == False
|
||
|
|
||
|
|
||
|
def test_in_doc_from_valid_buf():
|
||
|
|
||
|
buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read())
|
||
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||
|
|
||
|
doc = _make_input_doc_from_stream(stream)
|
||
|
assert doc.valid == True
|
||
|
|
||
|
|
||
|
def test_in_doc_from_invalid_buf():
|
||
|
|
||
|
buf = BytesIO(b"")
|
||
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||
|
|
||
|
doc = _make_input_doc_from_stream(stream)
|
||
|
assert doc.valid == False
|
||
|
|
||
|
|
||
|
def _make_input_doc(path):
|
||
|
in_doc = InputDocument(
|
||
|
path_or_stream=path,
|
||
|
format=InputFormat.PDF,
|
||
|
backend=PyPdfiumDocumentBackend,
|
||
|
)
|
||
|
return in_doc
|
||
|
|
||
|
|
||
|
def _make_input_doc_from_stream(doc_stream):
|
||
|
in_doc = InputDocument(
|
||
|
path_or_stream=doc_stream.stream,
|
||
|
format=InputFormat.PDF,
|
||
|
filename=doc_stream.name,
|
||
|
backend=PyPdfiumDocumentBackend,
|
||
|
)
|
||
|
return in_doc
|