from io import BytesIO from pathlib import Path from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.document import InputDocument def test_in_doc_from_valid_path(): test_doc_path = Path("./tests/data/2206.01062.pdf") doc = _make_input_doc(test_doc_path) assert doc.valid == True def test_in_doc_from_invalid_path(): test_doc_path = Path("./tests/does/not/exist.pdf") doc = _make_input_doc(test_doc_path) assert doc.valid == False def test_in_doc_from_valid_buf(): buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read()) stream = DocumentStream(name="my_doc.pdf", stream=buf) doc = _make_input_doc_from_stream(stream) assert doc.valid == True def test_in_doc_from_invalid_buf(): buf = BytesIO(b"") stream = DocumentStream(name="my_doc.pdf", stream=buf) doc = _make_input_doc_from_stream(stream) assert doc.valid == False def _make_input_doc(path): in_doc = InputDocument( path_or_stream=path, format=InputFormat.PDF, backend=PyPdfiumDocumentBackend, ) return in_doc def _make_input_doc_from_stream(doc_stream): in_doc = InputDocument( path_or_stream=doc_stream.stream, format=InputFormat.PDF, filename=doc_stream.name, backend=PyPdfiumDocumentBackend, ) return in_doc