diff --git a/pdelfin/beakerpipeline.py b/pdelfin/beakerpipeline.py index f1facf3..ec1f08d 100644 --- a/pdelfin/beakerpipeline.py +++ b/pdelfin/beakerpipeline.py @@ -811,9 +811,7 @@ if __name__ == "__main__": asyncio.run(main()) # TODO - # - Fix loading of the model checkpoints, it's so flakey now, maybe use datasets # - Add logging of failed pages and have the stats function read them - # - Fallback to different method if < 2% of pages are failed, make that configurable # - Sglang commit a fix for the context length issue # - pypdf fix for the 'v' error # - Get a solid benchmark on the stream vs non stream approach diff --git a/pdelfin/prompts/prompts.py b/pdelfin/prompts/prompts.py index 84439e8..c4b5dce 100644 --- a/pdelfin/prompts/prompts.py +++ b/pdelfin/prompts/prompts.py @@ -25,10 +25,24 @@ class PageResponse: natural_text: Optional[str] def __post_init__(self): - # Validate that rotation_correction is one of the allowed values + # Validate rotation_correction is one of the allowed values if self.rotation_correction not in {0, 90, 180, 270}: raise ValueError("rotation_correction must be one of [0, 90, 180, 270].") + # Type checks + if not isinstance(self.primary_language, (str, type(None))): + raise TypeError("primary_language must be of type Optional[str].") + if not isinstance(self.is_rotation_valid, bool): + raise TypeError("is_rotation_valid must be of type bool.") + if not isinstance(self.rotation_correction, int): + raise TypeError("rotation_correction must be of type int.") + if not isinstance(self.is_table, bool): + raise TypeError("is_table must be of type bool.") + if not isinstance(self.is_diagram, bool): + raise TypeError("is_diagram must be of type bool.") + if not isinstance(self.natural_text, (str, type(None))): + raise TypeError("natural_text must be of type Optional[str].") + def openai_response_format_schema() -> dict: return { "type": "json_schema", diff --git a/pdelfin/version.py b/pdelfin/version.py index 507440b..cb74083 100644 --- a/pdelfin/version.py +++ b/pdelfin/version.py @@ -2,7 +2,7 @@ _MAJOR = "0" _MINOR = "1" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "30" +_PATCH = "32" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = "" diff --git a/scripts/beaker/Dockerfile-inference b/scripts/beaker/Dockerfile-inference index 7e2452e..0e7038b 100644 --- a/scripts/beaker/Dockerfile-inference +++ b/scripts/beaker/Dockerfile-inference @@ -48,6 +48,9 @@ WORKDIR sglang/python RUN git checkout eff468dd5a3d24646560eb044276585f7a11ac3c RUN /root/.local/bin/uv pip install --system --no-cache -e .[all] +# TODO You can remove this once pypdf > 5.10 comes out +RUN /root/.local/bin/uv pip install --system --no-cache git+https://github.com/py-pdf/pypdf.git@c6e43374ab002d76811ec85333fdc2c82c268251 + WORKDIR /root COPY pdelfin pdelfin