More fixes

This commit is contained in:
Jake Poznanski 2024-11-18 15:04:50 -08:00
parent 8793fc7d99
commit 9e2e09bd06
4 changed files with 19 additions and 4 deletions

View File

@ -811,9 +811,7 @@ if __name__ == "__main__":
asyncio.run(main())
# TODO
# - Fix loading of the model checkpoints, it's so flakey now, maybe use datasets
# - Add logging of failed pages and have the stats function read them
# - Fallback to different method if < 2% of pages are failed, make that configurable
# - Sglang commit a fix for the context length issue
# - pypdf fix for the 'v' error
# - Get a solid benchmark on the stream vs non stream approach

View File

@ -25,10 +25,24 @@ class PageResponse:
natural_text: Optional[str]
def __post_init__(self):
# Validate that rotation_correction is one of the allowed values
# Validate rotation_correction is one of the allowed values
if self.rotation_correction not in {0, 90, 180, 270}:
raise ValueError("rotation_correction must be one of [0, 90, 180, 270].")
# Type checks
if not isinstance(self.primary_language, (str, type(None))):
raise TypeError("primary_language must be of type Optional[str].")
if not isinstance(self.is_rotation_valid, bool):
raise TypeError("is_rotation_valid must be of type bool.")
if not isinstance(self.rotation_correction, int):
raise TypeError("rotation_correction must be of type int.")
if not isinstance(self.is_table, bool):
raise TypeError("is_table must be of type bool.")
if not isinstance(self.is_diagram, bool):
raise TypeError("is_diagram must be of type bool.")
if not isinstance(self.natural_text, (str, type(None))):
raise TypeError("natural_text must be of type Optional[str].")
def openai_response_format_schema() -> dict:
return {
"type": "json_schema",

View File

@ -2,7 +2,7 @@ _MAJOR = "0"
_MINOR = "1"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "30"
_PATCH = "32"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""

View File

@ -48,6 +48,9 @@ WORKDIR sglang/python
RUN git checkout eff468dd5a3d24646560eb044276585f7a11ac3c
RUN /root/.local/bin/uv pip install --system --no-cache -e .[all]
# TODO You can remove this once pypdf > 5.10 comes out
RUN /root/.local/bin/uv pip install --system --no-cache git+https://github.com/py-pdf/pypdf.git@c6e43374ab002d76811ec85333fdc2c82c268251
WORKDIR /root
COPY pdelfin pdelfin