mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-12 07:41:41 +00:00
More fixes
This commit is contained in:
parent
8793fc7d99
commit
9e2e09bd06
@ -811,9 +811,7 @@ if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
# TODO
|
||||
# - Fix loading of the model checkpoints, it's so flakey now, maybe use datasets
|
||||
# - Add logging of failed pages and have the stats function read them
|
||||
# - Fallback to different method if < 2% of pages are failed, make that configurable
|
||||
# - Sglang commit a fix for the context length issue
|
||||
# - pypdf fix for the 'v' error
|
||||
# - Get a solid benchmark on the stream vs non stream approach
|
||||
|
||||
@ -25,10 +25,24 @@ class PageResponse:
|
||||
natural_text: Optional[str]
|
||||
|
||||
def __post_init__(self):
|
||||
# Validate that rotation_correction is one of the allowed values
|
||||
# Validate rotation_correction is one of the allowed values
|
||||
if self.rotation_correction not in {0, 90, 180, 270}:
|
||||
raise ValueError("rotation_correction must be one of [0, 90, 180, 270].")
|
||||
|
||||
# Type checks
|
||||
if not isinstance(self.primary_language, (str, type(None))):
|
||||
raise TypeError("primary_language must be of type Optional[str].")
|
||||
if not isinstance(self.is_rotation_valid, bool):
|
||||
raise TypeError("is_rotation_valid must be of type bool.")
|
||||
if not isinstance(self.rotation_correction, int):
|
||||
raise TypeError("rotation_correction must be of type int.")
|
||||
if not isinstance(self.is_table, bool):
|
||||
raise TypeError("is_table must be of type bool.")
|
||||
if not isinstance(self.is_diagram, bool):
|
||||
raise TypeError("is_diagram must be of type bool.")
|
||||
if not isinstance(self.natural_text, (str, type(None))):
|
||||
raise TypeError("natural_text must be of type Optional[str].")
|
||||
|
||||
def openai_response_format_schema() -> dict:
|
||||
return {
|
||||
"type": "json_schema",
|
||||
|
||||
@ -2,7 +2,7 @@ _MAJOR = "0"
|
||||
_MINOR = "1"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "30"
|
||||
_PATCH = "32"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
@ -48,6 +48,9 @@ WORKDIR sglang/python
|
||||
RUN git checkout eff468dd5a3d24646560eb044276585f7a11ac3c
|
||||
RUN /root/.local/bin/uv pip install --system --no-cache -e .[all]
|
||||
|
||||
# TODO You can remove this once pypdf > 5.10 comes out
|
||||
RUN /root/.local/bin/uv pip install --system --no-cache git+https://github.com/py-pdf/pypdf.git@c6e43374ab002d76811ec85333fdc2c82c268251
|
||||
|
||||
WORKDIR /root
|
||||
COPY pdelfin pdelfin
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user