mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-13 16:21:16 +00:00
More fixes
This commit is contained in:
parent
8793fc7d99
commit
9e2e09bd06
@ -811,9 +811,7 @@ if __name__ == "__main__":
|
|||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|
||||||
# TODO
|
# TODO
|
||||||
# - Fix loading of the model checkpoints, it's so flakey now, maybe use datasets
|
|
||||||
# - Add logging of failed pages and have the stats function read them
|
# - Add logging of failed pages and have the stats function read them
|
||||||
# - Fallback to different method if < 2% of pages are failed, make that configurable
|
|
||||||
# - Sglang commit a fix for the context length issue
|
# - Sglang commit a fix for the context length issue
|
||||||
# - pypdf fix for the 'v' error
|
# - pypdf fix for the 'v' error
|
||||||
# - Get a solid benchmark on the stream vs non stream approach
|
# - Get a solid benchmark on the stream vs non stream approach
|
||||||
|
|||||||
@ -25,10 +25,24 @@ class PageResponse:
|
|||||||
natural_text: Optional[str]
|
natural_text: Optional[str]
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
# Validate that rotation_correction is one of the allowed values
|
# Validate rotation_correction is one of the allowed values
|
||||||
if self.rotation_correction not in {0, 90, 180, 270}:
|
if self.rotation_correction not in {0, 90, 180, 270}:
|
||||||
raise ValueError("rotation_correction must be one of [0, 90, 180, 270].")
|
raise ValueError("rotation_correction must be one of [0, 90, 180, 270].")
|
||||||
|
|
||||||
|
# Type checks
|
||||||
|
if not isinstance(self.primary_language, (str, type(None))):
|
||||||
|
raise TypeError("primary_language must be of type Optional[str].")
|
||||||
|
if not isinstance(self.is_rotation_valid, bool):
|
||||||
|
raise TypeError("is_rotation_valid must be of type bool.")
|
||||||
|
if not isinstance(self.rotation_correction, int):
|
||||||
|
raise TypeError("rotation_correction must be of type int.")
|
||||||
|
if not isinstance(self.is_table, bool):
|
||||||
|
raise TypeError("is_table must be of type bool.")
|
||||||
|
if not isinstance(self.is_diagram, bool):
|
||||||
|
raise TypeError("is_diagram must be of type bool.")
|
||||||
|
if not isinstance(self.natural_text, (str, type(None))):
|
||||||
|
raise TypeError("natural_text must be of type Optional[str].")
|
||||||
|
|
||||||
def openai_response_format_schema() -> dict:
|
def openai_response_format_schema() -> dict:
|
||||||
return {
|
return {
|
||||||
"type": "json_schema",
|
"type": "json_schema",
|
||||||
|
|||||||
@ -2,7 +2,7 @@ _MAJOR = "0"
|
|||||||
_MINOR = "1"
|
_MINOR = "1"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "30"
|
_PATCH = "32"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
@ -48,6 +48,9 @@ WORKDIR sglang/python
|
|||||||
RUN git checkout eff468dd5a3d24646560eb044276585f7a11ac3c
|
RUN git checkout eff468dd5a3d24646560eb044276585f7a11ac3c
|
||||||
RUN /root/.local/bin/uv pip install --system --no-cache -e .[all]
|
RUN /root/.local/bin/uv pip install --system --no-cache -e .[all]
|
||||||
|
|
||||||
|
# TODO You can remove this once pypdf > 5.10 comes out
|
||||||
|
RUN /root/.local/bin/uv pip install --system --no-cache git+https://github.com/py-pdf/pypdf.git@c6e43374ab002d76811ec85333fdc2c82c268251
|
||||||
|
|
||||||
WORKDIR /root
|
WORKDIR /root
|
||||||
COPY pdelfin pdelfin
|
COPY pdelfin pdelfin
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user