mirror of
https://github.com/allenai/olmocr.git
synced 2025-08-23 16:23:36 +00:00
Logging fallback pages
This commit is contained in:
parent
b0acfa870e
commit
273a8b0d0a
@ -289,7 +289,8 @@ def build_dolma_document(pdf_s3_path, page_results):
|
|||||||
"Source-File": pdf_s3_path,
|
"Source-File": pdf_s3_path,
|
||||||
"pdf-total-pages": len(page_results),
|
"pdf-total-pages": len(page_results),
|
||||||
"total-input-tokens": sum(page.input_tokens for page in page_results),
|
"total-input-tokens": sum(page.input_tokens for page in page_results),
|
||||||
"total-output-tokens": sum(page.output_tokens for page in page_results)
|
"total-output-tokens": sum(page.output_tokens for page in page_results),
|
||||||
|
"total-fallback-pages": sum(page.is_fallback for page in page_results),
|
||||||
}
|
}
|
||||||
|
|
||||||
id_ = hashlib.sha1(document_text.encode()).hexdigest()
|
id_ = hashlib.sha1(document_text.encode()).hexdigest()
|
||||||
|
@ -2,7 +2,7 @@ _MAJOR = "0"
|
|||||||
_MINOR = "1"
|
_MINOR = "1"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "35"
|
_PATCH = "36"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user