Logging fallback pages

This commit is contained in:
Jake Poznanski 2024-11-19 15:11:02 -08:00
parent b0acfa870e
commit 273a8b0d0a
2 changed files with 3 additions and 2 deletions

View File

@ -289,7 +289,8 @@ def build_dolma_document(pdf_s3_path, page_results):
"Source-File": pdf_s3_path,
"pdf-total-pages": len(page_results),
"total-input-tokens": sum(page.input_tokens for page in page_results),
"total-output-tokens": sum(page.output_tokens for page in page_results)
"total-output-tokens": sum(page.output_tokens for page in page_results),
"total-fallback-pages": sum(page.is_fallback for page in page_results),
}
id_ = hashlib.sha1(document_text.encode()).hexdigest()

View File

@ -2,7 +2,7 @@ _MAJOR = "0"
_MINOR = "1"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "35"
_PATCH = "36"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""