mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-02 19:13:53 +00:00
Merge branch 'main' of https://github.com/allenai/pdelfin into main
This commit is contained in:
commit
a56ce71771
@ -455,12 +455,10 @@ async def worker(args, work_queue: S3WorkQueue, semaphore, worker_id):
|
||||
metrics.add_metrics(finished_input_tokens=sum(doc["metadata"]["total-input-tokens"] for doc in dolma_docs),
|
||||
finished_output_tokens=sum(doc["metadata"]["total-output-tokens"] for doc in dolma_docs))
|
||||
|
||||
# Update last batch time
|
||||
last_batch_time = time.perf_counter()
|
||||
await work_queue.mark_done(work_item)
|
||||
except Exception as e:
|
||||
logger.exception(f"Exception occurred while processing work_hash {work_item.hash}: {e}")
|
||||
finally:
|
||||
await work_queue.mark_done(work_item)
|
||||
semaphore.release()
|
||||
|
||||
|
||||
|
||||
@ -72,15 +72,15 @@ class PdfFilter:
|
||||
try:
|
||||
# Attempt to read the PDF at the beginning
|
||||
pdf_reader = PdfReader(local_pdf_path)
|
||||
|
||||
# Form check
|
||||
if self.apply_form_check and self._is_form(pdf_reader):
|
||||
logger.info(f"Filtering out {local_pdf_path} because it's a form")
|
||||
return True # Filter out
|
||||
except Exception as e:
|
||||
logger.warning(f"Error reading PDF {local_pdf_path}: {e}")
|
||||
return True # Filter out the PDF if an exception occurs
|
||||
|
||||
# Form check
|
||||
if self.apply_form_check and self._is_form(pdf_reader):
|
||||
logger.info(f"Filtering out {local_pdf_path} because it's a form")
|
||||
return True # Filter out
|
||||
|
||||
# Read the first five pages of text for language calculation
|
||||
pdftotext_result = subprocess.run(
|
||||
["pdftotext", "-f", "1", "-l", "5", local_pdf_path, "-"],
|
||||
|
||||
@ -2,7 +2,7 @@ _MAJOR = "0"
|
||||
_MINOR = "1"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "51"
|
||||
_PATCH = "53"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user