Merge branch 'main' of https://github.com/allenai/pdelfin into main

This commit is contained in:
Jake Poznanski 2024-12-04 13:20:14 -08:00
commit a56ce71771
3 changed files with 7 additions and 9 deletions

View File

@ -455,12 +455,10 @@ async def worker(args, work_queue: S3WorkQueue, semaphore, worker_id):
metrics.add_metrics(finished_input_tokens=sum(doc["metadata"]["total-input-tokens"] for doc in dolma_docs),
finished_output_tokens=sum(doc["metadata"]["total-output-tokens"] for doc in dolma_docs))
# Update last batch time
last_batch_time = time.perf_counter()
await work_queue.mark_done(work_item)
except Exception as e:
logger.exception(f"Exception occurred while processing work_hash {work_item.hash}: {e}")
finally:
await work_queue.mark_done(work_item)
semaphore.release()

View File

@ -72,15 +72,15 @@ class PdfFilter:
try:
# Attempt to read the PDF at the beginning
pdf_reader = PdfReader(local_pdf_path)
# Form check
if self.apply_form_check and self._is_form(pdf_reader):
logger.info(f"Filtering out {local_pdf_path} because it's a form")
return True # Filter out
except Exception as e:
logger.warning(f"Error reading PDF {local_pdf_path}: {e}")
return True # Filter out the PDF if an exception occurs
# Form check
if self.apply_form_check and self._is_form(pdf_reader):
logger.info(f"Filtering out {local_pdf_path} because it's a form")
return True # Filter out
# Read the first five pages of text for language calculation
pdftotext_result = subprocess.run(
["pdftotext", "-f", "1", "-l", "5", local_pdf_path, "-"],

View File

@ -2,7 +2,7 @@ _MAJOR = "0"
_MINOR = "1"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "51"
_PATCH = "53"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""