diff --git a/pdelfin/beakerpipeline.py b/pdelfin/beakerpipeline.py index 56abd90..fdff656 100644 --- a/pdelfin/beakerpipeline.py +++ b/pdelfin/beakerpipeline.py @@ -455,12 +455,10 @@ async def worker(args, work_queue: S3WorkQueue, semaphore, worker_id): metrics.add_metrics(finished_input_tokens=sum(doc["metadata"]["total-input-tokens"] for doc in dolma_docs), finished_output_tokens=sum(doc["metadata"]["total-output-tokens"] for doc in dolma_docs)) - # Update last batch time - last_batch_time = time.perf_counter() + await work_queue.mark_done(work_item) except Exception as e: logger.exception(f"Exception occurred while processing work_hash {work_item.hash}: {e}") finally: - await work_queue.mark_done(work_item) semaphore.release() diff --git a/pdelfin/filter/filter.py b/pdelfin/filter/filter.py index 9c0f4f5..b3f3ac3 100644 --- a/pdelfin/filter/filter.py +++ b/pdelfin/filter/filter.py @@ -72,15 +72,15 @@ class PdfFilter: try: # Attempt to read the PDF at the beginning pdf_reader = PdfReader(local_pdf_path) + + # Form check + if self.apply_form_check and self._is_form(pdf_reader): + logger.info(f"Filtering out {local_pdf_path} because it's a form") + return True # Filter out except Exception as e: logger.warning(f"Error reading PDF {local_pdf_path}: {e}") return True # Filter out the PDF if an exception occurs - # Form check - if self.apply_form_check and self._is_form(pdf_reader): - logger.info(f"Filtering out {local_pdf_path} because it's a form") - return True # Filter out - # Read the first five pages of text for language calculation pdftotext_result = subprocess.run( ["pdftotext", "-f", "1", "-l", "5", local_pdf_path, "-"], diff --git a/pdelfin/version.py b/pdelfin/version.py index 9a08606..c139745 100644 --- a/pdelfin/version.py +++ b/pdelfin/version.py @@ -2,7 +2,7 @@ _MAJOR = "0" _MINOR = "1" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "51" +_PATCH = "53" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = ""