From fa318dac7c3063a0b484bb37248779d9b610fe58 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Wed, 4 Dec 2024 18:46:39 +0000 Subject: [PATCH 1/2] New version with s3 fix in it --- pdelfin/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdelfin/version.py b/pdelfin/version.py index 9a08606..fce2114 100644 --- a/pdelfin/version.py +++ b/pdelfin/version.py @@ -2,7 +2,7 @@ _MAJOR = "0" _MINOR = "1" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "51" +_PATCH = "52" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = "" From 0b72eda79410e5234b56c6c3b014b83d8868365d Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Wed, 4 Dec 2024 19:08:21 +0000 Subject: [PATCH 2/2] Move form check into exception handler, don't mark the work item as done if it had an exception on it --- pdelfin/beakerpipeline.py | 4 +--- pdelfin/filter/filter.py | 10 +++++----- pdelfin/version.py | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pdelfin/beakerpipeline.py b/pdelfin/beakerpipeline.py index 56abd90..fdff656 100644 --- a/pdelfin/beakerpipeline.py +++ b/pdelfin/beakerpipeline.py @@ -455,12 +455,10 @@ async def worker(args, work_queue: S3WorkQueue, semaphore, worker_id): metrics.add_metrics(finished_input_tokens=sum(doc["metadata"]["total-input-tokens"] for doc in dolma_docs), finished_output_tokens=sum(doc["metadata"]["total-output-tokens"] for doc in dolma_docs)) - # Update last batch time - last_batch_time = time.perf_counter() + await work_queue.mark_done(work_item) except Exception as e: logger.exception(f"Exception occurred while processing work_hash {work_item.hash}: {e}") finally: - await work_queue.mark_done(work_item) semaphore.release() diff --git a/pdelfin/filter/filter.py b/pdelfin/filter/filter.py index 9c0f4f5..b3f3ac3 100644 --- a/pdelfin/filter/filter.py +++ b/pdelfin/filter/filter.py @@ -72,15 +72,15 @@ class PdfFilter: try: # Attempt to read the PDF at the beginning pdf_reader = PdfReader(local_pdf_path) + + # Form check + if self.apply_form_check and self._is_form(pdf_reader): + logger.info(f"Filtering out {local_pdf_path} because it's a form") + return True # Filter out except Exception as e: logger.warning(f"Error reading PDF {local_pdf_path}: {e}") return True # Filter out the PDF if an exception occurs - # Form check - if self.apply_form_check and self._is_form(pdf_reader): - logger.info(f"Filtering out {local_pdf_path} because it's a form") - return True # Filter out - # Read the first five pages of text for language calculation pdftotext_result = subprocess.run( ["pdftotext", "-f", "1", "-l", "5", local_pdf_path, "-"], diff --git a/pdelfin/version.py b/pdelfin/version.py index fce2114..c139745 100644 --- a/pdelfin/version.py +++ b/pdelfin/version.py @@ -2,7 +2,7 @@ _MAJOR = "0" _MINOR = "1" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "52" +_PATCH = "53" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = ""