Refactoring

2025-12-24 05:36:12 +00:00 · 2024-10-09 18:11:18 +00:00 · 2024-10-09 18:11:18 +00:00 · 4bf6e7a430
commit 4bf6e7a430
parent 0c56dec704
4 changed files with 3 additions and 61 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,6 @@
 # ml stuff
 wandb/
+*histogram.png

 /*.html

--- a/pdelfin/filter/imagedetect.py
+++ b/pdelfin/filter/imagedetect.py
@ -1,51 +0,0 @@
-import numpy as np
-from pypdf import PdfReader
-from pypdf.generic import ContentStream, NameObject, NumberObject
-
-
-def process_content(content_stream, resources):
-    total_image_area = 0
-    graphics_state_stack = []
-    current_matrix = np.eye(3)
-
-    for operands, operator in content_stream.operations:
-        if operator == b"q":  # Save graphics state
-            graphics_state_stack.append(current_matrix.copy())
-        elif operator == b"Q":  # Restore graphics state
-            current_matrix = graphics_state_stack.pop()
-        elif operator == b"cm":  # Concatenate matrix to CTM
-            a, b, c, d, e, f = operands  # [a, b, c, d, e, f]
-            cm_matrix = np.array([[a, b, 0], [c, d, 0], [e, f, 1]])
-            current_matrix = np.matmul(current_matrix, cm_matrix)
-        elif operator == b"Do":  # Paint external object
-            xObjectName = operands[0]
-            if "/XObject" in resources and xObjectName in resources["/XObject"]:
-                xObject = resources["/XObject"][xObjectName]
-                if xObject["/Subtype"] == "/Image":
-                    width = xObject["/Width"]
-                    height = xObject["/Height"]
-
-                    # Calculate the area scaling factor using the absolute value of the determinant
-
-                    image_area = float(width) * float(height) * np.linalg.det(current_matrix)
-                    total_image_area += image_area
-    return total_image_area
-
-
-def pdf_page_image_area(reader: PdfReader, page_num: int) -> float:
-    page = reader.pages[page_num - 1]
-
-    page_width = float(page.mediabox.width)
-    page_height = float(page.mediabox.height)
-    page_area = page_width * page_height
-
-    content = page.get_contents()
-    if content is None:
-        return float("nan")
-
-    content_stream = ContentStream(content, reader)
-    resources = page["/Resources"]
-
-    image_area = process_content(content_stream, resources)
-
-    return image_area / page_area
--- a/tests/test_dataloader.py
+++ b/tests/test_dataloader.py
@ -117,8 +117,8 @@ class TestBatchQueryResponseDataset(unittest.TestCase):
        print(response_data[0])

    def testPyArrowDirectJson(self):
-        query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_train/*.jsonl"
-        response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_train/*.json"
+        query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl"
+        response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json"
        
        all_files = list_dataset_files(query_glob_path)

--- a/tests/test_filter.py
+++ b/tests/test_filter.py
@ -4,7 +4,6 @@ import unittest
 from pypdf import PdfReader

 from pdelfin.filter import PdfFilter
-from pdelfin.filter.imagedetect import pdf_page_image_area


 class PdfFilterTest(unittest.TestCase):
@ -16,10 +15,3 @@ class PdfFilterTest(unittest.TestCase):
            self.filter._is_form(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "form_on_later_pages.pdf"))
        )

-
-class ImageDetectionTest(unittest.TestCase):
-    def testSlideshowMostlyImages(self):
-        self.pdf = PdfReader(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "slideshow_mostly_images.pdf"))
-
-        for page in range(self.pdf.get_num_pages()):
-            print(page, pdf_page_image_area(self.pdf, page + 1))