Refactoring

This commit is contained in:
Jake Poznanski 2024-10-09 18:11:18 +00:00
parent 0c56dec704
commit 4bf6e7a430
4 changed files with 3 additions and 61 deletions

1
.gitignore vendored
View File

@ -1,5 +1,6 @@
# ml stuff
wandb/
*histogram.png
/*.html

View File

@ -1,51 +0,0 @@
import numpy as np
from pypdf import PdfReader
from pypdf.generic import ContentStream, NameObject, NumberObject
def process_content(content_stream, resources):
total_image_area = 0
graphics_state_stack = []
current_matrix = np.eye(3)
for operands, operator in content_stream.operations:
if operator == b"q": # Save graphics state
graphics_state_stack.append(current_matrix.copy())
elif operator == b"Q": # Restore graphics state
current_matrix = graphics_state_stack.pop()
elif operator == b"cm": # Concatenate matrix to CTM
a, b, c, d, e, f = operands # [a, b, c, d, e, f]
cm_matrix = np.array([[a, b, 0], [c, d, 0], [e, f, 1]])
current_matrix = np.matmul(current_matrix, cm_matrix)
elif operator == b"Do": # Paint external object
xObjectName = operands[0]
if "/XObject" in resources and xObjectName in resources["/XObject"]:
xObject = resources["/XObject"][xObjectName]
if xObject["/Subtype"] == "/Image":
width = xObject["/Width"]
height = xObject["/Height"]
# Calculate the area scaling factor using the absolute value of the determinant
image_area = float(width) * float(height) * np.linalg.det(current_matrix)
total_image_area += image_area
return total_image_area
def pdf_page_image_area(reader: PdfReader, page_num: int) -> float:
page = reader.pages[page_num - 1]
page_width = float(page.mediabox.width)
page_height = float(page.mediabox.height)
page_area = page_width * page_height
content = page.get_contents()
if content is None:
return float("nan")
content_stream = ContentStream(content, reader)
resources = page["/Resources"]
image_area = process_content(content_stream, resources)
return image_area / page_area

View File

@ -117,8 +117,8 @@ class TestBatchQueryResponseDataset(unittest.TestCase):
print(response_data[0])
def testPyArrowDirectJson(self):
query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_train/*.jsonl"
response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_train/*.json"
query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl"
response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json"
all_files = list_dataset_files(query_glob_path)

View File

@ -4,7 +4,6 @@ import unittest
from pypdf import PdfReader
from pdelfin.filter import PdfFilter
from pdelfin.filter.imagedetect import pdf_page_image_area
class PdfFilterTest(unittest.TestCase):
@ -16,10 +15,3 @@ class PdfFilterTest(unittest.TestCase):
self.filter._is_form(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "form_on_later_pages.pdf"))
)
class ImageDetectionTest(unittest.TestCase):
def testSlideshowMostlyImages(self):
self.pdf = PdfReader(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "slideshow_mostly_images.pdf"))
for page in range(self.pdf.get_num_pages()):
print(page, pdf_page_image_area(self.pdf, page + 1))