mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-12 15:51:26 +00:00
Refactoring
This commit is contained in:
parent
0c56dec704
commit
4bf6e7a430
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,5 +1,6 @@
|
||||
# ml stuff
|
||||
wandb/
|
||||
*histogram.png
|
||||
|
||||
/*.html
|
||||
|
||||
|
||||
@ -1,51 +0,0 @@
|
||||
import numpy as np
|
||||
from pypdf import PdfReader
|
||||
from pypdf.generic import ContentStream, NameObject, NumberObject
|
||||
|
||||
|
||||
def process_content(content_stream, resources):
|
||||
total_image_area = 0
|
||||
graphics_state_stack = []
|
||||
current_matrix = np.eye(3)
|
||||
|
||||
for operands, operator in content_stream.operations:
|
||||
if operator == b"q": # Save graphics state
|
||||
graphics_state_stack.append(current_matrix.copy())
|
||||
elif operator == b"Q": # Restore graphics state
|
||||
current_matrix = graphics_state_stack.pop()
|
||||
elif operator == b"cm": # Concatenate matrix to CTM
|
||||
a, b, c, d, e, f = operands # [a, b, c, d, e, f]
|
||||
cm_matrix = np.array([[a, b, 0], [c, d, 0], [e, f, 1]])
|
||||
current_matrix = np.matmul(current_matrix, cm_matrix)
|
||||
elif operator == b"Do": # Paint external object
|
||||
xObjectName = operands[0]
|
||||
if "/XObject" in resources and xObjectName in resources["/XObject"]:
|
||||
xObject = resources["/XObject"][xObjectName]
|
||||
if xObject["/Subtype"] == "/Image":
|
||||
width = xObject["/Width"]
|
||||
height = xObject["/Height"]
|
||||
|
||||
# Calculate the area scaling factor using the absolute value of the determinant
|
||||
|
||||
image_area = float(width) * float(height) * np.linalg.det(current_matrix)
|
||||
total_image_area += image_area
|
||||
return total_image_area
|
||||
|
||||
|
||||
def pdf_page_image_area(reader: PdfReader, page_num: int) -> float:
|
||||
page = reader.pages[page_num - 1]
|
||||
|
||||
page_width = float(page.mediabox.width)
|
||||
page_height = float(page.mediabox.height)
|
||||
page_area = page_width * page_height
|
||||
|
||||
content = page.get_contents()
|
||||
if content is None:
|
||||
return float("nan")
|
||||
|
||||
content_stream = ContentStream(content, reader)
|
||||
resources = page["/Resources"]
|
||||
|
||||
image_area = process_content(content_stream, resources)
|
||||
|
||||
return image_area / page_area
|
||||
@ -117,8 +117,8 @@ class TestBatchQueryResponseDataset(unittest.TestCase):
|
||||
print(response_data[0])
|
||||
|
||||
def testPyArrowDirectJson(self):
|
||||
query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_train/*.jsonl"
|
||||
response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_train/*.json"
|
||||
query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl"
|
||||
response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json"
|
||||
|
||||
all_files = list_dataset_files(query_glob_path)
|
||||
|
||||
|
||||
@ -4,7 +4,6 @@ import unittest
|
||||
from pypdf import PdfReader
|
||||
|
||||
from pdelfin.filter import PdfFilter
|
||||
from pdelfin.filter.imagedetect import pdf_page_image_area
|
||||
|
||||
|
||||
class PdfFilterTest(unittest.TestCase):
|
||||
@ -16,10 +15,3 @@ class PdfFilterTest(unittest.TestCase):
|
||||
self.filter._is_form(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "form_on_later_pages.pdf"))
|
||||
)
|
||||
|
||||
|
||||
class ImageDetectionTest(unittest.TestCase):
|
||||
def testSlideshowMostlyImages(self):
|
||||
self.pdf = PdfReader(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "slideshow_mostly_images.pdf"))
|
||||
|
||||
for page in range(self.pdf.get_num_pages()):
|
||||
print(page, pdf_page_image_area(self.pdf, page + 1))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user