mirror of
https://github.com/allenai/olmocr.git
synced 2025-08-21 07:12:25 +00:00
Trying to make it faster
This commit is contained in:
parent
278422b8ff
commit
fd17652d55
@ -121,13 +121,8 @@ class PageReport:
|
|||||||
text_elements: List[TextElement]
|
text_elements: List[TextElement]
|
||||||
image_elements: List[ImageElement]
|
image_elements: List[ImageElement]
|
||||||
|
|
||||||
@lru_cache(maxsize=5)
|
|
||||||
def _get_cached_pdf_reader(local_pdf_path: str) -> PdfReader:
|
|
||||||
# Cached, because you are going to often iterate through a whole pdf, so this will make it a lot faster on subsequent iterations
|
|
||||||
return PdfReader(local_pdf_path)
|
|
||||||
|
|
||||||
def _pdf_report(local_pdf_path: str, page_num: int) -> PageReport:
|
def _pdf_report(local_pdf_path: str, page_num: int) -> PageReport:
|
||||||
reader = _get_cached_pdf_reader(local_pdf_path)
|
reader = PdfReader(local_pdf_path)
|
||||||
page = reader.pages[page_num - 1]
|
page = reader.pages[page_num - 1]
|
||||||
resources = page.get("/Resources", {})
|
resources = page.get("/Resources", {})
|
||||||
xobjects = resources.get("/XObject", {})
|
xobjects = resources.get("/XObject", {})
|
||||||
|
Loading…
x
Reference in New Issue
Block a user