mirror of
https://github.com/allenai/olmocr.git
synced 2025-08-31 12:25:54 +00:00
bugfixes
This commit is contained in:
parent
c2909f314e
commit
a90feda42f
@ -17,6 +17,10 @@ from pdelfin.prompts import build_finetuning_prompt
|
||||
from pdelfin.prompts.anchor import get_anchor_text
|
||||
from pdelfin.filter import PdfFilter
|
||||
|
||||
import logging
|
||||
|
||||
logging.getLogger("pypdf").setLevel(logging.ERROR)
|
||||
|
||||
pdf_filter = PdfFilter()
|
||||
|
||||
def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> dict:
|
||||
@ -212,8 +216,7 @@ def main():
|
||||
cur_file.write("\n")
|
||||
cur_file_size += request_size
|
||||
|
||||
pb.update(1)
|
||||
|
||||
pb.update(1)
|
||||
except Exception as e:
|
||||
print(f"Error processing a PDF: {str(e)}")
|
||||
|
||||
|
@ -7,7 +7,7 @@ from jinja2 import Template
|
||||
from urllib.parse import urlparse
|
||||
from difflib import SequenceMatcher
|
||||
from tqdm import tqdm
|
||||
from pdelfin.silver_data.renderpdf import render_pdf_to_base64png
|
||||
from pdelfin.data.renderpdf import render_pdf_to_base64png
|
||||
|
||||
session = boto3.Session(profile_name='s2')
|
||||
s3_client = session.client('s3')
|
||||
|
@ -109,7 +109,7 @@ class BuildSilverTest(unittest.TestCase):
|
||||
def testSmallPage(self):
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
|
||||
|
||||
from pdelfin.silver_data.buildsilver import build_page_query
|
||||
from pdelfin.data.buildsilver import build_page_query
|
||||
|
||||
result = build_page_query(local_pdf_path, "s3://test.pdf", 1)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user