mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-03 05:41:22 +00:00
bugfixes
This commit is contained in:
parent
c2909f314e
commit
a90feda42f
@ -17,6 +17,10 @@ from pdelfin.prompts import build_finetuning_prompt
|
|||||||
from pdelfin.prompts.anchor import get_anchor_text
|
from pdelfin.prompts.anchor import get_anchor_text
|
||||||
from pdelfin.filter import PdfFilter
|
from pdelfin.filter import PdfFilter
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.getLogger("pypdf").setLevel(logging.ERROR)
|
||||||
|
|
||||||
pdf_filter = PdfFilter()
|
pdf_filter = PdfFilter()
|
||||||
|
|
||||||
def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> dict:
|
def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> dict:
|
||||||
@ -212,8 +216,7 @@ def main():
|
|||||||
cur_file.write("\n")
|
cur_file.write("\n")
|
||||||
cur_file_size += request_size
|
cur_file_size += request_size
|
||||||
|
|
||||||
pb.update(1)
|
pb.update(1)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing a PDF: {str(e)}")
|
print(f"Error processing a PDF: {str(e)}")
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ from jinja2 import Template
|
|||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from pdelfin.silver_data.renderpdf import render_pdf_to_base64png
|
from pdelfin.data.renderpdf import render_pdf_to_base64png
|
||||||
|
|
||||||
session = boto3.Session(profile_name='s2')
|
session = boto3.Session(profile_name='s2')
|
||||||
s3_client = session.client('s3')
|
s3_client = session.client('s3')
|
||||||
|
@ -109,7 +109,7 @@ class BuildSilverTest(unittest.TestCase):
|
|||||||
def testSmallPage(self):
|
def testSmallPage(self):
|
||||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
|
||||||
|
|
||||||
from pdelfin.silver_data.buildsilver import build_page_query
|
from pdelfin.data.buildsilver import build_page_query
|
||||||
|
|
||||||
result = build_page_query(local_pdf_path, "s3://test.pdf", 1)
|
result = build_page_query(local_pdf_path, "s3://test.pdf", 1)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user