This commit is contained in:
Jake Poznanski 2024-10-09 20:20:06 +00:00
parent c2909f314e
commit a90feda42f
3 changed files with 7 additions and 4 deletions

View File

@ -17,6 +17,10 @@ from pdelfin.prompts import build_finetuning_prompt
from pdelfin.prompts.anchor import get_anchor_text
from pdelfin.filter import PdfFilter
import logging
logging.getLogger("pypdf").setLevel(logging.ERROR)
pdf_filter = PdfFilter()
def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> dict:
@ -212,8 +216,7 @@ def main():
cur_file.write("\n")
cur_file_size += request_size
pb.update(1)
pb.update(1)
except Exception as e:
print(f"Error processing a PDF: {str(e)}")

View File

@ -7,7 +7,7 @@ from jinja2 import Template
from urllib.parse import urlparse
from difflib import SequenceMatcher
from tqdm import tqdm
from pdelfin.silver_data.renderpdf import render_pdf_to_base64png
from pdelfin.data.renderpdf import render_pdf_to_base64png
session = boto3.Session(profile_name='s2')
s3_client = session.client('s3')

View File

@ -109,7 +109,7 @@ class BuildSilverTest(unittest.TestCase):
def testSmallPage(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
from pdelfin.silver_data.buildsilver import build_page_query
from pdelfin.data.buildsilver import build_page_query
result = build_page_query(local_pdf_path, "s3://test.pdf", 1)