diff --git a/pdelfin/data/runpipeline.py b/pdelfin/data/runpipeline.py index 0ab9f03..99cc09f 100644 --- a/pdelfin/data/runpipeline.py +++ b/pdelfin/data/runpipeline.py @@ -17,6 +17,10 @@ from pdelfin.prompts import build_finetuning_prompt from pdelfin.prompts.anchor import get_anchor_text from pdelfin.filter import PdfFilter +import logging + +logging.getLogger("pypdf").setLevel(logging.ERROR) + pdf_filter = PdfFilter() def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> dict: @@ -212,8 +216,7 @@ def main(): cur_file.write("\n") cur_file_size += request_size - pb.update(1) - + pb.update(1) except Exception as e: print(f"Error processing a PDF: {str(e)}") diff --git a/pdelfin/eval/evalhtml.py b/pdelfin/eval/evalhtml.py index 6dcc0c3..8ab4b35 100644 --- a/pdelfin/eval/evalhtml.py +++ b/pdelfin/eval/evalhtml.py @@ -7,7 +7,7 @@ from jinja2 import Template from urllib.parse import urlparse from difflib import SequenceMatcher from tqdm import tqdm -from pdelfin.silver_data.renderpdf import render_pdf_to_base64png +from pdelfin.data.renderpdf import render_pdf_to_base64png session = boto3.Session(profile_name='s2') s3_client = session.client('s3') diff --git a/tests/test_anchor.py b/tests/test_anchor.py index 2c7fd02..225ea27 100644 --- a/tests/test_anchor.py +++ b/tests/test_anchor.py @@ -109,7 +109,7 @@ class BuildSilverTest(unittest.TestCase): def testSmallPage(self): local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf") - from pdelfin.silver_data.buildsilver import build_page_query + from pdelfin.data.buildsilver import build_page_query result = build_page_query(local_pdf_path, "s3://test.pdf", 1)