diff --git a/pdelfin/prompts/anchor.py b/pdelfin/prompts/anchor.py index fa44787..c6d9104 100644 --- a/pdelfin/prompts/anchor.py +++ b/pdelfin/prompts/anchor.py @@ -39,8 +39,8 @@ def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftote scores = [get_document_coherency(text) for text in options] - # return option with the lowest score - return options[scores.index(min(scores))] + # return option with the best (highest) score (higher is more likley, as these are logprobs) + return options[scores.index(max(scores))] def _get_pdftotext(local_pdf_path: str, page: int) -> str: @@ -66,6 +66,5 @@ def _get_pypdf_raw(local_pdf_path: str, page: int) -> str: def _get_pdfium(local_pdf_path: str, page: int) -> str: pdf = pdfium.PdfDocument(local_pdf_path) textpage = pdf[page - 1].get_textpage() - - return textpage + return textpage.get_text_range() diff --git a/pdelfin/prompts/prompts.py b/pdelfin/prompts/prompts.py index 479ae68..dbde7ff 100644 --- a/pdelfin/prompts/prompts.py +++ b/pdelfin/prompts/prompts.py @@ -7,7 +7,7 @@ def build_openai_silver_data_prompt(base_text: str) -> str: f"Just return the plain text representation of this document as if you were reading it naturally.\n" f"Turn equations into a LaTeX representation, and tables into markdown format. Remove the headers and footers, but keep references and footnotes.\n" f"Read any natural handwriting.\n" - f"Strive to output the text as it appears on the page, without making any corrections\n" + f"This is likely one page out of several in the document, so be sure to preserve any sentences that come from the previous page, or continue onto the next page, exactly as they are.\n" f"If there is no text at all that you think you should read, just output [NO TEXT].\n" f"If the page has no English text on it at all, just output [NO ENGLISH TEXT].\n" f"Do not hallucinate.\n" @@ -25,6 +25,7 @@ def build_finetuning_prompt(base_text: str) -> str: f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END" ) + def extract_raw_text(prompt: str) -> str: pattern = r"RAW_TEXT_START\s*\n(.*?)\nRAW_TEXT_END" @@ -33,5 +34,5 @@ def extract_raw_text(prompt: str) -> str: if match: return match.group(1).strip() - else + else: raise ValueError("Prompt does not contain raw text") diff --git a/pdelfin/silver_data/convertsilver_birr.py b/pdelfin/silver_data/convertsilver_birr.py index 30edaef..170508e 100644 --- a/pdelfin/silver_data/convertsilver_birr.py +++ b/pdelfin/silver_data/convertsilver_birr.py @@ -63,7 +63,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool): try: with smart_open.open(input_file, 'r', encoding='utf-8') as infile, \ - smart_open.open(output_file, 'w', encoding='utf-8') as outfile: + smart_open.open(output_file, 'w', encoding='utf-8') as outfile: for line_number, line in enumerate(infile, 1): line = line.strip() @@ -96,6 +96,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool): logging.info(f"Processed '{input_file}': {processed_count} records transformed, {error_count} errors.") except Exception as e: + logging.exception(e) logging.error(f"Failed to process file {input_file}: {e}") diff --git a/tests/test_coherency.py b/tests/test_coherency.py index 4dc98a6..95fe363 100644 --- a/tests/test_coherency.py +++ b/tests/test_coherency.py @@ -7,6 +7,7 @@ import unittest from pdelfin.extract_text import get_document_text, get_page_text from pdelfin.filter.coherency import get_document_coherency +from pdelfin.prompts.anchor import get_anchor_text class TestCoherencyScores(unittest.TestCase): def testBadOcr1(self): @@ -63,3 +64,8 @@ class TestCoherencyScores(unittest.TestCase): print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text)) self.assertLess(pdftotext_score, pymupdf_score) + self.assertLess(pdfium_score, pymupdf_score) + + anchor_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), 2, pdf_engine="topcoherency") + + self.assertEqual(anchor_text, pymupdf_text)