coherency based anchor text

2025-12-01 17:50:33 +00:00 · 2024-10-01 20:19:03 +00:00 · 2024-10-01 20:19:03 +00:00 · 09e8840c56
commit 09e8840c56
parent 28fe314539
4 changed files with 14 additions and 7 deletions
--- a/pdelfin/prompts/anchor.py
+++ b/pdelfin/prompts/anchor.py
@ -39,8 +39,8 @@ def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftote

        scores = [get_document_coherency(text) for text in options]

-        # return option with the lowest score
-        return options[scores.index(min(scores))]
+        # return option with the best (highest) score (higher is more likley, as these are logprobs)
+        return options[scores.index(max(scores))]


 def _get_pdftotext(local_pdf_path: str, page: int) -> str:
@ -66,6 +66,5 @@ def _get_pypdf_raw(local_pdf_path: str, page: int) -> str:
 def _get_pdfium(local_pdf_path: str, page: int) -> str:
    pdf = pdfium.PdfDocument(local_pdf_path)
    textpage = pdf[page - 1].get_textpage()
-
-    return textpage
+    return textpage.get_text_range()

--- a/pdelfin/prompts/prompts.py
+++ b/pdelfin/prompts/prompts.py
@ -7,7 +7,7 @@ def build_openai_silver_data_prompt(base_text: str) -> str:
        f"Just return the plain text representation of this document as if you were reading it naturally.\n"
        f"Turn equations into a LaTeX representation, and tables into markdown format. Remove the headers and footers, but keep references and footnotes.\n"
        f"Read any natural handwriting.\n"
-        f"Strive to output the text as it appears on the page, without making any corrections\n"
+        f"This is likely one page out of several in the document, so be sure to preserve any sentences that come from the previous page, or continue onto the next page, exactly as they are.\n"
        f"If there is no text at all that you think you should read, just output [NO TEXT].\n"
        f"If the page has no English text on it at all, just output [NO ENGLISH TEXT].\n"
        f"Do not hallucinate.\n"
@ -25,6 +25,7 @@ def build_finetuning_prompt(base_text: str) -> str:
        f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
    )

+
 def extract_raw_text(prompt: str) -> str:
    pattern = r"RAW_TEXT_START\s*\n(.*?)\nRAW_TEXT_END"

@ -33,5 +34,5 @@ def extract_raw_text(prompt: str) -> str:

    if match:
        return match.group(1).strip()
-    else
+    else:
        raise ValueError("Prompt does not contain raw text")
--- a/pdelfin/silver_data/convertsilver_birr.py
+++ b/pdelfin/silver_data/convertsilver_birr.py
@ -63,7 +63,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):

    try:
        with smart_open.open(input_file, 'r', encoding='utf-8') as infile, \
-                smart_open.open(output_file, 'w', encoding='utf-8') as outfile:
+             smart_open.open(output_file, 'w', encoding='utf-8') as outfile:

            for line_number, line in enumerate(infile, 1):
                line = line.strip()
@ -96,6 +96,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):

        logging.info(f"Processed '{input_file}': {processed_count} records transformed, {error_count} errors.")
    except Exception as e:
+        logging.exception(e)
        logging.error(f"Failed to process file {input_file}: {e}")


--- a/tests/test_coherency.py
+++ b/tests/test_coherency.py
@ -7,6 +7,7 @@ import unittest
 from pdelfin.extract_text import get_document_text, get_page_text
 from pdelfin.filter.coherency import get_document_coherency

+from pdelfin.prompts.anchor import get_anchor_text

 class TestCoherencyScores(unittest.TestCase):
    def testBadOcr1(self):
@ -63,3 +64,8 @@ class TestCoherencyScores(unittest.TestCase):
        print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))

        self.assertLess(pdftotext_score, pymupdf_score)
+        self.assertLess(pdfium_score, pymupdf_score)
+
+        anchor_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), 2, pdf_engine="topcoherency")
+
+        self.assertEqual(anchor_text, pymupdf_text)