coherency based anchor text

This commit is contained in:
Jake Poznanski 2024-10-01 20:19:03 +00:00
parent 28fe314539
commit 09e8840c56
4 changed files with 14 additions and 7 deletions

View File

@ -39,8 +39,8 @@ def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftote
scores = [get_document_coherency(text) for text in options]
# return option with the lowest score
return options[scores.index(min(scores))]
# return option with the best (highest) score (higher is more likley, as these are logprobs)
return options[scores.index(max(scores))]
def _get_pdftotext(local_pdf_path: str, page: int) -> str:
@ -66,6 +66,5 @@ def _get_pypdf_raw(local_pdf_path: str, page: int) -> str:
def _get_pdfium(local_pdf_path: str, page: int) -> str:
pdf = pdfium.PdfDocument(local_pdf_path)
textpage = pdf[page - 1].get_textpage()
return textpage
return textpage.get_text_range()

View File

@ -7,7 +7,7 @@ def build_openai_silver_data_prompt(base_text: str) -> str:
f"Just return the plain text representation of this document as if you were reading it naturally.\n"
f"Turn equations into a LaTeX representation, and tables into markdown format. Remove the headers and footers, but keep references and footnotes.\n"
f"Read any natural handwriting.\n"
f"Strive to output the text as it appears on the page, without making any corrections\n"
f"This is likely one page out of several in the document, so be sure to preserve any sentences that come from the previous page, or continue onto the next page, exactly as they are.\n"
f"If there is no text at all that you think you should read, just output [NO TEXT].\n"
f"If the page has no English text on it at all, just output [NO ENGLISH TEXT].\n"
f"Do not hallucinate.\n"
@ -25,6 +25,7 @@ def build_finetuning_prompt(base_text: str) -> str:
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
)
def extract_raw_text(prompt: str) -> str:
pattern = r"RAW_TEXT_START\s*\n(.*?)\nRAW_TEXT_END"
@ -33,5 +34,5 @@ def extract_raw_text(prompt: str) -> str:
if match:
return match.group(1).strip()
else
else:
raise ValueError("Prompt does not contain raw text")

View File

@ -63,7 +63,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
try:
with smart_open.open(input_file, 'r', encoding='utf-8') as infile, \
smart_open.open(output_file, 'w', encoding='utf-8') as outfile:
smart_open.open(output_file, 'w', encoding='utf-8') as outfile:
for line_number, line in enumerate(infile, 1):
line = line.strip()
@ -96,6 +96,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
logging.info(f"Processed '{input_file}': {processed_count} records transformed, {error_count} errors.")
except Exception as e:
logging.exception(e)
logging.error(f"Failed to process file {input_file}: {e}")

View File

@ -7,6 +7,7 @@ import unittest
from pdelfin.extract_text import get_document_text, get_page_text
from pdelfin.filter.coherency import get_document_coherency
from pdelfin.prompts.anchor import get_anchor_text
class TestCoherencyScores(unittest.TestCase):
def testBadOcr1(self):
@ -63,3 +64,8 @@ class TestCoherencyScores(unittest.TestCase):
print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))
self.assertLess(pdftotext_score, pymupdf_score)
self.assertLess(pdfium_score, pymupdf_score)
anchor_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), 2, pdf_engine="topcoherency")
self.assertEqual(anchor_text, pymupdf_text)