mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-02 20:09:08 +00:00
coherency based anchor text
This commit is contained in:
parent
28fe314539
commit
09e8840c56
@ -39,8 +39,8 @@ def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftote
|
|||||||
|
|
||||||
scores = [get_document_coherency(text) for text in options]
|
scores = [get_document_coherency(text) for text in options]
|
||||||
|
|
||||||
# return option with the lowest score
|
# return option with the best (highest) score (higher is more likley, as these are logprobs)
|
||||||
return options[scores.index(min(scores))]
|
return options[scores.index(max(scores))]
|
||||||
|
|
||||||
|
|
||||||
def _get_pdftotext(local_pdf_path: str, page: int) -> str:
|
def _get_pdftotext(local_pdf_path: str, page: int) -> str:
|
||||||
@ -66,6 +66,5 @@ def _get_pypdf_raw(local_pdf_path: str, page: int) -> str:
|
|||||||
def _get_pdfium(local_pdf_path: str, page: int) -> str:
|
def _get_pdfium(local_pdf_path: str, page: int) -> str:
|
||||||
pdf = pdfium.PdfDocument(local_pdf_path)
|
pdf = pdfium.PdfDocument(local_pdf_path)
|
||||||
textpage = pdf[page - 1].get_textpage()
|
textpage = pdf[page - 1].get_textpage()
|
||||||
|
return textpage.get_text_range()
|
||||||
return textpage
|
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ def build_openai_silver_data_prompt(base_text: str) -> str:
|
|||||||
f"Just return the plain text representation of this document as if you were reading it naturally.\n"
|
f"Just return the plain text representation of this document as if you were reading it naturally.\n"
|
||||||
f"Turn equations into a LaTeX representation, and tables into markdown format. Remove the headers and footers, but keep references and footnotes.\n"
|
f"Turn equations into a LaTeX representation, and tables into markdown format. Remove the headers and footers, but keep references and footnotes.\n"
|
||||||
f"Read any natural handwriting.\n"
|
f"Read any natural handwriting.\n"
|
||||||
f"Strive to output the text as it appears on the page, without making any corrections\n"
|
f"This is likely one page out of several in the document, so be sure to preserve any sentences that come from the previous page, or continue onto the next page, exactly as they are.\n"
|
||||||
f"If there is no text at all that you think you should read, just output [NO TEXT].\n"
|
f"If there is no text at all that you think you should read, just output [NO TEXT].\n"
|
||||||
f"If the page has no English text on it at all, just output [NO ENGLISH TEXT].\n"
|
f"If the page has no English text on it at all, just output [NO ENGLISH TEXT].\n"
|
||||||
f"Do not hallucinate.\n"
|
f"Do not hallucinate.\n"
|
||||||
@ -25,6 +25,7 @@ def build_finetuning_prompt(base_text: str) -> str:
|
|||||||
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
|
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def extract_raw_text(prompt: str) -> str:
|
def extract_raw_text(prompt: str) -> str:
|
||||||
pattern = r"RAW_TEXT_START\s*\n(.*?)\nRAW_TEXT_END"
|
pattern = r"RAW_TEXT_START\s*\n(.*?)\nRAW_TEXT_END"
|
||||||
|
|
||||||
@ -33,5 +34,5 @@ def extract_raw_text(prompt: str) -> str:
|
|||||||
|
|
||||||
if match:
|
if match:
|
||||||
return match.group(1).strip()
|
return match.group(1).strip()
|
||||||
else
|
else:
|
||||||
raise ValueError("Prompt does not contain raw text")
|
raise ValueError("Prompt does not contain raw text")
|
||||||
|
@ -63,7 +63,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
with smart_open.open(input_file, 'r', encoding='utf-8') as infile, \
|
with smart_open.open(input_file, 'r', encoding='utf-8') as infile, \
|
||||||
smart_open.open(output_file, 'w', encoding='utf-8') as outfile:
|
smart_open.open(output_file, 'w', encoding='utf-8') as outfile:
|
||||||
|
|
||||||
for line_number, line in enumerate(infile, 1):
|
for line_number, line in enumerate(infile, 1):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
@ -96,6 +96,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
|
|||||||
|
|
||||||
logging.info(f"Processed '{input_file}': {processed_count} records transformed, {error_count} errors.")
|
logging.info(f"Processed '{input_file}': {processed_count} records transformed, {error_count} errors.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
logging.exception(e)
|
||||||
logging.error(f"Failed to process file {input_file}: {e}")
|
logging.error(f"Failed to process file {input_file}: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@ -7,6 +7,7 @@ import unittest
|
|||||||
from pdelfin.extract_text import get_document_text, get_page_text
|
from pdelfin.extract_text import get_document_text, get_page_text
|
||||||
from pdelfin.filter.coherency import get_document_coherency
|
from pdelfin.filter.coherency import get_document_coherency
|
||||||
|
|
||||||
|
from pdelfin.prompts.anchor import get_anchor_text
|
||||||
|
|
||||||
class TestCoherencyScores(unittest.TestCase):
|
class TestCoherencyScores(unittest.TestCase):
|
||||||
def testBadOcr1(self):
|
def testBadOcr1(self):
|
||||||
@ -63,3 +64,8 @@ class TestCoherencyScores(unittest.TestCase):
|
|||||||
print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))
|
print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))
|
||||||
|
|
||||||
self.assertLess(pdftotext_score, pymupdf_score)
|
self.assertLess(pdftotext_score, pymupdf_score)
|
||||||
|
self.assertLess(pdfium_score, pymupdf_score)
|
||||||
|
|
||||||
|
anchor_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), 2, pdf_engine="topcoherency")
|
||||||
|
|
||||||
|
self.assertEqual(anchor_text, pymupdf_text)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user