mirror of
https://github.com/allenai/olmocr.git
synced 2025-06-27 04:00:02 +00:00
Fix up some tests but I don't see why this isn't working
This commit is contained in:
parent
3245990216
commit
b7c80cd17f
@ -4,25 +4,25 @@ import os
|
||||
import time
|
||||
import unittest
|
||||
|
||||
from pdelfin.extract_text import get_document_text, get_page_text
|
||||
|
||||
from pdelfin.filter.coherency import get_document_coherency
|
||||
|
||||
from pdelfin.prompts.anchor import get_anchor_text
|
||||
|
||||
class TestCoherencyScores(unittest.TestCase):
|
||||
def testBadOcr1(self):
|
||||
good_text = get_document_text(
|
||||
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf")
|
||||
good_text = get_anchor_text(
|
||||
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf"), 1, pdf_engine="pdftotext"
|
||||
)
|
||||
ocr1_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"))
|
||||
ocr2_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf"))
|
||||
ocr1_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"), 1, pdf_engine="pdftotext")
|
||||
ocr2_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf"), 1, pdf_engine="pdftotext")
|
||||
|
||||
print("Good", get_document_coherency(good_text))
|
||||
print("Bad1", get_document_coherency(ocr1_text))
|
||||
print("Bad2", get_document_coherency(ocr2_text))
|
||||
|
||||
def testHugeBookCoherencySpeed(self):
|
||||
base_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ti89_guidebook.pdf"))
|
||||
base_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ti89_guidebook.pdf"), 1, pdf_engine="pdftotext")
|
||||
print(f"ti89 book length: {len(base_text):,}")
|
||||
|
||||
warmup = get_document_coherency(base_text[:1000])
|
||||
@ -40,19 +40,19 @@ class TestCoherencyScores(unittest.TestCase):
|
||||
print(f"{char_per_sec:.2f} chars per second per core")
|
||||
|
||||
def testTwoColumnMisparse(self):
|
||||
pdftotext_text = get_page_text(
|
||||
pdftotext_text = get_anchor_text(
|
||||
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
|
||||
page_num=2,
|
||||
page=2,
|
||||
pdf_engine="pdftotext",
|
||||
)
|
||||
pymupdf_text = get_page_text(
|
||||
pymupdf_text = get_anchor_text(
|
||||
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
|
||||
page_num=2,
|
||||
page=2,
|
||||
pdf_engine="pymupdf",
|
||||
)
|
||||
pdfium_text = get_page_text(
|
||||
pdfium_text = get_anchor_text(
|
||||
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
|
||||
page_num=2,
|
||||
page=2,
|
||||
pdf_engine="pdfium",
|
||||
)
|
||||
|
||||
|
@ -12,12 +12,37 @@ from pdelfin.train.dataprep import (
|
||||
prepare_data_for_qwen2_training, build_finetuning_prompt
|
||||
)
|
||||
|
||||
from tqdm import tqdm
|
||||
from torch.utils.data import DataLoader
|
||||
from pdelfin.train.utils import make_dataset
|
||||
from pdelfin.train.core.config import TrainConfig, DataConfig, SourceConfig
|
||||
|
||||
class TestDataprep(unittest.TestCase):
|
||||
def testFullDataloader(self):
|
||||
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
|
||||
config = TrainConfig(
|
||||
train_data=DataConfig(seed=42,
|
||||
sources=[SourceConfig(name="eval_test",
|
||||
query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl",
|
||||
response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json")]),
|
||||
|
||||
valid_data=DataConfig(seed=42,
|
||||
sources=[SourceConfig(name="eval_test",
|
||||
query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl",
|
||||
response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json")])
|
||||
)
|
||||
train_dataset, valid_dataset = make_dataset(config, processor)
|
||||
|
||||
#train_dataloader = DataLoader(train_dataset, batch_size=1, num_workers=4, shuffle=False)
|
||||
for entry in train_dataset:
|
||||
print({x: y.shape for (x,y) in entry.items()})
|
||||
|
||||
|
||||
|
||||
def testTokenizationMatches(self):
|
||||
ds = build_batch_query_response_vision_dataset(
|
||||
query_glob_path="s3://ai2-oe-data/jakep/openai_batch_data_v2_mini/*.jsonl",
|
||||
response_glob_path="s3://ai2-oe-data/jakep/openai_batch_done_v2_mini/*.json",
|
||||
query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl",
|
||||
response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json",
|
||||
)
|
||||
|
||||
example = ds[0]
|
||||
|
Loading…
x
Reference in New Issue
Block a user