Fix up some tests but I don't see why this isn't working

This commit is contained in:
Jake Poznanski 2024-10-10 16:41:19 +00:00
parent 3245990216
commit b7c80cd17f
2 changed files with 39 additions and 14 deletions

View File

@ -4,25 +4,25 @@ import os
import time
import unittest
from pdelfin.extract_text import get_document_text, get_page_text
from pdelfin.filter.coherency import get_document_coherency
from pdelfin.prompts.anchor import get_anchor_text
class TestCoherencyScores(unittest.TestCase):
def testBadOcr1(self):
good_text = get_document_text(
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf")
good_text = get_anchor_text(
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf"), 1, pdf_engine="pdftotext"
)
ocr1_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"))
ocr2_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf"))
ocr1_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"), 1, pdf_engine="pdftotext")
ocr2_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf"), 1, pdf_engine="pdftotext")
print("Good", get_document_coherency(good_text))
print("Bad1", get_document_coherency(ocr1_text))
print("Bad2", get_document_coherency(ocr2_text))
def testHugeBookCoherencySpeed(self):
base_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ti89_guidebook.pdf"))
base_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ti89_guidebook.pdf"), 1, pdf_engine="pdftotext")
print(f"ti89 book length: {len(base_text):,}")
warmup = get_document_coherency(base_text[:1000])
@ -40,19 +40,19 @@ class TestCoherencyScores(unittest.TestCase):
print(f"{char_per_sec:.2f} chars per second per core")
def testTwoColumnMisparse(self):
pdftotext_text = get_page_text(
pdftotext_text = get_anchor_text(
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
page_num=2,
page=2,
pdf_engine="pdftotext",
)
pymupdf_text = get_page_text(
pymupdf_text = get_anchor_text(
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
page_num=2,
page=2,
pdf_engine="pymupdf",
)
pdfium_text = get_page_text(
pdfium_text = get_anchor_text(
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
page_num=2,
page=2,
pdf_engine="pdfium",
)

View File

@ -12,12 +12,37 @@ from pdelfin.train.dataprep import (
prepare_data_for_qwen2_training, build_finetuning_prompt
)
from tqdm import tqdm
from torch.utils.data import DataLoader
from pdelfin.train.utils import make_dataset
from pdelfin.train.core.config import TrainConfig, DataConfig, SourceConfig
class TestDataprep(unittest.TestCase):
def testFullDataloader(self):
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
config = TrainConfig(
train_data=DataConfig(seed=42,
sources=[SourceConfig(name="eval_test",
query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl",
response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json")]),
valid_data=DataConfig(seed=42,
sources=[SourceConfig(name="eval_test",
query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl",
response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json")])
)
train_dataset, valid_dataset = make_dataset(config, processor)
#train_dataloader = DataLoader(train_dataset, batch_size=1, num_workers=4, shuffle=False)
for entry in train_dataset:
print({x: y.shape for (x,y) in entry.items()})
def testTokenizationMatches(self):
ds = build_batch_query_response_vision_dataset(
query_glob_path="s3://ai2-oe-data/jakep/openai_batch_data_v2_mini/*.jsonl",
response_glob_path="s3://ai2-oe-data/jakep/openai_batch_done_v2_mini/*.json",
query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl",
response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json",
)
example = ds[0]