mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-28 07:34:13 +00:00
More test code
This commit is contained in:
parent
606e81bfea
commit
28d52602e9
@ -9,6 +9,7 @@ from unittest.mock import patch, AsyncMock
|
||||
import os
|
||||
import json
|
||||
import tempfile
|
||||
import math
|
||||
import base64
|
||||
import torch
|
||||
from io import BytesIO
|
||||
@ -18,7 +19,7 @@ from pathlib import Path
|
||||
from pdelfin.beakerpipeline import sglang_server_task, sglang_server_ready, build_page_query, SGLANG_SERVER_PORT, render_pdf_to_base64png, get_anchor_text, download_directory
|
||||
from pdelfin.prompts import PageResponse
|
||||
from httpx import AsyncClient
|
||||
|
||||
import torch.nn.functional as F
|
||||
MODEL_FINETUNED_PATH = "s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/"
|
||||
|
||||
EDGAR_TEXT = (
|
||||
@ -52,7 +53,7 @@ class TestSglangServer(unittest.IsolatedAsyncioTestCase):
|
||||
|
||||
# Set up a semaphore for server tasks
|
||||
self.semaphore = asyncio.Semaphore(1)
|
||||
|
||||
self.maxDiff = None
|
||||
|
||||
# Start the sglang server
|
||||
self.my_server_task = asyncio.create_task(sglang_server_task(self.args, self.semaphore))
|
||||
@ -135,6 +136,7 @@ class TestHuggingFaceModel(unittest.IsolatedAsyncioTestCase):
|
||||
|
||||
# Path to the test PDF
|
||||
self.test_pdf_path = Path(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "edgar.pdf"))
|
||||
self.maxDiff = None
|
||||
|
||||
async def test_hugging_face_generation(self):
|
||||
query = await build_page_query(
|
||||
@ -176,31 +178,49 @@ class TestHuggingFaceModel(unittest.IsolatedAsyncioTestCase):
|
||||
generation_output = self.model.generate(
|
||||
**inputs,
|
||||
temperature=0.0,
|
||||
max_new_tokens=3000,
|
||||
max_new_tokens=1,
|
||||
max_length=8192,
|
||||
num_return_sequences=1,
|
||||
do_sample=False,
|
||||
output_scores=True,
|
||||
return_dict_in_generate=True,
|
||||
)
|
||||
|
||||
# Decode the output
|
||||
decoded_output = self.tokenizer.decode(generation_output[0], skip_special_tokens=True)
|
||||
print(generation_output.scores)
|
||||
|
||||
print(decoded_output)
|
||||
# Extract the generated token's log probabilities
|
||||
scores = generation_output.scores # Tuple of length 1
|
||||
logits = scores[0] # Tensor of shape (batch_size, vocab_size)
|
||||
log_probs = F.log_softmax(logits, dim=-1) # Apply log softmax to get log probabilities
|
||||
|
||||
# Convert the decoded output into the expected PageResponse structure
|
||||
input_length = inputs["input_ids"].shape[1]
|
||||
# Get top 5 tokens and their log probabilities
|
||||
topk_log_probs, topk_indices = torch.topk(log_probs[0], k=5)
|
||||
topk_tokens = self.tokenizer.convert_ids_to_tokens(topk_indices.tolist())
|
||||
|
||||
# Decode the output and extract only the new part
|
||||
decoded_output = self.tokenizer.decode(generation_output[0], skip_special_tokens=True)
|
||||
new_part = self.tokenizer.decode(generation_output[0][input_length:], skip_special_tokens=True)
|
||||
print("Top 5 tokens and their log probabilities:")
|
||||
for token, log_prob in zip(topk_tokens, topk_log_probs.tolist()):
|
||||
print(f"Token: {token}, Log Prob: {log_prob:.2f}, Prob {math.exp(log_prob):.2f}%")
|
||||
|
||||
print(new_part)
|
||||
|
||||
# Convert the new part into the expected PageResponse structure
|
||||
generated_response = PageResponse(**json.loads(new_part))
|
||||
# # Decode the output
|
||||
# decoded_output = self.tokenizer.decode(generation_output[0], skip_special_tokens=True)
|
||||
|
||||
# Assert the output matches the expected text
|
||||
self.assertEqual(generated_response.natural_text, EDGAR_TEXT, maxDiff=None)
|
||||
# print(decoded_output)
|
||||
|
||||
# # Convert the decoded output into the expected PageResponse structure
|
||||
# input_length = inputs["input_ids"].shape[1]
|
||||
|
||||
# # Decode the output and extract only the new part
|
||||
# decoded_output = self.tokenizer.decode(generation_output[0], skip_special_tokens=True)
|
||||
# new_part = self.tokenizer.decode(generation_output[0][input_length:], skip_special_tokens=True)
|
||||
|
||||
# print(new_part)
|
||||
|
||||
# # Convert the new part into the expected PageResponse structure
|
||||
# generated_response = PageResponse(**json.loads(new_part))
|
||||
|
||||
# # Assert the output matches the expected text
|
||||
# self.assertEqual(generated_response.natural_text, EDGAR_TEXT)
|
||||
|
||||
async def asyncTearDown(self):
|
||||
# Clean up the model and tokenizer
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user