mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-11 15:21:37 +00:00
Birr tokenization test
This commit is contained in:
parent
77f0b9fa84
commit
9d35d3ca8f
@ -128,4 +128,47 @@ class TestBirrTokenization(unittest.TestCase):
|
|||||||
print(end_token_len - start_token_len + 1, " max image tokens")
|
print(end_token_len - start_token_len + 1, " max image tokens")
|
||||||
|
|
||||||
print(compute_number_of_image_tokens(1024, 1024))
|
print(compute_number_of_image_tokens(1024, 1024))
|
||||||
|
|
||||||
|
def testBirrChatTemplate(self):
|
||||||
|
import yaml
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
from birr.tokenization import ModelTokenizer
|
||||||
|
from birr.core.config import FormatConfig, LLMModelConfig
|
||||||
|
from birr.batch_inference.data_models import RawInputItem
|
||||||
|
|
||||||
|
from pdelfin.birrpipeline import build_page_query
|
||||||
|
|
||||||
|
TEST_INSTANCES = [json.dumps(build_page_query(os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"gnarly_pdfs",
|
||||||
|
"edgar.pdf"
|
||||||
|
), "test.pdf", 1, 1024, 4096)),
|
||||||
|
]
|
||||||
|
|
||||||
|
TEST_INSTANCES = [
|
||||||
|
RawInputItem.from_message_dicts(i, json.loads(json_string)["chat_messages"]).messages
|
||||||
|
for i, json_string in enumerate(TEST_INSTANCES)
|
||||||
|
]
|
||||||
|
|
||||||
|
MODEL_NAME = "Qwen/Qwen2-0.5B"
|
||||||
|
|
||||||
|
CONFIG_FILE = "/home/ubuntu/mise/birr/configs/inference/qwen2-vl-test.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
with open(CONFIG_FILE, "r") as f:
|
||||||
|
file_contents = f.read()
|
||||||
|
config_dict = yaml.safe_load(file_contents)
|
||||||
|
|
||||||
|
model_config = LLMModelConfig(name_or_path=MODEL_NAME)
|
||||||
|
format_config = FormatConfig(**config_dict.get("format", {}))
|
||||||
|
|
||||||
|
tokenizer = ModelTokenizer(model_config, format_config)
|
||||||
|
|
||||||
|
formatted = tokenizer.batch_format(TEST_INSTANCES)
|
||||||
|
|
||||||
|
print(formatted)
|
||||||
|
|
||||||
|
for formatted_instance in formatted:
|
||||||
|
print(f"========================================\n{formatted_instance}\n========================================")
|
||||||
Loading…
x
Reference in New Issue
Block a user