From bce84577c683558e5ae50f38c726b59881f80fe8 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Fri, 6 May 2022 16:57:13 +0200 Subject: [PATCH] Upgrade transformers version to 4.18.0 (#2514) * Upgrade transformers version to 4.18.0 * Adapt tokenization test to upgrade * Adapt tokenization test to upgrade --- setup.cfg | 2 +- test/test_tokenization.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/setup.cfg b/setup.cfg index b8a03ad33..7c7e73f45 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,7 +56,7 @@ install_requires = torch>1.9,<1.11 requests pydantic - transformers==4.13.0 + transformers==4.18.0 nltk pandas diff --git a/test/test_tokenization.py b/test/test_tokenization.py index 0fdb2e084..486b338f7 100644 --- a/test/test_tokenization.py +++ b/test/test_tokenization.py @@ -266,7 +266,7 @@ def test_all_tokenizer_on_special_cases(caplog): "This is a sentence with multiple tabs", ] - expected_to_fail = [(1, 1), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (2, 1), (2, 5)] + expected_to_fail = {(2, 1), (2, 5)} for i_tok, tokenizer in enumerate(tokenizers): for i_text, text in enumerate(texts): @@ -299,8 +299,6 @@ def test_all_tokenizer_on_special_cases(caplog): for ((start, end), w_index) in zip(encoded.offsets, encoded.words): word_start_ch = word_spans[w_index][0] token_offsets.append((start + word_start_ch, end + word_start_ch)) - if getattr(tokenizer, "add_prefix_space", None): - token_offsets = [(start - 1, end) for start, end in token_offsets] # verify that offsets align back to original text if text == "力加勝北区ᴵᴺᵀᵃছজটডণত":