| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | from typing import Tuple | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | import re | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | import pytest | 
					
						
							|  |  |  | import numpy as np | 
					
						
							|  |  |  | from unittest.mock import MagicMock | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | from tokenizers.pre_tokenizers import WhitespaceSplit | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | import haystack | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  | from haystack.modeling.model.feature_extraction import FeatureExtractor | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | BERT = "bert-base-cased" | 
					
						
							|  |  |  | ROBERTA = "roberta-base" | 
					
						
							|  |  |  | XLNET = "xlnet-base-cased" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TOKENIZERS_TO_TEST = [BERT, ROBERTA, XLNET] | 
					
						
							|  |  |  | TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER = [(BERT, "##"), (ROBERTA, "Ġ"), (XLNET, "▁")] | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | REGULAR_SENTENCE = "This is a sentence" | 
					
						
							|  |  |  | GERMAN_SENTENCE = "Der entscheidende Pass" | 
					
						
							|  |  |  | OTHER_ALPHABETS = "力加勝北区ᴵᴺᵀᵃছজটডণত" | 
					
						
							|  |  |  | GIBBERISH_SENTENCE = "Thiso text is included tolod makelio sure Unicodeel is handled properly:" | 
					
						
							|  |  |  | SENTENCE_WITH_ELLIPSIS = "This is a sentence..." | 
					
						
							|  |  |  | SENTENCE_WITH_LINEBREAK_1 = "and another one\n\n\nwithout space" | 
					
						
							|  |  |  | SENTENCE_WITH_LINEBREAK_2 = """This is a sentence.
 | 
					
						
							|  |  |  |     With linebreak"""
 | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  | SENTENCE_WITH_LINEBREAKS = """Sentence
 | 
					
						
							|  |  |  |     with | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     multiple | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  |     newlines | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     """
 | 
					
						
							|  |  |  | SENTENCE_WITH_EXCESS_WHITESPACE = "This      is a sentence with multiple spaces" | 
					
						
							|  |  |  | SENTENCE_WITH_TABS = "This is a sentence			with multiple tabs" | 
					
						
							|  |  |  | SENTENCE_WITH_CUSTOM_TOKEN = "Let's see all on this text and. !23# neverseenwordspossible" | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | class AutoTokenizer: | 
					
						
							|  |  |  |     mocker: MagicMock = MagicMock() | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     @classmethod | 
					
						
							|  |  |  |     def from_pretrained(cls, *args, **kwargs): | 
					
						
							|  |  |  |         cls.mocker.from_pretrained(*args, **kwargs) | 
					
						
							|  |  |  |         return cls() | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | @pytest.fixture(autouse=True) | 
					
						
							|  |  |  | def mock_autotokenizer(request, monkeypatch): | 
					
						
							|  |  |  |     # Do not patch integration tests | 
					
						
							|  |  |  |     if "integration" in request.keywords: | 
					
						
							|  |  |  |         return | 
					
						
							|  |  |  |     monkeypatch.setattr(haystack.modeling.model.tokenization, "AutoTokenizer", AutoTokenizer) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def convert_offset_from_word_reference_to_text_reference(offsets, words, word_spans): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Token offsets are originally relative to the beginning of the word | 
					
						
							|  |  |  |     We make them relative to the beginning of the sentence. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Not a fixture, just a utility. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     token_offsets = [] | 
					
						
							|  |  |  |     for ((start, end), word_index) in zip(offsets, words): | 
					
						
							|  |  |  |         word_start = word_spans[word_index][0] | 
					
						
							|  |  |  |         token_offsets.append((start + word_start, end + word_start)) | 
					
						
							|  |  |  |     return token_offsets | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Unit tests | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_get_tokenizer_str(): | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  |     tokenizer = FeatureExtractor(pretrained_model_name_or_path="test-model-name") | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     tokenizer.mocker.from_pretrained.assert_called_with( | 
					
						
							|  |  |  |         pretrained_model_name_or_path="test-model-name", revision=None, use_fast=True, use_auth_token=None | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | def test_get_tokenizer_path(tmp_path): | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  |     tokenizer = FeatureExtractor(pretrained_model_name_or_path=tmp_path / "test-path") | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     tokenizer.mocker.from_pretrained.assert_called_with( | 
					
						
							|  |  |  |         pretrained_model_name_or_path=str(tmp_path / "test-path"), revision=None, use_fast=True, use_auth_token=None | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | def test_get_tokenizer_keep_accents(): | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  |     tokenizer = FeatureExtractor(pretrained_model_name_or_path="test-model-name-albert") | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     tokenizer.mocker.from_pretrained.assert_called_with( | 
					
						
							|  |  |  |         pretrained_model_name_or_path="test-model-name-albert", | 
					
						
							|  |  |  |         revision=None, | 
					
						
							|  |  |  |         use_fast=True, | 
					
						
							|  |  |  |         use_auth_token=None, | 
					
						
							|  |  |  |         keep_accents=True, | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | def test_get_tokenizer_mlm_warning(caplog): | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  |     tokenizer = FeatureExtractor(pretrained_model_name_or_path="test-model-name-mlm") | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     tokenizer.mocker.from_pretrained.assert_called_with( | 
					
						
							|  |  |  |         pretrained_model_name_or_path="test-model-name-mlm", revision=None, use_fast=True, use_auth_token=None | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert "MLM part of codebert is currently not supported in Haystack".lower() in caplog.text.lower() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Integration tests | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.integration | 
					
						
							|  |  |  | @pytest.mark.parametrize("model_name", TOKENIZERS_TO_TEST) | 
					
						
							|  |  |  | def test_save_load(tmp_path, model_name: str): | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  |     tokenizer = FeatureExtractor(pretrained_model_name_or_path=model_name, do_lower_case=False) | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  |     tokenizer.add_tokens(new_tokens=["neverseentokens"]) | 
					
						
							| 
									
										
										
										
											2022-08-31 07:33:39 -04:00
										 |  |  |     original_encoding = tokenizer(text) | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     save_dir = tmp_path / "saved_tokenizer" | 
					
						
							|  |  |  |     tokenizer.save_pretrained(save_dir) | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  |     tokenizer_loaded = FeatureExtractor(pretrained_model_name_or_path=save_dir) | 
					
						
							| 
									
										
										
										
											2022-08-31 07:33:39 -04:00
										 |  |  |     new_encoding = tokenizer_loaded(text) | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     assert original_encoding == new_encoding | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.integration | 
					
						
							|  |  |  | def test_tokenize_custom_vocab_bert(): | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  |     tokenizer = FeatureExtractor(pretrained_model_name_or_path=BERT, do_lower_case=False) | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     tokenizer.add_tokens(new_tokens=["neverseentokens"]) | 
					
						
							|  |  |  |     text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     tokenized = tokenizer.tokenize(text) | 
					
						
							|  |  |  |     assert ( | 
					
						
							|  |  |  |         tokenized == f"Some Text with neverseentokens plus ! 215 ? # . and a combined - token _ with / ch ##ars".split() | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.integration | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     "edge_case", | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         REGULAR_SENTENCE, | 
					
						
							|  |  |  |         OTHER_ALPHABETS, | 
					
						
							|  |  |  |         GIBBERISH_SENTENCE, | 
					
						
							|  |  |  |         SENTENCE_WITH_ELLIPSIS, | 
					
						
							|  |  |  |         SENTENCE_WITH_LINEBREAK_1, | 
					
						
							|  |  |  |         SENTENCE_WITH_LINEBREAK_2, | 
					
						
							|  |  |  |         SENTENCE_WITH_LINEBREAKS, | 
					
						
							|  |  |  |         SENTENCE_WITH_EXCESS_WHITESPACE, | 
					
						
							|  |  |  |         SENTENCE_WITH_TABS, | 
					
						
							|  |  |  |     ], | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | @pytest.mark.parametrize("model_name", TOKENIZERS_TO_TEST) | 
					
						
							|  |  |  | def test_tokenization_on_edge_cases_full_sequence_tokenization(model_name: str, edge_case: str): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Verify that tokenization on full sequence is the same as the one on "whitespace tokenized words" | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  |     tokenizer = FeatureExtractor(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True) | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     pre_tokenizer = WhitespaceSplit() | 
					
						
							|  |  |  |     words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case) | 
					
						
							|  |  |  |     words = [x[0] for x in words_and_spans] | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-08-31 07:33:39 -04:00
										 |  |  |     encoded = tokenizer(words, is_split_into_words=True, add_special_tokens=False).encodings[0] | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     expected_tokenization = tokenizer.tokenize(" ".join(edge_case.split()))  # remove multiple whitespaces | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     assert encoded.tokens == expected_tokenization | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.integration | 
					
						
							|  |  |  | @pytest.mark.parametrize("edge_case", [SENTENCE_WITH_CUSTOM_TOKEN, GERMAN_SENTENCE]) | 
					
						
							|  |  |  | @pytest.mark.parametrize("model_name", [t for t in TOKENIZERS_TO_TEST if t != ROBERTA]) | 
					
						
							|  |  |  | def test_tokenization_on_edge_cases_full_sequence_tokenization_roberta_exceptions(model_name: str, edge_case: str): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Verify that tokenization on full sequence is the same as the one on "whitespace tokenized words". | 
					
						
							|  |  |  |     These test cases work for all tokenizers under test except for RoBERTa. | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  |     tokenizer = FeatureExtractor(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True) | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     pre_tokenizer = WhitespaceSplit() | 
					
						
							|  |  |  |     words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case) | 
					
						
							|  |  |  |     words = [x[0] for x in words_and_spans] | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-08-31 07:33:39 -04:00
										 |  |  |     encoded = tokenizer(words, is_split_into_words=True, add_special_tokens=False).encodings[0] | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     expected_tokenization = tokenizer.tokenize(" ".join(edge_case.split()))  # remove multiple whitespaces | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     assert encoded.tokens == expected_tokenization | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | @pytest.mark.integration | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     "edge_case", | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         REGULAR_SENTENCE, | 
					
						
							|  |  |  |         # OTHER_ALPHABETS,  # contains [UNK] that are impossible to match back to original text space | 
					
						
							|  |  |  |         GIBBERISH_SENTENCE, | 
					
						
							|  |  |  |         SENTENCE_WITH_ELLIPSIS, | 
					
						
							|  |  |  |         SENTENCE_WITH_LINEBREAK_1, | 
					
						
							|  |  |  |         SENTENCE_WITH_LINEBREAK_2, | 
					
						
							|  |  |  |         SENTENCE_WITH_LINEBREAKS, | 
					
						
							|  |  |  |         SENTENCE_WITH_EXCESS_WHITESPACE, | 
					
						
							|  |  |  |         SENTENCE_WITH_TABS, | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | @pytest.mark.parametrize("model_name,marker", TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER) | 
					
						
							|  |  |  | def test_tokenization_on_edge_cases_full_sequence_verify_spans(model_name: str, marker: str, edge_case: str): | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  |     tokenizer = FeatureExtractor(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True) | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     pre_tokenizer = WhitespaceSplit() | 
					
						
							|  |  |  |     words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case) | 
					
						
							|  |  |  |     words = [x[0] for x in words_and_spans] | 
					
						
							|  |  |  |     word_spans = [x[1] for x in words_and_spans] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-08-31 07:33:39 -04:00
										 |  |  |     encoded = tokenizer(words, is_split_into_words=True, add_special_tokens=False).encodings[0] | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # subword-tokens have special chars depending on model type. To align with original text we get rid of them | 
					
						
							|  |  |  |     tokens = [token.replace(marker, "") for token in encoded.tokens] | 
					
						
							|  |  |  |     token_offsets = convert_offset_from_word_reference_to_text_reference(encoded.offsets, encoded.words, word_spans) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for token, (start, end) in zip(tokens, token_offsets): | 
					
						
							|  |  |  |         assert token == edge_case[start:end] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.integration | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     "edge_case", | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         REGULAR_SENTENCE, | 
					
						
							|  |  |  |         GERMAN_SENTENCE, | 
					
						
							|  |  |  |         SENTENCE_WITH_EXCESS_WHITESPACE, | 
					
						
							|  |  |  |         OTHER_ALPHABETS, | 
					
						
							|  |  |  |         GIBBERISH_SENTENCE, | 
					
						
							|  |  |  |         SENTENCE_WITH_ELLIPSIS, | 
					
						
							|  |  |  |         SENTENCE_WITH_CUSTOM_TOKEN, | 
					
						
							|  |  |  |         SENTENCE_WITH_LINEBREAK_1, | 
					
						
							|  |  |  |         SENTENCE_WITH_LINEBREAK_2, | 
					
						
							|  |  |  |         SENTENCE_WITH_LINEBREAKS, | 
					
						
							|  |  |  |         SENTENCE_WITH_TABS, | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_detokenization_for_bert(edge_case): | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  |     tokenizer = FeatureExtractor(pretrained_model_name_or_path=BERT, do_lower_case=False) | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-08-31 07:33:39 -04:00
										 |  |  |     encoded = tokenizer(edge_case, add_special_tokens=False).encodings[0] | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     detokenized = " ".join(encoded.tokens) | 
					
						
							|  |  |  |     detokenized = re.sub(r"(^|\s+)(##)", "", detokenized) | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     detokenized_ids = tokenizer(detokenized, add_special_tokens=False)["input_ids"] | 
					
						
							|  |  |  |     detokenized_tokens = [tokenizer.decode([tok_id]).strip() for tok_id in detokenized_ids] | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     assert encoded.tokens == detokenized_tokens | 
					
						
							| 
									
										
										
										
											2021-09-09 12:56:44 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.integration | 
					
						
							|  |  |  | def test_encode_plus_for_bert(): | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  |     tokenizer = FeatureExtractor(pretrained_model_name_or_path=BERT, do_lower_case=False) | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-08-31 07:33:39 -04:00
										 |  |  |     encoded_batch = tokenizer(text) | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     encoded = encoded_batch.encodings[0] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     words = np.array(encoded.words) | 
					
						
							|  |  |  |     words[0] = -1 | 
					
						
							|  |  |  |     words[-1] = -1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     print(words.tolist()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     tokens = encoded.tokens | 
					
						
							|  |  |  |     offsets = [x[0] for x in encoded.offsets] | 
					
						
							|  |  |  |     start_of_word = [False] + list(np.ediff1d(words) > 0) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert list(zip(tokens, offsets, start_of_word)) == [ | 
					
						
							|  |  |  |         ("[CLS]", 0, False), | 
					
						
							|  |  |  |         ("Some", 0, True), | 
					
						
							|  |  |  |         ("Text", 5, True), | 
					
						
							|  |  |  |         ("with", 10, True), | 
					
						
							|  |  |  |         ("never", 15, True), | 
					
						
							|  |  |  |         ("##see", 20, False), | 
					
						
							|  |  |  |         ("##nto", 23, False), | 
					
						
							|  |  |  |         ("##ken", 26, False), | 
					
						
							|  |  |  |         ("##s", 29, False), | 
					
						
							|  |  |  |         ("plus", 31, True), | 
					
						
							|  |  |  |         ("!", 36, True), | 
					
						
							|  |  |  |         ("215", 37, True), | 
					
						
							|  |  |  |         ("?", 40, True), | 
					
						
							|  |  |  |         ("#", 41, True), | 
					
						
							|  |  |  |         (".", 42, True), | 
					
						
							|  |  |  |         ("and", 44, True), | 
					
						
							|  |  |  |         ("a", 48, True), | 
					
						
							|  |  |  |         ("combined", 50, True), | 
					
						
							|  |  |  |         ("-", 58, True), | 
					
						
							|  |  |  |         ("token", 59, True), | 
					
						
							|  |  |  |         ("_", 64, True), | 
					
						
							|  |  |  |         ("with", 65, True), | 
					
						
							|  |  |  |         ("/", 69, True), | 
					
						
							|  |  |  |         ("ch", 70, True), | 
					
						
							|  |  |  |         ("##ars", 72, False), | 
					
						
							|  |  |  |         ("[SEP]", 0, False), | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.integration | 
					
						
							|  |  |  | def test_tokenize_custom_vocab_bert(): | 
					
						
							| 
									
										
										
										
											2022-10-17 18:58:35 +02:00
										 |  |  |     tokenizer = FeatureExtractor(pretrained_model_name_or_path=BERT, do_lower_case=False) | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     tokenizer.add_tokens(new_tokens=["neverseentokens"]) | 
					
						
							|  |  |  |     text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     tokenized = tokenizer.tokenize(text) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-08-31 07:33:39 -04:00
										 |  |  |     encoded = tokenizer(text, add_special_tokens=False).encodings[0] | 
					
						
							| 
									
										
										
										
											2022-07-22 16:29:30 +02:00
										 |  |  |     offsets = [x[0] for x in encoded.offsets] | 
					
						
							|  |  |  |     start_of_word_single = [True] + list(np.ediff1d(encoded.words) > 0) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert encoded.tokens == tokenized | 
					
						
							|  |  |  |     assert offsets == [0, 5, 10, 15, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72] | 
					
						
							|  |  |  |     assert start_of_word_single == [True] * 19 + [False] |