mirror of
https://github.com/FlagOpen/FlagEmbedding.git
synced 2025-06-27 02:39:58 +00:00
update tutorials
This commit is contained in:
parent
1374b98da4
commit
2bdd0f0542
@ -71,7 +71,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Run the following cell to check the model of bge-base-en-v1.5. It has the exactly same structure of BERT-base, 12 encoder layers and hidden dimension of 768.\n",
|
||||
"Run the following cell to check the model of bge-base-en-v1.5. It uses BERT-base as base model, with 12 encoder layers and hidden dimension of 768.\n",
|
||||
"\n",
|
||||
"Note that the corresponding models of BGE and BGE-v1.5 have same structures. For example, bge-base-en and bge-base-en-v1.5 have the same structure."
|
||||
]
|
||||
@ -391,7 +391,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/flag_models.py#L370) for more details."
|
||||
"As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/inference/embedder/encoder_only/base.py) for more details."
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -568,7 +568,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate using FlagEmbedding"
|
||||
"## 3. Evaluate using FlagEmbedding"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -34,7 +34,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"% pip install FlagEmbedding"
|
||||
"% pip install FlagEmbedding pytrec_eval"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -318,7 +318,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Use the Faiss index to search for each query."
|
||||
"Use the Faiss index to search answers for each query."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -456,7 +456,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate using FlagEmbedding"
|
||||
"## 3. Evaluate using FlagEmbedding"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -496,15 +496,6 @@
|
||||
"sys.argv = arguments.split()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"os.environ[\"SETUPTOOLS_USE_DISTUTILS\"] = \"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
|
95
Tutorials/4_Evaluation/utils/compute_metrics.py
Normal file
95
Tutorials/4_Evaluation/utils/compute_metrics.py
Normal file
@ -0,0 +1,95 @@
|
||||
"""
|
||||
Ref: https://github.com/facebookresearch/contriever
|
||||
"""
|
||||
import regex
|
||||
import unicodedata
|
||||
from functools import partial
|
||||
from typing import List, Union
|
||||
|
||||
|
||||
class SimpleTokenizer:
|
||||
ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+'
|
||||
NON_WS = r'[^\p{Z}\p{C}]'
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Args:
|
||||
annotators: None or empty set (only tokenizes).
|
||||
"""
|
||||
self._regexp = regex.compile(
|
||||
'(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
|
||||
flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
|
||||
)
|
||||
|
||||
def tokenize(self, text, uncased=False):
|
||||
matches = [m for m in self._regexp.finditer(text)]
|
||||
if uncased:
|
||||
tokens = [m.group().lower() for m in matches]
|
||||
else:
|
||||
tokens = [m.group() for m in matches]
|
||||
return tokens
|
||||
|
||||
|
||||
def _normalize(text):
|
||||
return unicodedata.normalize('NFD', text)
|
||||
|
||||
|
||||
def has_answer(answers, text, tokenizer) -> bool:
|
||||
"""Check if a document contains an answer string."""
|
||||
text = _normalize(text)
|
||||
text = tokenizer.tokenize(text, uncased=True)
|
||||
|
||||
for answer in answers:
|
||||
answer = _normalize(answer)
|
||||
answer = tokenizer.tokenize(answer, uncased=True)
|
||||
for i in range(0, len(text) - len(answer) + 1):
|
||||
if answer == text[i: i + len(answer)]:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def check_answer(example, tokenizer) -> List[bool]:
|
||||
"""Search through all the top docs to see if they have any of the answers."""
|
||||
answers = example['answers']
|
||||
ctxs = example['ctxs']
|
||||
|
||||
hits = []
|
||||
for i, text in enumerate(ctxs):
|
||||
if text is None: # cannot find the document for some reason
|
||||
hits.append(False)
|
||||
continue
|
||||
hits.append(has_answer(answers, text, tokenizer))
|
||||
return hits
|
||||
|
||||
|
||||
def evaluate_qa_recall(ctxs, answers, k_values: Union[int, List[int]]=100):
|
||||
# compute Recall@k for QA task
|
||||
data = []
|
||||
assert len(ctxs) == len(answers)
|
||||
for i in range(len(ctxs)):
|
||||
_ctxs, _answers = ctxs[i], answers[i]
|
||||
data.append({
|
||||
'answers': _answers,
|
||||
'ctxs': _ctxs,
|
||||
})
|
||||
tokenizer = SimpleTokenizer()
|
||||
get_score_partial = partial(check_answer, tokenizer=tokenizer)
|
||||
|
||||
scores = map(get_score_partial, data)
|
||||
|
||||
n_docs = len(data[0]['ctxs'])
|
||||
top_k_hits = [0] * n_docs
|
||||
for question_hits in scores:
|
||||
best_hit = next((i for i, x in enumerate(question_hits) if x), None)
|
||||
if best_hit is not None:
|
||||
top_k_hits[best_hit:] = [v + 1 for v in top_k_hits[best_hit:]]
|
||||
|
||||
if isinstance(k_values, int):
|
||||
k = min(k_values, len(top_k_hits))
|
||||
return top_k_hits[k - 1] / len(data)
|
||||
else:
|
||||
scores = []
|
||||
for k in k_values:
|
||||
k = min(k, len(top_k_hits))
|
||||
scores.append(top_k_hits[k - 1] / len(data))
|
||||
return scores
|
162
Tutorials/4_Evaluation/utils/normalize_text.py
Normal file
162
Tutorials/4_Evaluation/utils/normalize_text.py
Normal file
@ -0,0 +1,162 @@
|
||||
"""
|
||||
adapted from chemdataextractor.text.normalize
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Tools for normalizing text.
|
||||
https://github.com/mcs07/ChemDataExtractor
|
||||
:copyright: Copyright 2016 by Matt Swain.
|
||||
:license: MIT
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
'Software'), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
"""
|
||||
|
||||
#: Control characters.
|
||||
CONTROLS = {
|
||||
'\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011',
|
||||
'\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b',
|
||||
}
|
||||
# There are further control characters, but they are instead replaced with a space by unicode normalization
|
||||
# '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c', '\u001d', '\u001e', '\u001f'
|
||||
|
||||
|
||||
#: Hyphen and dash characters.
|
||||
HYPHENS = {
|
||||
'-', # \u002d Hyphen-minus
|
||||
'‐', # \u2010 Hyphen
|
||||
'‑', # \u2011 Non-breaking hyphen
|
||||
'⁃', # \u2043 Hyphen bullet
|
||||
'‒', # \u2012 figure dash
|
||||
'–', # \u2013 en dash
|
||||
'—', # \u2014 em dash
|
||||
'―', # \u2015 horizontal bar
|
||||
}
|
||||
|
||||
#: Minus characters.
|
||||
MINUSES = {
|
||||
'-', # \u002d Hyphen-minus
|
||||
'−', # \u2212 Minus
|
||||
'-', # \uff0d Full-width Hyphen-minus
|
||||
'⁻', # \u207b Superscript minus
|
||||
}
|
||||
|
||||
#: Plus characters.
|
||||
PLUSES = {
|
||||
'+', # \u002b Plus
|
||||
'+', # \uff0b Full-width Plus
|
||||
'⁺', # \u207a Superscript plus
|
||||
}
|
||||
|
||||
#: Slash characters.
|
||||
SLASHES = {
|
||||
'/', # \u002f Solidus
|
||||
'⁄', # \u2044 Fraction slash
|
||||
'∕', # \u2215 Division slash
|
||||
}
|
||||
|
||||
#: Tilde characters.
|
||||
TILDES = {
|
||||
'~', # \u007e Tilde
|
||||
'˜', # \u02dc Small tilde
|
||||
'⁓', # \u2053 Swung dash
|
||||
'∼', # \u223c Tilde operator #in mbert vocab
|
||||
'∽', # \u223d Reversed tilde
|
||||
'∿', # \u223f Sine wave
|
||||
'〜', # \u301c Wave dash #in mbert vocab
|
||||
'~', # \uff5e Full-width tilde #in mbert vocab
|
||||
}
|
||||
|
||||
#: Apostrophe characters.
|
||||
APOSTROPHES = {
|
||||
"'", # \u0027
|
||||
'’', # \u2019
|
||||
'՚', # \u055a
|
||||
'Ꞌ', # \ua78b
|
||||
'ꞌ', # \ua78c
|
||||
''', # \uff07
|
||||
}
|
||||
|
||||
#: Single quote characters.
|
||||
SINGLE_QUOTES = {
|
||||
"'", # \u0027
|
||||
'‘', # \u2018
|
||||
'’', # \u2019
|
||||
'‚', # \u201a
|
||||
'‛', # \u201b
|
||||
|
||||
}
|
||||
|
||||
#: Double quote characters.
|
||||
DOUBLE_QUOTES = {
|
||||
'"', # \u0022
|
||||
'“', # \u201c
|
||||
'”', # \u201d
|
||||
'„', # \u201e
|
||||
'‟', # \u201f
|
||||
}
|
||||
|
||||
#: Accent characters.
|
||||
ACCENTS = {
|
||||
'`', # \u0060
|
||||
'´', # \u00b4
|
||||
}
|
||||
|
||||
#: Prime characters.
|
||||
PRIMES = {
|
||||
'′', # \u2032
|
||||
'″', # \u2033
|
||||
'‴', # \u2034
|
||||
'‵', # \u2035
|
||||
'‶', # \u2036
|
||||
'‷', # \u2037
|
||||
'⁗', # \u2057
|
||||
}
|
||||
|
||||
#: Quote characters, including apostrophes, single quotes, double quotes, accents and primes.
|
||||
QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES
|
||||
|
||||
def normalize_text(text: str):
|
||||
for control in CONTROLS:
|
||||
text = text.replace(control, '')
|
||||
text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ')
|
||||
|
||||
for hyphen in HYPHENS | MINUSES:
|
||||
text = text.replace(hyphen, '-')
|
||||
text = text.replace('\u00ad', '')
|
||||
|
||||
for double_quote in DOUBLE_QUOTES:
|
||||
text = text.replace(double_quote, '"') # \u0022
|
||||
for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS):
|
||||
text = text.replace(single_quote, "'") # \u0027
|
||||
text = text.replace('′', "'") # \u2032 prime
|
||||
text = text.replace('‵', "'") # \u2035 reversed prime
|
||||
text = text.replace('″', "''") # \u2033 double prime
|
||||
text = text.replace('‶', "''") # \u2036 reversed double prime
|
||||
text = text.replace('‴', "'''") # \u2034 triple prime
|
||||
text = text.replace('‷', "'''") # \u2037 reversed triple prime
|
||||
text = text.replace('⁗', "''''") # \u2057 quadruple prime
|
||||
|
||||
text = text.replace('…', '...').replace(' . . . ', ' ... ') # \u2026
|
||||
|
||||
for slash in SLASHES:
|
||||
text = text.replace(slash, '/')
|
||||
|
||||
#for tilde in TILDES:
|
||||
# text = text.replace(tilde, '~')
|
||||
|
||||
return text
|
Loading…
x
Reference in New Issue
Block a user