update tutorials

2025-06-27 02:39:58 +00:00 · 2024-12-03 11:49:43 +00:00 · 2024-12-03 11:49:43 +00:00 · 2bdd0f0542
commit 2bdd0f0542
parent 1374b98da4
5 changed files with 263 additions and 15 deletions
--- a/Tutorials/1_Embedding/1.2.3_BGE_v1&1.5.ipynb
+++ b/Tutorials/1_Embedding/1.2.3_BGE_v1&1.5.ipynb
@ -71,7 +71,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Run the following cell to check the model of bge-base-en-v1.5. It has the exactly same structure of BERT-base, 12 encoder layers and hidden dimension of 768.\n",
+    "Run the following cell to check the model of bge-base-en-v1.5. It uses BERT-base as base model, with 12 encoder layers and hidden dimension of 768.\n",
    "\n",
    "Note that the corresponding models of BGE and BGE-v1.5 have same structures. For example, bge-base-en and bge-base-en-v1.5 have the same structure."
   ]
@ -391,7 +391,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/flag_models.py#L370) for more details."
+    "As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/inference/embedder/encoder_only/base.py) for more details."
   ]
  }
 ],
--- a/Tutorials/4_Evaluation/4.5.1_MIRACL.ipynb
+++ b/Tutorials/4_Evaluation/4.5.1_MIRACL.ipynb
@ -568,7 +568,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Evaluate using FlagEmbedding"
+    "## 3. Evaluate using FlagEmbedding"
   ]
  },
  {
--- a/Tutorials/4_Evaluation/4.5.2_MLDR.ipynb
+++ b/Tutorials/4_Evaluation/4.5.2_MLDR.ipynb
@ -34,7 +34,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "% pip install FlagEmbedding"
+    "% pip install FlagEmbedding pytrec_eval"
   ]
  },
  {
@ -318,7 +318,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Use the Faiss index to search for each query."
+    "Use the Faiss index to search answers for each query."
   ]
  },
  {
@ -456,7 +456,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Evaluate using FlagEmbedding"
+    "## 3. Evaluate using FlagEmbedding"
   ]
  },
  {
@ -496,15 +496,6 @@
    "sys.argv = arguments.split()"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"SETUPTOOLS_USE_DISTUTILS\"] = \"\""
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": 4,
--- a/Tutorials/4_Evaluation/utils/compute_metrics.py
+++ b/Tutorials/4_Evaluation/utils/compute_metrics.py
@ -0,0 +1,95 @@
+"""
+Ref: https://github.com/facebookresearch/contriever
+"""
+import regex
+import unicodedata
+from functools import partial
+from typing import List, Union
+
+
+class SimpleTokenizer:
+    ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+'
+    NON_WS = r'[^\p{Z}\p{C}]'
+
+    def __init__(self):
+        """
+        Args:
+            annotators: None or empty set (only tokenizes).
+        """
+        self._regexp = regex.compile(
+            '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
+            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
+        )
+
+    def tokenize(self, text, uncased=False):
+        matches = [m for m in self._regexp.finditer(text)]
+        if uncased:
+            tokens = [m.group().lower() for m in matches]
+        else:
+            tokens = [m.group() for m in matches]
+        return tokens
+
+
+def _normalize(text):
+    return unicodedata.normalize('NFD', text)
+
+
+def has_answer(answers, text, tokenizer) -> bool:
+    """Check if a document contains an answer string."""
+    text = _normalize(text)
+    text = tokenizer.tokenize(text, uncased=True)
+
+    for answer in answers:
+        answer = _normalize(answer)
+        answer = tokenizer.tokenize(answer, uncased=True)
+        for i in range(0, len(text) - len(answer) + 1):
+            if answer == text[i: i + len(answer)]:
+                return True
+    return False
+
+
+def check_answer(example, tokenizer) -> List[bool]:
+    """Search through all the top docs to see if they have any of the answers."""
+    answers = example['answers']
+    ctxs = example['ctxs']
+
+    hits = []
+    for i, text in enumerate(ctxs):
+        if text is None:  # cannot find the document for some reason
+            hits.append(False)
+            continue
+        hits.append(has_answer(answers, text, tokenizer))
+    return hits
+
+
+def evaluate_qa_recall(ctxs, answers, k_values: Union[int, List[int]]=100):
+    # compute Recall@k for QA task
+    data = []
+    assert len(ctxs) == len(answers)
+    for i in range(len(ctxs)):
+        _ctxs, _answers = ctxs[i], answers[i]
+        data.append({
+            'answers': _answers,
+            'ctxs': _ctxs,
+        })
+    tokenizer = SimpleTokenizer()
+    get_score_partial = partial(check_answer, tokenizer=tokenizer)
+
+    scores = map(get_score_partial, data)
+
+    n_docs = len(data[0]['ctxs'])
+    top_k_hits = [0] * n_docs
+    for question_hits in scores:
+        best_hit = next((i for i, x in enumerate(question_hits) if x), None)
+        if best_hit is not None:
+            top_k_hits[best_hit:] = [v + 1 for v in top_k_hits[best_hit:]]
+
+    if isinstance(k_values, int):
+        k = min(k_values, len(top_k_hits))
+        return top_k_hits[k - 1] / len(data)
+    else:
+        scores = []
+        for k in k_values:
+            k = min(k, len(top_k_hits))
+            scores.append(top_k_hits[k - 1] / len(data))
+        return scores
--- a/Tutorials/4_Evaluation/utils/normalize_text.py
+++ b/Tutorials/4_Evaluation/utils/normalize_text.py
@ -0,0 +1,162 @@
+"""
+adapted from chemdataextractor.text.normalize
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Tools for normalizing text.
+https://github.com/mcs07/ChemDataExtractor
+:copyright: Copyright 2016 by Matt Swain.
+:license: MIT
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+#: Control characters.
+CONTROLS = {
+    '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011',
+    '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b',
+}
+# There are further control characters, but they are instead replaced with a space by unicode normalization
+# '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c',  '\u001d', '\u001e', '\u001f'
+
+
+#: Hyphen and dash characters.
+HYPHENS = {
+    '-',  # \u002d Hyphen-minus
+    '‐',  # \u2010 Hyphen
+    '‑',  # \u2011 Non-breaking hyphen
+    '⁃',  # \u2043 Hyphen bullet
+    '‒',  # \u2012 figure dash
+    '–',  # \u2013 en dash
+    '—',  # \u2014 em dash
+    '―',  # \u2015 horizontal bar
+}
+
+#: Minus characters.
+MINUSES = {
+    '-',  # \u002d Hyphen-minus
+    '−',  # \u2212 Minus
+    '－',  # \uff0d Full-width Hyphen-minus
+    '⁻',  # \u207b Superscript minus
+}
+
+#: Plus characters.
+PLUSES = {
+    '+',  # \u002b Plus
+    '＋',  # \uff0b Full-width Plus
+    '⁺',  # \u207a Superscript plus
+}
+
+#: Slash characters.
+SLASHES = {
+    '/',  # \u002f Solidus
+    '⁄',  # \u2044 Fraction slash
+    '∕',  # \u2215 Division slash
+}
+
+#: Tilde characters.
+TILDES = {
+    '~',  # \u007e Tilde
+    '˜',  # \u02dc Small tilde
+    '⁓',  # \u2053 Swung dash
+    '∼',  # \u223c Tilde operator #in mbert vocab
+    '∽',  # \u223d Reversed tilde
+    '∿',  # \u223f Sine wave
+    '〜',  # \u301c Wave dash #in mbert vocab
+    '～',  # \uff5e Full-width tilde #in mbert vocab
+}
+
+#: Apostrophe characters.
+APOSTROPHES = {
+    "'",  # \u0027
+    '’',  # \u2019
+    '՚',  # \u055a
+    'Ꞌ',  # \ua78b
+    'ꞌ',  # \ua78c
+    '＇',  # \uff07
+}
+
+#: Single quote characters.
+SINGLE_QUOTES = {
+    "'",  # \u0027
+    '‘',  # \u2018
+    '’',  # \u2019
+    '‚',  # \u201a
+    '‛',  # \u201b
+
+}
+
+#: Double quote characters.
+DOUBLE_QUOTES = {
+    '"',  # \u0022
+    '“',  # \u201c
+    '”',  # \u201d
+    '„',  # \u201e
+    '‟',  # \u201f
+}
+
+#: Accent characters.
+ACCENTS = {
+    '`',  # \u0060
+    '´',  # \u00b4
+}
+
+#: Prime characters.
+PRIMES = {
+    '′',  # \u2032
+    '″',  # \u2033
+    '‴',  # \u2034
+    '‵',  # \u2035
+    '‶',  # \u2036
+    '‷',  # \u2037
+    '⁗',  # \u2057
+}
+
+#: Quote characters, including apostrophes, single quotes, double quotes, accents and primes.
+QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES
+
+def normalize_text(text: str):
+    for control in CONTROLS:
+        text = text.replace(control, '')
+    text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ')
+
+    for hyphen in HYPHENS | MINUSES:
+        text = text.replace(hyphen, '-')
+    text = text.replace('\u00ad', '')
+
+    for double_quote in DOUBLE_QUOTES:
+        text = text.replace(double_quote, '"')  # \u0022
+    for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS):
+        text = text.replace(single_quote, "'")  # \u0027
+    text = text.replace('′', "'")     # \u2032 prime
+    text = text.replace('‵', "'")     # \u2035 reversed prime
+    text = text.replace('″', "''")    # \u2033 double prime
+    text = text.replace('‶', "''")    # \u2036 reversed double prime
+    text = text.replace('‴', "'''")   # \u2034 triple prime
+    text = text.replace('‷', "'''")   # \u2037 reversed triple prime
+    text = text.replace('⁗', "''''")  # \u2057 quadruple prime
+
+    text = text.replace('…', '...').replace(' . . . ', ' ... ')  # \u2026
+
+    for slash in SLASHES:
+        text = text.replace(slash, '/')
+
+    #for tilde in TILDES:
+    #    text = text.replace(tilde, '~')
+
+    return text