From a2fc545f20ccae6bc7a5e072491aba533e7d4f5a Mon Sep 17 00:00:00 2001 From: shitao Date: Fri, 4 Aug 2023 11:20:20 +0800 Subject: [PATCH] update hf_repo name --- README.md | 28 +++++++++++++++------------- README_zh.md | 20 ++++++++++---------- benchmark/README.md | 6 +++--- benchmark/eval_C-MTEB.py | 6 +++--- benchmark/eval_MTEB.py | 6 +++--- examples/finetune/README.md | 2 +- examples/pretrain/README.md | 2 +- examples/search_demo/arguments.py | 2 +- examples/search_demo/readme.md | 2 +- flag_embedding/README.md | 2 +- 10 files changed, 39 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index fab27e0..ec5a7e9 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Build - Build + Build

@@ -39,9 +39,9 @@ And it also can be used in vector database for LLMs. ## Model List | Model | Language | Description | query instruction for retrieval | |:-------------------------------|:--------:| :--------:| :--------:| -| [BAAI/baai-general-embedding-large-en-instruction](https://huggingface.co/BAAI/baai-general-embedding-large-en-instruction) | English | :trophy: rank **1st** in [MTEB](https://huggingface.co/spaces/mteb/leaderboard) leaderboard | `Represent this sentence for searching relevant passages: ` | -| [BAAI/baai-general-embedding-large-zh-instruction](https://huggingface.co/BAAI/baai-general-embedding-large-zh-instruction) | Chinese | :trophy: rank **1st** in [C-MTEB](https://github.com/FlagOpen/FlagEmbedding/tree/master/benchmark) benchmark | `为这个句子生成表示以用于检索相关文章:` | -| [BAAI/baai-general-embedding-large-zh](https://huggingface.co/BAAI/baai-general-embedding-large-zh) | Chinese | rank **2nd** in [C-MTEB](https://github.com/FlagOpen/FlagEmbedding/tree/master/benchmark) benchmark | | +| [BAAI/bge-large-en](https://huggingface.co/BAAI/bge-large-en) | English | :trophy: rank **1st** in [MTEB](https://huggingface.co/spaces/mteb/leaderboard) leaderboard | `Represent this sentence for searching relevant passages: ` | +| [BAAI/bge-large-zh](https://huggingface.co/BAAI/bge-large-zh) | Chinese | :trophy: rank **1st** in [C-MTEB](https://github.com/FlagOpen/FlagEmbedding/tree/master/benchmark) benchmark | `为这个句子生成表示以用于检索相关文章:` | +| [BAAI/bge-large-zh-noinstruct](https://huggingface.co/BAAI/bge-large-zh-noinstruct) | Chinese | rank **2nd** in [C-MTEB](https://github.com/FlagOpen/FlagEmbedding/tree/master/benchmark) benchmark | | ## Usage @@ -59,17 +59,19 @@ Then you can use the model like this: ```python from sentence_transformers import SentenceTransformer sentences = ["样例数据-1", "样例数据-2"] -model = SentenceTransformer('BAAI/baai-general-embedding-large-zh-instruction') +model = SentenceTransformer('BAAI/bge-large-zh') embeddings = model.encode(sentences, normalize_embeddings=True) print(embeddings) ``` -For retrieval task, when you use the model whose name ends with `-instruction`, +For retrieval task, each query should start with a instruction (instructions see [Model List](https://github.com/FlagOpen/FlagEmbedding/tree/master#model-list)). ```python +from sentence_transformers import SentenceTransformer queries = ["手机开不了机怎么办?"] passages = ["样例段落-1", "样例段落-2"] instruction = "为这个句子生成表示以用于检索相关文章:" -model = SentenceTransformer('BAAI/baai-general-embedding-large-zh-instruction') + +model = SentenceTransformer('BAAI/bge-large-zh') q_embeddings = model.encode([instruction+q for q in queries], normalize_embeddings=True) p_embeddings = model.encode(passages, normalize_embeddings=True) scores = q_embeddings @ p_embeddings.T @@ -86,8 +88,8 @@ import torch sentences = ["样例数据-1", "样例数据-2"] # Load model from HuggingFace Hub -tokenizer = AutoTokenizer.from_pretrained('BAAI/baai-general-embedding-large-zh-instruction') -model = AutoModel.from_pretrained('BAAI/baai-general-embedding-large-zh-instruction') +tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh') +model = AutoModel.from_pretrained('BAAI/bge-large-zh') # Tokenize sentences encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') @@ -113,7 +115,7 @@ More details and evaluation scripts see [benchemark](benchmark/README.md). | Model Name | Model Size (GB) | Dimension | Sequence Length | Average (56) | Retrieval (15) |Clustering (11) | Pair Classification (3) | Reranking (4) | STS (10) | Summarization (1) | Classification (12) | |:----:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:| -| [**baai-general-embedding-large-en-instruction**](https://huggingface.co/BAAI/baai-general-embedding-large-en-instruction) | 0.67 | 1024 | 512 | **63.34** | **53.23** | 48.47 | 86.34 | 59.87 | 81.89 | 30.55 | 72.28 | +| [bge-large-en](https://huggingface.co/BAAI/bge-large-en) | 0.67 | 1024 | 512 | **63.98** | **53.9** | **46.98** | 85.8 | **59.48** | 81.56 | 32.06 | **76.21** | | [gte-large](https://huggingface.co/thenlper/gte-large) | 0.67 | 1024 | 512 | 63.13 | 52.22 | 46.84 | 85.00 | 59.13 | 83.35 | 31.66 | 73.33 | | [gte-base](https://huggingface.co/thenlper/gte-base) | 0.22 | 768 | 512 | 62.39 | 51.14 | 46.2 | 84.57 | 58.61 | 82.3 | 31.17 | 73.01 | | [e5-large-v2](https://huggingface.co/intfloat/e5-large-v2) | 1.34 | 1024| 512 | 62.25 | 50.56 | 44.49 | 86.03 | 56.61 | 82.05 | 30.19 | 75.24 | @@ -138,8 +140,8 @@ Please refer to [benchemark](benchmark/README.md) for a detailed introduction. | Model | Embedding dimension | Avg | Retrieval | STS | PairClassification | Classification | Reranking | Clustering | |:-------------------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:| -| [**baai-general-embedding-large-zh-instruction**](https://huggingface.co/BAAI/baai-general-embedding-large-zh-instruction) | 1024 | **64.20** | **71.53** | **53.23** | **78.94** | 72.26 | **65.11** | 48.39 | -| [baai-general-embedding-large-zh](https://huggingface.co/BAAI/baai-general-embedding-large-zh) | 1024 | 63.53 | 70.55 | 50.98 | 76.77 | **72.49** | 64.91 | **50.01** | +| [**bge-large-zh**](https://huggingface.co/BAAI/bge-large-zh) | 1024 | **64.20** | **71.53** | **53.23** | **78.94** | 72.26 | **65.11** | 48.39 | +| [bge-large-zh-noinstruct](https://huggingface.co/BAAI/bge-large-zh-noinstruct) | 1024 | 63.53 | 70.55 | 50.98 | 76.77 | **72.49** | 64.91 | **50.01** | | [m3e-base](https://huggingface.co/moka-ai/m3e-base) | 768 | 57.10 |56.91 | 48.15 | 63.99 | 70.28 | 59.34 | 47.68 | | [m3e-large](https://huggingface.co/moka-ai/m3e-large) | 1024 | 57.05 |54.75 | 48.64 | 64.3 | 71.22 | 59.66 | 48.88 | | [text-embedding-ada-002(OpenAI)](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) | 1536 | 53.02 | 52.0 | 40.61 | 69.56 | 67.38 | 54.28 | 45.68 | @@ -177,7 +179,7 @@ We used the AdamW optimizer and the learning rate is 2e-5. We fine-tune the model using a contrastive objective. The format of input data is a triple`(query, positive, negative)`. Besides the negative in the triple, we also adopt in-batch negatives strategy. -We employ the [cross-device negatives sharing method](https://github.com/microsoft/MoPQ) to sharing negatives among different GPUs, +We employ the cross-device negatives sharing method to sharing negatives among different GPUs, which can dramatically **increase the number of negatives**. We trained our model on 48 A100(40G) GPUs with a large batch size of 32,768 (so there are **65,535** negatives for each query in a batch). diff --git a/README_zh.md b/README_zh.md index 1e7e5f1..473950d 100644 --- a/README_zh.md +++ b/README_zh.md @@ -38,9 +38,9 @@ ## Model List | Model | Language | Description | query instruction for retrieval | |:-------------------------------|:--------:| :--------:| :--------:| -| [BAAI/baai-general-embedding-large-en-instruction](https://huggingface.co/BAAI/baai-general-embedding-large-en-instruction) | English | :trophy: 在 [MTEB](https://huggingface.co/spaces/mteb/leaderboard) 榜单上排名第一 | `Represent this sentence for searching relevant passages: ` | -| [BAAI/baai-general-embedding-large-zh-instruction](https://huggingface.co/BAAI/baai-general-embedding-large-zh-instruction) | Chinese | :trophy: 在 [C-MTEB](https://github.com/FlagOpen/FlagEmbedding/tree/master/benchmark) 榜单上排名第一 | `为这个句子生成表示以用于检索相关文章:` | -| [BAAI/baai-general-embedding-large-zh](https://huggingface.co/BAAI/baai-general-embedding-large-zh) | Chinese | 在 [C-MTEB](https://github.com/FlagOpen/FlagEmbedding/tree/master/benchmark) 榜单上排名第二 | -- | +| [BAAI/bge-large-en](https://huggingface.co/BAAI/bge-large-en) | English | :trophy: 在 [MTEB](https://huggingface.co/spaces/mteb/leaderboard) 榜单上排名第一 | `Represent this sentence for searching relevant passages: ` | +| [BAAI/bge-large-zh](https://huggingface.co/BAAI/bge-large-zh) | Chinese | :trophy: 在 [C-MTEB](https://github.com/FlagOpen/FlagEmbedding/tree/master/benchmark) 榜单上排名第一 | `为这个句子生成表示以用于检索相关文章:` | +| [BAAI/bge-large-zh-noinstruct](https://huggingface.co/BAAI/bge-large-zh-noinstruct) | Chinese | 在 [C-MTEB](https://github.com/FlagOpen/FlagEmbedding/tree/master/benchmark) 榜单上排名第二 | -- | ## Usage @@ -58,7 +58,7 @@ pip install -U sentence-transformers ```python from sentence_transformers import SentenceTransformer sentences = ["样例数据-1", "样例数据-2"] -model = SentenceTransformer('BAAI/baai-general-embedding-large-zh-instruction') +model = SentenceTransformer('BAAI/bge-large-zh') embeddings = model.encode(sentences, normalize_embeddings=True) print(embeddings) ``` @@ -68,7 +68,7 @@ print(embeddings) queries = ["手机开不了机怎么办?"] passages = ["样例段落-1", "样例段落-2"] instruction = "为这个句子生成表示以用于检索相关文章:" -model = SentenceTransformer('BAAI/baai-general-embedding-large-zh-instruction') +model = SentenceTransformer('BAAI/bge-large-zh') q_embeddings = model.encode([instruction+q for q in queries], normalize_embeddings=True) p_embeddings = model.encode(passages, normalize_embeddings=True) scores = q_embeddings @ p_embeddings.T @@ -84,8 +84,8 @@ import torch sentences = ["样例数据-1", "样例数据-2"] # Load model from HuggingFace Hub -tokenizer = AutoTokenizer.from_pretrained('BAAI/baai-general-embedding-large-zh-instruction') -model = AutoModel.from_pretrained('BAAI/baai-general-embedding-large-zh-instruction') +tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh') +model = AutoModel.from_pretrained('BAAI/bge-large-zh') # Tokenize sentences encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') @@ -111,7 +111,7 @@ print("Sentence embeddings:", sentence_embeddings) | Model Name | Model Size (GB) | Dimension | Sequence Length | Average (56) | Retrieval (15) |Clustering (11) | Pair Classification (3) | Reranking (4) | STS (10) | Summarization (1) | Classification (12) | |:----:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:| -| [**baai-general-embedding-large-en-instruction**](https://huggingface.co/BAAI/baai-general-embedding-large-en-instruction) | 0.67 | 1024 | 512 | **63.34** | **53.23** | 48.47 | 86.34 | 59.87 | 81.89 | 30.55 | 72.28 | +| [**baai-general-embedding-large-en-instruction**](https://huggingface.co/BAAI/bge-large-en) | 0.67 | 1024 | 512 | **63.34** | **53.23** | 48.47 | 86.34 | 59.87 | 81.89 | 30.55 | 72.28 | | [gte-large](https://huggingface.co/thenlper/gte-large) | 0.67 | 1024 | 512 | 63.13 | 52.22 | 46.84 | 85.00 | 59.13 | 83.35 | 31.66 | 73.33 | | [gte-base](https://huggingface.co/thenlper/gte-base) | 0.22 | 768 | 512 | 62.39 | 51.14 | 46.2 | 84.57 | 58.61 | 82.3 | 31.17 | 73.01 | | [e5-large-v2](https://huggingface.co/intfloat/e5-large-v2) | 1.34 | 1024| 512 | 62.25 | 50.56 | 44.49 | 86.03 | 56.61 | 82.05 | 30.19 | 75.24 | @@ -137,8 +137,8 @@ print("Sentence embeddings:", sentence_embeddings) | Model | Embedding dimension | Avg | Retrieval | STS | PairClassification | Classification | Reranking | Clustering | |:-------------------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:| -| [**baai-general-embedding-large-zh-instruction**](https://huggingface.co/BAAI/baai-general-embedding-large-zh-instruction) | 1024 | **64.20** | **71.53** | **53.23** | **78.94** | 72.26 | **65.11** | 48.39 | -| [baai-general-embedding-large-zh](https://huggingface.co/BAAI/baai-general-embedding-large-zh) | 1024 | 63.53 | 70.55 | 50.98 | 76.77 | **72.49** | 64.91 | **50.01** | +| [**baai-general-embedding-large-zh-instruction**](https://huggingface.co/BAAI/bge-large-zh) | 1024 | **64.20** | **71.53** | **53.23** | **78.94** | 72.26 | **65.11** | 48.39 | +| [baai-general-embedding-large-zh](https://huggingface.co/BAAI/bge-large-zh-noinstruct) | 1024 | 63.53 | 70.55 | 50.98 | 76.77 | **72.49** | 64.91 | **50.01** | | [m3e-base](https://huggingface.co/moka-ai/m3e-base) | 768 | 57.10 |56.91 | 48.15 | 63.99 | 70.28 | 59.34 | 47.68 | | [m3e-large](https://huggingface.co/moka-ai/m3e-large) | 1024 | 57.05 |54.75 | 48.64 | 64.3 | 71.22 | 59.66 | 48.88 | | [text-embedding-ada-002(OpenAI)](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) | 1536 | 53.02 | 52.0 | 40.61 | 69.56 | 67.38 | 54.28 | 45.68 | diff --git a/benchmark/README.md b/benchmark/README.md index bed7665..36774c0 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -27,7 +27,7 @@ results = evaluation.run(model, output_folder=f"results/{model_name}") * Reproduce the results of flag_embedding Using the provided python script (see [eval_C-MTEB.py]() ) ```bash -python eval_C-MTEB.py --model_name_or_path BAAI/baai-general-embedding-large-zh-instruction +python eval_C-MTEB.py --model_name_or_path BAAI/bge-large-zh ``` * Using a custom model @@ -59,8 +59,8 @@ evaluation.run(model) ### overall | Model | Embedding dimension | Avg | Retrieval | STS | PairClassification | Classification | Reranking | Clustering | |:-------------------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:| -| [**baai-general-embedding-large-zh-instruction**](https://huggingface.co/BAAI/baai-general-embedding-large-zh-instruction) | 1024 | **64.20** | **71.53** | **53.23** | **78.94** | 72.26 | **65.11** | 48.39 | -| [baai-general-embedding-large-zh](https://huggingface.co/BAAI/baai-general-embedding-large-zh) | 1024 | 63.53 | 70.55 | 50.98 | 76.77 | **72.49** | 64.91 | **50.01** | +| [**baai-general-embedding-large-zh-instruction**](https://huggingface.co/BAAI/bge-large-zh) | 1024 | **64.20** | **71.53** | **53.23** | **78.94** | 72.26 | **65.11** | 48.39 | +| [baai-general-embedding-large-zh](https://huggingface.co/BAAI/bge-large-zh-noinstruct) | 1024 | 63.53 | 70.55 | 50.98 | 76.77 | **72.49** | 64.91 | **50.01** | | [m3e-base](https://huggingface.co/moka-ai/m3e-base) | 768 | 57.10 |56.91 | 48.15 | 63.99 | 70.28 | 59.34 | 47.68 | | [m3e-large](https://huggingface.co/moka-ai/m3e-large) | 1024 | 57.05 |54.75 | 48.64 | 64.3 | 71.22 | 59.66 | 48.88 | | [text-embedding-ada-002(OpenAI)](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) | 1536 | 53.02 | 52.0 | 40.61 | 69.56 | 67.38 | 54.28 | 45.68 | diff --git a/benchmark/eval_C-MTEB.py b/benchmark/eval_C-MTEB.py index b8e6e46..3a4edf5 100644 --- a/benchmark/eval_C-MTEB.py +++ b/benchmark/eval_C-MTEB.py @@ -6,14 +6,14 @@ from models import UniversalModel from mteb import MTEB query_instruction_for_retrieval_dict = { - "BAAI/baai-general-embedding-large-zh-instruction": "为这个句子生成表示以用于检索相关文章:", - "BAAI/baai-general-embedding-large-zh": None + "BAAI/bge-large-zh": "为这个句子生成表示以用于检索相关文章:", + "BAAI/bge-large-zh-noinstruct": None } def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--model_name_or_path', default="BAAI/baai-general-embedding-large-zh-instruction", type=str) + parser.add_argument('--model_name_or_path', default="BAAI/bge-large-zh", type=str) parser.add_argument('--task_type', default=None, type=str) return parser.parse_args() diff --git a/benchmark/eval_MTEB.py b/benchmark/eval_MTEB.py index 6a19b51..914fb74 100644 --- a/benchmark/eval_MTEB.py +++ b/benchmark/eval_MTEB.py @@ -4,13 +4,13 @@ from models import UniversalModel from mteb import MTEB query_instruction_for_retrieval_dict = { - "BAAI/baai-general-embedding-large-en-instruction": "Represent this sentence for searching relevant passages: ", + "BAAI/bge-large-en": "Represent this sentence for searching relevant passages: ", } def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--model_name_or_path', default="BAAI/baai-general-embedding-large-en-instruction", type=str) + parser.add_argument('--model_name_or_path', default="BAAI/bge-large-en", type=str) parser.add_argument('--task_type', default=None, type=str) return parser.parse_args() @@ -36,5 +36,5 @@ if __name__ == '__main__': normlized=False, query_instruction_for_retrieval=instruction) - evaluation = MTEB(tasks=[task], task_langs=['zh']) + evaluation = MTEB(tasks=[task], task_langs=['en']) evaluation.run(model, output_folder=f"en_results/{args.model_name_or_path.split('/')[-1]}") diff --git a/examples/finetune/README.md b/examples/finetune/README.md index 2ac4e35..14941a9 100644 --- a/examples/finetune/README.md +++ b/examples/finetune/README.md @@ -27,7 +27,7 @@ See [toy_finetune_data.jsonl]() for a toy data file. torchrun --nproc_per_node {number of gpus} \ -m finetune.run \ --output_dir {path to save model} \ ---model_name_or_path BAAI/baai-general-embedding-large-zh \ +--model_name_or_path BAAI/bge-large-zh-noinstruct \ --train_data {data file} \ --learning_rate 1e-5 \ --num_train_epochs 5 \ diff --git a/examples/pretrain/README.md b/examples/pretrain/README.md index 205fe23..cc3d967 100644 --- a/examples/pretrain/README.md +++ b/examples/pretrain/README.md @@ -27,7 +27,7 @@ See [toy_pretrain_data.jsonl]() for a toy data file. torchrun --nproc_per_node {number of gpus} \ -m retromae_pretrain.run \ --output_dir {path to save model} \ ---model_name_or_path BAAI/baai-general-embedding-large-zh \ +--model_name_or_path BAAI/bge-large-zh-noinstruct \ --train_data toy_pretrain_data.jsonl \ --learning_rate 2e-5 \ --num_train_epochs 5 \ diff --git a/examples/search_demo/arguments.py b/examples/search_demo/arguments.py index da799ef..483b709 100644 --- a/examples/search_demo/arguments.py +++ b/examples/search_demo/arguments.py @@ -4,7 +4,7 @@ from dataclasses import dataclass, field @dataclass class ModelArguments: model_name_or_path: str = field( - default='BAAI/baai-general-embedding-large-zh', + default='BAAI/bge-large-zh-noinstruct', metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} ) diff --git a/examples/search_demo/readme.md b/examples/search_demo/readme.md index 4dacc56..bb41f0f 100644 --- a/examples/search_demo/readme.md +++ b/examples/search_demo/readme.md @@ -57,7 +57,7 @@ This script will build a Q&A dialogue scenario. from tool import LocalDatasetLoader, BMVectorIndex, Agent loader = LocalDatasetLoader(data_path="./data/dataset", embedding_path="./data/emb/data.npy") -index = BMVectorIndex(model_path="BAAI/baai-general-embedding-large-zh-instruction", +index = BMVectorIndex(model_path="BAAI/bge-large-zh", bm_index_path="./data/index", data_loader=loader) agent = Agent(index) diff --git a/flag_embedding/README.md b/flag_embedding/README.md index 5c6a346..ebcb1a3 100644 --- a/flag_embedding/README.md +++ b/flag_embedding/README.md @@ -52,7 +52,7 @@ See [examples/finetune](../examples/finetune) for a toy data and training exampl torchrun --nproc_per_node {number of gpus} \ -m finetune.run \ --output_dir {path to save model} \ ---model_name_or_path BAAI/baai-general-embedding-large-zh \ +--model_name_or_path BAAI/bge-large-zh-noinstruct \ --train_data {data file} \ --learning_rate 1e-5 \ --num_train_epochs 5 \