update readme

This commit is contained in:
shitao 2023-09-15 13:30:53 +08:00
parent d96bbcc5be
commit 1e88cd5af0
5 changed files with 30 additions and 25 deletions

View File

@ -79,8 +79,9 @@ For chinese, we trained our model on 24 A100(40G) GPUs with a large batch size o
- Chinese: [wudao](https://data.baai.ac.cn/details/WuDaoCorporaText), [cmrc2018](https://huggingface.co/datasets/cmrc2018), [dureader](https://github.com/baidu/DuReader),
[simclue](https://github.com/CLUEbenchmark/SimCLUE), [csl](https://arxiv.org/abs/2209.05034), [amazon_reviews_multi](amazon_reviews_multi),
[wiki_atomic_edits](https://huggingface.co/datasets/wiki_atomic_edits), [mlqa](https://huggingface.co/datasets/mlqa),
[xlsum](https://huggingface.co/datasets/csebuetnlp/xlsum), and other data collected by BAAI teams.
[xlsum](https://huggingface.co/datasets/csebuetnlp/xlsum), and other data collected by BAAI teams from internet (including QA, news and paper).
We release the dataset at https://data.baai.ac.cn/details/BAAI-MTP .
#### 2.2 high-quality supervised pairs

View File

@ -16,7 +16,7 @@ def get_args():
parser.add_argument('--output_file', default=None, type=str)
parser.add_argument('--range_for_sampling', default=None, type=str, help="range to sample negatives")
parser.add_argument('--use_gpu_for_searching', action='store_true', help='use faiss-gpu')
parser.add_argument('--negative_number', default=15, help='use faiss-gpu')
parser.add_argument('--negative_number', default=15, help='the number of negatives')
parser.add_argument('--query_instruction_for_retrieval', default="")
return parser.parse_args()

View File

@ -132,7 +132,9 @@ If it doesn't work for you, you can see [FlagEmbedding](https://github.com/FlagO
from FlagEmbedding import FlagModel
sentences_1 = ["样例数据-1", "样例数据-2"]
sentences_2 = ["样例数据-3", "样例数据-4"]
model = FlagModel('BAAI/bge-large-zh', query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:")
model = FlagModel('BAAI/bge-large-zh-v1.5',
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
embeddings_1 = model.encode(sentences_1)
embeddings_2 = model.encode(sentences_2)
similarity = embeddings_1 @ embeddings_2.T
@ -178,7 +180,7 @@ queries = ['query_1', 'query_2']
passages = ["样例文档-1", "样例文档-2"]
instruction = "为这个句子生成表示以用于检索相关文章:"
model = SentenceTransformer('BAAI/bge-large-zh')
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
q_embeddings = model.encode([instruction+q for q in queries], normalize_embeddings=True)
p_embeddings = model.encode(passages, normalize_embeddings=True)
scores = q_embeddings @ p_embeddings.T
@ -189,7 +191,7 @@ scores = q_embeddings @ p_embeddings.T
You can use `bge` in langchain like this:
```python
from langchain.embeddings import HuggingFaceBgeEmbeddings
model_name = "BAAI/bge-small-en"
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
model = HuggingFaceBgeEmbeddings(
@ -213,8 +215,8 @@ import torch
sentences = ["样例数据-1", "样例数据-2"]
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh')
model = AutoModel.from_pretrained('BAAI/bge-large-zh')
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-large-zh-v1.5')
model.eval()
# Tokenize sentences
@ -244,10 +246,10 @@ The reranker is optimized based cross-entropy loss, so the relevance score is no
pip install -U FlagEmbedding
```
Get relevance score:
Get relevance scores (higher scores indicate more relevance):
```python
from FlagEmbedding import FlagReranker
reranker = FlagReranker('BAAI/bge-reranker-base', use_fp16=True) #use fp16 can speed up computing
reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
score = reranker.compute_score(['query', 'passage'])
print(score)
@ -261,10 +263,10 @@ print(scores)
```python
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BatchEncoding, PreTrainedTokenizerFast
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-base')
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-base')
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-large')
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-large')
model.eval()
pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]
@ -343,7 +345,7 @@ See [C_MTEB](https://github.com/FlagOpen/FlagEmbedding/blob/master/C_MTEB/) for
| [BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base) | 67.28 | 63.95 | 60.45 | 35.46 | 81.26 | 84.1 | 65.42 |
| [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large) | 67.6 | 64.03 | 61.44 | 37.16 | 82.15 | 84.18 | 66.09 |
\* : T2RerankingZh2En and T2RerankingEn2Zh are cross-language retrieval task
\* : T2RerankingZh2En and T2RerankingEn2Zh are cross-language retrieval tasks
## Train

View File

@ -137,7 +137,9 @@ pip install -U FlagEmbedding
```python
from FlagEmbedding import FlagModel
sentences = ["样例数据-1", "样例数据-2"]
model = FlagModel('BAAI/bge-large-zh', query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:")
model = FlagModel('BAAI/bge-large-zh-v1.5',
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
use_fp16=True) # 设置use_fp16为True可以加快计算效果会稍有下降
embeddings_1 = model.encode(sentences)
embeddings_2 = model.encode(sentences)
similarity = embeddings_1 @ embeddings_2.T
@ -171,7 +173,7 @@ pip install -U sentence-transformers
```python
from sentence_transformers import SentenceTransformer
sentences = ["样例数据-1", "样例数据-2"]
model = SentenceTransformer('BAAI/bge-large-zh')
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
embeddings_1 = model.encode(sentences, normalize_embeddings=True)
embeddings_2 = model.encode(sentences, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
@ -184,7 +186,7 @@ print(similarity)
queries = ['query_1', 'query_2']
passages = ["样例文档-1", "样例文档-2"]
instruction = "为这个句子生成表示以用于检索相关文章:"
model = SentenceTransformer('BAAI/bge-large-zh')
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
q_embeddings = model.encode([instruction+q for q in queries], normalize_embeddings=True)
p_embeddings = model.encode(passages, normalize_embeddings=True)
scores = q_embeddings @ p_embeddings.T
@ -198,7 +200,7 @@ scores = q_embeddings @ p_embeddings.T
在Langchian中使用bge模型
```python
from langchain.embeddings import HuggingFaceBgeEmbeddings
model_name = "BAAI/bge-small-en"
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
model = HuggingFaceBgeEmbeddings(
@ -218,8 +220,8 @@ import torch
sentences = ["样例数据-1", "样例数据-2"]
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh')
model = AutoModel.from_pretrained('BAAI/bge-large-zh')
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-large-zh-v1.5')
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
@ -248,10 +250,10 @@ print("Sentence embeddings:", sentence_embeddings)
pip install -U FlagEmbedding
```
计算相关分数:
计算相关分数,越高表示越相关:
```python
from FlagEmbedding import FlagReranker
reranker = FlagReranker('BAAI/bge-reranker-base', use_fp16=True) #设置 fp16 为True可以加快推理速度效果会有可以忽略的下降
reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16=True) #设置 fp16 为True可以加快推理速度效果会有可以忽略的下降
score = reranker.compute_score(['query', 'passage']) # 计算 query 和 passage的相似度
print(score)
@ -265,10 +267,10 @@ print(scores)
```python
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BatchEncoding, PreTrainedTokenizerFast
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-base')
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-base')
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-large')
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-large')
model.eval()
pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]

View File

@ -5,7 +5,7 @@ with open("README.md", mode="r", encoding="utf-8") as readme_file:
setup(
name='FlagEmbedding',
version='1.1.0',
version='1.1.1',
description='FlagEmbedding',
long_description=readme,
long_description_content_type="text/markdown",