mirror of
https://github.com/FlagOpen/FlagEmbedding.git
synced 2025-06-27 02:39:58 +00:00
update readme
This commit is contained in:
parent
d96bbcc5be
commit
1e88cd5af0
@ -79,8 +79,9 @@ For chinese, we trained our model on 24 A100(40G) GPUs with a large batch size o
|
||||
- Chinese: [wudao](https://data.baai.ac.cn/details/WuDaoCorporaText), [cmrc2018](https://huggingface.co/datasets/cmrc2018), [dureader](https://github.com/baidu/DuReader),
|
||||
[simclue](https://github.com/CLUEbenchmark/SimCLUE), [csl](https://arxiv.org/abs/2209.05034), [amazon_reviews_multi](amazon_reviews_multi),
|
||||
[wiki_atomic_edits](https://huggingface.co/datasets/wiki_atomic_edits), [mlqa](https://huggingface.co/datasets/mlqa),
|
||||
[xlsum](https://huggingface.co/datasets/csebuetnlp/xlsum), and other data collected by BAAI teams.
|
||||
[xlsum](https://huggingface.co/datasets/csebuetnlp/xlsum), and other data collected by BAAI teams from internet (including QA, news and paper).
|
||||
|
||||
We release the dataset at https://data.baai.ac.cn/details/BAAI-MTP .
|
||||
|
||||
#### 2.2 high-quality supervised pairs
|
||||
|
||||
|
@ -16,7 +16,7 @@ def get_args():
|
||||
parser.add_argument('--output_file', default=None, type=str)
|
||||
parser.add_argument('--range_for_sampling', default=None, type=str, help="range to sample negatives")
|
||||
parser.add_argument('--use_gpu_for_searching', action='store_true', help='use faiss-gpu')
|
||||
parser.add_argument('--negative_number', default=15, help='use faiss-gpu')
|
||||
parser.add_argument('--negative_number', default=15, help='the number of negatives')
|
||||
parser.add_argument('--query_instruction_for_retrieval', default="")
|
||||
|
||||
return parser.parse_args()
|
||||
|
24
README.md
24
README.md
@ -132,7 +132,9 @@ If it doesn't work for you, you can see [FlagEmbedding](https://github.com/FlagO
|
||||
from FlagEmbedding import FlagModel
|
||||
sentences_1 = ["样例数据-1", "样例数据-2"]
|
||||
sentences_2 = ["样例数据-3", "样例数据-4"]
|
||||
model = FlagModel('BAAI/bge-large-zh', query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:")
|
||||
model = FlagModel('BAAI/bge-large-zh-v1.5',
|
||||
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
||||
use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
|
||||
embeddings_1 = model.encode(sentences_1)
|
||||
embeddings_2 = model.encode(sentences_2)
|
||||
similarity = embeddings_1 @ embeddings_2.T
|
||||
@ -178,7 +180,7 @@ queries = ['query_1', 'query_2']
|
||||
passages = ["样例文档-1", "样例文档-2"]
|
||||
instruction = "为这个句子生成表示以用于检索相关文章:"
|
||||
|
||||
model = SentenceTransformer('BAAI/bge-large-zh')
|
||||
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
|
||||
q_embeddings = model.encode([instruction+q for q in queries], normalize_embeddings=True)
|
||||
p_embeddings = model.encode(passages, normalize_embeddings=True)
|
||||
scores = q_embeddings @ p_embeddings.T
|
||||
@ -189,7 +191,7 @@ scores = q_embeddings @ p_embeddings.T
|
||||
You can use `bge` in langchain like this:
|
||||
```python
|
||||
from langchain.embeddings import HuggingFaceBgeEmbeddings
|
||||
model_name = "BAAI/bge-small-en"
|
||||
model_name = "BAAI/bge-large-en-v1.5"
|
||||
model_kwargs = {'device': 'cuda'}
|
||||
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
|
||||
model = HuggingFaceBgeEmbeddings(
|
||||
@ -213,8 +215,8 @@ import torch
|
||||
sentences = ["样例数据-1", "样例数据-2"]
|
||||
|
||||
# Load model from HuggingFace Hub
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh')
|
||||
model = AutoModel.from_pretrained('BAAI/bge-large-zh')
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh-v1.5')
|
||||
model = AutoModel.from_pretrained('BAAI/bge-large-zh-v1.5')
|
||||
model.eval()
|
||||
|
||||
# Tokenize sentences
|
||||
@ -244,10 +246,10 @@ The reranker is optimized based cross-entropy loss, so the relevance score is no
|
||||
pip install -U FlagEmbedding
|
||||
```
|
||||
|
||||
Get relevance score:
|
||||
Get relevance scores (higher scores indicate more relevance):
|
||||
```python
|
||||
from FlagEmbedding import FlagReranker
|
||||
reranker = FlagReranker('BAAI/bge-reranker-base', use_fp16=True) #use fp16 can speed up computing
|
||||
reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
|
||||
|
||||
score = reranker.compute_score(['query', 'passage'])
|
||||
print(score)
|
||||
@ -261,10 +263,10 @@ print(scores)
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BatchEncoding, PreTrainedTokenizerFast
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-base')
|
||||
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-base')
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-large')
|
||||
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-large')
|
||||
model.eval()
|
||||
|
||||
pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]
|
||||
@ -343,7 +345,7 @@ See [C_MTEB](https://github.com/FlagOpen/FlagEmbedding/blob/master/C_MTEB/) for
|
||||
| [BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base) | 67.28 | 63.95 | 60.45 | 35.46 | 81.26 | 84.1 | 65.42 |
|
||||
| [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large) | 67.6 | 64.03 | 61.44 | 37.16 | 82.15 | 84.18 | 66.09 |
|
||||
|
||||
\* : T2RerankingZh2En and T2RerankingEn2Zh are cross-language retrieval task
|
||||
\* : T2RerankingZh2En and T2RerankingEn2Zh are cross-language retrieval tasks
|
||||
|
||||
## Train
|
||||
|
||||
|
24
README_zh.md
24
README_zh.md
@ -137,7 +137,9 @@ pip install -U FlagEmbedding
|
||||
```python
|
||||
from FlagEmbedding import FlagModel
|
||||
sentences = ["样例数据-1", "样例数据-2"]
|
||||
model = FlagModel('BAAI/bge-large-zh', query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:")
|
||||
model = FlagModel('BAAI/bge-large-zh-v1.5',
|
||||
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
||||
use_fp16=True) # 设置use_fp16为True可以加快计算,效果会稍有下降
|
||||
embeddings_1 = model.encode(sentences)
|
||||
embeddings_2 = model.encode(sentences)
|
||||
similarity = embeddings_1 @ embeddings_2.T
|
||||
@ -171,7 +173,7 @@ pip install -U sentence-transformers
|
||||
```python
|
||||
from sentence_transformers import SentenceTransformer
|
||||
sentences = ["样例数据-1", "样例数据-2"]
|
||||
model = SentenceTransformer('BAAI/bge-large-zh')
|
||||
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
|
||||
embeddings_1 = model.encode(sentences, normalize_embeddings=True)
|
||||
embeddings_2 = model.encode(sentences, normalize_embeddings=True)
|
||||
similarity = embeddings_1 @ embeddings_2.T
|
||||
@ -184,7 +186,7 @@ print(similarity)
|
||||
queries = ['query_1', 'query_2']
|
||||
passages = ["样例文档-1", "样例文档-2"]
|
||||
instruction = "为这个句子生成表示以用于检索相关文章:"
|
||||
model = SentenceTransformer('BAAI/bge-large-zh')
|
||||
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
|
||||
q_embeddings = model.encode([instruction+q for q in queries], normalize_embeddings=True)
|
||||
p_embeddings = model.encode(passages, normalize_embeddings=True)
|
||||
scores = q_embeddings @ p_embeddings.T
|
||||
@ -198,7 +200,7 @@ scores = q_embeddings @ p_embeddings.T
|
||||
在Langchian中使用bge模型:
|
||||
```python
|
||||
from langchain.embeddings import HuggingFaceBgeEmbeddings
|
||||
model_name = "BAAI/bge-small-en"
|
||||
model_name = "BAAI/bge-large-en-v1.5"
|
||||
model_kwargs = {'device': 'cpu'}
|
||||
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
|
||||
model = HuggingFaceBgeEmbeddings(
|
||||
@ -218,8 +220,8 @@ import torch
|
||||
sentences = ["样例数据-1", "样例数据-2"]
|
||||
|
||||
# Load model from HuggingFace Hub
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh')
|
||||
model = AutoModel.from_pretrained('BAAI/bge-large-zh')
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh-v1.5')
|
||||
model = AutoModel.from_pretrained('BAAI/bge-large-zh-v1.5')
|
||||
|
||||
# Tokenize sentences
|
||||
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
||||
@ -248,10 +250,10 @@ print("Sentence embeddings:", sentence_embeddings)
|
||||
pip install -U FlagEmbedding
|
||||
```
|
||||
|
||||
计算相关分数:
|
||||
计算相关分数,越高表示越相关:
|
||||
```python
|
||||
from FlagEmbedding import FlagReranker
|
||||
reranker = FlagReranker('BAAI/bge-reranker-base', use_fp16=True) #设置 fp16 为True可以加快推理速度,效果会有可以忽略的下降
|
||||
reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16=True) #设置 fp16 为True可以加快推理速度,效果会有可以忽略的下降
|
||||
|
||||
score = reranker.compute_score(['query', 'passage']) # 计算 query 和 passage的相似度
|
||||
print(score)
|
||||
@ -265,10 +267,10 @@ print(scores)
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BatchEncoding, PreTrainedTokenizerFast
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-base')
|
||||
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-base')
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-large')
|
||||
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-large')
|
||||
model.eval()
|
||||
|
||||
pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]
|
||||
|
Loading…
x
Reference in New Issue
Block a user