mirror of
https://github.com/FlagOpen/FlagEmbedding.git
synced 2026-01-07 04:33:07 +00:00
update readme
This commit is contained in:
parent
9876bb2b84
commit
35a5aedbfc
@ -69,7 +69,7 @@ embeddings = model.encode(sentences)
|
||||
print(embeddings)
|
||||
|
||||
# for retrieval task, please use encode_queries() which will automatically add the instruction to each query
|
||||
# corpus in retrieval task can still use encode() or encode_corpus()
|
||||
# corpus in retrieval task can still use encode() or encode_corpus(), since they don't need instruction
|
||||
queries = ['query_1', 'query_2']
|
||||
passages = ["样例段落-1", "样例段落-2"]
|
||||
q_embeddings = model.encode_queries(queries)
|
||||
@ -117,8 +117,7 @@ You can use `bge` in langchain like this:
|
||||
from langchain.embeddings import HuggingFaceInstructEmbeddings
|
||||
encode_kwargs = {'normalize_embeddings': True}
|
||||
model = HuggingFaceInstructEmbeddings(model_name='BAAI/bge-large-en',
|
||||
embed_instruction="",
|
||||
# retrieval passages for short query, using query_instruction, else set it ""
|
||||
embed_instruction="", # no instruction is needed for candidate passages
|
||||
query_instruction="Represent this sentence for searching relevant passages: ",
|
||||
encode_kwargs=encode_kwargs)
|
||||
```
|
||||
@ -235,7 +234,7 @@ We trained our model on 48 A100(40G) GPUs with a large batch size of 32,768 (so
|
||||
We used the AdamW optimizer and the learning rate is 1e-5.
|
||||
The temperature for contrastive loss is 0.01.
|
||||
|
||||
Besides, we add instruction to the query for retrieval task in the training.
|
||||
Besides, we add instruction to the query for retrieval task in the training (add nothing to passages).
|
||||
For English, the instruction is `Represent this sentence for searching relevant passages: `;
|
||||
For Chinese, the instruction is `为这个句子生成表示以用于检索相关文章:`.
|
||||
In the evaluation, the instruction should be added for queries in retrieval task, not be added for other tasks.
|
||||
|
||||
@ -71,7 +71,7 @@ embeddings = model.encode(sentences)
|
||||
print(embeddings)
|
||||
|
||||
# 对于检索任务中的查询问题,请使用 encode_queries() 函数,其会自动为每个查询加上指令
|
||||
# 检索中的候选集依然使用 encode() 或 encode_corpus() 函数
|
||||
# 由于候选文本不需要添加指令,检索中的候选集依然使用 encode() 或 encode_corpus() 函数
|
||||
queries = ['query_1', 'query_2']
|
||||
passages = ["样例段落-1", "样例段落-2"]
|
||||
q_embeddings = model.encode_queries(queries)
|
||||
@ -120,7 +120,6 @@ from langchain.embeddings import HuggingFaceInstructEmbeddings
|
||||
encode_kwargs = {'normalize_embeddings': True}
|
||||
model = HuggingFaceInstructEmbeddings(model_name='BAAI/bge-large-en',
|
||||
embed_instruction="",
|
||||
# retrieval passages for short query, using query_instruction, else set it ""
|
||||
query_instruction="Represent this sentence for searching relevant passages: ",
|
||||
encode_kwargs=encode_kwargs)
|
||||
```
|
||||
@ -143,7 +142,7 @@ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tenso
|
||||
# for retrieval task, add an instruction to query (not add instruction for passages)
|
||||
# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
|
||||
|
||||
# Compute token embeddings
|
||||
# Compute embeddings
|
||||
with torch.no_grad():
|
||||
model_output = model(**encoded_input)
|
||||
# Perform pooling. In this case, cls pooling.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user