update readme

This commit is contained in:
shitao 2023-08-08 15:37:33 +08:00
parent 9876bb2b84
commit 35a5aedbfc
2 changed files with 5 additions and 7 deletions

View File

@ -69,7 +69,7 @@ embeddings = model.encode(sentences)
print(embeddings)
# for retrieval task, please use encode_queries() which will automatically add the instruction to each query
# corpus in retrieval task can still use encode() or encode_corpus()
# corpus in retrieval task can still use encode() or encode_corpus(), since they don't need instruction
queries = ['query_1', 'query_2']
passages = ["样例段落-1", "样例段落-2"]
q_embeddings = model.encode_queries(queries)
@ -117,8 +117,7 @@ You can use `bge` in langchain like this:
from langchain.embeddings import HuggingFaceInstructEmbeddings
encode_kwargs = {'normalize_embeddings': True}
model = HuggingFaceInstructEmbeddings(model_name='BAAI/bge-large-en',
embed_instruction="",
# retrieval passages for short query, using query_instruction, else set it ""
embed_instruction="", # no instruction is needed for candidate passages
query_instruction="Represent this sentence for searching relevant passages: ",
encode_kwargs=encode_kwargs)
```
@ -235,7 +234,7 @@ We trained our model on 48 A100(40G) GPUs with a large batch size of 32,768 (so
We used the AdamW optimizer and the learning rate is 1e-5.
The temperature for contrastive loss is 0.01.
Besides, we add instruction to the query for retrieval task in the training.
Besides, we add instruction to the query for retrieval task in the training (add nothing to passages).
For English, the instruction is `Represent this sentence for searching relevant passages: `;
For Chinese, the instruction is `为这个句子生成表示以用于检索相关文章:`.
In the evaluation, the instruction should be added for queries in retrieval task, not be added for other tasks.

View File

@ -71,7 +71,7 @@ embeddings = model.encode(sentences)
print(embeddings)
# 对于检索任务中的查询问题,请使用 encode_queries() 函数,其会自动为每个查询加上指令
# 检索中的候选集依然使用 encode() 或 encode_corpus() 函数
# 由于候选文本不需要添加指令,检索中的候选集依然使用 encode() 或 encode_corpus() 函数
queries = ['query_1', 'query_2']
passages = ["样例段落-1", "样例段落-2"]
q_embeddings = model.encode_queries(queries)
@ -120,7 +120,6 @@ from langchain.embeddings import HuggingFaceInstructEmbeddings
encode_kwargs = {'normalize_embeddings': True}
model = HuggingFaceInstructEmbeddings(model_name='BAAI/bge-large-en',
embed_instruction="",
# retrieval passages for short query, using query_instruction, else set it ""
query_instruction="Represent this sentence for searching relevant passages: ",
encode_kwargs=encode_kwargs)
```
@ -143,7 +142,7 @@ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tenso
# for retrieval task, add an instruction to query (not add instruction for passages)
# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings
# Compute embeddings
with torch.no_grad():
model_output = model(**encoded_input)
# Perform pooling. In this case, cls pooling.