update readme

2026-01-07 04:33:07 +00:00 · 2023-08-08 15:37:33 +08:00 · 2023-08-08 15:37:33 +08:00 · 35a5aedbfc
commit 35a5aedbfc
parent 9876bb2b84
2 changed files with 5 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -69,7 +69,7 @@ embeddings = model.encode(sentences)
 print(embeddings)

 # for retrieval task, please use encode_queries() which will automatically add the instruction to each query
-# corpus in retrieval task can still use encode() or encode_corpus()
+# corpus in retrieval task can still use encode() or encode_corpus(), since they don't need instruction
 queries = ['query_1', 'query_2']
 passages = ["样例段落-1", "样例段落-2"]
 q_embeddings = model.encode_queries(queries)
@ -117,8 +117,7 @@ You can use `bge` in langchain like this:
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 encode_kwargs = {'normalize_embeddings': True}
 model = HuggingFaceInstructEmbeddings(model_name='BAAI/bge-large-en',
-                                      embed_instruction="",
-                                      # retrieval passages for short query, using query_instruction, else set it ""
+                                      embed_instruction="", # no instruction is needed for candidate passages
                                      query_instruction="Represent this sentence for searching relevant passages: ",
                                      encode_kwargs=encode_kwargs)
 ```
@ -235,7 +234,7 @@ We trained our model on 48 A100(40G) GPUs with a large batch size of 32,768 (so
 We used the AdamW optimizer and the learning rate is 1e-5.
 The temperature for contrastive loss is 0.01.

-Besides, we add instruction to the query for retrieval task in the training. 
+Besides, we add instruction to the query for retrieval task in the training (add nothing to passages). 
 For English, the instruction is `Represent this sentence for searching relevant passages: `;
 For Chinese, the instruction is `为这个句子生成表示以用于检索相关文章：`.
 In the evaluation, the instruction should be added for queries in retrieval task, not be added for other tasks.
--- a/README_zh.md
+++ b/README_zh.md
@ -71,7 +71,7 @@ embeddings = model.encode(sentences)
 print(embeddings)

 # 对于检索任务中的查询问题，请使用 encode_queries() 函数，其会自动为每个查询加上指令
-# 检索中的候选集依然使用 encode() 或 encode_corpus() 函数
+# 由于候选文本不需要添加指令，检索中的候选集依然使用 encode() 或 encode_corpus() 函数
 queries = ['query_1', 'query_2']
 passages = ["样例段落-1", "样例段落-2"]
 q_embeddings = model.encode_queries(queries)
@ -120,7 +120,6 @@ from langchain.embeddings import HuggingFaceInstructEmbeddings
 encode_kwargs = {'normalize_embeddings': True}
 model = HuggingFaceInstructEmbeddings(model_name='BAAI/bge-large-en',
                                      embed_instruction="",
-                                      # retrieval passages for short query, using query_instruction, else set it ""
                                      query_instruction="Represent this sentence for searching relevant passages: ",
                                      encode_kwargs=encode_kwargs)
 ```
@ -143,7 +142,7 @@ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tenso
 # for retrieval task, add an instruction to query (not add instruction for passages)
 # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

-# Compute token embeddings
+# Compute embeddings
 with torch.no_grad():
    model_output = model(**encoded_input)
    # Perform pooling. In this case, cls pooling.