mirror of
https://github.com/FlagOpen/FlagEmbedding.git
synced 2025-06-27 02:39:58 +00:00
update tutorials & docs
This commit is contained in:
parent
74a671f69d
commit
d5292b6875
366
Tutorials/1_Embedding/1.2.6_BGE_VL.ipynb
Normal file
366
Tutorials/1_Embedding/1.2.6_BGE_VL.ipynb
Normal file
@ -0,0 +1,366 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# BGE-VL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this tutorial, we will go through the multimodel retrieval models BGE-VL series, which achieved state-of-the-art performance on four popular zero-shot composed image retrieval benchmarks and the massive multimodal embedding benchmark (MMEB)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 0. Installation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Install the required packages in your environment."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install numpy torch transformers pillow"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 1. BGE-VL-CLIP"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"| Model | Language | Parameters | Model Size | Description | Base Model |\n",
|
||||
"|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n",
|
||||
"| [BAAI/bge-vl-base](https://huggingface.co/BAAI/BGE-VL-base) | English | 150M | 299 MB | Light weight multimodel embedder among image and text | CLIP-base |\n",
|
||||
"| [BAAI/bge-vl-large](https://huggingface.co/BAAI/BGE-VL-large) | English | 428M | 855 MB | Large scale multimodel embedder among image and text | CLIP-large |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"BGE-VL-base and BGE-VL-large are trained based on CLIP base and CLIP large, which both contain a vision transformer and a text transformer:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/share/project/xzy/Envs/ft/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||||
"/share/project/xzy/Envs/ft/lib/python3.11/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
|
||||
" warnings.warn(\n",
|
||||
"/share/project/xzy/Envs/ft/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"CLIPModel(\n",
|
||||
" (text_model): CLIPTextTransformer(\n",
|
||||
" (embeddings): CLIPTextEmbeddings(\n",
|
||||
" (token_embedding): Embedding(49408, 512)\n",
|
||||
" (position_embedding): Embedding(77, 512)\n",
|
||||
" )\n",
|
||||
" (encoder): CLIPEncoder(\n",
|
||||
" (layers): ModuleList(\n",
|
||||
" (0-11): 12 x CLIPEncoderLayer(\n",
|
||||
" (self_attn): CLIPSdpaAttention(\n",
|
||||
" (k_proj): Linear(in_features=512, out_features=512, bias=True)\n",
|
||||
" (v_proj): Linear(in_features=512, out_features=512, bias=True)\n",
|
||||
" (q_proj): Linear(in_features=512, out_features=512, bias=True)\n",
|
||||
" (out_proj): Linear(in_features=512, out_features=512, bias=True)\n",
|
||||
" )\n",
|
||||
" (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" (mlp): CLIPMLP(\n",
|
||||
" (activation_fn): QuickGELUActivation()\n",
|
||||
" (fc1): Linear(in_features=512, out_features=2048, bias=True)\n",
|
||||
" (fc2): Linear(in_features=2048, out_features=512, bias=True)\n",
|
||||
" )\n",
|
||||
" (layer_norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" (vision_model): CLIPVisionTransformer(\n",
|
||||
" (embeddings): CLIPVisionEmbeddings(\n",
|
||||
" (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)\n",
|
||||
" (position_embedding): Embedding(197, 768)\n",
|
||||
" )\n",
|
||||
" (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" (encoder): CLIPEncoder(\n",
|
||||
" (layers): ModuleList(\n",
|
||||
" (0-11): 12 x CLIPEncoderLayer(\n",
|
||||
" (self_attn): CLIPSdpaAttention(\n",
|
||||
" (k_proj): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (v_proj): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (q_proj): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (out_proj): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" )\n",
|
||||
" (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" (mlp): CLIPMLP(\n",
|
||||
" (activation_fn): QuickGELUActivation()\n",
|
||||
" (fc1): Linear(in_features=768, out_features=3072, bias=True)\n",
|
||||
" (fc2): Linear(in_features=3072, out_features=768, bias=True)\n",
|
||||
" )\n",
|
||||
" (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" (post_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" (visual_projection): Linear(in_features=768, out_features=512, bias=False)\n",
|
||||
" (text_projection): Linear(in_features=512, out_features=512, bias=False)\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import torch\n",
|
||||
"from transformers import AutoModel\n",
|
||||
"\n",
|
||||
"MODEL_NAME = \"BAAI/BGE-VL-base\" # or \"BAAI/BGE-VL-base\"\n",
|
||||
"\n",
|
||||
"model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True) # You must set trust_remote_code=True\n",
|
||||
"model.set_processor(MODEL_NAME)\n",
|
||||
"model.eval()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([[0.2647, 0.1242]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with torch.no_grad():\n",
|
||||
" query = model.encode(\n",
|
||||
" images = \"../../imgs/cir_query.png\", \n",
|
||||
" text = \"Make the background dark, as if the camera has taken the photo at night\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" candidates = model.encode(\n",
|
||||
" images = [\"../../imgs/cir_candi_1.png\", \"../../imgs/cir_candi_2.png\"]\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" scores = query @ candidates.T\n",
|
||||
"print(scores)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 2. BGE-VL-MLLM"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"| Model | Language | Parameters | Model Size | Description | Base Model |\n",
|
||||
"|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n",
|
||||
"| [BAAI/bge-vl-MLLM-S1](https://huggingface.co/BAAI/BGE-VL-MLLM-S1) | English | 7.57B | 15.14 GB | SOTA in composed image retrieval, trained on MegaPairs dataset | LLaVA-1.6 |\n",
|
||||
"| [BAAI/bge-vl-MLLM-S2](https://huggingface.co/BAAI/BGE-VL-MLLM-S2) | English | 7.57B | 15.14 GB | Finetune BGE-VL-MLLM-S1 with one epoch on MMEB training set | LLaVA-1.6 |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/share/project/xzy/Envs/ft/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||||
"/share/project/xzy/Envs/ft/lib/python3.11/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
|
||||
" warnings.warn(\n",
|
||||
"Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00, 1.28it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"LLaVANextForEmbedding(\n",
|
||||
" (vision_tower): CLIPVisionModel(\n",
|
||||
" (vision_model): CLIPVisionTransformer(\n",
|
||||
" (embeddings): CLIPVisionEmbeddings(\n",
|
||||
" (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)\n",
|
||||
" (position_embedding): Embedding(577, 1024)\n",
|
||||
" )\n",
|
||||
" (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" (encoder): CLIPEncoder(\n",
|
||||
" (layers): ModuleList(\n",
|
||||
" (0-23): 24 x CLIPEncoderLayer(\n",
|
||||
" (self_attn): CLIPSdpaAttention(\n",
|
||||
" (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" )\n",
|
||||
" (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" (mlp): CLIPMLP(\n",
|
||||
" (activation_fn): QuickGELUActivation()\n",
|
||||
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
|
||||
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
|
||||
" )\n",
|
||||
" (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" (multi_modal_projector): LlavaNextMultiModalProjector(\n",
|
||||
" (linear_1): Linear(in_features=1024, out_features=4096, bias=True)\n",
|
||||
" (act): GELUActivation()\n",
|
||||
" (linear_2): Linear(in_features=4096, out_features=4096, bias=True)\n",
|
||||
" )\n",
|
||||
" (language_model): MistralForCausalLM(\n",
|
||||
" (model): MistralModel(\n",
|
||||
" (embed_tokens): Embedding(32005, 4096)\n",
|
||||
" (layers): ModuleList(\n",
|
||||
" (0-31): 32 x MistralDecoderLayer(\n",
|
||||
" (self_attn): MistralSdpaAttention(\n",
|
||||
" (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
|
||||
" (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
|
||||
" (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
|
||||
" (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
|
||||
" (rotary_emb): MistralRotaryEmbedding()\n",
|
||||
" )\n",
|
||||
" (mlp): MistralMLP(\n",
|
||||
" (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
|
||||
" (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
|
||||
" (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
|
||||
" (act_fn): SiLU()\n",
|
||||
" )\n",
|
||||
" (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)\n",
|
||||
" (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" (norm): MistralRMSNorm((4096,), eps=1e-05)\n",
|
||||
" )\n",
|
||||
" (lm_head): Linear(in_features=4096, out_features=32005, bias=False)\n",
|
||||
" )\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"from transformers import AutoModel\n",
|
||||
"from PIL import Image\n",
|
||||
"\n",
|
||||
"MODEL_NAME= \"BAAI/BGE-VL-MLLM-S1\"\n",
|
||||
"\n",
|
||||
"model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
|
||||
"model.eval()\n",
|
||||
"model.cuda()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([[0.4109, 0.1807]], device='cuda:0')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with torch.no_grad():\n",
|
||||
" model.set_processor(MODEL_NAME)\n",
|
||||
"\n",
|
||||
" query_inputs = model.data_process(\n",
|
||||
" text=\"Make the background dark, as if the camera has taken the photo at night\", \n",
|
||||
" images=\"../../imgs/cir_query.png\",\n",
|
||||
" q_or_c=\"q\",\n",
|
||||
" task_instruction=\"Retrieve the target image that best meets the combined criteria by using both the provided image and the image retrieval instructions: \"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" candidate_inputs = model.data_process(\n",
|
||||
" images=[\"../../imgs/cir_candi_1.png\", \"../../imgs/cir_candi_2.png\"],\n",
|
||||
" q_or_c=\"c\",\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" query_embs = model(**query_inputs, output_hidden_states=True)[:, -1, :]\n",
|
||||
" candi_embs = model(**candidate_inputs, output_hidden_states=True)[:, -1, :]\n",
|
||||
" \n",
|
||||
" query_embs = torch.nn.functional.normalize(query_embs, dim=-1)\n",
|
||||
" candi_embs = torch.nn.functional.normalize(candi_embs, dim=-1)\n",
|
||||
"\n",
|
||||
" scores = torch.matmul(query_embs, candi_embs.T)\n",
|
||||
"print(scores)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -27,4 +27,5 @@ Quickly get started with:
|
||||
IR
|
||||
embedder
|
||||
reranker
|
||||
similarity
|
||||
retrieval_demo
|
60
docs/source/Introduction/similarity.rst
Normal file
60
docs/source/Introduction/similarity.rst
Normal file
@ -0,0 +1,60 @@
|
||||
Similarity
|
||||
==========
|
||||
|
||||
A primary goal of retrieval is to find the most relevant documents in response to a user's query.
|
||||
One of the core components of this process is measuring similarity between the query and candidates.
|
||||
Similarity metrics quantify how closely related two pieces of data are, and guide the retrieval system in ranking results.
|
||||
|
||||
Jaccard Similarity
|
||||
------------------
|
||||
|
||||
.. math::
|
||||
|
||||
J(A,B)=\frac{|A\cap B|}{|A\cup B|}
|
||||
|
||||
The Jaccard similarity or Jaccard index is commonly used for set-based similarity, particularly in binary data (e.g., whether a term appears in a document or not).
|
||||
It is calculated as the size of the intersection of two sets divided by the size of their union.
|
||||
In information retrieval, it's often used to compare sets of keywords or phrases, with higher values indicating more similarity.
|
||||
|
||||
Euclidean Distance
|
||||
------------------
|
||||
|
||||
.. math::
|
||||
|
||||
d(A, B) = \|A-B\|_2 = \sqrt{\sum_{i=1}^n (A_i-B_i)^2}
|
||||
|
||||
Euclidean distance measures the straight-line distance between two points in a vector space.
|
||||
In IR, this can be used to assess the difference between document or query vectors.
|
||||
A smaller distance indicates greater similarity.
|
||||
This metric is intuitive but can sometimes be sensitive to the scale of the data, especially in high-dimensional spaces like text embeddings.
|
||||
|
||||
Cosine Similarity
|
||||
-----------------
|
||||
|
||||
.. math::
|
||||
|
||||
\cos(\theta)=\frac{A\cdot B}{\|A\|\|B\|}
|
||||
|
||||
Cosine similarity is one of the most widely used metrics in information retrieval, especially for text.
|
||||
It measures the cosine of the angle between two vectors in a multi-dimensional space (typically representing term frequency vectors of documents and queries).
|
||||
If the cosine similarity is closer to 1, the vectors are more similar.
|
||||
A value of 0 indicates orthogonality, meaning no similarity.
|
||||
It's a simple yet effective measure for text-based retrieval, as it considers the orientation but not the magnitude of vectors.
|
||||
|
||||
Dot Product
|
||||
-----------
|
||||
|
||||
Coordinate definition:
|
||||
.. math::
|
||||
|
||||
A\cdot B = \sum_{i=1}^{i=n}A_i B_i
|
||||
|
||||
Geometric definition:
|
||||
.. math::
|
||||
|
||||
A\cdot B = \|A\|\|B\|\cos(\theta)
|
||||
|
||||
The dot product between two vectors provides a measure of how similar the vectors are in terms of direction and magnitude.
|
||||
In information retrieval, the dot product is often used in vector space models, particularly when dealing with pre-trained word or sentence embeddings.
|
||||
A higher dot product indicates that the query and document are closely aligned in the vector space.
|
||||
|
110
docs/source/bge/bge_vl.rst
Normal file
110
docs/source/bge/bge_vl.rst
Normal file
@ -0,0 +1,110 @@
|
||||
BGE-VL
|
||||
======
|
||||
|
||||
BGE-VL is a series of multimodel retrieval models training on `MegaPairs <https://github.com/VectorSpaceLab/MegaPairs>`_
|
||||
|
||||
BGE-VL contains light weight CLIP based models as well as more powerful LLAVA-NeXT based MLLM models:
|
||||
|
||||
+----------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+
|
||||
| Model | Language | Parameters | Model Size | Description |
|
||||
+======================================================================+===========+============+==============+=======================================================================+
|
||||
| `BAAI/bge-vl-base <https://huggingface.co/BAAI/BGE-VL-base>`_ | English | 150M | 299 MB | Light weight multimodel embedder among image and text |
|
||||
+----------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+
|
||||
| `BAAI/bge-vl-large <https://huggingface.co/BAAI/BGE-VL-large>`_ | English | 428M | 855 MB | Large scale multimodel embedder among image and text |
|
||||
+----------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+
|
||||
| `BAAI/bge-vl-MLLM-S1 <https://huggingface.co/BAAI/BGE-VL-MLLM-S1>`_ | English | 7.57B | 15.14 GB | SOTA in composed image retrieval, trained on MegaPairs dataset |
|
||||
+----------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+
|
||||
| `BAAI/bge-vl-MLLM-S2 <https://huggingface.co/BAAI/BGE-VL-MLLM-S2>`_ | English | 7.57B | 15.14 GB | Finetune BGE-VL-MLLM-S1 with one epoch on MMEB training set |
|
||||
+----------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+
|
||||
|
||||
|
||||
BGE-VL-CLIP
|
||||
-----------
|
||||
|
||||
The base and large model are trained based on CLIP-vit-base-patch16 and CLIP-vit-large-patch14.
|
||||
For composed image-text data, the model directly use score-fusion to sum up the outputs of visual encoder and text encoder and get the final embedding.
|
||||
|
||||
.. tip::
|
||||
|
||||
Our code works well on transformers==4.45.2, and we recommend using this version.
|
||||
|
||||
You can easily use BGE-VL-CLIP models based on transformers:
|
||||
|
||||
.. code:: python
|
||||
|
||||
import torch
|
||||
from transformers import AutoModel
|
||||
|
||||
MODEL_NAME = "BAAI/BGE-VL-base" # or "BAAI/BGE-VL-large"
|
||||
model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True) # You must set trust_remote_code=True
|
||||
model.set_processor(MODEL_NAME)
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
query = model.encode(
|
||||
images = "./assets/cir_query.png",
|
||||
text = "Make the background dark, as if the camera has taken the photo at night"
|
||||
)
|
||||
candidates = model.encode(
|
||||
images = ["./assets/cir_candi_1.png", "./assets/cir_candi_2.png"]
|
||||
)
|
||||
|
||||
scores = query @ candidates.T
|
||||
print(scores)
|
||||
|
||||
|
||||
BGE-VL-MLLM
|
||||
-----------
|
||||
|
||||
The multimodal large language models (MLLMs) incorporate a visual encoder, typically based on a vision transformer, into a large language model (LLM).
|
||||
This integration allows image tokens to be directly processed by the LLM.
|
||||
Consequently, MLLMs can effectively handle diverse multimodal inputs by converting any type of input into a sequence of tokens.
|
||||
|
||||
BGE-VL-MLLM builds upon the LLaVA1.6. In both training and inference stages, MMRet uses task-specific instructions for query inputs to improve generalization, aligning
|
||||
with standard practices in LLM-based embedding models.
|
||||
A typical multimodal query input is structured as follows:
|
||||
|
||||
.. math::
|
||||
|
||||
⟨\text{instruct}⟩{\{task\_ inst\}} \space⟨\text{query}⟩\{q_t\} \{q_i\}\space[\text{EOS}]
|
||||
|
||||
where :math:`{task_inst}` represents the task-specific instruction, :math:`{qt}` denotes the input query text, and
|
||||
:math:`{qi}` is the input query image.
|
||||
The normalized last hidden state of the [EOS] token in the MLLM is used as the embedding of any given input sequence.
|
||||
|
||||
.. code:: python
|
||||
|
||||
import torch
|
||||
from transformers import AutoModel
|
||||
from PIL import Image
|
||||
|
||||
MODEL_NAME= "BAAI/BGE-VL-MLLM-S1"
|
||||
model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
||||
model.eval()
|
||||
model.cuda()
|
||||
|
||||
with torch.no_grad():
|
||||
model.set_processor(MODEL_NAME)
|
||||
|
||||
query_inputs = model.data_process(
|
||||
text="Make the background dark, as if the camera has taken the photo at night",
|
||||
images="./assets/cir_query.png",
|
||||
q_or_c="q",
|
||||
task_instruction="Retrieve the target image that best meets the combined criteria by using both the provided image and the image retrieval instructions: "
|
||||
)
|
||||
candidate_inputs = model.data_process(
|
||||
images=["./assets/cir_candi_1.png", "./assets/cir_candi_2.png"],
|
||||
q_or_c="c",
|
||||
)
|
||||
|
||||
query_embs = model(**query_inputs, output_hidden_states=True)[:, -1, :]
|
||||
candi_embs = model(**candidate_inputs, output_hidden_states=True)[:, -1, :]
|
||||
|
||||
query_embs = torch.nn.functional.normalize(query_embs, dim=-1)
|
||||
candi_embs = torch.nn.functional.normalize(candi_embs, dim=-1)
|
||||
|
||||
scores = torch.matmul(query_embs, candi_embs.T)
|
||||
print(scores)
|
||||
|
||||
|
||||
For more details, check out the repo of `MegaPairs <https://github.com/VectorSpaceLab/MegaPairs>`_
|
@ -14,6 +14,7 @@ BGE
|
||||
bge_v1_v1.5
|
||||
bge_m3
|
||||
bge_icl
|
||||
bge_vl
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
BIN
imgs/cir_candi_1.png
Normal file
BIN
imgs/cir_candi_1.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 50 KiB |
BIN
imgs/cir_candi_2.png
Normal file
BIN
imgs/cir_candi_2.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 880 KiB |
BIN
imgs/cir_query.png
Normal file
BIN
imgs/cir_query.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 149 KiB |
Loading…
x
Reference in New Issue
Block a user