diff --git a/Tutorials/1_Embedding/1.2.6_BGE_VL.ipynb b/Tutorials/1_Embedding/1.2.6_BGE_VL.ipynb new file mode 100644 index 0000000..263b896 --- /dev/null +++ b/Tutorials/1_Embedding/1.2.6_BGE_VL.ipynb @@ -0,0 +1,366 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BGE-VL" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we will go through the multimodel retrieval models BGE-VL series, which achieved state-of-the-art performance on four popular zero-shot composed image retrieval benchmarks and the massive multimodal embedding benchmark (MMEB)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Installation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the required packages in your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install numpy torch transformers pillow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. BGE-VL-CLIP" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "| Model | Language | Parameters | Model Size | Description | Base Model |\n", + "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n", + "| [BAAI/bge-vl-base](https://huggingface.co/BAAI/BGE-VL-base) | English | 150M | 299 MB | Light weight multimodel embedder among image and text | CLIP-base |\n", + "| [BAAI/bge-vl-large](https://huggingface.co/BAAI/BGE-VL-large) | English | 428M | 855 MB | Large scale multimodel embedder among image and text | CLIP-large |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BGE-VL-base and BGE-VL-large are trained based on CLIP base and CLIP large, which both contain a vision transformer and a text transformer:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n", + " warnings.warn(\n", + "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "CLIPModel(\n", + " (text_model): CLIPTextTransformer(\n", + " (embeddings): CLIPTextEmbeddings(\n", + " (token_embedding): Embedding(49408, 512)\n", + " (position_embedding): Embedding(77, 512)\n", + " )\n", + " (encoder): CLIPEncoder(\n", + " (layers): ModuleList(\n", + " (0-11): 12 x CLIPEncoderLayer(\n", + " (self_attn): CLIPSdpaAttention(\n", + " (k_proj): Linear(in_features=512, out_features=512, bias=True)\n", + " (v_proj): Linear(in_features=512, out_features=512, bias=True)\n", + " (q_proj): Linear(in_features=512, out_features=512, bias=True)\n", + " (out_proj): Linear(in_features=512, out_features=512, bias=True)\n", + " )\n", + " (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n", + " (mlp): CLIPMLP(\n", + " (activation_fn): QuickGELUActivation()\n", + " (fc1): Linear(in_features=512, out_features=2048, bias=True)\n", + " (fc2): Linear(in_features=2048, out_features=512, bias=True)\n", + " )\n", + " (layer_norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " )\n", + " )\n", + " (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (vision_model): CLIPVisionTransformer(\n", + " (embeddings): CLIPVisionEmbeddings(\n", + " (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)\n", + " (position_embedding): Embedding(197, 768)\n", + " )\n", + " (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (encoder): CLIPEncoder(\n", + " (layers): ModuleList(\n", + " (0-11): 12 x CLIPEncoderLayer(\n", + " (self_attn): CLIPSdpaAttention(\n", + " (k_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (v_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (q_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " )\n", + " (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (mlp): CLIPMLP(\n", + " (activation_fn): QuickGELUActivation()\n", + " (fc1): Linear(in_features=768, out_features=3072, bias=True)\n", + " (fc2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " )\n", + " )\n", + " (post_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (visual_projection): Linear(in_features=768, out_features=512, bias=False)\n", + " (text_projection): Linear(in_features=512, out_features=512, bias=False)\n", + ")" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "import torch\n", + "from transformers import AutoModel\n", + "\n", + "MODEL_NAME = \"BAAI/BGE-VL-base\" # or \"BAAI/BGE-VL-base\"\n", + "\n", + "model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True) # You must set trust_remote_code=True\n", + "model.set_processor(MODEL_NAME)\n", + "model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[0.2647, 0.1242]])\n" + ] + } + ], + "source": [ + "with torch.no_grad():\n", + " query = model.encode(\n", + " images = \"../../imgs/cir_query.png\", \n", + " text = \"Make the background dark, as if the camera has taken the photo at night\"\n", + " )\n", + "\n", + " candidates = model.encode(\n", + " images = [\"../../imgs/cir_candi_1.png\", \"../../imgs/cir_candi_2.png\"]\n", + " )\n", + " \n", + " scores = query @ candidates.T\n", + "print(scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. BGE-VL-MLLM" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "| Model | Language | Parameters | Model Size | Description | Base Model |\n", + "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n", + "| [BAAI/bge-vl-MLLM-S1](https://huggingface.co/BAAI/BGE-VL-MLLM-S1) | English | 7.57B | 15.14 GB | SOTA in composed image retrieval, trained on MegaPairs dataset | LLaVA-1.6 |\n", + "| [BAAI/bge-vl-MLLM-S2](https://huggingface.co/BAAI/BGE-VL-MLLM-S2) | English | 7.57B | 15.14 GB | Finetune BGE-VL-MLLM-S1 with one epoch on MMEB training set | LLaVA-1.6 |" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n", + " warnings.warn(\n", + "Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00, 1.28it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "LLaVANextForEmbedding(\n", + " (vision_tower): CLIPVisionModel(\n", + " (vision_model): CLIPVisionTransformer(\n", + " (embeddings): CLIPVisionEmbeddings(\n", + " (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)\n", + " (position_embedding): Embedding(577, 1024)\n", + " )\n", + " (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", + " (encoder): CLIPEncoder(\n", + " (layers): ModuleList(\n", + " (0-23): 24 x CLIPEncoderLayer(\n", + " (self_attn): CLIPSdpaAttention(\n", + " (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", + " (mlp): CLIPMLP(\n", + " (activation_fn): QuickGELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " )\n", + " )\n", + " (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " )\n", + " (multi_modal_projector): LlavaNextMultiModalProjector(\n", + " (linear_1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (act): GELUActivation()\n", + " (linear_2): Linear(in_features=4096, out_features=4096, bias=True)\n", + " )\n", + " (language_model): MistralForCausalLM(\n", + " (model): MistralModel(\n", + " (embed_tokens): Embedding(32005, 4096)\n", + " (layers): ModuleList(\n", + " (0-31): 32 x MistralDecoderLayer(\n", + " (self_attn): MistralSdpaAttention(\n", + " (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", + " (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n", + " (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n", + " (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", + " (rotary_emb): MistralRotaryEmbedding()\n", + " )\n", + " (mlp): MistralMLP(\n", + " (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n", + " (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n", + " (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)\n", + " (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " )\n", + " (norm): MistralRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (lm_head): Linear(in_features=4096, out_features=32005, bias=False)\n", + " )\n", + ")" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "from transformers import AutoModel\n", + "from PIL import Image\n", + "\n", + "MODEL_NAME= \"BAAI/BGE-VL-MLLM-S1\"\n", + "\n", + "model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)\n", + "model.eval()\n", + "model.cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[0.4109, 0.1807]], device='cuda:0')\n" + ] + } + ], + "source": [ + "with torch.no_grad():\n", + " model.set_processor(MODEL_NAME)\n", + "\n", + " query_inputs = model.data_process(\n", + " text=\"Make the background dark, as if the camera has taken the photo at night\", \n", + " images=\"../../imgs/cir_query.png\",\n", + " q_or_c=\"q\",\n", + " task_instruction=\"Retrieve the target image that best meets the combined criteria by using both the provided image and the image retrieval instructions: \"\n", + " )\n", + "\n", + " candidate_inputs = model.data_process(\n", + " images=[\"../../imgs/cir_candi_1.png\", \"../../imgs/cir_candi_2.png\"],\n", + " q_or_c=\"c\",\n", + " )\n", + "\n", + " query_embs = model(**query_inputs, output_hidden_states=True)[:, -1, :]\n", + " candi_embs = model(**candidate_inputs, output_hidden_states=True)[:, -1, :]\n", + " \n", + " query_embs = torch.nn.functional.normalize(query_embs, dim=-1)\n", + " candi_embs = torch.nn.functional.normalize(candi_embs, dim=-1)\n", + "\n", + " scores = torch.matmul(query_embs, candi_embs.T)\n", + "print(scores)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/Introduction/index.rst b/docs/source/Introduction/index.rst index f7ea3c3..17c7b19 100644 --- a/docs/source/Introduction/index.rst +++ b/docs/source/Introduction/index.rst @@ -27,4 +27,5 @@ Quickly get started with: IR embedder reranker + similarity retrieval_demo \ No newline at end of file diff --git a/docs/source/Introduction/similarity.rst b/docs/source/Introduction/similarity.rst new file mode 100644 index 0000000..5e1b61b --- /dev/null +++ b/docs/source/Introduction/similarity.rst @@ -0,0 +1,60 @@ +Similarity +========== + +A primary goal of retrieval is to find the most relevant documents in response to a user's query. +One of the core components of this process is measuring similarity between the query and candidates. +Similarity metrics quantify how closely related two pieces of data are, and guide the retrieval system in ranking results. + +Jaccard Similarity +------------------ + +.. math:: + + J(A,B)=\frac{|A\cap B|}{|A\cup B|} + +The Jaccard similarity or Jaccard index is commonly used for set-based similarity, particularly in binary data (e.g., whether a term appears in a document or not). +It is calculated as the size of the intersection of two sets divided by the size of their union. +In information retrieval, it's often used to compare sets of keywords or phrases, with higher values indicating more similarity. + +Euclidean Distance +------------------ + +.. math:: + + d(A, B) = \|A-B\|_2 = \sqrt{\sum_{i=1}^n (A_i-B_i)^2} + +Euclidean distance measures the straight-line distance between two points in a vector space. +In IR, this can be used to assess the difference between document or query vectors. +A smaller distance indicates greater similarity. +This metric is intuitive but can sometimes be sensitive to the scale of the data, especially in high-dimensional spaces like text embeddings. + +Cosine Similarity +----------------- + +.. math:: + + \cos(\theta)=\frac{A\cdot B}{\|A\|\|B\|} + +Cosine similarity is one of the most widely used metrics in information retrieval, especially for text. +It measures the cosine of the angle between two vectors in a multi-dimensional space (typically representing term frequency vectors of documents and queries). +If the cosine similarity is closer to 1, the vectors are more similar. +A value of 0 indicates orthogonality, meaning no similarity. +It's a simple yet effective measure for text-based retrieval, as it considers the orientation but not the magnitude of vectors. + +Dot Product +----------- + +Coordinate definition: +.. math:: + + A\cdot B = \sum_{i=1}^{i=n}A_i B_i + +Geometric definition: +.. math:: + + A\cdot B = \|A\|\|B\|\cos(\theta) + +The dot product between two vectors provides a measure of how similar the vectors are in terms of direction and magnitude. +In information retrieval, the dot product is often used in vector space models, particularly when dealing with pre-trained word or sentence embeddings. +A higher dot product indicates that the query and document are closely aligned in the vector space. + diff --git a/docs/source/bge/bge_vl.rst b/docs/source/bge/bge_vl.rst new file mode 100644 index 0000000..5076ff0 --- /dev/null +++ b/docs/source/bge/bge_vl.rst @@ -0,0 +1,110 @@ +BGE-VL +====== + +BGE-VL is a series of multimodel retrieval models training on `MegaPairs `_ + +BGE-VL contains light weight CLIP based models as well as more powerful LLAVA-NeXT based MLLM models: + ++----------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ +| Model | Language | Parameters | Model Size | Description | ++======================================================================+===========+============+==============+=======================================================================+ +| `BAAI/bge-vl-base `_ | English | 150M | 299 MB | Light weight multimodel embedder among image and text | ++----------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ +| `BAAI/bge-vl-large `_ | English | 428M | 855 MB | Large scale multimodel embedder among image and text | ++----------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ +| `BAAI/bge-vl-MLLM-S1 `_ | English | 7.57B | 15.14 GB | SOTA in composed image retrieval, trained on MegaPairs dataset | ++----------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ +| `BAAI/bge-vl-MLLM-S2 `_ | English | 7.57B | 15.14 GB | Finetune BGE-VL-MLLM-S1 with one epoch on MMEB training set | ++----------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ + + +BGE-VL-CLIP +----------- + +The base and large model are trained based on CLIP-vit-base-patch16 and CLIP-vit-large-patch14. +For composed image-text data, the model directly use score-fusion to sum up the outputs of visual encoder and text encoder and get the final embedding. + +.. tip:: + + Our code works well on transformers==4.45.2, and we recommend using this version. + +You can easily use BGE-VL-CLIP models based on transformers: + +.. code:: python + + import torch + from transformers import AutoModel + + MODEL_NAME = "BAAI/BGE-VL-base" # or "BAAI/BGE-VL-large" + model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True) # You must set trust_remote_code=True + model.set_processor(MODEL_NAME) + model.eval() + + with torch.no_grad(): + query = model.encode( + images = "./assets/cir_query.png", + text = "Make the background dark, as if the camera has taken the photo at night" + ) + candidates = model.encode( + images = ["./assets/cir_candi_1.png", "./assets/cir_candi_2.png"] + ) + + scores = query @ candidates.T + print(scores) + + +BGE-VL-MLLM +----------- + +The multimodal large language models (MLLMs) incorporate a visual encoder, typically based on a vision transformer, into a large language model (LLM). +This integration allows image tokens to be directly processed by the LLM. +Consequently, MLLMs can effectively handle diverse multimodal inputs by converting any type of input into a sequence of tokens. + +BGE-VL-MLLM builds upon the LLaVA1.6. In both training and inference stages, MMRet uses task-specific instructions for query inputs to improve generalization, aligning +with standard practices in LLM-based embedding models. +A typical multimodal query input is structured as follows: + +.. math:: + + ⟨\text{instruct}⟩{\{task\_ inst\}} \space⟨\text{query}⟩\{q_t\} \{q_i\}\space[\text{EOS}] + +where :math:`{task_inst}` represents the task-specific instruction, :math:`{qt}` denotes the input query text, and +:math:`{qi}` is the input query image. +The normalized last hidden state of the [EOS] token in the MLLM is used as the embedding of any given input sequence. + +.. code:: python + + import torch + from transformers import AutoModel + from PIL import Image + + MODEL_NAME= "BAAI/BGE-VL-MLLM-S1" + model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True) + model.eval() + model.cuda() + + with torch.no_grad(): + model.set_processor(MODEL_NAME) + + query_inputs = model.data_process( + text="Make the background dark, as if the camera has taken the photo at night", + images="./assets/cir_query.png", + q_or_c="q", + task_instruction="Retrieve the target image that best meets the combined criteria by using both the provided image and the image retrieval instructions: " + ) + candidate_inputs = model.data_process( + images=["./assets/cir_candi_1.png", "./assets/cir_candi_2.png"], + q_or_c="c", + ) + + query_embs = model(**query_inputs, output_hidden_states=True)[:, -1, :] + candi_embs = model(**candidate_inputs, output_hidden_states=True)[:, -1, :] + + query_embs = torch.nn.functional.normalize(query_embs, dim=-1) + candi_embs = torch.nn.functional.normalize(candi_embs, dim=-1) + + scores = torch.matmul(query_embs, candi_embs.T) + print(scores) + + +For more details, check out the repo of `MegaPairs `_ \ No newline at end of file diff --git a/docs/source/bge/index.rst b/docs/source/bge/index.rst index 7e83c92..82c31be 100644 --- a/docs/source/bge/index.rst +++ b/docs/source/bge/index.rst @@ -14,6 +14,7 @@ BGE bge_v1_v1.5 bge_m3 bge_icl + bge_vl .. toctree:: :maxdepth: 1 diff --git a/imgs/cir_candi_1.png b/imgs/cir_candi_1.png new file mode 100644 index 0000000..d44fabf Binary files /dev/null and b/imgs/cir_candi_1.png differ diff --git a/imgs/cir_candi_2.png b/imgs/cir_candi_2.png new file mode 100644 index 0000000..143a39d Binary files /dev/null and b/imgs/cir_candi_2.png differ diff --git a/imgs/cir_query.png b/imgs/cir_query.png new file mode 100644 index 0000000..4a28de6 Binary files /dev/null and b/imgs/cir_query.png differ