Refine CLIP embedding (#1245)

lvliang-intel · web-flow · commit a53d357c46b6 · 2025-01-31T21:50:32.000+08:00
* Refine clip embedding

Signed-off-by: lvliang-intel &lt;liang1.lv@intel.com&gt;
diff --git a/comps/embeddings/deployment/docker_compose/compose.yaml b/comps/embeddings/deployment/docker_compose/compose.yaml
@@ -26,7 +26,7 @@ services:
     image: ${REGISTRY:-opea}/embedding:${TAG:-latest}
     container_name: tei-embedding-server
     ports:
-      - "${EMBEDDER_PORT:-10200}:6000"
+      - "${EMBEDDER_PORT:-10201}:6000"
     ipc: host
     environment:
       no_proxy: ${no_proxy}
@@ -43,7 +43,7 @@ services:
     image: ${REGISTRY:-opea}/embedding:${TAG:-latest}
     container_name: pg-embedding-server
     ports:
-      - ${EMBEDDER_PORT:-10200}:6000
+      - ${EMBEDDER_PORT:-10202}:6000
     ipc: host
     environment:
       no_proxy: ${no_proxy}
@@ -54,6 +54,19 @@ services:
       EMBEDDING_COMPONENT_NAME: "OPEA_PREDICTIONGUARD_EMBEDDING"
     restart: unless-stopped
 
+  clip-embedding-server:
+    image: ${REGISTRY:-opea}/embedding:${TAG:-latest}
+    container_name: clip-embedding-server
+    ports:
+      - ${EMBEDDER_PORT:-10203}:6000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      EMBEDDING_COMPONENT_NAME: "OPEA_CLIP_EMBEDDING"
+    restart: unless-stopped
+
   multimodal-bridgetower-embedding-server:
     <<: *multimodal-bridgetower-embedding-config
     depends_on:
diff --git a/comps/embeddings/src/README.md b/comps/embeddings/src/README.md
@@ -22,6 +22,10 @@ For details, please refer to [readme](./README_tei.md).
 
 For details, please refer to this [readme](./README_predictionguard.md).
 
+## Embeddings Microservice with Multimodal Clip
+
+For details, please refer to this [readme](./README_clip.md)
+
 ## Embeddings Microservice with Multimodal
 
 For details, please refer to this [readme](./README_bridgetower.md).
diff --git a/comps/embeddings/src/README_clip.md b/comps/embeddings/src/README_clip.md
@@ -0,0 +1,75 @@
+# Multimodal CLIP Embedding Microservice
+
+The Multimodal CLIP Embedding Microservice provides a powerful solution for converting textual and visual data into high-dimensional vector embeddings. These embeddings capture the semantic essence of the input, enabling robust applications in multi-modal data processing, information retrieval, recommendation systems, and more.
+
+## ✨ Key Features
+
+- **High Performance**: Optimized for rapid and reliable embedding generation for text and images.
+- **Scalable**: Capable of handling high-concurrency workloads, ensuring consistent performance under heavy loads.
+- **Easy Integration**: Offers a simple API interface for seamless integration into diverse workflows.
+- **Customizable**: Supports tailored configurations, including model selection and preprocessing adjustments, to fit specific requirements.
+
+This service empowers users to configure and deploy embedding pipelines tailored to their needs.
+
+---
+
+## 🚀 Quick Start
+
+### 1. Launch the Microservice with Docker
+
+#### 1.1 Build the Docker Image
+
+To build the Docker image, execute the following commands:
+
+```bash
+cd ../../..
+docker build -t opea/embedding:latest \
+  --build-arg https_proxy=$https_proxy \
+  --build-arg http_proxy=$http_proxy \
+  -f comps/embeddings/src/Dockerfile .
+```
+
+#### 1.2 Start the Service with Docker Compose
+
+Use Docker Compose to start the service:
+
+```bash
+cd comps/embeddings/deployment/docker_compose/
+docker compose up clip-embedding-server -d
+```
+
+---
+
+### 2. Consume the Embedding Service
+
+#### 2.1 Check Service Health
+
+Verify that the service is running by performing a health check:
+
+```bash
+curl http://localhost:6000/v1/health_check \
+  -X GET \
+  -H 'Content-Type: application/json'
+```
+
+#### 2.2 Generate Embeddings
+
+The service supports [OpenAI API](https://platform.openai.com/docs/api-reference/embeddings)-compatible requests.
+
+- **Single Text Input**:
+
+  ```bash
+  curl http://localhost:6000/v1/embeddings \
+    -X POST \
+    -d '{"input":"Hello, world!"}' \
+    -H 'Content-Type: application/json'
+  ```
+
+- **Multiple Texts with Parameters**:
+
+  ```bash
+  curl http://localhost:6000/v1/embeddings \
+    -X POST \
+    -d '{"input":["Hello, world!","How are you?"], "dimensions":100}' \
+    -H 'Content-Type: application/json'
+  ```
diff --git a/comps/embeddings/src/README_tei.md b/comps/embeddings/src/README_tei.md
@@ -44,7 +44,7 @@ This guide walks you through starting, deploying, and consuming the **TEI-based
 
    ```bash
    docker run -d --name="embedding-tei-server" \
-   -p 6000:5000 \
+   -p 6000:6000 \
    -e http_proxy=$http_proxy -e https_proxy=$https_proxy \
    --ipc=host \
    -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT \
diff --git a/comps/embeddings/src/integrations/clip.py b/comps/embeddings/src/integrations/clip.py
@@ -0,0 +1,134 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from typing import List, Union
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+from transformers import AutoProcessor, AutoTokenizer, CLIPModel
+
+from comps import CustomLogger, OpeaComponent, OpeaComponentRegistry, ServiceType
+from comps.cores.proto.api_protocol import EmbeddingRequest, EmbeddingResponse, EmbeddingResponseData
+
+logger = CustomLogger("opea_multimodal_embedding_clip")
+logflag = os.getenv("LOGFLAG", False)
+
+
+model_name = "openai/clip-vit-base-patch32"
+
+clip = CLIPModel.from_pretrained(model_name)
+processor = AutoProcessor.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+
+class vCLIP(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.num_frm = cfg["num_frm"]
+        self.model_name = cfg["model_name"]
+
+    def embed_query(self, texts):
+        """Input is list of texts."""
+        text_inputs = tokenizer(texts, padding=True, return_tensors="pt")
+        text_features = clip.get_text_features(**text_inputs)
+        return text_features
+
+    def get_embedding_length(self):
+        text_features = self.embed_query("sample_text")
+        return text_features.shape[1]
+
+    def get_image_embeddings(self, images):
+        """Input is list of images."""
+        image_inputs = processor(images=images, return_tensors="pt")
+        image_features = clip.get_image_features(**image_inputs)
+        return image_features
+
+    def get_video_embeddings(self, frames_batch):
+        """Input is list of list of frames in video."""
+        self.batch_size = len(frames_batch)
+        vid_embs = []
+        for frames in frames_batch:
+            frame_embeddings = self.get_image_embeddings(frames)
+            frame_embeddings = rearrange(frame_embeddings, "(b n) d -> b n d", b=len(frames_batch))
+            # Normalize, mean aggregate and return normalized video_embeddings
+            frame_embeddings = frame_embeddings / frame_embeddings.norm(dim=-1, keepdim=True)
+            video_embeddings = frame_embeddings.mean(dim=1)
+            video_embeddings = video_embeddings / video_embeddings.norm(dim=-1, keepdim=True)
+            vid_embs.append(video_embeddings)
+        return torch.cat(vid_embs, dim=0)
+
+
+@OpeaComponentRegistry.register("OPEA_CLIP_EMBEDDING")
+class OpeaClipEmbedding(OpeaComponent):
+    """A specialized embedding component derived from OpeaComponent for CLIP embedding services.
+
+    This class initializes and configures the CLIP embedding service using the vCLIP model.
+    It also performs a health check during initialization and logs an error if the check fails.
+
+    Attributes:
+        embeddings (vCLIP): An instance of the vCLIP model used for generating embeddings.
+    """
+
+    def __init__(self, name: str, description: str, config: dict = None):
+        super().__init__(name, ServiceType.EMBEDDING.name.lower(), description, config)
+        self.embeddings = vCLIP({"model_name": "openai/clip-vit-base-patch32", "num_frm": 4})
+
+        health_status = self.check_health()
+        if not health_status:
+            logger.error("OpeaClipEmbedding health check failed.")
+
+    async def invoke(self, input: EmbeddingRequest) -> EmbeddingResponse:
+        """Invokes the embedding service to generate embeddings for the provided input.
+
+        Args:
+            input (EmbeddingRequest): The input in OpenAI embedding format, including text(s) and optional parameters like model.
+
+        Returns:
+            EmbeddingResponse: The response in OpenAI embedding format, including embeddings, model, and usage information.
+        """
+        # Parse input according to the EmbeddingRequest format
+        if isinstance(input.input, str):
+            texts = [input.input.replace("\n", " ")]
+        elif isinstance(input.input, list):
+            if all(isinstance(item, str) for item in input.input):
+                texts = [text.replace("\n", " ") for text in input.input]
+            else:
+                raise ValueError("Invalid input format: Only string or list of strings are supported.")
+        else:
+            raise TypeError("Unsupported input type: input must be a string or list of strings.")
+        embed_vector = self.get_embeddings(texts)
+        if input.dimensions is not None:
+            embed_vector = [embed_vector[i][: input.dimensions] for i in range(len(embed_vector))]
+
+        # for standard openai embedding format
+        res = EmbeddingResponse(
+            data=[EmbeddingResponseData(index=i, embedding=embed_vector[i]) for i in range(len(embed_vector))]
+        )
+        return res
+
+    def check_health(self) -> bool:
+        """Checks if the embedding model is healthy.
+
+        Returns:
+            bool: True if the embedding model is initialized, False otherwise.
+        """
+        if self.embeddings:
+            return True
+        else:
+            return False
+
+    def get_embeddings(self, text: Union[str, List[str]]) -> List[List[float]]:
+        """Generates embeddings for input text.
+
+        Args:
+            text (Union[str, List[str]]): Input text or list of texts.
+
+        Returns:
+            List[List[float]]: List of embedding vectors.
+        """
+        texts = [text] if isinstance(text, str) else text
+        embed_vector = self.embeddings.embed_query(texts).tolist()
+        return embed_vector
diff --git a/comps/embeddings/src/opea_embedding_microservice.py b/comps/embeddings/src/opea_embedding_microservice.py
@@ -4,6 +4,7 @@
 import os
 import time
 
+from integrations.clip import OpeaClipEmbedding
 from integrations.predictionguard import PredictionguardEmbedding
 from integrations.tei import OpeaTEIEmbedding
 
diff --git a/comps/embeddings/src/requirements.txt b/comps/embeddings/src/requirements.txt
@@ -1,7 +1,9 @@
 aiohttp
 docarray
+einops
 fastapi
 huggingface_hub
+open-clip-torch
 openai
 opentelemetry-api
 opentelemetry-exporter-otlp
@@ -11,4 +13,5 @@ predictionguard==2.2.1
 prometheus-fastapi-instrumentator
 PyYAML
 shortuuid
+transformers
 uvicorn
diff --git a/tests/embeddings/test_embeddings_clip.sh b/tests/embeddings/test_embeddings_clip.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -x
+
+WORKPATH=$(dirname "$PWD")
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    cd $WORKPATH
+    echo $(pwd)
+    docker build --no-cache -t opea/embedding:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy  -f comps/embeddings/src/Dockerfile .
+    if [ $? -ne 0 ]; then
+        echo "opea/embedding built fail"
+        exit 1
+    else
+        echo "opea/embedding built successful"
+    fi
+}
+
+function start_service() {
+    export TAG=comps
+    export host_ip=${ip_address}
+    export EMBEDDER_PORT=10203
+    service_name="clip-embedding-server"
+    cd $WORKPATH
+    cd comps/embeddings/deployment/docker_compose/
+    docker compose up ${service_name} -d
+    sleep 15
+}
+
+function validate_service() {
+    local INPUT_DATA="$1"
+    service_port=10203
+    result=$(http_proxy="" curl http://${ip_address}:$service_port/v1/embeddings \
+        -X POST \
+        -d "$INPUT_DATA" \
+        -H 'Content-Type: application/json')
+    if [[ $result == *"embedding"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs clip-embedding-server
+        exit 1
+    fi
+}
+
+function validate_microservice() {
+    ## Test OpenAI API, input single text
+    validate_service \
+        '{"input":"What is Deep Learning?"}'
+
+    ## Test OpenAI API, input multiple texts with parameters
+    validate_service \
+        '{"input":["What is Deep Learning?","How are you?"], "dimensions":100}'
+}
+
+function stop_docker() {
+    cid=$(docker ps -aq --filter "name=clip-embedding-server*")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+}
+
+function main() {
+
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservice
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
diff --git a/tests/embeddings/test_embeddings_predictionguard.sh b/tests/embeddings/test_embeddings_predictionguard.sh
@@ -30,7 +30,7 @@ function start_service() {
     cd $WORKPATH
     cd comps/embeddings/deployment/docker_compose/
     docker compose up ${service_name} -d
-    sleep 10
+    sleep 30
 }
 
 function validate_service() {

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ function start_service() {`
`30`	`30`	`cd $WORKPATH`
`31`	`31`	`cd comps/embeddings/deployment/docker_compose/`
`32`	`32`	`docker compose up ${service_name} -d`
`33`		`- sleep 10`
	`33`	`+ sleep 30`
`34`	`34`	`}`
`35`	`35`
`36`	`36`	`function validate_service() {`