lightseekorg · ppraneth · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026
@@ -14,6 +14,9 @@ service VllmEngine {
   // Submit an embedding request
   rpc Embed(EmbedRequest) returns (EmbedResponse);
 
+  // Submit a scoring/reranking request
+  rpc Score(ScoreRequest) returns (ScoreResponse);
+
   // Health check
   rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
 
@@ -265,6 +268,29 @@ message EmbedResponse {
   uint32 embedding_dim = 3;
 }
 
+// =====================
+// Score/Rerank Request
+// =====================
+
+message ScoreRequest {
+  string request_id = 1;
+  string text_1 = 2;
+  repeated string text_2 = 3;
+}
-message ScoreRequest {
-  string request_id = 1;
-  string text_1 = 2;
-  repeated string text_2 = 3;
-}
+message ScoreRequest {
+  string request_id = 1;
+  string text_1 = 2;
+  repeated string text_2 = 3;
+  optional uint32 truncate_prompt_tokens = 4;
+}
-message ScoreRequest {
-  string request_id = 1;
-  string text_1 = 2;
-  repeated string text_2 = 3;
-}
+message ScoreRequest {
+  string request_id = 1;
+  string text_1 = 2;
+  repeated string text_2 = 3;
+  optional uint32 truncate_prompt_tokens = 4;
+}
+
+message ScoreResult {
+  uint32 index = 1;
+  float score = 2;
+}
+
+message ScoreResponse {
+  repeated ScoreResult data = 1;
+  uint32 prompt_tokens = 2;
+  uint32 total_tokens = 3;
+  string request_id = 4;
+  int64 created = 5;
+}
+
 // =====================
 // Management Operations
 // =====================

@@ -9,7 +9,7 @@ description = "SMG gRPC proto definitions for SGLang, vLLM, and TRT-LLM"
 requires-python = ">=3.10"
 dependencies = [
     "grpcio>=1.78.0",
-    "protobuf>=5.26.0",
+    "protobuf>=5.26.0,<7.0.0",
 ]
 readme = "README.md"
 license = { text = "Apache-2.0" }

@@ -1 +1 @@
-../../proto
+../../proto
@@ -668,6 +668,40 @@ impl VllmEngineClient {
         Ok(response.into_inner())
     }
 
+    /// Build a ScoreRequest for cross-encoder reranking
+    #[expect(
+        clippy::unused_self,
+        reason = "method receiver kept for consistent public API across gRPC backends"
+    )]
+    pub fn build_score_request(
+        &self,
+        request_id: String,
+        text_1: String,
+        text_2: Vec<String>,
+    ) -> proto::ScoreRequest {
+        proto::ScoreRequest {
+            request_id,
+            text_1,
+            text_2,
+        }
+    }
-    pub fn build_score_request(
-        &self,
-        request_id: String,
-        text_1: String,
-        text_2: Vec<String>,
-    ) -> proto::ScoreRequest {
-        proto::ScoreRequest {
-            request_id,
-            text_1,
-            text_2,
-        }
-    }
+    pub fn build_score_request(
+        &self,
+        request_id: String,
+        text_1: String,
+        text_2: Vec<String>,
+        truncate_prompt_tokens: Option<u32>,
+    ) -> proto::ScoreRequest {
+        proto::ScoreRequest {
+            request_id,
+            text_1,
+            text_2,
+            truncate_prompt_tokens,
+        }
+    }
-    pub fn build_score_request(
-        &self,
-        request_id: String,
-        text_1: String,
-        text_2: Vec<String>,
-    ) -> proto::ScoreRequest {
-        proto::ScoreRequest {
-            request_id,
-            text_1,
-            text_2,
-        }
-    }
+    pub fn build_score_request(
+        &self,
+        request_id: String,
+        text_1: String,
+        text_2: Vec<String>,
+        truncate_prompt_tokens: Option<u32>,
+    ) -> proto::ScoreRequest {
+        proto::ScoreRequest {
+            request_id,
+            text_1,
+            text_2,
+            truncate_prompt_tokens,
+        }
+    }
+
+    /// Submit a scoring request
+    pub async fn score(
+        &self,
+        req: proto::ScoreRequest,
+    ) -> Result<proto::ScoreResponse, tonic::Status> {
+        let mut client = self.client.clone();
+        let mut request = Request::new(req);
+
+        if let Err(e) = self.trace_injector.inject(request.metadata_mut()) {
+            warn!("Failed to inject trace context: {}", e);
+        }
+
+        let response = client.score(request).await?;
+        Ok(response.into_inner())
+    }
+
     fn build_grpc_sampling_params_from_completion(
         request: &CompletionRequest,
     ) -> Result<proto::SamplingParams, String> {

@@ -34,6 +34,8 @@ bitflags! {
         const AUDIO       = 1 << 10;
         /// Content moderation models
         const MODERATION  = 1 << 11;
+        /// Score/cross-encoder reranker models (vLLM /v1/score)
+        const SCORE       = 1 << 12;
 
         /// Standard LLM: chat + completions + responses + tools
         const LLM = Self::CHAT.bits() | Self::COMPLETIONS.bits()
@@ -62,6 +64,9 @@ bitflags! {
 
         /// Content moderation model only
         const MODERATION_MODEL = Self::MODERATION.bits();
+
+        /// Score / cross-encoder reranker model only (vLLM /v1/score)
+        const SCORE_MODEL = Self::SCORE.bits();
     }
 }
 
@@ -79,6 +84,7 @@ const CAPABILITY_NAMES: &[(ModelType, &str)] = &[
     (ModelType::IMAGE_GEN, "image_gen"),
     (ModelType::AUDIO, "audio"),
     (ModelType::MODERATION, "moderation"),
+    (ModelType::SCORE, "score"),
 ];
 
 impl ModelType {
@@ -154,6 +160,12 @@ impl ModelType {
         self.contains(Self::MODERATION)
     }
 
+    /// Check if this model type supports the score endpoint (vLLM /v1/score)
+    #[inline]
+    pub fn supports_score(self) -> bool {
+        self.contains(Self::SCORE)
+    }
+
     /// Check if this model type supports a given endpoint
     pub fn supports_endpoint(self, endpoint: Endpoint) -> bool {
         match endpoint {
@@ -162,6 +174,7 @@ impl ModelType {
             Endpoint::Responses => self.supports_responses(),
             Endpoint::Embeddings => self.supports_embeddings(),
             Endpoint::Rerank => self.supports_rerank(),
+            Endpoint::Score => self.supports_score(),
             Endpoint::Generate => self.supports_generate(),
             Endpoint::Models => true,
         }
@@ -196,6 +209,12 @@ impl ModelType {
         self.supports_rerank() && !self.supports_chat()
     }
 
+    /// Check if this is a score/cross-encoder model (supports /v1/score)
+    #[inline]
+    pub fn is_score_model(self) -> bool {
+        self.supports_score() && !self.supports_chat()
+    }
+
     /// Check if this is an image generation model
     #[inline]
     pub fn is_image_model(self) -> bool {
@@ -344,6 +363,8 @@ pub enum Endpoint {
     Embeddings,
     /// Rerank endpoint (/v1/rerank)
     Rerank,
+    /// Score / cross-encoder endpoint (/v1/score)
+    Score,
     /// SGLang generate endpoint (/generate)
     Generate,
     /// Models listing endpoint (/v1/models)
@@ -359,6 +380,7 @@ impl Endpoint {
             Endpoint::Responses => "/v1/responses",
             Endpoint::Embeddings => "/v1/embeddings",
             Endpoint::Rerank => "/v1/rerank",
+            Endpoint::Score => "/v1/score",
             Endpoint::Generate => "/generate",
             Endpoint::Models => "/v1/models",
         }
@@ -373,6 +395,7 @@ impl Endpoint {
             "/v1/responses" => Some(Endpoint::Responses),
             "/v1/embeddings" => Some(Endpoint::Embeddings),
             "/v1/rerank" => Some(Endpoint::Rerank),
+            "/v1/score" => Some(Endpoint::Score),
             "/generate" => Some(Endpoint::Generate),
             "/v1/models" => Some(Endpoint::Models),
             _ => None,
@@ -387,6 +410,7 @@ impl Endpoint {
             Endpoint::Responses => Some(ModelType::RESPONSES),
             Endpoint::Embeddings => Some(ModelType::EMBEDDINGS),
             Endpoint::Rerank => Some(ModelType::RERANK),
+            Endpoint::Score => Some(ModelType::SCORE),
             Endpoint::Generate => Some(ModelType::GENERATE),
             Endpoint::Models => None,
         }
@@ -401,6 +425,7 @@ impl std::fmt::Display for Endpoint {
             Endpoint::Responses => write!(f, "responses"),
             Endpoint::Embeddings => write!(f, "embeddings"),
             Endpoint::Rerank => write!(f, "rerank"),
+            Endpoint::Score => write!(f, "score"),
             Endpoint::Generate => write!(f, "generate"),
             Endpoint::Models => write!(f, "models"),
         }

@@ -212,3 +212,132 @@ impl From<V1RerankReqInput> for RerankRequest {
         }
     }
 }
+
+// ============================================================================
+// Score API (vLLM /v1/score)
+// ============================================================================
+
+/// vLLM-compatible score request for cross-encoder reranker models.
+///
+/// Matches the vLLM `/v1/score` request schema which uses `text_1`/`text_2`
+/// pairs rather than the classic `query`/`documents` style.
+///
+/// # Example
+/// ```json
+/// {
+///   "model": "modernbert-reranker",
+///   "text_1": "What is the capital of France?",
+///   "text_2": ["Paris is the capital.", "London is in England."]
+/// }
+/// ```
+#[derive(Debug, Clone, Deserialize, Serialize, schemars::JsonSchema)]
+pub struct ScoreRequest {
+    /// The model to use for scoring
+    pub model: String,
+
+    /// The query/source text (single string)
+    pub text_1: String,
+
+    /// The document(s) to score against the query.
+    /// Can be a single string or a list of strings.
+    pub text_2: StringOrVec,
+
+    /// Optional encoding format for the response
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub encoding_format: Option<String>,
+
+    /// Whether to truncate the input
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub truncate_prompt_tokens: Option<u32>,
+}
-#[derive(Debug, Clone, Deserialize, Serialize, schemars::JsonSchema)]
-pub struct ScoreRequest {
-    /// The model to use for scoring
-    pub model: String,
-
-    /// The query/source text (single string)
-    pub text_1: String,
-
-    /// The document(s) to score against the query.
-    /// Can be a single string or a list of strings.
-    pub text_2: StringOrVec,
-
-    /// Optional encoding format for the response
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub encoding_format: Option<String>,
-
-    /// Whether to truncate the input
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub truncate_prompt_tokens: Option<u32>,
-}
+use validator::Validate;
+
+#[derive(Debug, Clone, Deserialize, Serialize, schemars::JsonSchema, Validate)]
+pub struct ScoreRequest {
+    /// The model to use for scoring
+    pub model: String,
+
+    /// The query/source text (single string)
+    #[validate(custom(function = "validate_text_1"))]
+    pub text_1: String,
+
+    /// The document(s) to score against the query.
+    /// Can be a single string or a list of strings.
+    #[validate(custom(function = "validate_text_2"))]
+    pub text_2: StringOrVec,
+
+    /// Optional encoding format for the response
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub encoding_format: Option<String>,
+
+    /// Whether to truncate the input
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub truncate_prompt_tokens: Option<u32>,
+}
+
+fn validate_text_1(text: &str) -> Result<(), validator::ValidationError> {
+    if text.trim().is_empty() {
+        return Err(validator::ValidationError::new("text_1 cannot be empty"));
+    }
+    Ok(())
+}
+
+fn validate_text_2(text_2: &StringOrVec) -> Result<(), validator::ValidationError> {
+    if text_2.is_empty() {
+        return Err(validator::ValidationError::new("text_2 cannot be empty"));
+    }
+    Ok(())
+}
-#[derive(Debug, Clone, Deserialize, Serialize, schemars::JsonSchema)]
-pub struct ScoreRequest {
-    /// The model to use for scoring
-    pub model: String,
-
-    /// The query/source text (single string)
-    pub text_1: String,
-
-    /// The document(s) to score against the query.
-    /// Can be a single string or a list of strings.
-    pub text_2: StringOrVec,
-
-    /// Optional encoding format for the response
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub encoding_format: Option<String>,
-
-    /// Whether to truncate the input
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub truncate_prompt_tokens: Option<u32>,
-}
+use validator::Validate;
+
+#[derive(Debug, Clone, Deserialize, Serialize, schemars::JsonSchema, Validate)]
+pub struct ScoreRequest {
+    /// The model to use for scoring
+    pub model: String,
+
+    /// The query/source text (single string)
+    #[validate(custom(function = "validate_text_1"))]
+    pub text_1: String,
+
+    /// The document(s) to score against the query.
+    /// Can be a single string or a list of strings.
+    #[validate(custom(function = "validate_text_2"))]
+    pub text_2: StringOrVec,
+
+    /// Optional encoding format for the response
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub encoding_format: Option<String>,
+
+    /// Whether to truncate the input
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub truncate_prompt_tokens: Option<u32>,
+}
+
+fn validate_text_1(text: &str) -> Result<(), validator::ValidationError> {
+    if text.trim().is_empty() {
+        return Err(validator::ValidationError::new("text_1 cannot be empty"));
+    }
+    Ok(())
+}
+
+fn validate_text_2(text_2: &StringOrVec) -> Result<(), validator::ValidationError> {
+    if text_2.is_empty() {
+        return Err(validator::ValidationError::new("text_2 cannot be empty"));
+    }
+    Ok(())
+}
+
+impl ScoreRequest {
+    /// Return text_2 as a slice of string references for routing/hashing.
+    pub fn texts(&self) -> Vec<&str> {
+        match &self.text_2 {
+            StringOrVec::Single(s) => vec![s.as_str()],
+            StringOrVec::Array(v) => v.iter().map(String::as_str).collect(),
+        }
+    }
+}
+
+impl GenerationRequest for ScoreRequest {
+    fn get_model(&self) -> Option<&str> {
+        Some(&self.model)
+    }
+
+    fn is_stream(&self) -> bool {
+        false // Score endpoint never streams
+    }
+
+    fn extract_text_for_routing(&self) -> String {
+        self.text_1.clone()
+    }
+}
+
+/// `text_2` field: either a single string or an array.
+///
+/// vLLM accepts both forms; we deserialize and normalize internally.
+#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
+#[serde(untagged)]
+pub enum StringOrVec {
+    Single(String),
+    Array(Vec<String>),
+}
+
+impl StringOrVec {
+    /// Convert into an owned `Vec<String>` regardless of variant.
+    pub fn into_vec(self) -> Vec<String> {
+        match self {
+            Self::Single(s) => vec![s],
+            Self::Array(v) => v,
+        }
+    }
+
+    /// Return the number of texts.
+    pub fn len(&self) -> usize {
+        match self {
+            Self::Single(_) => 1,
+            Self::Array(v) => v.len(),
+        }
+    }
+
+    /// Return true if empty.
+    pub fn is_empty(&self) -> bool {
+        match self {
+            Self::Single(_) => false,
+            Self::Array(v) => v.is_empty(),
+        }
+    }
+}
+
+/// An individual score result from the vLLM score API.
+#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
+pub struct ScoreData {
+    /// Always `"score"` (vLLM compat)
+    pub object: String,
+    /// The relevance score as a float
+    pub score: f64,
+    /// 0-based index of this text in `text_2`
+    pub index: usize,
+}
+
+/// Response from the vLLM `/v1/score` endpoint.
+///
+/// Mirrors the structure returned by vLLM's `ScoringResponse`.
+#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
+pub struct ScoreResponse {
+    /// Unique identifier for this score response
+    pub id: String,
+    /// Always `"list"`
+    pub object: String,
+    /// Unix timestamp (seconds) when the response was created
+    pub created: i64,
+    /// The scored results, one per input in `text_2`
+    pub data: Vec<ScoreData>,
+    /// The model that produced the scores
+    pub model: String,
+    /// Usage information (if provided by backend)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub usage: Option<UsageInfo>,
+}