From bb3c69f76fd6f525be34b2fe4e15451244783940 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Tue, 7 Jan 2025 12:45:52 -0500 Subject: [PATCH 1/2] Review comments proposal.md --- docs/proposals/002-api-proposal/proposal.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 8f406d86..c2d4b3a2 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -78,6 +78,7 @@ The API design is based on these axioms: - This solution should be composable with other Gateway solutions and flexible to fit customer needs - The MVP will heavily assume requests are done using the OpenAI spec, but open to extension in the future - The Gateway should route in a way that does not generate a queue of requests at the model server level +/* REVIEW: Note that models are served in a way that leads to many models running at the same time, vs many existing kube networking assumptions that assume there is a user-facing service that the implementation changes in user-invisible ways over time */ The [PoC](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458) was focused on lower-level scheduling. And the API follows that similar logic, which lead to the proposal of the **InferencePool**. @@ -129,6 +130,7 @@ type InferencePoolSpec struct { // that should be included in the InferencePool. ModelServers should not // be with any other Service or InferencePool, that behavior is not supported // and will result in sub-optimal utilization. +/* REVIEW: This is generally true for all services, but since the ig needs to deal with monitoring, health checking, and changes to membership in the pool (i.e. a member removed from the selector or gracefully shutting down), I think it's more of a distraction than necessary. */ ModelServerSelector map[string]string `json:"modelServerSelector,omitempty"` } ``` @@ -142,11 +144,15 @@ type InferencePoolSpec struct { // and rollout of new versions of those models, and defines the specific // performance and latency goals for the model. These workloads are // expected to operate within an InferencePool sharing compute capacity with other +/* REVIEW: "operate" sounds wierd */ // InferenceModels, defined by the Inference Platform Admin. We allow a user who // has multiple InferenceModels across multiple pools (with the same config) to -// specify the configuration exactly once, and deploy to many pools +// specify the configuration exactly once, and deploy to many pools +/* REVIEW: be more consistent about "to" or "on" or "in" */ // simultaneously. Enabling a simpler config and single source of truth +/* REVIEW: broken clause, this should be part of the previous sentence or more naturally integrated */ // for a given user. InferenceModel ModelNames are unique for a given InferencePool, +/* Review: missing clause? How is uniqueness enforced? */ type InferenceModel struct { metav1.ObjectMeta metav1.TypeMeta @@ -157,15 +163,18 @@ type InferenceModel struct { type InferenceModelSpec struct { // The name of the model as the users set in the "model" parameter in the requests. // The name should be unique among the workloads that reference the same backend pool. +/* REVIEW: Describe what happens if the name isn't unique, and specify the behavior here */ // This is the parameter that will be used to match the request with. In the future, we may // allow to match on other request parameters. The other approach to support matching on // on other request parameters is to use a different ModelName per HTTPFilter. // Names can be reserved without implementing an actual model in the pool. +/* REVIEW: "without a model of that name already being configured in the pool"? "Implementing a model" doesn't sound right */ // This can be done by specifying a target model and setting the weight to zero, // an error will be returned specifying that no valid target model is found. ModelName string // Optional // Defines how important it is to serve the model compared to other models referencing the same pool. +/* REVIEW: define the behavior when multiple models have the same criticality in a sentence */ Criticality *Criticality // Optional. // Allow multiple versions of a model for traffic splitting. @@ -177,6 +186,7 @@ type InferenceModelSpec struct { } // Defines how important it is to serve the model compared to other models. +/* REVIEW: Describe why there are only three levels and suggest that we don't expect to extend this enumeration, probably within the field name above. Open enumerations can be problematic, but a novice user reading this could reasonably ask "why not just have a priority number?" and so we should answer that. */ type Criticality string const ( // Most important. Requests to this band will be shed last. @@ -200,6 +210,7 @@ type TargetModel struct { Name string // Weight is used to determine the percentage of traffic that should be // sent to this target model when multiple versions of the model are specified. +/* REVIEW: What's the closest analog to weight in the existing API? */ Weight int } From 9f54009cf8efcda0fd146be20983f5e849307065 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Tue, 7 Jan 2025 12:55:09 -0500 Subject: [PATCH 2/2] Review comments proposal.md --- docs/proposals/002-api-proposal/proposal.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index c2d4b3a2..a0486265 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -212,6 +212,7 @@ type TargetModel struct { // sent to this target model when multiple versions of the model are specified. /* REVIEW: What's the closest analog to weight in the existing API? */ Weight int +/* REVIEW: As discussed in review session, make this *int with no default because that is most flexible to future extensibility */ } // LocalObjectReference identifies an API object within the namespace of the