mudler
diff --git a/‎core/application/startup.go‎
Lines changed: 5 additions & 0 deletions b/‎core/application/startup.go‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎core/gallery/gallery.go‎
Lines changed: 8 additions & 0 deletions b/‎core/gallery/gallery.go‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎core/http/endpoints/localai/import_model.go‎
Lines changed: 6 additions & 7 deletions b/‎core/http/endpoints/localai/import_model.go‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎core/http/endpoints/localai/vram.go‎
Lines changed: 32 additions & 52 deletions b/‎core/http/endpoints/localai/vram.go‎
Lines changed: 32 additions & 52 deletions
diff --git a/‎core/http/react-ui/src/pages/Models.jsx‎
Lines changed: 30 additions & 6 deletions b/‎core/http/react-ui/src/pages/Models.jsx‎
Lines changed: 30 additions & 6 deletions
diff --git a/‎core/http/react-ui/src/utils/api.js‎
Lines changed: 4 additions & 1 deletion b/‎core/http/react-ui/src/utils/api.js‎
Lines changed: 4 additions & 1 deletion
@@ -17,6 +17,7 @@ import (
 	"github.com/mudler/LocalAI/core/services/jobs"
 	"github.com/mudler/LocalAI/core/services/nodes"
 	"github.com/mudler/LocalAI/core/services/storage"
+	"github.com/mudler/LocalAI/pkg/vram"
 	coreStartup "github.com/mudler/LocalAI/core/startup"
 	"github.com/mudler/LocalAI/internal"
 
@@ -231,6 +232,10 @@ func New(opts ...config.AppOption) (*Application, error) {
 		xlog.Error("error registering external backends", "error", err)
 	}
 
+	// Wire gallery generation counter into VRAM caches so they invalidate
+	// when gallery data refreshes instead of using a fixed TTL.
+	vram.SetGalleryGenerationFunc(gallery.GalleryGeneration)
+
 	if options.ConfigFile != "" {
 		if err := application.ModelConfigLoader().LoadMultipleModelConfigsSingleFile(options.ConfigFile, configLoaderOpts...); err != nil {
 			xlog.Error("error loading config file", "error", err)
 
@@ -301,8 +301,14 @@ var (
 	availableModelsMu    sync.RWMutex
 	availableModelsCache GalleryElements[*GalleryModel]
 	refreshing           atomic.Bool
+	galleryGeneration    atomic.Uint64
 )
 
+// GalleryGeneration returns a counter that increments each time the gallery
+// model list is refreshed from upstream. VRAM estimation caches use this to
+// invalidate entries when the gallery data changes.
+func GalleryGeneration() uint64 { return galleryGeneration.Load() }
+
 // AvailableGalleryModelsCached returns gallery models from an in-memory cache.
 // Local-only fields (installed status) are refreshed on every call. A background
 // goroutine is triggered to re-fetch the full model list (including network
@@ -335,6 +341,7 @@ func AvailableGalleryModelsCached(galleries []config.Gallery, systemState *syste
 
 	availableModelsMu.Lock()
 	availableModelsCache = models
+	galleryGeneration.Add(1)
 	availableModelsMu.Unlock()
 
 	return models, nil
@@ -356,6 +363,7 @@ func triggerGalleryRefresh(galleries []config.Gallery, systemState *system.Syste
 		}
 		availableModelsMu.Lock()
 		availableModelsCache = models
+		galleryGeneration.Add(1)
 		availableModelsMu.Unlock()
 	}()
 }
 
@@ -51,18 +51,17 @@ func ImportModelURIEndpoint(cl *config.ModelConfigLoader, appConfig *config.Appl
 			}
 			estCtx, cancel := context.WithTimeout(c.Request().Context(), 5*time.Second)
 			defer cancel()
-			result, err := vram.EstimateModel(estCtx, vram.ModelEstimateInput{
-				Files:   files,
-				Options: vram.EstimateOptions{ContextLength: 8192},
-			})
+			result, err := vram.EstimateModelMultiContext(estCtx, vram.ModelEstimateInput{
+				Files: files,
+			}, []uint32{8192})
 			if err == nil {
 				if result.SizeBytes > 0 {
 					resp.EstimatedSizeBytes = result.SizeBytes
 					resp.EstimatedSizeDisplay = result.SizeDisplay
 				}
-				if result.VRAMBytes > 0 {
-					resp.EstimatedVRAMBytes = result.VRAMBytes
-					resp.EstimatedVRAMDisplay = result.VRAMDisplay
+				if v := result.VRAMForContext(8192); v > 0 {
+					resp.EstimatedVRAMBytes = v
+					resp.EstimatedVRAMDisplay = vram.FormatBytes(v)
 				}
 			}
 		}
 
@@ -2,9 +2,9 @@ package localai
 
 import (
 	"context"
-	"fmt"
 	"net/http"
 	"path/filepath"
+	"slices"
 	"strings"
 	"time"
 
@@ -14,16 +14,10 @@ import (
 )
 
 type vramEstimateRequest struct {
-	Model       string `json:"model"`                      // model name (must be installed)
-	ContextSize uint32 `json:"context_size,omitempty"`     // context length to estimate for (default 8192)
-	GPULayers   int    `json:"gpu_layers,omitempty"`       // number of layers to offload to GPU (0 = all)
-	KVQuantBits int    `json:"kv_quant_bits,omitempty"`    // KV cache quantization bits (0 = fp16)
-}
-
-type vramEstimateResponse struct {
-	vram.EstimateResult
-	ContextNote     string `json:"context_note,omitempty"`      // note when context_size was defaulted
-	ModelMaxContext uint64 `json:"model_max_context,omitempty"` // model's trained maximum context length
+	Model        string   `json:"model"`                       // model name (must be installed)
+	ContextSizes []uint32 `json:"context_sizes,omitempty"`     // context sizes to estimate (default [8192])
+	GPULayers    int      `json:"gpu_layers,omitempty"`        // number of layers to offload to GPU (0 = all)
+	KVQuantBits  int      `json:"kv_quant_bits,omitempty"`     // KV cache quantization bits (0 = fp16)
 }
 
 // resolveModelURI converts a relative model path to a file:// URI so the
@@ -36,8 +30,8 @@ func resolveModelURI(uri, modelsPath string) string {
 	return "file://" + filepath.Join(modelsPath, uri)
 }
 
-// addWeightFile appends a resolved weight file to files and tracks the first GGUF.
-func addWeightFile(uri, modelsPath string, files *[]vram.FileInput, firstGGUF *string, seen map[string]bool) {
+// addWeightFile appends a resolved weight file to files.
+func addWeightFile(uri, modelsPath string, files *[]vram.FileInput, seen map[string]bool) {
 	if !vram.IsWeightFile(uri) {
 		return
 	}
@@ -47,21 +41,17 @@ func addWeightFile(uri, modelsPath string, files *[]vram.FileInput, firstGGUF *s
 	}
 	seen[resolved] = true
 	*files = append(*files, vram.FileInput{URI: resolved, Size: 0})
-	if *firstGGUF == "" && vram.IsGGUF(uri) {
-		*firstGGUF = resolved
-	}
 }
 
 // VRAMEstimateEndpoint returns a handler that estimates VRAM usage for an
-// installed model configuration. For uninstalled models (gallery URLs), use
-// the gallery-level estimates in /api/models instead.
+// installed model configuration at multiple context sizes.
 // @Summary Estimate VRAM usage for a model
-// @Description Estimates VRAM based on model weight files, context size, and GPU layers
+// @Description Estimates VRAM based on model weight files at multiple context sizes
 // @Tags config
 // @Accept json
 // @Produce json
 // @Param request body vramEstimateRequest true "VRAM estimation parameters"
-// @Success 200 {object} vramEstimateResponse "VRAM estimate"
+// @Success 200 {object} vram.MultiContextEstimate "VRAM estimate"
 // @Router /api/models/vram-estimate [post]
 func VRAMEstimateEndpoint(cl *config.ModelConfigLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc {
 	return func(c echo.Context) error {
@@ -82,17 +72,16 @@ func VRAMEstimateEndpoint(cl *config.ModelConfigLoader, appConfig *config.Applic
 		modelsPath := appConfig.SystemState.Model.ModelsPath
 
 		var files []vram.FileInput
-		var firstGGUF string
 		seen := make(map[string]bool)
 
 		for _, f := range modelConfig.DownloadFiles {
-			addWeightFile(string(f.URI), modelsPath, &files, &firstGGUF, seen)
+			addWeightFile(string(f.URI), modelsPath, &files, seen)
 		}
 		if modelConfig.Model != "" {
-			addWeightFile(modelConfig.Model, modelsPath, &files, &firstGGUF, seen)
+			addWeightFile(modelConfig.Model, modelsPath, &files, seen)
 		}
 		if modelConfig.MMProj != "" {
-			addWeightFile(modelConfig.MMProj, modelsPath, &files, &firstGGUF, seen)
+			addWeightFile(modelConfig.MMProj, modelsPath, &files, seen)
 		}
 
 		if len(files) == 0 {
@@ -101,45 +90,36 @@ func VRAMEstimateEndpoint(cl *config.ModelConfigLoader, appConfig *config.Applic
 			})
 		}
 
-		contextDefaulted := false
-		opts := vram.EstimateOptions{
-			ContextLength: req.ContextSize,
-			GPULayers:     req.GPULayers,
-			KVQuantBits:   req.KVQuantBits,
-		}
-		if opts.ContextLength == 0 {
+		contextSizes := req.ContextSizes
+		if len(contextSizes) == 0 {
 			if modelConfig.ContextSize != nil {
-				opts.ContextLength = uint32(*modelConfig.ContextSize)
+				contextSizes = []uint32{uint32(*modelConfig.ContextSize)}
 			} else {
-				opts.ContextLength = 8192
-				contextDefaulted = true
+				contextSizes = []uint32{8192}
+			}
+		}
+
+		// Include model's configured context size alongside requested sizes
+		if modelConfig.ContextSize != nil {
+			modelCtx := uint32(*modelConfig.ContextSize)
+			if !slices.Contains(contextSizes, modelCtx) {
+				contextSizes = append(contextSizes, modelCtx)
 			}
 		}
 
+		opts := vram.EstimateOptions{
+			GPULayers:   req.GPULayers,
+			KVQuantBits: req.KVQuantBits,
+		}
+
 		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 		defer cancel()
 
-		result, err := vram.Estimate(ctx, files, opts, vram.DefaultCachedSizeResolver(), vram.DefaultCachedGGUFReader())
+		result, err := vram.EstimateMultiContext(ctx, files, contextSizes, opts, vram.DefaultCachedSizeResolver(), vram.DefaultCachedGGUFReader())
 		if err != nil {
 			return c.JSON(http.StatusInternalServerError, map[string]any{"error": err.Error()})
 		}
 
-		resp := vramEstimateResponse{EstimateResult: result}
-
-		// When context was defaulted to 8192, read the GGUF metadata to report
-		// the model's trained maximum context length so callers know the estimate
-		// may be conservative.
-		if contextDefaulted && firstGGUF != "" {
-			ggufMeta, err := vram.DefaultCachedGGUFReader().ReadMetadata(ctx, firstGGUF)
-			if err == nil && ggufMeta != nil && ggufMeta.MaximumContextLength > 0 {
-				resp.ModelMaxContext = ggufMeta.MaximumContextLength
-				resp.ContextNote = fmt.Sprintf(
-					"Estimate used default context_size=8192. The model's trained maximum context is %d; VRAM usage will be higher at larger context sizes.",
-					ggufMeta.MaximumContextLength,
-				)
-			}
-		}
-
-		return c.JSON(http.StatusOK, resp)
+		return c.JSON(http.StatusOK, result)
 	}
 }
@@ -86,6 +86,9 @@ function GalleryLoader() {
 }
 
 
+const CONTEXT_SIZES = [8192, 16384, 32768, 65536, 131072, 262144]
+const CONTEXT_LABELS = ['8K', '16K', '32K', '64K', '128K', '256K']
+
 const FILTERS = [
   { key: '', label: 'All', icon: 'fa-layer-group' },
   { key: 'chat', label: 'Chat', icon: 'fa-brain' },
@@ -119,6 +122,7 @@ export default function Models() {
   const [allBackends, setAllBackends] = useState([])
   const [backendUsecases, setBackendUsecases] = useState({})
   const [estimates, setEstimates] = useState({})
+  const [contextSize, setContextSize] = useState(CONTEXT_SIZES[0])
   const debounceRef = useRef(null)
   const [confirmDialog, setConfirmDialog] = useState(null)
 
@@ -190,9 +194,9 @@ export default function Models() {
     models.forEach(model => {
       const id = model.name || model.id
       if (estimates[id]) return
-      modelsApi.estimate(id).then(est => {
+      modelsApi.estimate(id, CONTEXT_SIZES).then(est => {
         if (cancelled) return
-        if (est && (est.SizeBytes || est.VRAMBytes)) {
+        if (est && (est.sizeBytes || est.estimates)) {
           setEstimates(prev => ({ ...prev, [id]: est }))
         }
       }).catch(() => {})
@@ -371,6 +375,25 @@ export default function Models() {
         )}
       </div>
 
+      {/* Context size slider for VRAM estimates */}
+      <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)', marginBottom: 'var(--spacing-md)', fontSize: '0.8125rem' }}>
+        <label style={{ color: 'var(--color-text-muted)', whiteSpace: 'nowrap' }}>
+          <i className="fas fa-memory" style={{ marginRight: 4 }} />
+          Context:
+        </label>
+        <input
+          type="range"
+          min={0}
+          max={CONTEXT_SIZES.length - 1}
+          value={CONTEXT_SIZES.indexOf(contextSize)}
+          onChange={(e) => setContextSize(CONTEXT_SIZES[e.target.value])}
+          style={{ width: 140, accentColor: 'var(--color-primary)' }}
+        />
+        <span style={{ fontWeight: 600, minWidth: '3em' }}>
+          {CONTEXT_LABELS[CONTEXT_SIZES.indexOf(contextSize)]}
+        </span>
+      </div>
+
       {/* Table */}
       {loading ? (
         <GalleryLoader />
@@ -415,10 +438,11 @@ export default function Models() {
               <tbody>
                 {models.map((model, idx) => {
                   const name = model.name || model.id
-                  const est = estimates[name] || {}
-                  const sizeDisplay = est.SizeDisplay || model.estimated_size_display
-                  const vramDisplay = est.VRAMDisplay || model.estimated_vram_display
-                  const vramBytes = est.VRAMBytes || model.estimated_vram_bytes
+                  const estData = estimates[name]
+                  const sizeDisplay = estData?.sizeDisplay
+                  const ctxEst = estData?.estimates?.[String(contextSize)]
+                  const vramDisplay = ctxEst?.vramDisplay
+                  const vramBytes = ctxEst?.vramBytes
                   const installing = isInstalling(name)
                   const progress = getOperationProgress(name)
                   const fit = fitsGpu(vramBytes)
 
@@ -79,7 +79,10 @@ export const modelsApi = {
   listCapabilities: () => fetchJSON(API_CONFIG.endpoints.modelsCapabilities),
   install: (id) => postJSON(API_CONFIG.endpoints.installModel(id), {}),
   delete: (id) => postJSON(API_CONFIG.endpoints.deleteModel(id), {}),
-  estimate: (id) => fetchJSON(API_CONFIG.endpoints.modelEstimate(id)),
+  estimate: (id, contexts) => fetchJSON(
+    buildUrl(API_CONFIG.endpoints.modelEstimate(id),
+      contexts?.length ? { contexts: contexts.join(',') } : {})
+  ),
   getConfig: (id) => postJSON(API_CONFIG.endpoints.modelConfig(id), {}),
   getConfigJson: (name) => fetchJSON(API_CONFIG.endpoints.modelConfigJson(name)),
   getJob: (uid) => fetchJSON(API_CONFIG.endpoints.modelJob(uid)),
Original file line number	Diff line number	Diff line change
`@@ -51,18 +51,17 @@ func ImportModelURIEndpoint(cl config.ModelConfigLoader, appConfig config.Appl`
`51`	`51`	`}`
`52`	`52`	`estCtx, cancel := context.WithTimeout(c.Request().Context(), 5*time.Second)`
`53`	`53`	`defer cancel()`
`54`		`- result, err := vram.EstimateModel(estCtx, vram.ModelEstimateInput{`
`55`		`- Files: files,`
`56`		`- Options: vram.EstimateOptions{ContextLength: 8192},`
`57`		`- })`
	`54`	`+ result, err := vram.EstimateModelMultiContext(estCtx, vram.ModelEstimateInput{`
	`55`	`+ Files: files,`
	`56`	`+ }, []uint32{8192})`
`58`	`57`	`if err == nil {`
`59`	`58`	`if result.SizeBytes > 0 {`
`60`	`59`	`resp.EstimatedSizeBytes = result.SizeBytes`
`61`	`60`	`resp.EstimatedSizeDisplay = result.SizeDisplay`
`62`	`61`	`}`
`63`		`- if result.VRAMBytes > 0 {`
`64`		`- resp.EstimatedVRAMBytes = result.VRAMBytes`
`65`		`- resp.EstimatedVRAMDisplay = result.VRAMDisplay`
	`62`	`+ if v := result.VRAMForContext(8192); v > 0 {`
	`63`	`+ resp.EstimatedVRAMBytes = v`
	`64`	`+ resp.EstimatedVRAMDisplay = vram.FormatBytes(v)`
`66`	`65`	`}`
`67`	`66`	`}`
`68`	`67`	`}`