Skip to content
Open
1 change: 1 addition & 0 deletions omlx/admin/i18n/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@
"settings.resource.restart_badge": "Restart needed",
"settings.cache.section_label": "Cache",
"settings.cache.enabled": "Cache Enabled",
"settings.cache.hot_cache_only": "Hot Cache Only",
"settings.cache.ssd_directory": "SSD Cache Directory",
"settings.generation.section_label": "Generation Defaults",
"settings.generation.max_context_window": "Max Context Window",
Expand Down
4 changes: 4 additions & 0 deletions omlx/admin/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ class GlobalSettingsRequest(BaseModel):
cache_enabled: Optional[bool] = None
ssd_cache_dir: Optional[str] = None
ssd_cache_max_size: Optional[str] = None
hot_cache_only: Optional[bool] = None
hot_cache_max_size: Optional[str] = None # "0" = disabled, "8GB", etc.
initial_cache_blocks: Optional[int] = None # Starting blocks (requires restart)

Expand Down Expand Up @@ -1741,6 +1742,7 @@ async def get_global_settings(is_admin: bool = Depends(require_admin)):
"ssd_cache_max_size": _format_cache_size(
global_settings.cache.get_ssd_cache_max_size_bytes(global_settings.base_path)
),
"hot_cache_only": global_settings.cache.hot_cache_only,
"hot_cache_max_size": global_settings.cache.hot_cache_max_size,
"initial_cache_blocks": global_settings.cache.initial_cache_blocks,
},
Expand Down Expand Up @@ -1934,6 +1936,8 @@ async def update_global_settings(
if request.ssd_cache_max_size is not None:
global_settings.cache.ssd_cache_max_size = request.ssd_cache_max_size
cache_changed = True
if request.hot_cache_only is not None:
global_settings.cache.hot_cache_only = request.hot_cache_only
if request.hot_cache_max_size is not None:
global_settings.cache.hot_cache_max_size = request.hot_cache_max_size
cache_changed = True
Expand Down
3 changes: 2 additions & 1 deletion omlx/admin/static/js/dashboard.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
model: { model_dirs: [''], max_model_memory: '' },
memory: { max_process_memory: 'auto', prefill_memory_guard: true },
scheduler: { max_concurrent_requests: 8 },
cache: { enabled: true, ssd_cache_dir: '', ssd_cache_max_size: 'auto', hot_cache_max_size: '0', initial_cache_blocks: 256 },
cache: { enabled: true, ssd_cache_dir: '', ssd_cache_max_size: 'auto', hot_cache_max_size: '0', initial_cache_blocks: 256, hot_cache_only: false },
sampling: { max_context_window: 32768, max_tokens: 32768, temperature: 1.0, top_p: 0.95, top_k: 0, repetition_penalty: 1.0 },
mcp: { config_path: '' },
huggingface: { endpoint: '' },
Expand Down Expand Up @@ -673,6 +673,7 @@
ssd_cache_max_size: this.globalSettings.cache.ssd_cache_max_size,
hot_cache_max_size: this.globalSettings.cache.hot_cache_max_size,
initial_cache_blocks: this.globalSettings.cache.initial_cache_blocks,
hot_cache_only: this.globalSettings.cache.hot_cache_only,
sampling_max_context_window: this.globalSettings.sampling.max_context_window,
sampling_max_tokens: this.globalSettings.sampling.max_tokens,
sampling_temperature: this.globalSettings.sampling.temperature,
Expand Down
16 changes: 15 additions & 1 deletion omlx/admin/templates/dashboard/_settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,20 @@ <h3 class="text-2xl font-bold tracking-tight text-neutral-900">{{ t('settings.gl
class="block w-5 h-5 bg-white rounded-full shadow-sm transform transition-transform duration-300 absolute top-0.5 left-0.5"></span>
</button>
</div>
<div class="flex flex-col sm:flex-row sm:items-center sm:justify-between gap-2 px-4 sm:px-6 py-4">
<div class="flex items-center gap-2">
<label class="text-sm text-neutral-700">{{ t('settings.cache.hot_cache_only') }}</label>
<span class="px-1.5 py-0.5 text-[9px] font-medium rounded-full bg-amber-50 text-amber-600 border border-amber-200">
{{ t('settings.global.restart_badge') }}
</span>
</div>
<button type="button" @click="globalSettings.cache.hot_cache_only = !globalSettings.cache.hot_cache_only"
:class="globalSettings.cache.hot_cache_only ? 'bg-black' : 'bg-neutral-200'"
class="relative w-11 h-6 rounded-full transition-colors duration-300 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-black">
<span :class="globalSettings.cache.hot_cache_only ? 'translate-x-5' : 'translate-x-0'"
class="block w-5 h-5 bg-white rounded-full shadow-sm transform transition-transform duration-300 absolute top-0.5 left-0.5"></span>
</button>
</div>
<div class="flex flex-col sm:flex-row sm:items-center sm:justify-between gap-2 px-4 sm:px-6 py-4">
<div class="flex items-center gap-2">
<label class="text-sm text-neutral-700">{{ t('settings.cache.ssd_directory') }}</label>
Expand Down Expand Up @@ -796,7 +810,7 @@ <h4 class="text-lg font-semibold text-neutral-900 mb-2">{{ t('settings.models.no
@click.stop="copyToClipboard(model.id); copied = true; setTimeout(() => copied = false, 2000)"
class="p-1 rounded-md transition-all flex-shrink-0"
:class="copied ? 'text-green-500' : 'text-neutral-300 hover:text-neutral-600 hover:bg-neutral-100'"
:title="window.t('settings.models.table.copy_model_name_tooltip')"
:title="window.t('settings.models.table.copy_model_name_tooltip')">
<svg x-show="!copied" class="w-3.5 h-3.5" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect width="14" height="14" x="8" y="8" rx="2" ry="2"/><path d="M4 16c-1.1 0-2-.9-2-2V4c0-1.1.9-2 2-2h10c1.1 0 2 .9 2 2"/></svg>
<svg x-show="copied" x-cloak class="w-3.5 h-3.5" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M20 6 9 17l-5-5"/></svg>
</button>
Expand Down
69 changes: 45 additions & 24 deletions omlx/cache/boundary_snapshot_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ def __init__(self, base_dir: Path) -> None:
self._pending_writes: Dict[Tuple[str, int], Dict] = {}
self._pending_lock = threading.Lock()

# Lock to synchronize directory deletions with background writes.
self._write_lock = threading.Lock()

# Requests whose snapshots have been cleaned up — writer thread
# skips queued items for these request IDs.
self._cancelled_requests: set[str] = set()
Expand Down Expand Up @@ -128,6 +131,9 @@ def save(
if not HAS_MLX:
return False

# Ensure this request is no longer marked as cancelled
self._cancelled_requests.discard(request_id)

try:
# 1. Extract dict-format states on inference thread.
extracted, model_cache_config = extract_cache_states_fn(snapshot_cache)
Expand Down Expand Up @@ -256,14 +262,20 @@ def cleanup_request(self, request_id: str) -> None:

# Remove files.
req_dir = self._snapshot_dir / request_id
if req_dir.exists():
try:
shutil.rmtree(req_dir)
except Exception as e:
logger.debug("Failed to clean up snapshots for %s: %s", request_id, e)
with self._write_lock:
if req_dir.exists():
try:
shutil.rmtree(req_dir)
except Exception as e:
logger.debug("Failed to clean up snapshots for %s: %s", request_id, e)

def cleanup_all(self) -> None:
"""Delete all snapshot files (for reset/startup)."""
# Mark all current requests as cancelled to stop in-flight writes
with self._registry_lock:
for req_id in self._file_registry:
self._cancelled_requests.add(req_id)

# Drain write queue so the writer thread doesn't process stale
# items after the directory is deleted.
while True:
Expand All @@ -279,14 +291,14 @@ def cleanup_all(self) -> None:
self._pending_writes.clear()
with self._registry_lock:
self._file_registry.clear()
self._cancelled_requests.clear()

if self._snapshot_dir.exists():
try:
shutil.rmtree(self._snapshot_dir)
except Exception as e:
logger.debug("Failed to clean up all boundary snapshots: %s", e)
self._snapshot_dir.mkdir(parents=True, exist_ok=True)
with self._write_lock:
if self._snapshot_dir.exists():
try:
shutil.rmtree(self._snapshot_dir)
except Exception as e:
logger.debug("Failed to clean up all boundary snapshots: %s", e)
self._snapshot_dir.mkdir(parents=True, exist_ok=True)

def shutdown(self) -> None:
"""Stop background writer thread."""
Expand Down Expand Up @@ -324,20 +336,26 @@ def _writer_loop(self) -> None:
continue

try:
file_path.parent.mkdir(parents=True, exist_ok=True)
temp_path = file_path.with_name(
file_path.stem + "_tmp.safetensors"
)
_write_safetensors_no_mx(str(temp_path), tensors_raw, metadata)
os.rename(str(temp_path), str(file_path))
with self._write_lock:
# Check again inside lock to avoid race with cleanup_all/cleanup_request
if pw_key[0] in self._cancelled_requests:
continue

file_path.parent.mkdir(parents=True, exist_ok=True)
temp_path = file_path.with_name(
file_path.stem + "_tmp.safetensors"
)
_write_safetensors_no_mx(str(temp_path), tensors_raw, metadata)
os.rename(str(temp_path), str(file_path))
except Exception as e:
logger.debug("Background snapshot write failed: %s", e)
for p in (temp_path, file_path):
try:
if p is not None and p.exists():
p.unlink()
except Exception:
pass
# Cleanup temp file if it exists
try:
temp_path = file_path.with_name(file_path.stem + "_tmp.safetensors")
if temp_path.exists():
temp_path.unlink()
except Exception:
pass
finally:
# Remove extracted cache objects from pending writes to free
# memory, but keep tensors_raw for read-back until file is on
Expand All @@ -364,6 +382,9 @@ def _serialize_extracted(
layer_info: List[Dict[str, str]] = []

for i, layer_state in enumerate(extracted):
if layer_state is None:
layer_state = {}

class_name = layer_state.get("class_name", "KVCache")
cache_type = layer_state.get("cache_type", "KVCache")
meta_state = layer_state.get("meta_state", ())
Expand Down
Loading