cobaltcore-dev
diff --git a/‎Tiltfile
Lines changed: 17 additions & 1 deletion b/‎Tiltfile
Lines changed: 17 additions & 1 deletion
diff --git a/‎commands/checks/checks.go
Lines changed: 2 additions & 0 deletions b/‎commands/checks/checks.go
Lines changed: 2 additions & 0 deletions
diff --git a/‎commands/checks/cinder/checks.go
Lines changed: 62 additions & 0 deletions b/‎commands/checks/cinder/checks.go
Lines changed: 62 additions & 0 deletions
diff --git a/‎helm/bundles/cortex-cinder/Chart.yaml
Lines changed: 25 additions & 0 deletions b/‎helm/bundles/cortex-cinder/Chart.yaml
Lines changed: 25 additions & 0 deletions
diff --git a/‎helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml
Lines changed: 215 additions & 0 deletions b/‎helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml
Lines changed: 215 additions & 0 deletions
diff --git a/‎helm/bundles/cortex-cinder/templates/alerts.yaml
Lines changed: 17 additions & 0 deletions b/‎helm/bundles/cortex-cinder/templates/alerts.yaml
Lines changed: 17 additions & 0 deletions
@@ -67,7 +67,7 @@ docker_build('ghcr.io/cobaltcore-dev/cortex-postgres', 'postgres')
 # Package the lib charts locally and sync them to the bundle charts. In this way
 # we can bump the lib charts locally and test them before pushing them to the OCI registry.
 lib_charts = ['cortex-core', 'cortex-postgres', 'cortex-mqtt']
-bundle_charts = ['cortex-nova', 'cortex-manila']
+bundle_charts = ['cortex-nova', 'cortex-manila', 'cortex-cinder']
 for lib_chart in lib_charts:
     watch_file('helm/library/' + lib_chart) # React to lib chart changes.
     local('sh helm/sync.sh helm/library/' + lib_chart)
@@ -81,6 +81,7 @@ for lib_chart in lib_charts:
             # Make sure the gen_tgz is removed from the local directory.
             local('rm -f ' + gen_tgz)
         else:
+            local('mkdir -p helm/bundles/' + bundle_chart + '/charts/')
             local('mv -f ' + gen_tgz + ' helm/bundles/' + bundle_chart + '/charts/')
 # Ensure the bundle charts are up to date.
 for bundle_chart in bundle_charts:
@@ -89,6 +90,7 @@ for bundle_chart in bundle_charts:
 # Deploy the Cortex bundles.
 k8s_yaml(helm('./helm/bundles/cortex-nova', name='cortex-nova', values=[tilt_values]))
 k8s_yaml(helm('./helm/bundles/cortex-manila', name='cortex-manila', values=[tilt_values]))
+k8s_yaml(helm('./helm/bundles/cortex-cinder', name='cortex-cinder', values=[tilt_values]))
 
 # Note: place resources higher in this list to ensure their local port stays the same.
 # Elements placed lower in the list will have their local port shifted by elements inserted above.
@@ -98,6 +100,7 @@ resources = [
         [
             'cortex-nova-mqtt',
             'cortex-manila-mqtt',
+            'cortex-cinder-mqtt',
         ],
         [(1883, 'tcp'), (15675, 'ws')],
     ),
@@ -106,6 +109,7 @@ resources = [
         [
             'cortex-nova-postgresql',
             'cortex-manila-postgresql',
+            'cortex-cinder-postgresql',
         ],
         [(5432, 'psql')],
     ),
@@ -134,6 +138,18 @@ resources = [
         ],
         [(2112, 'metrics'), (8080, 'api')],
     ),
+    (
+        'Cortex-Cinder',
+        [
+            'cortex-cinder-migrations',
+            'cortex-cinder-cli',
+            'cortex-cinder-syncer',
+            'cortex-cinder-extractor',
+            'cortex-cinder-kpis',
+            'cortex-cinder-scheduler',
+        ],
+        [(2112, 'metrics'), (8080, 'api')],
+    ),
 ]
 local_port = 8000
 for label, components, service_ports in resources:
 
@@ -7,6 +7,7 @@ import (
 	"context"
 	"log/slog"
 
+	"github.com/cobaltcore-dev/cortex/commands/checks/cinder"
 	"github.com/cobaltcore-dev/cortex/commands/checks/manila"
 	"github.com/cobaltcore-dev/cortex/commands/checks/nova"
 	"github.com/cobaltcore-dev/cortex/internal/conf"
@@ -15,6 +16,7 @@ import (
 var checks = map[string]func(context.Context, conf.Config){
 	"nova":   nova.RunChecks,
 	"manila": manila.RunChecks,
+	"cinder": cinder.RunChecks,
 }
 
 // Run all checks.
 
@@ -0,0 +1,62 @@
+// Copyright 2025 SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package cinder
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"io"
+	"log/slog"
+	"net/http"
+	"strconv"
+
+	"github.com/cobaltcore-dev/cortex/internal/conf"
+	"github.com/cobaltcore-dev/cortex/internal/scheduler/cinder/api"
+	"github.com/sapcc/go-bits/must"
+)
+
+// Run all checks.
+func RunChecks(ctx context.Context, config conf.Config) {
+	checkCinderSchedulerReturnsValidHosts(ctx, config)
+}
+
+// Check that the Cinder external scheduler returns a valid set of share hosts.
+func checkCinderSchedulerReturnsValidHosts(ctx context.Context, config conf.Config) {
+	// TODO ADD THIS CHECK
+
+	//
+
+	request := api.ExternalSchedulerRequest{
+		Hosts:     []api.ExternalSchedulerHost{},
+		Weights:   map[string]float64{},
+		Sandboxed: true,
+	}
+	port := strconv.Itoa(config.GetAPIConfig().Port)
+	apiURL := "http://cortex-cinder-scheduler:" + port + "/scheduler/cinder/external"
+	slog.Info("sending request to external scheduler", "apiURL", apiURL)
+
+	requestBody := must.Return(json.Marshal(request))
+	buf := bytes.NewBuffer(requestBody)
+	req := must.Return(http.NewRequestWithContext(ctx, http.MethodPost, apiURL, buf))
+	req.Header.Set("Content-Type", "application/json")
+	//nolint:bodyclose // We don't care about the body here.
+	respRaw := must.Return(http.DefaultClient.Do(req))
+	defer respRaw.Body.Close()
+	if respRaw.StatusCode != http.StatusOK {
+		// Log the response body for debugging
+		bodyBytes := must.Return(io.ReadAll(respRaw.Body))
+		slog.Error("external scheduler API returned non-200 status code",
+			"statusCode", respRaw.StatusCode,
+			"responseBody", string(bodyBytes),
+		)
+		panic("external scheduler API returned non-200 status code")
+	}
+	var resp api.ExternalSchedulerResponse
+	must.Succeed(json.NewDecoder(respRaw.Body).Decode(&resp))
+	if len(resp.Hosts) == 0 {
+		panic("no share hosts found in response")
+	}
+	slog.Info("check successful, got share hosts", "count", len(resp.Hosts))
+}
@@ -0,0 +1,25 @@
+# Copyright 2025 SAP SE
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v3
+name: cortex-cinder
+description: A Helm chart deploying Cortex for Cinder.
+type: application
+version: 0.0.7
+appVersion: 0.1.0
+dependencies:
+  - name: cortex-core
+    repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
+    version: 0.24.6
+  - name: cortex-postgres
+    repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
+    version: 0.5.3
+  - name: cortex-mqtt
+    repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
+    version: 0.1.0
+  # Owner info adds a configmap to the kubernetes cluster with information on
+  # the service owner. This makes it easier to find out who to contact in case
+  # of issues. See: https://github.com/sapcc/helm-charts/pkgs/container/helm-charts%2Fowner-info
+  - name: owner-info
+    repository: oci://ghcr.io/sapcc/helm-charts
+    version: 1.0.0
@@ -0,0 +1,215 @@
+groups:
+- name: cortex-cinder-alerts
+  rules:
+  - alert: CortexCinderInitialPlacementDown
+    expr: |
+      up{component="cortex-cinder-scheduler"} != 1 or
+      absent(up{component="cortex-cinder-scheduler"})
+    for: 1m
+    labels:
+      context: liveness
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+      playbook: docs/support/playbook/cortex/down
+    annotations:
+      summary: "Cortex initial placement for Cinder is down"
+      description: >
+        The Cortex initial placement is down. Initial placement requests from Cinder will
+        not be served. This is no immediate problem, since Cinder will continue
+        placing new shares. However, the placement will be less desirable.
+
+  - alert: CortexCinderSyncerDown
+    expr: |
+      up{component="cortex-cinder-syncer"} != 1 or
+      absent(up{component="cortex-cinder-syncer"})
+    for: 1m
+    labels:
+      context: liveness
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Cortex syncer is down"
+      description: >
+        The Cortex syncer is down. Cortex requires somewhat recent data from
+        it's datasources (OpenStack, Prometheus, etc.) to make accurate
+        scheduling decisions. If this issue persists for a longer time, the
+        data based will slowly drift away from the actual state of the
+        datacenter, which may lead to less desirable placement decisions.
+        This is no immediate problem, since Cinder will continue placing new
+        shares.
+
+  - alert: CortexCinderExtractorDown
+    expr: |
+      up{component="cortex-cinder-extractor"} != 1 or
+      absent(up{component="cortex-cinder-extractor"})
+    for: 1m
+    labels:
+      context: liveness
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Cortex extractor is down"
+      description: >
+        The Cortex extractor is down. This means that newly available data
+        about the datacenter will not be used to extract scheduling knowledge.
+        This is no immediate problem, since Cinder will continue placing new
+        shares. However, the placement will be less desirable.
+
+  - alert: CortexCinderHttpRequest400sTooHigh
+    expr: rate(cortex_scheduler_api_request_duration_seconds_count{component="cortex-cinder-scheduler",status=~"4.+"}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "HTTP request 400 errors too high"
+      description: >
+        Cortex is responding to Cinder initial placement requests with HTTP 4xx
+        errors. This is expected when the scheduling request cannot be served
+        by Cortex. However, it could also indicate that the Cinder request
+        format has changed and Cortex is unable to parse it.
+
+  - alert: CortexCinderHttpRequest500sTooHigh
+    expr: rate(cortex_scheduler_api_request_duration_seconds_count{component="cortex-cinder-scheduler",status=~"5.+"}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "HTTP request 500 errors too high"
+      description: >
+        Cortex is responding to Cinder initial placement requests with HTTP 5xx
+        errors. This is not expected and indicates that Cortex is having some
+        internal problem. Cinder will continue to place new shares, but the
+        placement will be less desirable. Thus, no immediate action is needed.
+
+  - alert: CortexCinderHighMemoryUsage
+    expr: process_resident_memory_bytes{component=~"cortex-cinder-.*"} > 6000 * 1024 * 1024
+    for: 5m
+    labels:
+      context: memory
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Cortex `{{$labels.component}}` uses too much memory"
+      description: >
+        Cortex should not be using more than 6000 MiB of memory. Usually it
+        should use much less, so there may be a memory leak or other changes
+        that are causing the memory usage to increase significantly.
+
+  - alert: CortexCinderHighCPUUsage
+    expr: rate(process_cpu_seconds_total{component=~"cortex-cinder-.*"}[1m]) > 0.5
+    for: 5m
+    labels:
+      context: cpu
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Cortex `{{$labels.component}}` uses too much CPU"
+      description: >
+        Cortex should not be using more than 50% of a single CPU core. Usually
+        it should use much less, so there may be a CPU leak or other changes
+        that are causing the CPU usage to increase significantly.
+
+  - alert: CortexCinderSyncNotSuccessful
+    expr: cortex_sync_request_processed_total{component=~"cortex-cinder-.*"} - cortex_sync_request_duration_seconds_count{component=~"cortex-cinder-.*"} > 0
+    for: 5m
+    labels:
+      context: syncstatus
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Sync not successful"
+      description: >
+        Cortex experienced an issue syncing data from a datasource. This may
+        happen when the datasource (OpenStack, Prometheus, etc.) is down or
+        the sync module is misconfigured. No immediate action is needed, since
+        the sync module will retry the sync operation and the currently synced
+        data will be kept. However, when this problem persists for a longer
+        time the service will have a less recent view of the datacenter.
+
+  - alert: CortexCinderSyncObjectsDroppedToZero
+    expr: cortex_sync_objects{component=~"cortex-cinder-.*",datasource!="openstack_migrations"} == 0
+    for: 60m
+    labels:
+      context: syncobjects
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Cortex is not syncing any new data from `{{$labels.datasource}}`"
+      description: >
+        Cortex is not syncing any objects from a datasource. This may happen
+        when the datasource (OpenStack, Prometheus, etc.) is down or the sync
+        module is misconfigured. No immediate action is needed, since the sync
+        module will retry the sync operation and the currently synced data will
+        be kept. However, when this problem persists for a longer time the
+        service will have a less recent view of the datacenter.
+
+  - alert: CortexCinderSyncObjectsTooHigh
+    expr: cortex_sync_objects{component=~"cortex-cinder-.*"} > 10000000
+    for: 5m
+    labels:
+      context: syncobjects
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Cortex is syncing unexpectedly many objects from `{{$labels.datasource}}`"
+      description: >
+        Cortex is syncing more than 1 million objects from a datasource. This
+        may happen when the datasource (OpenStack, Prometheus, etc.) returns
+        unexpectedly many objects, or when the database cannot drop old objects.
+        No immediate action is needed, but should this condition persist for a
+        longer time, the database may fill up and crash.
+
+  - alert: CortexCinderTooManyMQTTConnectionAttempts
+    expr: rate(cortex_mqtt_connection_attempts_total{component=~"cortex-cinder-.*"}[5m]) > 0.1
+    for: 1m
+    labels:
+      context: mqtt
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Cortex is trying to connect to MQTT too often"
+      description: >
+        Cortex is trying to connect to the MQTT broker too often. This may
+        happen when the broker is down or the connection parameters are
+        misconfigured.
+
+  - alert: CortexCinderTooManyDBConnectionAttempts
+    expr: rate(cortex_db_connection_attempts_total{component=~"cortex-cinder-.*"}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: db
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Cortex is trying to connect to the database too often"
+      description: >
+        Cortex is trying to connect to the database too often. This may happen
+        when the database is down or the connection parameters are misconfigured.
@@ -0,0 +1,17 @@
+# Copyright 2025 SAP SE
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.alerts.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: cortex-cinder-alerts
+  labels:
+    type: alerting-rules
+    prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }}
+spec:
+  {{- $files := .Files.Glob "alerts/*.alerts.yaml" }}
+  {{- range $path, $file := $files }}
+  {{ $file | toString | nindent 2 }}
+  {{- end }}
+{{- end }}