Skip to content

Commit 91f4380

Browse files
Add cortex-cinder scheduler (#248)
Co-authored-by: Philipp Matthes <[email protected]>
1 parent b408996 commit 91f4380

File tree

15 files changed

+1007
-3
lines changed

15 files changed

+1007
-3
lines changed

Tiltfile

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ docker_build('ghcr.io/cobaltcore-dev/cortex-postgres', 'postgres')
6767
# Package the lib charts locally and sync them to the bundle charts. In this way
6868
# we can bump the lib charts locally and test them before pushing them to the OCI registry.
6969
lib_charts = ['cortex-core', 'cortex-postgres', 'cortex-mqtt']
70-
bundle_charts = ['cortex-nova', 'cortex-manila']
70+
bundle_charts = ['cortex-nova', 'cortex-manila', 'cortex-cinder']
7171
for lib_chart in lib_charts:
7272
watch_file('helm/library/' + lib_chart) # React to lib chart changes.
7373
local('sh helm/sync.sh helm/library/' + lib_chart)
@@ -81,6 +81,7 @@ for lib_chart in lib_charts:
8181
# Make sure the gen_tgz is removed from the local directory.
8282
local('rm -f ' + gen_tgz)
8383
else:
84+
local('mkdir -p helm/bundles/' + bundle_chart + '/charts/')
8485
local('mv -f ' + gen_tgz + ' helm/bundles/' + bundle_chart + '/charts/')
8586
# Ensure the bundle charts are up to date.
8687
for bundle_chart in bundle_charts:
@@ -89,6 +90,7 @@ for bundle_chart in bundle_charts:
8990
# Deploy the Cortex bundles.
9091
k8s_yaml(helm('./helm/bundles/cortex-nova', name='cortex-nova', values=[tilt_values]))
9192
k8s_yaml(helm('./helm/bundles/cortex-manila', name='cortex-manila', values=[tilt_values]))
93+
k8s_yaml(helm('./helm/bundles/cortex-cinder', name='cortex-cinder', values=[tilt_values]))
9294

9395
# Note: place resources higher in this list to ensure their local port stays the same.
9496
# Elements placed lower in the list will have their local port shifted by elements inserted above.
@@ -98,6 +100,7 @@ resources = [
98100
[
99101
'cortex-nova-mqtt',
100102
'cortex-manila-mqtt',
103+
'cortex-cinder-mqtt',
101104
],
102105
[(1883, 'tcp'), (15675, 'ws')],
103106
),
@@ -106,6 +109,7 @@ resources = [
106109
[
107110
'cortex-nova-postgresql',
108111
'cortex-manila-postgresql',
112+
'cortex-cinder-postgresql',
109113
],
110114
[(5432, 'psql')],
111115
),
@@ -134,6 +138,18 @@ resources = [
134138
],
135139
[(2112, 'metrics'), (8080, 'api')],
136140
),
141+
(
142+
'Cortex-Cinder',
143+
[
144+
'cortex-cinder-migrations',
145+
'cortex-cinder-cli',
146+
'cortex-cinder-syncer',
147+
'cortex-cinder-extractor',
148+
'cortex-cinder-kpis',
149+
'cortex-cinder-scheduler',
150+
],
151+
[(2112, 'metrics'), (8080, 'api')],
152+
),
137153
]
138154
local_port = 8000
139155
for label, components, service_ports in resources:

commands/checks/checks.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"context"
88
"log/slog"
99

10+
"github.com/cobaltcore-dev/cortex/commands/checks/cinder"
1011
"github.com/cobaltcore-dev/cortex/commands/checks/manila"
1112
"github.com/cobaltcore-dev/cortex/commands/checks/nova"
1213
"github.com/cobaltcore-dev/cortex/internal/conf"
@@ -15,6 +16,7 @@ import (
1516
var checks = map[string]func(context.Context, conf.Config){
1617
"nova": nova.RunChecks,
1718
"manila": manila.RunChecks,
19+
"cinder": cinder.RunChecks,
1820
}
1921

2022
// Run all checks.

commands/checks/cinder/checks.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// Copyright 2025 SAP SE
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package cinder
5+
6+
import (
7+
"bytes"
8+
"context"
9+
"encoding/json"
10+
"io"
11+
"log/slog"
12+
"net/http"
13+
"strconv"
14+
15+
"github.com/cobaltcore-dev/cortex/internal/conf"
16+
"github.com/cobaltcore-dev/cortex/internal/scheduler/cinder/api"
17+
"github.com/sapcc/go-bits/must"
18+
)
19+
20+
// Run all checks.
21+
func RunChecks(ctx context.Context, config conf.Config) {
22+
checkCinderSchedulerReturnsValidHosts(ctx, config)
23+
}
24+
25+
// Check that the Cinder external scheduler returns a valid set of share hosts.
26+
func checkCinderSchedulerReturnsValidHosts(ctx context.Context, config conf.Config) {
27+
// TODO ADD THIS CHECK
28+
29+
//
30+
31+
request := api.ExternalSchedulerRequest{
32+
Hosts: []api.ExternalSchedulerHost{},
33+
Weights: map[string]float64{},
34+
Sandboxed: true,
35+
}
36+
port := strconv.Itoa(config.GetAPIConfig().Port)
37+
apiURL := "http://cortex-cinder-scheduler:" + port + "/scheduler/cinder/external"
38+
slog.Info("sending request to external scheduler", "apiURL", apiURL)
39+
40+
requestBody := must.Return(json.Marshal(request))
41+
buf := bytes.NewBuffer(requestBody)
42+
req := must.Return(http.NewRequestWithContext(ctx, http.MethodPost, apiURL, buf))
43+
req.Header.Set("Content-Type", "application/json")
44+
//nolint:bodyclose // We don't care about the body here.
45+
respRaw := must.Return(http.DefaultClient.Do(req))
46+
defer respRaw.Body.Close()
47+
if respRaw.StatusCode != http.StatusOK {
48+
// Log the response body for debugging
49+
bodyBytes := must.Return(io.ReadAll(respRaw.Body))
50+
slog.Error("external scheduler API returned non-200 status code",
51+
"statusCode", respRaw.StatusCode,
52+
"responseBody", string(bodyBytes),
53+
)
54+
panic("external scheduler API returned non-200 status code")
55+
}
56+
var resp api.ExternalSchedulerResponse
57+
must.Succeed(json.NewDecoder(respRaw.Body).Decode(&resp))
58+
if len(resp.Hosts) == 0 {
59+
panic("no share hosts found in response")
60+
}
61+
slog.Info("check successful, got share hosts", "count", len(resp.Hosts))
62+
}

helm/bundles/cortex-cinder/Chart.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright 2025 SAP SE
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
apiVersion: v3
5+
name: cortex-cinder
6+
description: A Helm chart deploying Cortex for Cinder.
7+
type: application
8+
version: 0.0.7
9+
appVersion: 0.1.0
10+
dependencies:
11+
- name: cortex-core
12+
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
13+
version: 0.24.6
14+
- name: cortex-postgres
15+
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
16+
version: 0.5.3
17+
- name: cortex-mqtt
18+
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
19+
version: 0.1.0
20+
# Owner info adds a configmap to the kubernetes cluster with information on
21+
# the service owner. This makes it easier to find out who to contact in case
22+
# of issues. See: https://github.com/sapcc/helm-charts/pkgs/container/helm-charts%2Fowner-info
23+
- name: owner-info
24+
repository: oci://ghcr.io/sapcc/helm-charts
25+
version: 1.0.0
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
groups:
2+
- name: cortex-cinder-alerts
3+
rules:
4+
- alert: CortexCinderInitialPlacementDown
5+
expr: |
6+
up{component="cortex-cinder-scheduler"} != 1 or
7+
absent(up{component="cortex-cinder-scheduler"})
8+
for: 1m
9+
labels:
10+
context: liveness
11+
dashboard: cortex/cortex
12+
service: cortex
13+
severity: warning
14+
support_group: workload-management
15+
playbook: docs/support/playbook/cortex/down
16+
annotations:
17+
summary: "Cortex initial placement for Cinder is down"
18+
description: >
19+
The Cortex initial placement is down. Initial placement requests from Cinder will
20+
not be served. This is no immediate problem, since Cinder will continue
21+
placing new shares. However, the placement will be less desirable.
22+
23+
- alert: CortexCinderSyncerDown
24+
expr: |
25+
up{component="cortex-cinder-syncer"} != 1 or
26+
absent(up{component="cortex-cinder-syncer"})
27+
for: 1m
28+
labels:
29+
context: liveness
30+
dashboard: cortex/cortex
31+
service: cortex
32+
severity: warning
33+
support_group: workload-management
34+
annotations:
35+
summary: "Cortex syncer is down"
36+
description: >
37+
The Cortex syncer is down. Cortex requires somewhat recent data from
38+
it's datasources (OpenStack, Prometheus, etc.) to make accurate
39+
scheduling decisions. If this issue persists for a longer time, the
40+
data based will slowly drift away from the actual state of the
41+
datacenter, which may lead to less desirable placement decisions.
42+
This is no immediate problem, since Cinder will continue placing new
43+
shares.
44+
45+
- alert: CortexCinderExtractorDown
46+
expr: |
47+
up{component="cortex-cinder-extractor"} != 1 or
48+
absent(up{component="cortex-cinder-extractor"})
49+
for: 1m
50+
labels:
51+
context: liveness
52+
dashboard: cortex/cortex
53+
service: cortex
54+
severity: warning
55+
support_group: workload-management
56+
annotations:
57+
summary: "Cortex extractor is down"
58+
description: >
59+
The Cortex extractor is down. This means that newly available data
60+
about the datacenter will not be used to extract scheduling knowledge.
61+
This is no immediate problem, since Cinder will continue placing new
62+
shares. However, the placement will be less desirable.
63+
64+
- alert: CortexCinderHttpRequest400sTooHigh
65+
expr: rate(cortex_scheduler_api_request_duration_seconds_count{component="cortex-cinder-scheduler",status=~"4.+"}[5m]) > 0.1
66+
for: 5m
67+
labels:
68+
context: api
69+
dashboard: cortex/cortex
70+
service: cortex
71+
severity: warning
72+
support_group: workload-management
73+
annotations:
74+
summary: "HTTP request 400 errors too high"
75+
description: >
76+
Cortex is responding to Cinder initial placement requests with HTTP 4xx
77+
errors. This is expected when the scheduling request cannot be served
78+
by Cortex. However, it could also indicate that the Cinder request
79+
format has changed and Cortex is unable to parse it.
80+
81+
- alert: CortexCinderHttpRequest500sTooHigh
82+
expr: rate(cortex_scheduler_api_request_duration_seconds_count{component="cortex-cinder-scheduler",status=~"5.+"}[5m]) > 0.1
83+
for: 5m
84+
labels:
85+
context: api
86+
dashboard: cortex/cortex
87+
service: cortex
88+
severity: warning
89+
support_group: workload-management
90+
annotations:
91+
summary: "HTTP request 500 errors too high"
92+
description: >
93+
Cortex is responding to Cinder initial placement requests with HTTP 5xx
94+
errors. This is not expected and indicates that Cortex is having some
95+
internal problem. Cinder will continue to place new shares, but the
96+
placement will be less desirable. Thus, no immediate action is needed.
97+
98+
- alert: CortexCinderHighMemoryUsage
99+
expr: process_resident_memory_bytes{component=~"cortex-cinder-.*"} > 6000 * 1024 * 1024
100+
for: 5m
101+
labels:
102+
context: memory
103+
dashboard: cortex/cortex
104+
service: cortex
105+
severity: warning
106+
support_group: workload-management
107+
annotations:
108+
summary: "Cortex `{{$labels.component}}` uses too much memory"
109+
description: >
110+
Cortex should not be using more than 6000 MiB of memory. Usually it
111+
should use much less, so there may be a memory leak or other changes
112+
that are causing the memory usage to increase significantly.
113+
114+
- alert: CortexCinderHighCPUUsage
115+
expr: rate(process_cpu_seconds_total{component=~"cortex-cinder-.*"}[1m]) > 0.5
116+
for: 5m
117+
labels:
118+
context: cpu
119+
dashboard: cortex/cortex
120+
service: cortex
121+
severity: warning
122+
support_group: workload-management
123+
annotations:
124+
summary: "Cortex `{{$labels.component}}` uses too much CPU"
125+
description: >
126+
Cortex should not be using more than 50% of a single CPU core. Usually
127+
it should use much less, so there may be a CPU leak or other changes
128+
that are causing the CPU usage to increase significantly.
129+
130+
- alert: CortexCinderSyncNotSuccessful
131+
expr: cortex_sync_request_processed_total{component=~"cortex-cinder-.*"} - cortex_sync_request_duration_seconds_count{component=~"cortex-cinder-.*"} > 0
132+
for: 5m
133+
labels:
134+
context: syncstatus
135+
dashboard: cortex/cortex
136+
service: cortex
137+
severity: warning
138+
support_group: workload-management
139+
annotations:
140+
summary: "Sync not successful"
141+
description: >
142+
Cortex experienced an issue syncing data from a datasource. This may
143+
happen when the datasource (OpenStack, Prometheus, etc.) is down or
144+
the sync module is misconfigured. No immediate action is needed, since
145+
the sync module will retry the sync operation and the currently synced
146+
data will be kept. However, when this problem persists for a longer
147+
time the service will have a less recent view of the datacenter.
148+
149+
- alert: CortexCinderSyncObjectsDroppedToZero
150+
expr: cortex_sync_objects{component=~"cortex-cinder-.*",datasource!="openstack_migrations"} == 0
151+
for: 60m
152+
labels:
153+
context: syncobjects
154+
dashboard: cortex/cortex
155+
service: cortex
156+
severity: warning
157+
support_group: workload-management
158+
annotations:
159+
summary: "Cortex is not syncing any new data from `{{$labels.datasource}}`"
160+
description: >
161+
Cortex is not syncing any objects from a datasource. This may happen
162+
when the datasource (OpenStack, Prometheus, etc.) is down or the sync
163+
module is misconfigured. No immediate action is needed, since the sync
164+
module will retry the sync operation and the currently synced data will
165+
be kept. However, when this problem persists for a longer time the
166+
service will have a less recent view of the datacenter.
167+
168+
- alert: CortexCinderSyncObjectsTooHigh
169+
expr: cortex_sync_objects{component=~"cortex-cinder-.*"} > 10000000
170+
for: 5m
171+
labels:
172+
context: syncobjects
173+
dashboard: cortex/cortex
174+
service: cortex
175+
severity: warning
176+
support_group: workload-management
177+
annotations:
178+
summary: "Cortex is syncing unexpectedly many objects from `{{$labels.datasource}}`"
179+
description: >
180+
Cortex is syncing more than 1 million objects from a datasource. This
181+
may happen when the datasource (OpenStack, Prometheus, etc.) returns
182+
unexpectedly many objects, or when the database cannot drop old objects.
183+
No immediate action is needed, but should this condition persist for a
184+
longer time, the database may fill up and crash.
185+
186+
- alert: CortexCinderTooManyMQTTConnectionAttempts
187+
expr: rate(cortex_mqtt_connection_attempts_total{component=~"cortex-cinder-.*"}[5m]) > 0.1
188+
for: 1m
189+
labels:
190+
context: mqtt
191+
dashboard: cortex/cortex
192+
service: cortex
193+
severity: warning
194+
support_group: workload-management
195+
annotations:
196+
summary: "Cortex is trying to connect to MQTT too often"
197+
description: >
198+
Cortex is trying to connect to the MQTT broker too often. This may
199+
happen when the broker is down or the connection parameters are
200+
misconfigured.
201+
202+
- alert: CortexCinderTooManyDBConnectionAttempts
203+
expr: rate(cortex_db_connection_attempts_total{component=~"cortex-cinder-.*"}[5m]) > 0.1
204+
for: 5m
205+
labels:
206+
context: db
207+
dashboard: cortex/cortex
208+
service: cortex
209+
severity: warning
210+
support_group: workload-management
211+
annotations:
212+
summary: "Cortex is trying to connect to the database too often"
213+
description: >
214+
Cortex is trying to connect to the database too often. This may happen
215+
when the database is down or the connection parameters are misconfigured.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Copyright 2025 SAP SE
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
{{- if .Values.alerts.enabled }}
5+
apiVersion: monitoring.coreos.com/v1
6+
kind: PrometheusRule
7+
metadata:
8+
name: cortex-cinder-alerts
9+
labels:
10+
type: alerting-rules
11+
prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }}
12+
spec:
13+
{{- $files := .Files.Glob "alerts/*.alerts.yaml" }}
14+
{{- range $path, $file := $files }}
15+
{{ $file | toString | nindent 2 }}
16+
{{- end }}
17+
{{- end }}

0 commit comments

Comments
 (0)