Skip to content

Commit 702591b

Browse files
committed
Merge branch 'master' into hotfix
2 parents d902c0a + c562746 commit 702591b

File tree

17 files changed

+195
-120
lines changed

17 files changed

+195
-120
lines changed

.gitignore

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,23 @@
11
/cc-backend
2+
/.env
3+
/config.json
24

35
/var/job-archive
4-
/var/*.db
56
/var/machine-state
6-
7-
/.env
8-
/config.json
7+
/var/job.db-shm
8+
/var/job.db-wal
9+
/var/*.db
10+
/var/*.txt
911

1012
/web/frontend/public/build
1113
/web/frontend/node_modules
12-
/.vscode/*
14+
1315
/archive-migration
1416
/archive-manager
15-
var/job.db-shm
16-
var/job.db-wal
1717

18+
/internal/repository/testdata/job.db-shm
19+
/internal/repository/testdata/job.db-wal
20+
21+
/.vscode/*
1822
dist/
1923
*.db
20-
internal/repository/testdata/job.db-shm
21-
internal/repository/testdata/job.db-wal

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ tags:
8282
@ctags -R
8383

8484
$(VAR):
85-
@mkdir $(VAR)
85+
@mkdir -p $(VAR)
8686

8787
config.json:
8888
$(info ===> Initialize config.json file)

internal/archiver/archiver.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,13 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
6060
max = math.Max(max, series.Statistics.Max)
6161
}
6262

63+
// Round AVG Result to 2 Digits
6364
jobMeta.Statistics[metric] = schema.JobStatistics{
6465
Unit: schema.Unit{
6566
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
6667
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
6768
},
68-
Avg: avg / float64(job.NumNodes),
69+
Avg: (math.Round((avg/float64(job.NumNodes))*100) / 100),
6970
Min: min,
7071
Max: max,
7172
}

internal/graph/schema.resolvers.go

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/metricDataDispatcher/dataLoader.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,9 @@ func LoadData(job *schema.Job,
170170
jd.AddNodeScope("mem_bw")
171171
}
172172

173+
// Round Resulting Stat Values
174+
jd.RoundMetricStats()
175+
173176
return jd, ttl, size
174177
})
175178

internal/metricdata/cc-metric-store.go

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,23 @@ func (ccms *CCMetricStore) buildQueries(
440440
continue
441441
}
442442

443+
// Core -> Socket
444+
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket {
445+
sockets, _ := topology.GetSocketsFromCores(hwthreads)
446+
for _, socket := range sockets {
447+
queries = append(queries, ApiQuery{
448+
Metric: remoteName,
449+
Hostname: host.Hostname,
450+
Aggregate: true,
451+
Type: &coreString,
452+
TypeIds: intToStringSlice(topology.Socket[socket]),
453+
Resolution: resolution,
454+
})
455+
assignedScope = append(assignedScope, scope)
456+
}
457+
continue
458+
}
459+
443460
// Core -> Node
444461
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
445462
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
@@ -627,7 +644,7 @@ func (ccms *CCMetricStore) LoadNodeData(
627644
req.Queries = append(req.Queries, ApiQuery{
628645
Hostname: node,
629646
Metric: ccms.toRemoteName(metric),
630-
Resolution: 60, // Default for Node Queries
647+
Resolution: 0, // Default for Node Queries: Will return metric $Timestep Resolution
631648
})
632649
}
633650
}
@@ -1038,6 +1055,23 @@ func (ccms *CCMetricStore) buildNodeQueries(
10381055
continue
10391056
}
10401057

1058+
// Core -> Socket
1059+
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket {
1060+
sockets, _ := topology.GetSocketsFromCores(topology.Node)
1061+
for _, socket := range sockets {
1062+
queries = append(queries, ApiQuery{
1063+
Metric: remoteName,
1064+
Hostname: hostname,
1065+
Aggregate: true,
1066+
Type: &coreString,
1067+
TypeIds: intToStringSlice(topology.Socket[socket]),
1068+
Resolution: resolution,
1069+
})
1070+
assignedScope = append(assignedScope, scope)
1071+
}
1072+
continue
1073+
}
1074+
10411075
// Core -> Node
10421076
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
10431077
cores, _ := topology.GetCoresFromHWThreads(topology.Node)

internal/repository/job.go

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -217,11 +217,6 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er
217217

218218
func (r *JobRepository) FetchFootprint(job *schema.Job) (map[string]float64, error) {
219219
start := time.Now()
220-
cachekey := fmt.Sprintf("footprint:%d", job.ID)
221-
if cached := r.cache.Get(cachekey, nil); cached != nil {
222-
job.Footprint = cached.(map[string]float64)
223-
return job.Footprint, nil
224-
}
225220

226221
if err := sq.Select("job.footprint").From("job").Where("job.id = ?", job.ID).
227222
RunWith(r.stmtCache).QueryRow().Scan(&job.RawFootprint); err != nil {
@@ -238,7 +233,6 @@ func (r *JobRepository) FetchFootprint(job *schema.Job) (map[string]float64, err
238233
return nil, err
239234
}
240235

241-
r.cache.Put(cachekey, job.Footprint, len(job.Footprint), 24*time.Hour)
242236
log.Debugf("Timer FetchFootprint %s", time.Since(start))
243237
return job.Footprint, nil
244238
}
@@ -606,8 +600,11 @@ func (r *JobRepository) UpdateEnergy(
606600
// FIXME: Needs sum as stats type
607601
} else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt)
608602
// Energy: Power (in Watts) * Time (in Seconds)
609-
// Unit: ( W * s ) / 3600 / 1000 = kWh ; Rounded to 2 nearest digits
610-
energy = math.Round(((LoadJobStat(jobMeta, fp, "avg")*float64(jobMeta.Duration))/3600/1000)*100) / 100
603+
// Unit: (( W * s ) / 3600) / 1000 = kWh ; Rounded to 2 nearest digits: (Energy * 100) / 100
604+
// Here: All-Node Metric Average * Number of Nodes * Job Runtime
605+
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
606+
metricNodeSum := LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes) * float64(jobMeta.Duration)
607+
energy = math.Round(((metricNodeSum/3600)/1000)*100) / 100
611608
}
612609
} else {
613610
log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID)

internal/taskManager/updateFootprintService.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ func RegisterFootprintWorker() {
9494
}
9595
}
9696

97-
// Add values rounded to 2 digits
97+
// Add values rounded to 2 digits: repo.LoadStats may return unrounded
9898
jobMeta.Statistics[metric] = schema.JobStatistics{
9999
Unit: schema.Unit{
100100
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,

pkg/schema/cluster.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,38 @@ func (topo *Topology) GetSocketsFromHWThreads(
122122
return sockets, exclusive
123123
}
124124

125+
// Return a list of socket IDs given a list of core IDs. Even if just one
126+
// core is in that socket, add it to the list. If no cores other than
127+
// those in the argument list are assigned to one of the sockets in the first
128+
// return value, return true as the second value. TODO: Optimize this, there
129+
// must be a more efficient way/algorithm.
130+
func (topo *Topology) GetSocketsFromCores (
131+
cores []int,
132+
) (sockets []int, exclusive bool) {
133+
socketsMap := map[int]int{}
134+
for _, core := range cores {
135+
for _, hwthreadInCore := range topo.Core[core] {
136+
for socket, hwthreadsInSocket := range topo.Socket {
137+
for _, hwthreadInSocket := range hwthreadsInSocket {
138+
if hwthreadInCore == hwthreadInSocket {
139+
socketsMap[socket] += 1
140+
}
141+
}
142+
}
143+
}
144+
}
145+
146+
exclusive = true
147+
hwthreadsPerSocket := len(topo.Node) / len(topo.Socket)
148+
sockets = make([]int, 0, len(socketsMap))
149+
for socket, count := range socketsMap {
150+
sockets = append(sockets, socket)
151+
exclusive = exclusive && count == hwthreadsPerSocket
152+
}
153+
154+
return sockets, exclusive
155+
}
156+
125157
// Return a list of core IDs given a list of hwthread IDs. Even if just one
126158
// hwthread is in that core, add it to the list. If no hwthreads other than
127159
// those in the argument list are assigned to one of the cores in the first

pkg/schema/metrics.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,21 @@ func (jd *JobData) AddNodeScope(metric string) bool {
291291
return true
292292
}
293293

294+
func (jd *JobData) RoundMetricStats() {
295+
// TODO: Make Digit-Precision Configurable? (Currently: Fixed to 2 Digits)
296+
for _, scopes := range *jd {
297+
for _, jm := range scopes {
298+
for index := range jm.Series {
299+
jm.Series[index].Statistics = MetricStatistics{
300+
Avg: (math.Round(jm.Series[index].Statistics.Avg*100) / 100),
301+
Min: (math.Round(jm.Series[index].Statistics.Min*100) / 100),
302+
Max: (math.Round(jm.Series[index].Statistics.Max*100) / 100),
303+
}
304+
}
305+
}
306+
}
307+
}
308+
294309
func (jm *JobMetric) AddPercentiles(ps []int) bool {
295310
if jm.StatisticsSeries == nil {
296311
jm.AddStatisticsSeries()

pkg/schema/schemas/config.schema.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@
446446
}
447447
},
448448
"job_view_selectedMetrics": {
449-
"description": "",
449+
"description": "Initial metrics shown as plots in single job view",
450450
"type": "array",
451451
"items": {
452452
"type": "string",

web/frontend/src/Job.root.svelte

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -117,27 +117,41 @@
117117
}
118118
`;
119119
120+
const roofQuery = gql`
121+
query ($dbid: ID!, $selectedMetrics: [String!]!, $selectedScopes: [MetricScope!]!, $selectedResolution: Int) {
122+
jobMetrics(id: $dbid, metrics: $selectedMetrics, scopes: $selectedScopes, resolution: $selectedResolution) {
123+
name
124+
scope
125+
metric {
126+
series {
127+
data
128+
}
129+
}
130+
}
131+
}
132+
`;
133+
120134
$: jobMetrics = queryStore({
121135
client: client,
122136
query: query,
123137
variables: { dbid, selectedMetrics, selectedScopes },
124138
});
125139
140+
// Roofline: Always load roofMetrics with configured timestep (Resolution: 0)
141+
$: roofMetrics = queryStore({
142+
client: client,
143+
query: roofQuery,
144+
variables: { dbid, selectedMetrics: ["flops_any", "mem_bw"], selectedScopes: ["node"], selectedResolution: 0 },
145+
});
146+
126147
// Handle Job Query on Init -> is not executed anymore
127148
getContext("on-init")(() => {
128149
let job = $initq.data.job;
129150
if (!job) return;
130151
131152
const pendingMetrics = [
132-
"flops_any",
133-
"mem_bw",
134153
...(ccconfig[`job_view_selectedMetrics:${job.cluster}`] ||
135-
$initq.data.globalMetrics.reduce((names, gm) => {
136-
if (gm.availability.find((av) => av.cluster === job.cluster)) {
137-
names.push(gm.name);
138-
}
139-
return names;
140-
}, [])
154+
ccconfig[`job_view_selectedMetrics`]
141155
),
142156
...(ccconfig[`job_view_nodestats_selectedMetrics:${job.cluster}`] ||
143157
ccconfig[`job_view_nodestats_selectedMetrics`]
@@ -276,12 +290,12 @@
276290
277291
<!-- Column 3: Job Roofline; If footprint Enabled: full width, else half width -->
278292
<Col xs={12} md={12} xl={5} xxl={6}>
279-
{#if $initq.error || $jobMetrics.error}
293+
{#if $initq.error || $roofMetrics.error}
280294
<Card body color="danger">
281295
<p>Initq Error: {$initq.error?.message}</p>
282-
<p>jobMetrics Error: {$jobMetrics.error?.message}</p>
296+
<p>roofMetrics (jobMetrics) Error: {$roofMetrics.error?.message}</p>
283297
</Card>
284-
{:else if $initq?.data && $jobMetrics?.data}
298+
{:else if $initq?.data && $roofMetrics?.data}
285299
<Card style="height: 400px;">
286300
<div bind:clientWidth={roofWidth}>
287301
<Roofline
@@ -292,10 +306,10 @@
292306
.find((c) => c.name == $initq.data.job.cluster)
293307
.subClusters.find((sc) => sc.name == $initq.data.job.subCluster)}
294308
data={transformDataForRoofline(
295-
$jobMetrics.data?.jobMetrics?.find(
309+
$roofMetrics.data?.jobMetrics?.find(
296310
(m) => m.name == "flops_any" && m.scope == "node",
297311
)?.metric,
298-
$jobMetrics.data?.jobMetrics?.find(
312+
$roofMetrics.data?.jobMetrics?.find(
299313
(m) => m.name == "mem_bw" && m.scope == "node",
300314
)?.metric,
301315
)}

web/frontend/src/Status.root.svelte

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@
8080
: ccconfig.user_view_histogramMetrics || [];
8181
8282
const client = getContextClient();
83+
// Note: nodeMetrics are requested on configured $timestep resolution
8384
$: mainQuery = queryStore({
8485
client: client,
8586
query: gql`

web/frontend/src/Systems.root.svelte

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@
7777
for (let sm of systemMetrics) {
7878
systemUnits[sm.name] = (sm?.unit?.prefix ? sm.unit.prefix : "") + (sm?.unit?.base ? sm.unit.base : "")
7979
}
80+
if (!selectedMetric) selectedMetric = systemMetrics[0].name
8081
}
8182
8283
$: loadMetrics($initialized)

0 commit comments

Comments
 (0)