Skip to content

Commit 5765565

Browse files
committed
Improve CPU stats and add average heartbeat count before reset/crash
1 parent 717b0d7 commit 5765565

File tree

3 files changed

+186
-85
lines changed

3 files changed

+186
-85
lines changed

src/main.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,8 @@ int main(int argc, char *argv[])
352352
// Update resource usage stats (1 min)
353353
if((uptime % 60) == 0 && process_is_running(i))
354354
{
355-
stats_update_resource_usage(i, get_app_pid(i));
355+
stats_update_cpu_usage(i, get_app_pid(i));
356+
stats_update_memory_usage(i, get_app_pid(i));
356357
}
357358

358359
// Update stats files periodically (15 mins)

src/stats.c

Lines changed: 174 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,12 @@ typedef struct
5050
size_t heartbeat_count; /**< Number of heartbeats received. */
5151
size_t heartbeat_count_old; /**< Number of old heartbeats received. */
5252
size_t heartbeat_reset_count; /**< Number of restarts due to late heartbeats. */
53+
size_t avg_heartbeat_count_old; /**< Average heartbeat count before crashes/resets. */
5354
// CPU and Memory usage statistics
54-
double current_cpu_percent; /**< Current CPU usage percentage. */
55-
double max_cpu_percent; /**< Maximum CPU usage percentage. */
56-
double min_cpu_percent; /**< Minimum CPU usage percentage. */
57-
double avg_cpu_percent; /**< Average CPU usage percentage. */
55+
float current_cpu_percent; /**< Current CPU usage percentage. */
56+
float max_cpu_percent; /**< Maximum CPU usage percentage. */
57+
float min_cpu_percent; /**< Minimum CPU usage percentage. */
58+
float avg_cpu_percent; /**< Average CPU usage percentage. */
5859
size_t current_memory_kb; /**< Current memory usage in KB. */
5960
size_t max_memory_kb; /**< Maximum memory usage in KB. */
6061
size_t min_memory_kb; /**< Minimum memory usage in KB. */
@@ -65,77 +66,136 @@ typedef struct
6566

6667
static Statistic_t stats[MAX_APPS]; // statistics for the apps
6768

69+
typedef struct
70+
{
71+
unsigned long long prev_process_time;
72+
struct timespec prev_ts;
73+
int initialized;
74+
} CpuState_t;
75+
76+
static CpuState_t cpustates[MAX_APPS] = {0}; // CPU utilization for the apps
77+
6878
/**
69-
@brief Reads CPU usage percentage for a specific process from /proc/[pid]/stat
70-
@param pid Process ID
79+
@brief Reads instantaneous CPU usage percentage for a specific process.
80+
Works regardless of sampling interval. Can exceed 100% on multicore systems.
81+
@param index Index into cpustates[] array
82+
@param pid Process ID
7183
@return CPU usage percentage, or -1.0 on error
7284
*/
73-
static double get_process_cpu_usage(int pid)
85+
static float get_process_cpu_usage(int index, int pid)
7486
{
75-
static unsigned long long prev_total_time = 0;
76-
static unsigned long long prev_process_time = 0;
77-
static int prev_pid = -1;
78-
char stat_path[64];
87+
char stat_path[48];
88+
char buf[512];
7989
snprintf(stat_path, sizeof(stat_path), "/proc/%d/stat", pid);
8090
FILE *fp = fopen(stat_path, "r");
8191

8292
if(!fp)
8393
{
84-
return -1.0;
94+
LOGE("Failed to open %s", stat_path);
95+
return -1.0f;
8596
}
8697

87-
unsigned long long utime, stime, cutime, cstime;
88-
89-
// Skip first 13 fields, then read utime, stime, cutime, cstime
90-
if(fscanf(fp, "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %llu %llu %llu %llu",
91-
&utime, &stime, &cutime, &cstime) != 4)
98+
if(!fgets(buf, sizeof(buf), fp))
9299
{
93100
fclose(fp);
94-
return -1.0;
101+
LOGE("Failed to read line from %s", stat_path);
102+
return -1.0f;
95103
}
96104

97105
fclose(fp);
98-
// Get system total time from /proc/stat
99-
fp = fopen("/proc/stat", "r");
106+
// find end of comm field (inside parentheses)
107+
char *paren = strrchr(buf, ')');
100108

101-
if(!fp)
109+
if(!paren)
102110
{
103-
return -1.0;
111+
LOGE("Failed to find closing parenthesis in stat line");
112+
return -1.0f;
104113
}
105114

106-
unsigned long long user, nice, system, idle, iowait, irq, softirq, steal;
115+
// tokenize remaining fields after ") "
116+
char *saveptr;
117+
char *token = strtok_r(paren + 2, " ", &saveptr);
118+
int field = 3; // already counted pid(1), comm(2), state(3)
119+
unsigned long long utime = 0, stime = 0, cutime = 0, cstime = 0;
107120

108-
if(fscanf(fp, "cpu %llu %llu %llu %llu %llu %llu %llu %llu",
109-
&user, &nice, &system, &idle, &iowait, &irq, &softirq, &steal) != 8)
121+
while(token)
110122
{
111-
fclose(fp);
112-
return -1.0;
123+
field++;
124+
125+
if(field == 14)
126+
{
127+
utime = strtoull(token, NULL, 10);
128+
}
129+
else if(field == 15)
130+
{
131+
stime = strtoull(token, NULL, 10);
132+
}
133+
else if(field == 16)
134+
{
135+
cutime = strtoull(token, NULL, 10);
136+
}
137+
else if(field == 17)
138+
{
139+
cstime = strtoull(token, NULL, 10);
140+
break;
141+
}
142+
143+
token = strtok_r(NULL, " ", &saveptr);
113144
}
114145

115-
fclose(fp);
116-
unsigned long long total_time = user + nice + system + idle + iowait + irq + softirq + steal;
117146
unsigned long long process_time = utime + stime + cutime + cstime;
147+
LOGD("PID=%d utime=%llu stime=%llu cutime=%llu cstime=%llu total_process_time=%llu",
148+
pid, utime, stime, cutime, cstime, process_time);
149+
long ticks_per_sec = sysconf(_SC_CLK_TCK);
150+
//long nprocs = sysconf(_SC_NPROCESSORS_ONLN);
151+
struct timespec now;
152+
clock_gettime(CLOCK_MONOTONIC, &now);
153+
CpuState_t *st = &cpustates[index];
154+
155+
/* if(nprocs <= 0)
156+
{
157+
nprocs = 1;
158+
}*/
118159

119-
// Calculate CPU percentage only if we have previous values and same PID
120-
if(prev_pid == pid && prev_total_time > 0)
160+
if(st->initialized)
121161
{
122-
unsigned long long total_diff = total_time - prev_total_time;
123-
unsigned long long process_diff = process_time - prev_process_time;
162+
float elapsed = (float)(now.tv_sec - st->prev_ts.tv_sec) +
163+
(float)(now.tv_nsec - st->prev_ts.tv_nsec) / 1e9f;
124164

125-
if(total_diff > 0)
165+
if(elapsed < 1e-6f)
126166
{
127-
double cpu_percent = (100.0 * process_diff) / total_diff;
128-
prev_total_time = total_time;
129-
prev_process_time = process_time;
130-
return cpu_percent;
167+
// avoid division by near-zero
168+
return -1.0f;
131169
}
170+
171+
unsigned long long diff = process_time - st->prev_process_time;
172+
173+
if(process_time < st->prev_process_time)
174+
{
175+
// counter wrapped or PID restarted, reset baseline
176+
st->prev_process_time = process_time;
177+
st->prev_ts = now;
178+
LOGD("Process time decreased, resetting baseline for PID=%d", pid);
179+
return -1.0f;
180+
}
181+
182+
float cpu_sec = (float)diff / (float)ticks_per_sec;
183+
LOGD("PID=%d diff=%llu ticks_per_sec=%ld cpu_sec=%.6f elapsed=%.6f",
184+
pid, diff, ticks_per_sec, cpu_sec, elapsed);
185+
st->prev_process_time = process_time;
186+
st->prev_ts = now;
187+
float percent = (cpu_sec / elapsed) * 100.0f;
188+
LOGD("PID=%d CPU usage=%.2f%% (can exceed 100%% on multicore)",
189+
pid, percent);
190+
return percent;
132191
}
133192

134-
// Store current values for next calculation
135-
prev_pid = pid;
136-
prev_total_time = total_time;
137-
prev_process_time = process_time;
138-
return 0.0; // First measurement, return 0
193+
// first call for this pid/index
194+
st->prev_process_time = process_time;
195+
st->prev_ts = now;
196+
st->initialized = 1;
197+
LOGD("First call for PID=%d, initializing baseline", pid);
198+
return -1.0f;
139199
}
140200

141201
/**
@@ -145,7 +205,7 @@ static double get_process_cpu_usage(int pid)
145205
*/
146206
static size_t get_process_memory_usage(int pid)
147207
{
148-
char status_path[64];
208+
char status_path[48];
149209
snprintf(status_path, sizeof(status_path), "/proc/%d/status", pid);
150210

151211
if(!f_exist(status_path))
@@ -182,6 +242,28 @@ static void clearHeartbeatCount(int index)
182242
stats[index].heartbeat_count = 0;
183243
}
184244

245+
static void updateHeartbeatCountAverage(int index)
246+
{
247+
// Update average heartbeat count old when process crashes or gets reset
248+
if(stats[index].heartbeat_count_old > 0)
249+
{
250+
size_t total_events = stats[index].crash_count + stats[index].heartbeat_reset_count;
251+
252+
if(total_events == 1)
253+
{
254+
// First crash/reset
255+
stats[index].avg_heartbeat_count_old = stats[index].heartbeat_count_old;
256+
}
257+
else if(total_events > 1)
258+
{
259+
// Update running average
260+
stats[index].avg_heartbeat_count_old =
261+
((stats[index].avg_heartbeat_count_old * (total_events - 1)) +
262+
stats[index].heartbeat_count_old) / total_events;
263+
}
264+
}
265+
}
266+
185267
void stats_started_at(int index)
186268
{
187269
stats[index].started_at = time(NULL);
@@ -193,13 +275,15 @@ void stats_crashed_at(int index)
193275
{
194276
stats[index].crashed_at = time(NULL);
195277
stats[index].crash_count++;
278+
updateHeartbeatCountAverage(index);
196279
clearHeartbeatCount(index);
197280
}
198281

199282
void stats_heartbeat_reset_at(int index)
200283
{
201284
stats[index].heartbeat_reset_at = time(NULL);
202285
stats[index].heartbeat_reset_count++;
286+
updateHeartbeatCountAverage(index);
203287
clearHeartbeatCount(index);
204288
}
205289

@@ -241,31 +325,24 @@ void stats_update_first_heartbeat_time(int index, time_t heartbeatTime)
241325
}
242326
}
243327

244-
void stats_update_resource_usage(int index, int pid)
328+
void stats_update_cpu_usage(int index, int pid)
245329
{
246-
if(pid <= 0)
247-
{
248-
return; // Process not running
249-
}
330+
// Get current CPU usage
331+
float cpu_percent = get_process_cpu_usage(index, pid);
250332

251-
// Get current CPU and memory usage
252-
double cpu_percent = get_process_cpu_usage(pid);
253-
size_t memory_kb = get_process_memory_usage(pid);
254-
255-
if(cpu_percent < 0.0)
333+
if(cpu_percent < 0.0f)
256334
{
335+
LOGD("Invalid CPU reading (%.2f%%) for PID=%d, skipping sample", cpu_percent, pid);
257336
return; // Error reading CPU usage
258337
}
259338

260-
// Update current values
339+
// Update current CPU value
261340
stats[index].current_cpu_percent = cpu_percent;
262-
stats[index].current_memory_kb = memory_kb;
263-
stats[index].resource_sample_count++;
264341

265-
// Update CPU statistics
266-
if(stats[index].resource_sample_count == 1)
342+
// Initialize or update CPU statistics
343+
if(stats[index].max_cpu_percent == 0.0f && stats[index].min_cpu_percent == 100.0f)
267344
{
268-
// First sample
345+
// First CPU sample or after reset
269346
stats[index].max_cpu_percent = cpu_percent;
270347
stats[index].min_cpu_percent = cpu_percent;
271348
stats[index].avg_cpu_percent = cpu_percent;
@@ -283,36 +360,50 @@ void stats_update_resource_usage(int index, int pid)
283360
stats[index].min_cpu_percent = cpu_percent;
284361
}
285362

286-
// Update average CPU
287-
stats[index].avg_cpu_percent = ((stats[index].avg_cpu_percent * (stats[index].resource_sample_count - 1)) + cpu_percent) / stats[index].resource_sample_count;
363+
// Use exponential moving average for CPU
364+
float alpha = 0.1f; // Smoothing factor
365+
stats[index].avg_cpu_percent = stats[index].avg_cpu_percent * (1.0f - alpha) + cpu_percent * alpha;
288366
}
367+
}
289368

290-
// Update memory statistics
291-
if(memory_kb > 0)
369+
void stats_update_memory_usage(int index, int pid)
370+
{
371+
// Get current memory usage
372+
size_t memory_kb = get_process_memory_usage(pid);
373+
374+
if(memory_kb == 0)
375+
{
376+
LOGD("Failed to read memory usage for PID=%d", pid);
377+
return; // Error reading memory usage
378+
}
379+
380+
// Update current memory value
381+
stats[index].current_memory_kb = memory_kb;
382+
stats[index].resource_sample_count++;
383+
384+
// Initialize or update memory statistics
385+
if(stats[index].resource_sample_count == 1 || stats[index].max_memory_kb == 0)
292386
{
293-
if(stats[index].resource_sample_count == 1)
387+
// First memory sample
388+
stats[index].max_memory_kb = memory_kb;
389+
stats[index].min_memory_kb = memory_kb;
390+
stats[index].avg_memory_kb = memory_kb;
391+
}
392+
else
393+
{
394+
// Update max/min memory
395+
if(memory_kb > stats[index].max_memory_kb)
294396
{
295-
// First sample
296397
stats[index].max_memory_kb = memory_kb;
297-
stats[index].min_memory_kb = memory_kb;
298-
stats[index].avg_memory_kb = memory_kb;
299398
}
300-
else
399+
400+
if(memory_kb < stats[index].min_memory_kb)
301401
{
302-
// Update max/min memory
303-
if(memory_kb > stats[index].max_memory_kb)
304-
{
305-
stats[index].max_memory_kb = memory_kb;
306-
}
307-
308-
if(memory_kb < stats[index].min_memory_kb)
309-
{
310-
stats[index].min_memory_kb = memory_kb;
311-
}
312-
313-
// Update average memory
314-
stats[index].avg_memory_kb = ((stats[index].avg_memory_kb * (stats[index].resource_sample_count - 1)) + memory_kb) / stats[index].resource_sample_count;
402+
stats[index].min_memory_kb = memory_kb;
315403
}
404+
405+
// Update average memory using running average
406+
stats[index].avg_memory_kb = ((stats[index].avg_memory_kb * (stats[index].resource_sample_count - 1)) + memory_kb) / stats[index].resource_sample_count;
316407
}
317408
}
318409

@@ -353,6 +444,7 @@ void stats_print_to_file(int index, const char *app_name)
353444
fprintf(fp, "Heartbeat reset count: %zu\n", stats[index].heartbeat_reset_count);
354445
fprintf(fp, "Heartbeat count: %zu\n", stats[index].heartbeat_count);
355446
fprintf(fp, "Heartbeat count old: %zu\n", stats[index].heartbeat_count_old);
447+
fprintf(fp, "Average heartbeat count old: %zu\n", stats[index].avg_heartbeat_count_old);
356448
fprintf(fp, "Average first heartbeat time: %lld seconds\n", (long long)stats[index].avg_first_heartbeat_time);
357449
fprintf(fp, "Maximum first heartbeat time: %lld seconds\n", (long long)stats[index].max_first_heartbeat_time);
358450
fprintf(fp, "Minimum first heartbeat time: %lld seconds\n", (long long)stats[index].min_first_heartbeat_time);

0 commit comments

Comments
 (0)