@@ -50,11 +50,12 @@ typedef struct
5050 size_t heartbeat_count ; /**< Number of heartbeats received. */
5151 size_t heartbeat_count_old ; /**< Number of old heartbeats received. */
5252 size_t heartbeat_reset_count ; /**< Number of restarts due to late heartbeats. */
53+ size_t avg_heartbeat_count_old ; /**< Average heartbeat count before crashes/resets. */
5354 // CPU and Memory usage statistics
54- double current_cpu_percent ; /**< Current CPU usage percentage. */
55- double max_cpu_percent ; /**< Maximum CPU usage percentage. */
56- double min_cpu_percent ; /**< Minimum CPU usage percentage. */
57- double avg_cpu_percent ; /**< Average CPU usage percentage. */
55+ float current_cpu_percent ; /**< Current CPU usage percentage. */
56+ float max_cpu_percent ; /**< Maximum CPU usage percentage. */
57+ float min_cpu_percent ; /**< Minimum CPU usage percentage. */
58+ float avg_cpu_percent ; /**< Average CPU usage percentage. */
5859 size_t current_memory_kb ; /**< Current memory usage in KB. */
5960 size_t max_memory_kb ; /**< Maximum memory usage in KB. */
6061 size_t min_memory_kb ; /**< Minimum memory usage in KB. */
@@ -65,77 +66,136 @@ typedef struct
6566
6667static Statistic_t stats [MAX_APPS ]; // statistics for the apps
6768
69+ typedef struct
70+ {
71+ unsigned long long prev_process_time ;
72+ struct timespec prev_ts ;
73+ int initialized ;
74+ } CpuState_t ;
75+
76+ static CpuState_t cpustates [MAX_APPS ] = {0 }; // CPU utilization for the apps
77+
6878/**
69- @brief Reads CPU usage percentage for a specific process from /proc/[pid]/stat
70- @param pid Process ID
79+ @brief Reads instantaneous CPU usage percentage for a specific process.
80+ Works regardless of sampling interval. Can exceed 100% on multicore systems.
81+ @param index Index into cpustates[] array
82+ @param pid Process ID
7183 @return CPU usage percentage, or -1.0 on error
7284*/
73- static double get_process_cpu_usage (int pid )
85+ static float get_process_cpu_usage (int index , int pid )
7486{
75- static unsigned long long prev_total_time = 0 ;
76- static unsigned long long prev_process_time = 0 ;
77- static int prev_pid = -1 ;
78- char stat_path [64 ];
87+ char stat_path [48 ];
88+ char buf [512 ];
7989 snprintf (stat_path , sizeof (stat_path ), "/proc/%d/stat" , pid );
8090 FILE * fp = fopen (stat_path , "r" );
8191
8292 if (!fp )
8393 {
84- return -1.0 ;
94+ LOGE ("Failed to open %s" , stat_path );
95+ return -1.0f ;
8596 }
8697
87- unsigned long long utime , stime , cutime , cstime ;
88-
89- // Skip first 13 fields, then read utime, stime, cutime, cstime
90- if (fscanf (fp , "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %llu %llu %llu %llu" ,
91- & utime , & stime , & cutime , & cstime ) != 4 )
98+ if (!fgets (buf , sizeof (buf ), fp ))
9299 {
93100 fclose (fp );
94- return -1.0 ;
101+ LOGE ("Failed to read line from %s" , stat_path );
102+ return -1.0f ;
95103 }
96104
97105 fclose (fp );
98- // Get system total time from /proc/stat
99- fp = fopen ( "/proc/stat" , "r" );
106+ // find end of comm field (inside parentheses)
107+ char * paren = strrchr ( buf , ')' );
100108
101- if (!fp )
109+ if (!paren )
102110 {
103- return -1.0 ;
111+ LOGE ("Failed to find closing parenthesis in stat line" );
112+ return -1.0f ;
104113 }
105114
106- unsigned long long user , nice , system , idle , iowait , irq , softirq , steal ;
115+ // tokenize remaining fields after ") "
116+ char * saveptr ;
117+ char * token = strtok_r (paren + 2 , " " , & saveptr );
118+ int field = 3 ; // already counted pid(1), comm(2), state(3)
119+ unsigned long long utime = 0 , stime = 0 , cutime = 0 , cstime = 0 ;
107120
108- if (fscanf (fp , "cpu %llu %llu %llu %llu %llu %llu %llu %llu" ,
109- & user , & nice , & system , & idle , & iowait , & irq , & softirq , & steal ) != 8 )
121+ while (token )
110122 {
111- fclose (fp );
112- return -1.0 ;
123+ field ++ ;
124+
125+ if (field == 14 )
126+ {
127+ utime = strtoull (token , NULL , 10 );
128+ }
129+ else if (field == 15 )
130+ {
131+ stime = strtoull (token , NULL , 10 );
132+ }
133+ else if (field == 16 )
134+ {
135+ cutime = strtoull (token , NULL , 10 );
136+ }
137+ else if (field == 17 )
138+ {
139+ cstime = strtoull (token , NULL , 10 );
140+ break ;
141+ }
142+
143+ token = strtok_r (NULL , " " , & saveptr );
113144 }
114145
115- fclose (fp );
116- unsigned long long total_time = user + nice + system + idle + iowait + irq + softirq + steal ;
117146 unsigned long long process_time = utime + stime + cutime + cstime ;
147+ LOGD ("PID=%d utime=%llu stime=%llu cutime=%llu cstime=%llu total_process_time=%llu" ,
148+ pid , utime , stime , cutime , cstime , process_time );
149+ long ticks_per_sec = sysconf (_SC_CLK_TCK );
150+ //long nprocs = sysconf(_SC_NPROCESSORS_ONLN);
151+ struct timespec now ;
152+ clock_gettime (CLOCK_MONOTONIC , & now );
153+ CpuState_t * st = & cpustates [index ];
154+
155+ /* if(nprocs <= 0)
156+ {
157+ nprocs = 1;
158+ }*/
118159
119- // Calculate CPU percentage only if we have previous values and same PID
120- if (prev_pid == pid && prev_total_time > 0 )
160+ if (st -> initialized )
121161 {
122- unsigned long long total_diff = total_time - prev_total_time ;
123- unsigned long long process_diff = process_time - prev_process_time ;
162+ float elapsed = ( float )( now . tv_sec - st -> prev_ts . tv_sec ) +
163+ ( float )( now . tv_nsec - st -> prev_ts . tv_nsec ) / 1e9f ;
124164
125- if (total_diff > 0 )
165+ if (elapsed < 1e-6f )
126166 {
127- double cpu_percent = (100.0 * process_diff ) / total_diff ;
128- prev_total_time = total_time ;
129- prev_process_time = process_time ;
130- return cpu_percent ;
167+ // avoid division by near-zero
168+ return -1.0f ;
131169 }
170+
171+ unsigned long long diff = process_time - st -> prev_process_time ;
172+
173+ if (process_time < st -> prev_process_time )
174+ {
175+ // counter wrapped or PID restarted, reset baseline
176+ st -> prev_process_time = process_time ;
177+ st -> prev_ts = now ;
178+ LOGD ("Process time decreased, resetting baseline for PID=%d" , pid );
179+ return -1.0f ;
180+ }
181+
182+ float cpu_sec = (float )diff / (float )ticks_per_sec ;
183+ LOGD ("PID=%d diff=%llu ticks_per_sec=%ld cpu_sec=%.6f elapsed=%.6f" ,
184+ pid , diff , ticks_per_sec , cpu_sec , elapsed );
185+ st -> prev_process_time = process_time ;
186+ st -> prev_ts = now ;
187+ float percent = (cpu_sec / elapsed ) * 100.0f ;
188+ LOGD ("PID=%d CPU usage=%.2f%% (can exceed 100%% on multicore)" ,
189+ pid , percent );
190+ return percent ;
132191 }
133192
134- // Store current values for next calculation
135- prev_pid = pid ;
136- prev_total_time = total_time ;
137- prev_process_time = process_time ;
138- return 0.0 ; // First measurement, return 0
193+ // first call for this pid/index
194+ st -> prev_process_time = process_time ;
195+ st -> prev_ts = now ;
196+ st -> initialized = 1 ;
197+ LOGD ("First call for PID=%d, initializing baseline" , pid );
198+ return -1.0f ;
139199}
140200
141201/**
@@ -145,7 +205,7 @@ static double get_process_cpu_usage(int pid)
145205*/
146206static size_t get_process_memory_usage (int pid )
147207{
148- char status_path [64 ];
208+ char status_path [48 ];
149209 snprintf (status_path , sizeof (status_path ), "/proc/%d/status" , pid );
150210
151211 if (!f_exist (status_path ))
@@ -182,6 +242,28 @@ static void clearHeartbeatCount(int index)
182242 stats [index ].heartbeat_count = 0 ;
183243}
184244
245+ static void updateHeartbeatCountAverage (int index )
246+ {
247+ // Update average heartbeat count old when process crashes or gets reset
248+ if (stats [index ].heartbeat_count_old > 0 )
249+ {
250+ size_t total_events = stats [index ].crash_count + stats [index ].heartbeat_reset_count ;
251+
252+ if (total_events == 1 )
253+ {
254+ // First crash/reset
255+ stats [index ].avg_heartbeat_count_old = stats [index ].heartbeat_count_old ;
256+ }
257+ else if (total_events > 1 )
258+ {
259+ // Update running average
260+ stats [index ].avg_heartbeat_count_old =
261+ ((stats [index ].avg_heartbeat_count_old * (total_events - 1 )) +
262+ stats [index ].heartbeat_count_old ) / total_events ;
263+ }
264+ }
265+ }
266+
185267void stats_started_at (int index )
186268{
187269 stats [index ].started_at = time (NULL );
@@ -193,13 +275,15 @@ void stats_crashed_at(int index)
193275{
194276 stats [index ].crashed_at = time (NULL );
195277 stats [index ].crash_count ++ ;
278+ updateHeartbeatCountAverage (index );
196279 clearHeartbeatCount (index );
197280}
198281
199282void stats_heartbeat_reset_at (int index )
200283{
201284 stats [index ].heartbeat_reset_at = time (NULL );
202285 stats [index ].heartbeat_reset_count ++ ;
286+ updateHeartbeatCountAverage (index );
203287 clearHeartbeatCount (index );
204288}
205289
@@ -241,31 +325,24 @@ void stats_update_first_heartbeat_time(int index, time_t heartbeatTime)
241325 }
242326}
243327
244- void stats_update_resource_usage (int index , int pid )
328+ void stats_update_cpu_usage (int index , int pid )
245329{
246- if (pid <= 0 )
247- {
248- return ; // Process not running
249- }
330+ // Get current CPU usage
331+ float cpu_percent = get_process_cpu_usage (index , pid );
250332
251- // Get current CPU and memory usage
252- double cpu_percent = get_process_cpu_usage (pid );
253- size_t memory_kb = get_process_memory_usage (pid );
254-
255- if (cpu_percent < 0.0 )
333+ if (cpu_percent < 0.0f )
256334 {
335+ LOGD ("Invalid CPU reading (%.2f%%) for PID=%d, skipping sample" , cpu_percent , pid );
257336 return ; // Error reading CPU usage
258337 }
259338
260- // Update current values
339+ // Update current CPU value
261340 stats [index ].current_cpu_percent = cpu_percent ;
262- stats [index ].current_memory_kb = memory_kb ;
263- stats [index ].resource_sample_count ++ ;
264341
265- // Update CPU statistics
266- if (stats [index ].resource_sample_count == 1 )
342+ // Initialize or update CPU statistics
343+ if (stats [index ].max_cpu_percent == 0.0f && stats [ index ]. min_cpu_percent == 100.0f )
267344 {
268- // First sample
345+ // First CPU sample or after reset
269346 stats [index ].max_cpu_percent = cpu_percent ;
270347 stats [index ].min_cpu_percent = cpu_percent ;
271348 stats [index ].avg_cpu_percent = cpu_percent ;
@@ -283,36 +360,50 @@ void stats_update_resource_usage(int index, int pid)
283360 stats [index ].min_cpu_percent = cpu_percent ;
284361 }
285362
286- // Update average CPU
287- stats [index ].avg_cpu_percent = ((stats [index ].avg_cpu_percent * (stats [index ].resource_sample_count - 1 )) + cpu_percent ) / stats [index ].resource_sample_count ;
363+ // Use exponential moving average for CPU
364+ float alpha = 0.1f ; // Smoothing factor
365+ stats [index ].avg_cpu_percent = stats [index ].avg_cpu_percent * (1.0f - alpha ) + cpu_percent * alpha ;
288366 }
367+ }
289368
290- // Update memory statistics
291- if (memory_kb > 0 )
369+ void stats_update_memory_usage (int index , int pid )
370+ {
371+ // Get current memory usage
372+ size_t memory_kb = get_process_memory_usage (pid );
373+
374+ if (memory_kb == 0 )
375+ {
376+ LOGD ("Failed to read memory usage for PID=%d" , pid );
377+ return ; // Error reading memory usage
378+ }
379+
380+ // Update current memory value
381+ stats [index ].current_memory_kb = memory_kb ;
382+ stats [index ].resource_sample_count ++ ;
383+
384+ // Initialize or update memory statistics
385+ if (stats [index ].resource_sample_count == 1 || stats [index ].max_memory_kb == 0 )
292386 {
293- if (stats [index ].resource_sample_count == 1 )
387+ // First memory sample
388+ stats [index ].max_memory_kb = memory_kb ;
389+ stats [index ].min_memory_kb = memory_kb ;
390+ stats [index ].avg_memory_kb = memory_kb ;
391+ }
392+ else
393+ {
394+ // Update max/min memory
395+ if (memory_kb > stats [index ].max_memory_kb )
294396 {
295- // First sample
296397 stats [index ].max_memory_kb = memory_kb ;
297- stats [index ].min_memory_kb = memory_kb ;
298- stats [index ].avg_memory_kb = memory_kb ;
299398 }
300- else
399+
400+ if (memory_kb < stats [index ].min_memory_kb )
301401 {
302- // Update max/min memory
303- if (memory_kb > stats [index ].max_memory_kb )
304- {
305- stats [index ].max_memory_kb = memory_kb ;
306- }
307-
308- if (memory_kb < stats [index ].min_memory_kb )
309- {
310- stats [index ].min_memory_kb = memory_kb ;
311- }
312-
313- // Update average memory
314- stats [index ].avg_memory_kb = ((stats [index ].avg_memory_kb * (stats [index ].resource_sample_count - 1 )) + memory_kb ) / stats [index ].resource_sample_count ;
402+ stats [index ].min_memory_kb = memory_kb ;
315403 }
404+
405+ // Update average memory using running average
406+ stats [index ].avg_memory_kb = ((stats [index ].avg_memory_kb * (stats [index ].resource_sample_count - 1 )) + memory_kb ) / stats [index ].resource_sample_count ;
316407 }
317408}
318409
@@ -353,6 +444,7 @@ void stats_print_to_file(int index, const char *app_name)
353444 fprintf (fp , "Heartbeat reset count: %zu\n" , stats [index ].heartbeat_reset_count );
354445 fprintf (fp , "Heartbeat count: %zu\n" , stats [index ].heartbeat_count );
355446 fprintf (fp , "Heartbeat count old: %zu\n" , stats [index ].heartbeat_count_old );
447+ fprintf (fp , "Average heartbeat count old: %zu\n" , stats [index ].avg_heartbeat_count_old );
356448 fprintf (fp , "Average first heartbeat time: %lld seconds\n" , (long long )stats [index ].avg_first_heartbeat_time );
357449 fprintf (fp , "Maximum first heartbeat time: %lld seconds\n" , (long long )stats [index ].max_first_heartbeat_time );
358450 fprintf (fp , "Minimum first heartbeat time: %lld seconds\n" , (long long )stats [index ].min_first_heartbeat_time );
0 commit comments