Skip to content

Commit 895146d

Browse files
Zhen Leichantra
authored andcommitted
rcu: Add RCU stall diagnosis information
In some extreme cases, such as the I/O pressure test, the CPU usage may be 100%, causing RCU stall. In this case, the printed information about current is not useful. Displays the number and usage of hard interrupts, soft interrupts, and context switches that are generated within half of the CPU stall timeout, can help us make a general judgment. In other cases, we can preliminarily determine whether an infinite loop occurs when local_irq, local_bh or preempt is disabled. For example: rcu: INFO: rcu_preempt self-detected stall on CPU rcu: 0-....: (1250 ticks this GP) <omitted> rcu: hardirqs softirqs csw/system rcu: number: 624 45 0 rcu: cputime: 69 1 2425 ==> 2500(ms) The example above shows that the number of hard and soft interrupts is small, there is zero context switching, and the system takes up a lot of time. We can quickly conclude that the current task is infinitely looped with preempt_disable(). The impact on system performance is negligible because snapshot is recorded only one time after 1/2 CPU stall timeout. Signed-off-by: Zhen Lei <[email protected]>
1 parent ac2b214 commit 895146d

File tree

2 files changed

+51
-0
lines changed

2 files changed

+51
-0
lines changed

kernel/rcu/tree.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,16 @@ union rcu_noqs {
158158
u16 s; /* Set of bits, aggregate OR here. */
159159
};
160160

161+
struct rcu_snap_record {
162+
unsigned long gp_seq;
163+
u64 cputime_irq;
164+
u64 cputime_softirq;
165+
u64 cputime_system;
166+
unsigned int nr_hardirqs;
167+
unsigned int nr_softirqs;
168+
unsigned long long nr_csw;
169+
};
170+
161171
/* Per-CPU data for read-copy update. */
162172
struct rcu_data {
163173
/* 1) quiescent-state and grace-period handling : */
@@ -262,6 +272,7 @@ struct rcu_data {
262272
short rcu_onl_gp_flags; /* ->gp_flags at last online. */
263273
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
264274
unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
275+
struct rcu_snap_record snap_record;
265276

266277
long lazy_len; /* Length of buffered lazy callbacks. */
267278
int cpu;

kernel/rcu/tree_stall.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,8 +211,10 @@ static void rcu_stall_kick_kthreads(void)
211211
*/
212212
static void rcu_iw_handler(struct irq_work *iwp)
213213
{
214+
int cpu = smp_processor_id();
214215
struct rcu_data *rdp;
215216
struct rcu_node *rnp;
217+
struct rcu_snap_record *r;
216218

217219
rdp = container_of(iwp, struct rcu_data, rcu_iw);
218220
rnp = rdp->mynode;
@@ -222,6 +224,16 @@ static void rcu_iw_handler(struct irq_work *iwp)
222224
rdp->rcu_iw_pending = false;
223225
}
224226
raw_spin_unlock_rcu_node(rnp);
227+
228+
r = &rdp->snap_record;
229+
r->cputime_irq = kcpustat_this_cpu->cpustat[CPUTIME_IRQ];
230+
r->cputime_softirq = kcpustat_this_cpu->cpustat[CPUTIME_SOFTIRQ];
231+
r->cputime_system = kcpustat_this_cpu->cpustat[CPUTIME_SYSTEM];
232+
r->nr_hardirqs = kstat_cpu_irqs_sum(cpu);
233+
r->nr_softirqs = kstat_cpu_softirqs_sum(cpu);
234+
r->nr_csw = nr_context_switches_cpu(cpu);
235+
barrier();
236+
r->gp_seq = rdp->gp_seq;
225237
}
226238

227239
//////////////////////////////////////////////////////////////////////////////
@@ -428,6 +440,32 @@ static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp
428440
return j > 2 * HZ;
429441
}
430442

443+
static void print_cpu_stat_info(int cpu)
444+
{
445+
u64 *cpustat;
446+
unsigned long half_timeout;
447+
struct rcu_snap_record *r;
448+
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
449+
450+
r = &rdp->snap_record;
451+
if (r->gp_seq != rdp->gp_seq)
452+
return;
453+
454+
cpustat = kcpustat_this_cpu->cpustat;
455+
half_timeout = rcu_jiffies_till_stall_check() / 2;
456+
457+
pr_err(" hardirqs softirqs csw/system\n");
458+
pr_err(" number: %8d %10d %12lld\n",
459+
kstat_cpu_irqs_sum(cpu) - r->nr_hardirqs,
460+
kstat_cpu_softirqs_sum(cpu) - r->nr_softirqs,
461+
nr_context_switches_cpu(cpu) - r->nr_csw);
462+
pr_err("cputime: %8lld %10lld %12lld ==> %lld(ms)\n",
463+
div_u64(cpustat[CPUTIME_IRQ] - r->cputime_irq, NSEC_PER_MSEC),
464+
div_u64(cpustat[CPUTIME_SOFTIRQ] - r->cputime_softirq, NSEC_PER_MSEC),
465+
div_u64(cpustat[CPUTIME_SYSTEM] - r->cputime_system, NSEC_PER_MSEC),
466+
jiffies64_to_msecs(half_timeout));
467+
}
468+
431469
/*
432470
* Print out diagnostic information for the specified stalled CPU.
433471
*
@@ -484,6 +522,8 @@ static void print_cpu_stall_info(int cpu)
484522
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
485523
rcuc_starved ? buf : "",
486524
falsepositive ? " (false positive?)" : "");
525+
526+
print_cpu_stat_info(cpu);
487527
}
488528

489529
/* Complain about starvation of grace-period kthread. */

0 commit comments

Comments
 (0)