Commit 8a1dd1e5 authored by Tejun Heo's avatar Tejun Heo
Browse files

workqueue: Track and monitor per-workqueue CPU time usage



Now that wq_worker_tick() is there, we can easily track the rough CPU time
consumption of each workqueue by charging the whole tick whenever a tick
hits an active workqueue. While not super accurate, it provides reasonable
visibility into the workqueues that consume a lot of CPU cycles.
wq_monitor.py is updated to report the per-workqueue CPU times.

v2: wq_monitor.py was using "cputime" as the key when outputting in json
    format. Use "cpu_time" instead for consistency with other fields.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parent 63638450
Loading
Loading
Loading
Loading
+19 −19
Original line number Diff line number Diff line
@@ -354,25 +354,25 @@ Monitoring
Use tools/workqueue/wq_monitor.py to monitor workqueue operations: ::

  $ tools/workqueue/wq_monitor.py events
                              total  infl CPUitsv  CMwake  mayday rescued
  events                      18545     0       0       5       -       -
  events_highpri                  8     0       0       0       -       -
  events_long                     3     0       0       0       -       -
  events_unbound              38306     0       -       -       -       -
  events_freezable                0     0       0       0       -       -
  events_power_efficient      29598     0       0       0       -       -
  events_freezable_power_        10     0       0       0       -       -
  sock_diag_events                0     0       0       0       -       -

                              total  infl CPUitsv  CMwake  mayday rescued
  events                      18548     0       0       5       -       -
  events_highpri                  8     0       0       0       -       -
  events_long                     3     0       0       0       -       -
  events_unbound              38322     0       -       -       -       -
  events_freezable                0     0       0       0       -       -
  events_power_efficient      29603     0       0       0       -       -
  events_freezable_power_        10     0       0       0       -       -
  sock_diag_events                0     0       0       0       -       -
                              total  infl  CPUtime  CPUhog  CMwake  mayday rescued
  events                      18545     0      6.1       0       5       -       -
  events_highpri                  8     0      0.0       0       0       -       -
  events_long                     3     0      0.0       0       0       -       -
  events_unbound              38306     0      0.1       -       -       -       -
  events_freezable                0     0      0.0       0       0       -       -
  events_power_efficient      29598     0      0.2       0       0       -       -
  events_freezable_power_        10     0      0.0       0       0       -       -
  sock_diag_events                0     0      0.0       0       0       -       -

                              total  infl  CPUtime  CPUhog  CMwake  mayday rescued
  events                      18548     0      6.1       0       5       -       -
  events_highpri                  8     0      0.0       0       0       -       -
  events_long                     3     0      0.0       0       0       -       -
  events_unbound              38322     0      0.1       -       -       -       -
  events_freezable                0     0      0.0       0       0       -       -
  events_power_efficient      29603     0      0.2       0       0       -       -
  events_freezable_power_        10     0      0.0       0       0       -       -
  sock_diag_events                0     0      0.0       0       0       -       -

  ...

+3 −0
Original line number Diff line number Diff line
@@ -212,6 +212,7 @@ struct worker_pool {
enum pool_workqueue_stats {
	PWQ_STAT_STARTED,	/* work items started execution */
	PWQ_STAT_COMPLETED,	/* work items completed execution */
	PWQ_STAT_CPU_TIME,	/* total CPU time consumed */
	PWQ_STAT_CPU_INTENSIVE,	/* wq_cpu_intensive_thresh_us violations */
	PWQ_STAT_CM_WAKEUP,	/* concurrency-management worker wakeups */
	PWQ_STAT_MAYDAY,	/* maydays to rescuer */
@@ -1136,6 +1137,8 @@ void wq_worker_tick(struct task_struct *task)
	if (!pwq)
		return;

	pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;

	/*
	 * If the current worker is concurrency managed and hogged the CPU for
	 * longer than wq_cpu_intensive_thresh_us, it's automatically marked
+8 −1
Original line number Diff line number Diff line
@@ -11,6 +11,10 @@ https://github.com/osandov/drgn.

  infl     The number of currently in-flight work items.

  CPUtime  Total CPU time consumed by the workqueue in seconds. This is
           sampled from scheduler ticks and only provides ballpark
           measurement. "nohz_full=" CPUs are excluded from measurement.

  CPUitsv  The number of times a concurrency-managed work item hogged CPU
           longer than the threshold (workqueue.cpu_intensive_thresh_us)
           and got excluded from concurrency management to avoid stalling
@@ -58,6 +62,7 @@ WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM']

PWQ_STAT_STARTED        = prog['PWQ_STAT_STARTED']      # work items started execution
PWQ_STAT_COMPLETED      = prog['PWQ_STAT_COMPLETED']	# work items completed execution
PWQ_STAT_CPU_TIME       = prog['PWQ_STAT_CPU_TIME']     # total CPU time consumed
PWQ_STAT_CPU_INTENSIVE  = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations
PWQ_STAT_CM_WAKEUP      = prog['PWQ_STAT_CM_WAKEUP']    # concurrency-management worker wakeups
PWQ_STAT_MAYDAY         = prog['PWQ_STAT_MAYDAY']	# maydays to rescuer
@@ -81,13 +86,14 @@ class WqStats:
                 'mem_reclaim'          : self.mem_reclaim,
                 'started'              : self.stats[PWQ_STAT_STARTED],
                 'completed'            : self.stats[PWQ_STAT_COMPLETED],
                 'cpu_time'             : self.stats[PWQ_STAT_CPU_TIME],
                 'cpu_intensive'        : self.stats[PWQ_STAT_CPU_INTENSIVE],
                 'cm_wakeup'            : self.stats[PWQ_STAT_CM_WAKEUP],
                 'mayday'               : self.stats[PWQ_STAT_MAYDAY],
                 'rescued'              : self.stats[PWQ_STAT_RESCUED], }

    def table_header_str():
        return f'{"":>24} {"total":>8} {"infl":>5} '\
        return f'{"":>24} {"total":>8} {"infl":>5} {"CPUtime":>8} '\
            f'{"CPUitsv":>7} {"CMwake":>7} {"mayday":>7} {"rescued":>7}'

    def table_row_str(self):
@@ -107,6 +113,7 @@ class WqStats:
        out = f'{self.name[-24:]:24} ' \
              f'{self.stats[PWQ_STAT_STARTED]:8} ' \
              f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \
              f'{self.stats[PWQ_STAT_CPU_TIME] / 1000000:8.1f} ' \
              f'{cpu_intensive:>7} ' \
              f'{cm_wakeup:>7} ' \
              f'{mayday:>7} ' \