Commit 5256184b authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'timers-urgent-2024-07-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull timer migration updates from Thomas Gleixner:
 "Fixes and minor updates for the timer migration code:

   - Stop testing the group->parent pointer as it is not guaranteed to
     be stable over a chain of operations by design.

     This includes a warning which would be nice to have but it produces
     false positives due to the racy nature of the check.

   - Plug a race between CPUs going in and out of idle and a CPU hotplug
     operation. The latter can create and connect a new hierarchy level
     which is missed in the concurrent updates of CPUs which go into
     idle. As a result the events of such a CPU might not be processed
     and timers go stale.

     Cure it by splitting the hotplug operation into a prepare and
     online callback. The prepare callback is guaranteed to run on an
     online and therefore active CPU. This CPU updates the hierarchy and
     being online ensures that there is always at least one migrator
     active which handles the modified hierarchy correctly when going
     idle. The online callback which runs on the incoming CPU then just
     marks the CPU active and brings it into operation.

   - Improve tracing and polish the code further so it is more obvious
     what's going on"

* tag 'timers-urgent-2024-07-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  timers/migration: Fix grammar in comment
  timers/migration: Spare write when nothing changed
  timers/migration: Rename childmask by groupmask to make naming more obvious
  timers/migration: Read childmask and parent pointer in a single place
  timers/migration: Use a single struct for hierarchy walk data
  timers/migration: Improve tracing
  timers/migration: Move hierarchy setup into cpuhotplug prepare callback
  timers/migration: Do not rely always on group->parent
parents c9f33436 f004bf9d
...@@ -122,6 +122,7 @@ enum cpuhp_state { ...@@ -122,6 +122,7 @@ enum cpuhp_state {
CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE,
CPUHP_ZCOMP_PREPARE, CPUHP_ZCOMP_PREPARE,
CPUHP_TIMERS_PREPARE, CPUHP_TIMERS_PREPARE,
CPUHP_TMIGR_PREPARE,
CPUHP_MIPS_SOC_PREPARE, CPUHP_MIPS_SOC_PREPARE,
CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN,
CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20,
......
...@@ -43,7 +43,7 @@ TRACE_EVENT(tmigr_connect_child_parent, ...@@ -43,7 +43,7 @@ TRACE_EVENT(tmigr_connect_child_parent,
__field( unsigned int, lvl ) __field( unsigned int, lvl )
__field( unsigned int, numa_node ) __field( unsigned int, numa_node )
__field( unsigned int, num_children ) __field( unsigned int, num_children )
__field( u32, childmask ) __field( u32, groupmask )
), ),
TP_fast_assign( TP_fast_assign(
...@@ -52,11 +52,11 @@ TRACE_EVENT(tmigr_connect_child_parent, ...@@ -52,11 +52,11 @@ TRACE_EVENT(tmigr_connect_child_parent,
__entry->lvl = child->parent->level; __entry->lvl = child->parent->level;
__entry->numa_node = child->parent->numa_node; __entry->numa_node = child->parent->numa_node;
__entry->num_children = child->parent->num_children; __entry->num_children = child->parent->num_children;
__entry->childmask = child->childmask; __entry->groupmask = child->groupmask;
), ),
TP_printk("group=%p childmask=%0x parent=%p lvl=%d numa=%d num_children=%d", TP_printk("group=%p groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
__entry->child, __entry->childmask, __entry->parent, __entry->child, __entry->groupmask, __entry->parent,
__entry->lvl, __entry->numa_node, __entry->num_children) __entry->lvl, __entry->numa_node, __entry->num_children)
); );
...@@ -72,7 +72,7 @@ TRACE_EVENT(tmigr_connect_cpu_parent, ...@@ -72,7 +72,7 @@ TRACE_EVENT(tmigr_connect_cpu_parent,
__field( unsigned int, lvl ) __field( unsigned int, lvl )
__field( unsigned int, numa_node ) __field( unsigned int, numa_node )
__field( unsigned int, num_children ) __field( unsigned int, num_children )
__field( u32, childmask ) __field( u32, groupmask )
), ),
TP_fast_assign( TP_fast_assign(
...@@ -81,11 +81,11 @@ TRACE_EVENT(tmigr_connect_cpu_parent, ...@@ -81,11 +81,11 @@ TRACE_EVENT(tmigr_connect_cpu_parent,
__entry->lvl = tmc->tmgroup->level; __entry->lvl = tmc->tmgroup->level;
__entry->numa_node = tmc->tmgroup->numa_node; __entry->numa_node = tmc->tmgroup->numa_node;
__entry->num_children = tmc->tmgroup->num_children; __entry->num_children = tmc->tmgroup->num_children;
__entry->childmask = tmc->childmask; __entry->groupmask = tmc->groupmask;
), ),
TP_printk("cpu=%d childmask=%0x parent=%p lvl=%d numa=%d num_children=%d", TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
__entry->cpu, __entry->childmask, __entry->parent, __entry->cpu, __entry->groupmask, __entry->parent,
__entry->lvl, __entry->numa_node, __entry->num_children) __entry->lvl, __entry->numa_node, __entry->num_children)
); );
......
This diff is collapsed.
...@@ -22,7 +22,17 @@ struct tmigr_event { ...@@ -22,7 +22,17 @@ struct tmigr_event {
* struct tmigr_group - timer migration hierarchy group * struct tmigr_group - timer migration hierarchy group
* @lock: Lock protecting the event information and group hierarchy * @lock: Lock protecting the event information and group hierarchy
* information during setup * information during setup
* @parent: Pointer to the parent group * @parent: Pointer to the parent group. Pointer is updated when a
* new hierarchy level is added because of a CPU coming
* online the first time. Once it is set, the pointer will
* not be removed or updated. When accessing parent pointer
* lock less to decide whether to abort a propagation or
* not, it is not a problem. The worst outcome is an
* unnecessary/early CPU wake up. But do not access parent
* pointer several times in the same 'action' (like
* activation, deactivation, check for remote expiry,...)
* without holding the lock as it is not ensured that value
* will not change.
* @groupevt: Next event of the group which is only used when the * @groupevt: Next event of the group which is only used when the
* group is !active. The group event is then queued into * group is !active. The group event is then queued into
* the parent timer queue. * the parent timer queue.
...@@ -41,9 +51,8 @@ struct tmigr_event { ...@@ -41,9 +51,8 @@ struct tmigr_event {
* @num_children: Counter of group children to make sure the group is only * @num_children: Counter of group children to make sure the group is only
* filled with TMIGR_CHILDREN_PER_GROUP; Required for setup * filled with TMIGR_CHILDREN_PER_GROUP; Required for setup
* only * only
* @childmask: childmask of the group in the parent group; is set * @groupmask: mask of the group in the parent group; is set during
* during setup and will never change; can be read * setup and will never change; can be read lockless
* lockless
* @list: List head that is added to the per level * @list: List head that is added to the per level
* tmigr_level_list; is required during setup when a * tmigr_level_list; is required during setup when a
* new group needs to be connected to the existing * new group needs to be connected to the existing
...@@ -59,7 +68,7 @@ struct tmigr_group { ...@@ -59,7 +68,7 @@ struct tmigr_group {
unsigned int level; unsigned int level;
int numa_node; int numa_node;
unsigned int num_children; unsigned int num_children;
u8 childmask; u8 groupmask;
struct list_head list; struct list_head list;
}; };
...@@ -79,7 +88,7 @@ struct tmigr_group { ...@@ -79,7 +88,7 @@ struct tmigr_group {
* hierarchy * hierarchy
* @remote: Is set when timers of the CPU are expired remotely * @remote: Is set when timers of the CPU are expired remotely
* @tmgroup: Pointer to the parent group * @tmgroup: Pointer to the parent group
* @childmask: childmask of tmigr_cpu in the parent group * @groupmask: mask of tmigr_cpu in the parent group
* @wakeup: Stores the first timer when the timer migration * @wakeup: Stores the first timer when the timer migration
* hierarchy is completely idle and remote expiry was done; * hierarchy is completely idle and remote expiry was done;
* is returned to timer code in the idle path and is only * is returned to timer code in the idle path and is only
...@@ -92,7 +101,7 @@ struct tmigr_cpu { ...@@ -92,7 +101,7 @@ struct tmigr_cpu {
bool idle; bool idle;
bool remote; bool remote;
struct tmigr_group *tmgroup; struct tmigr_group *tmgroup;
u8 childmask; u8 groupmask;
u64 wakeup; u64 wakeup;
struct tmigr_event cpuevt; struct tmigr_event cpuevt;
}; };
...@@ -108,8 +117,8 @@ union tmigr_state { ...@@ -108,8 +117,8 @@ union tmigr_state {
u32 state; u32 state;
/** /**
* struct - split state of tmigr_group * struct - split state of tmigr_group
* @active: Contains each childmask bit of the active children * @active: Contains each mask bit of the active children
* @migrator: Contains childmask of the child which is migrator * @migrator: Contains mask of the child which is migrator
* @seq: Sequence counter needs to be increased when an update * @seq: Sequence counter needs to be increased when an update
* to the tmigr_state is done. It prevents a race when * to the tmigr_state is done. It prevents a race when
* updates in the child groups are propagated in changed * updates in the child groups are propagated in changed
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment