[ALL] RealTime kernel/system changes.

- Raspberry Pi: Using the RT patches and select RT as PREEMPT
- Pipewire changes to upvote itself to prio 95% nice -19
- Other small kernel tweaks and fixes
This commit is contained in:
j1nx 2024-01-04 12:38:55 +00:00
parent 6e78f7b4b9
commit a68ff2c35e
69 changed files with 10145 additions and 10 deletions

View File

@ -50,5 +50,11 @@ CONFIG_PCI_HYPERV=m
CONFIG_PCI_HYPERV_INTERFACE=m
CONFIG_FB_HYPERV=y
CONFIG_PREEMPT_VOLUNTARY=y
CONFIG_HAVE_PREEMPT_LAZY=y
CONFIG_PREEMPT_LAZY=y
# CONFIG_PREEMPT_NONE is not set
# CONFIG_PREEMPT_VOLUNTARY is not set
# CONFIG_PREEMPT is not set
CONFIG_PREEMPT_RT=y
CONFIG_PREEMPT_COUNT=y
CONFIG_PREEMPTION=y

View File

@ -66,7 +66,7 @@ CONFIG_RT2800USB_UNKNOWN=y
# Multimedia core support
CONFIG_MEDIA_SUPPORT=y
CONFIG_MEDIA_CONTROLLER=y
CONFIG_MEDIA_CONTROLLER_DVB=m
CONFIG_MEDIA_CONTROLLER_DVB=y
CONFIG_MEDIA_CAMERA_SUPPORT=y
CONFIG_MEDIA_ANALOG_TV_SUPPORT=y
CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y
@ -186,7 +186,7 @@ CONFIG_SND_HDA=y
CONFIG_SND_HDA_CORE=y
CONFIG_SND_HDA_GENERIC=y
CONFIG_SND_HDA_INTEL=m
CONFIG_SND_HDA_I915=m
CONFIG_SND_HDA_I915=y
CONFIG_SND_HDA_CODEC_REALTEK=m
CONFIG_SND_HDA_CODEC_ANALOG=m
CONFIG_SND_HDA_CODEC_VIA=m

View File

@ -6,6 +6,7 @@ CONFIG_NET_SCHED=y
# CONFIG_RT_GROUP_SCHED is not set
CONFIG_CGROUPS=y
CONFIG_CPUSETS=y
CONFIG_HUGETLB_PAGE=y
CONFIG_BLK_CGROUP=y
CONFIG_BLK_DEV_THROTTLING=y
@ -16,7 +17,7 @@ CONFIG_CGROUP_HUGETLB=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_CGROUP_PERF=y
CONFIG_CGROUP_HUGETLB=y
CONFIG_NET_CLS_CGROUP=y
CONFIG_CGROUP_NET_PRIO=y
CONFIG_CGROUP_BPF=y
@ -73,7 +74,7 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y
CONFIG_IP_NF_TARGET_REDIRECT=y
CONFIG_BRIDGE=y
CONFIG_BRIDGE_NETFILTER=y
CONFIG_XFRM=y
CONFIG_XFRM=m
CONFIG_XFRM_USER=y
CONFIG_XFRM_ALGO=y

View File

@ -1,11 +1,18 @@
CONFIG_LOCALVERSION="-ovos-buildroot"
CONFIG_KERNEL_LZO=y
# CONFIG_KERNEL_GZIP is not set
# GCC plugins are disabled by linux.mk, disable them here to reduce
# linux-diff-config noise
CONFIG_GCC_PLUGINS=n
CONFIG_CMDLINE=""
CONFIG_PANIC_TIMEOUT=5
CONFIG_PSI=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_MODULE_COMPRESS_NONE=y
# CONFIG_MODULE_COMPRESS_XZ is not set
CONFIG_ZRAM=y
CONFIG_KSM=y

View File

@ -0,0 +1,32 @@
From 52072a197524e62baa4ac9a5f33d15cd8b27fb17 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 16 Aug 2022 09:45:22 +0200
Subject: [PATCH 01/62] vduse: Remove include of rwlock.h
rwlock.h should not be included directly. Instead linux/splinlock.h
should be included. Including it directly will break the RT build.
Remove the rwlock.h include.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lkml.kernel.org/r/20221026134407.711768-1-bigeasy@linutronix.de
---
drivers/vdpa/vdpa_user/iova_domain.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
index 4e0e50e7ac15..173e979b84a9 100644
--- a/drivers/vdpa/vdpa_user/iova_domain.h
+++ b/drivers/vdpa/vdpa_user/iova_domain.h
@@ -14,7 +14,6 @@
#include <linux/iova.h>
#include <linux/dma-mapping.h>
#include <linux/vhost_iotlb.h>
-#include <linux/rwlock.h>
#define IOVA_START_PFN 1
--
2.43.0

View File

@ -0,0 +1,65 @@
From d5541b6ef4eccee650abfe3095b9e7365773494c Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 22 Jun 2022 11:36:17 +0200
Subject: [PATCH 02/62] signal: Don't disable preemption in ptrace_stop() on
PREEMPT_RT.
Commit
53da1d9456fe7 ("fix ptrace slowness")
is just band aid around the problem.
The invocation of do_notify_parent_cldstop() wakes the parent and makes
it runnable. The scheduler then wants to replace this still running task
with the parent. With the read_lock() acquired this is not possible
because preemption is disabled and so this is deferred until read_unlock().
This scheduling point is undesired and is avoided by disabling preemption
around the unlock operation enabled again before the schedule() invocation
without a preemption point.
This is only undesired because the parent sleeps a cycle in
wait_task_inactive() until the traced task leaves the run-queue in
schedule(). It is not a correctness issue, it is just band aid to avoid the
visbile delay which sums up over multiple invocations.
The task can still be preempted if an interrupt occurs between
preempt_enable_no_resched() and freezable_schedule() because on the IRQ-exit
path of the interrupt scheduling _will_ happen. This is ignored since it does
not happen very often.
On PREEMPT_RT keeping preemption disabled during the invocation of
cgroup_enter_frozen() becomes a problem because the function acquires
css_set_lock which is a sleeping lock on PREEMPT_RT and must not be
acquired with disabled preemption.
Don't disable preemption on PREEMPT_RT. Remove the TODO regarding adding
read_unlock_no_resched() as there is no need for it and will cause harm.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/20220720154435.232749-2-bigeasy@linutronix.de
---
kernel/signal.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/kernel/signal.c b/kernel/signal.c
index 5d45f5da2b36..58e919c7c936 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2302,13 +2302,13 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
/*
* Don't want to allow preemption here, because
* sys_ptrace() needs this task to be inactive.
- *
- * XXX: implement read_unlock_no_resched().
*/
- preempt_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
read_unlock(&tasklist_lock);
cgroup_enter_frozen();
- preempt_enable_no_resched();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable_no_resched();
schedule();
cgroup_leave_frozen(true);
--
2.43.0

View File

@ -0,0 +1,151 @@
From e4742fc784660e012dc23090a72614bf1f9a0ca1 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 22 Jun 2022 12:27:05 +0200
Subject: [PATCH 03/62] sched: Consider task_struct::saved_state in
wait_task_inactive().
Ptrace is using wait_task_inactive() to wait for the tracee to reach a
certain task state. On PREEMPT_RT that state may be stored in
task_struct::saved_state while the tracee blocks on a sleeping lock and
task_struct::__state is set to TASK_RTLOCK_WAIT.
It is not possible to check only for TASK_RTLOCK_WAIT to be sure that the task
is blocked on a sleeping lock because during wake up (after the sleeping lock
has been acquired) the task state is set TASK_RUNNING. After the task in on CPU
and acquired the pi_lock it will reset the state accordingly but until then
TASK_RUNNING will be observed (with the desired state saved in saved_state).
Check also for task_struct::saved_state if the desired match was not found in
task_struct::__state on PREEMPT_RT. If the state was found in saved_state, wait
until the task is idle and state is visible in task_struct::__state.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/Yt%2FpQAFQ1xKNK0RY@linutronix.de
---
kernel/sched/core.c | 81 ++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 76 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18a4f8f28a25..6bd06122850a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3281,6 +3281,76 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
}
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * Consider:
+ *
+ * set_special_state(X);
+ *
+ * do_things()
+ * // Somewhere in there is an rtlock that can be contended:
+ * current_save_and_set_rtlock_wait_state();
+ * [...]
+ * schedule_rtlock(); (A)
+ * [...]
+ * current_restore_rtlock_saved_state();
+ *
+ * schedule(); (B)
+ *
+ * If p->saved_state is anything else than TASK_RUNNING, then p blocked on an
+ * rtlock (A) *before* voluntarily calling into schedule() (B) after setting its
+ * state to X. For things like ptrace (X=TASK_TRACED), the task could have more
+ * work to do upon acquiring the lock in do_things() before whoever called
+ * wait_task_inactive() should return. IOW, we have to wait for:
+ *
+ * p.saved_state = TASK_RUNNING
+ * p.__state = X
+ *
+ * which implies the task isn't blocked on an RT lock and got to schedule() (B).
+ *
+ * Also see comments in ttwu_state_match().
+ */
+
+static __always_inline bool state_mismatch(struct task_struct *p, unsigned int match_state)
+{
+ unsigned long flags;
+ bool mismatch;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ if (READ_ONCE(p->__state) & match_state)
+ mismatch = false;
+ else if (READ_ONCE(p->saved_state) & match_state)
+ mismatch = false;
+ else
+ mismatch = true;
+
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ return mismatch;
+}
+static __always_inline bool state_match(struct task_struct *p, unsigned int match_state,
+ bool *wait)
+{
+ if (READ_ONCE(p->__state) & match_state)
+ return true;
+ if (READ_ONCE(p->saved_state) & match_state) {
+ *wait = true;
+ return true;
+ }
+ return false;
+}
+#else
+static __always_inline bool state_mismatch(struct task_struct *p, unsigned int match_state)
+{
+ return !(READ_ONCE(p->__state) & match_state);
+}
+static __always_inline bool state_match(struct task_struct *p, unsigned int match_state,
+ bool *wait)
+{
+ return (READ_ONCE(p->__state) & match_state);
+}
+#endif
+
/*
* wait_task_inactive - wait for a thread to unschedule.
*
@@ -3299,7 +3369,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
*/
unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
{
- int running, queued;
+ bool running, wait;
struct rq_flags rf;
unsigned long ncsw;
struct rq *rq;
@@ -3325,7 +3395,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
* is actually now running somewhere else!
*/
while (task_on_cpu(rq, p)) {
- if (!(READ_ONCE(p->__state) & match_state))
+ if (state_mismatch(p, match_state))
return 0;
cpu_relax();
}
@@ -3338,9 +3408,10 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
rq = task_rq_lock(p, &rf);
trace_sched_wait_task(p);
running = task_on_cpu(rq, p);
- queued = task_on_rq_queued(p);
+ wait = task_on_rq_queued(p);
ncsw = 0;
- if (READ_ONCE(p->__state) & match_state)
+
+ if (state_match(p, match_state, &wait))
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, p, &rf);
@@ -3370,7 +3441,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
- if (unlikely(queued)) {
+ if (unlikely(wait)) {
ktime_t to = NSEC_PER_SEC / HZ;
set_current_state(TASK_UNINTERRUPTIBLE);
--
2.43.0

View File

@ -0,0 +1,39 @@
From 638117350cb3452dd5043156c7e394befe7d6eb9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Aug 2022 16:15:32 +0200
Subject: [PATCH 04/62] spi: Remove the obsolte u64_stats_fetch_*_irq() users.
Now that the 32bit UP oddity is gone and 32bit uses always a sequence
count, there is no need for the fetch_irq() variants anymore.
Convert to the regular interface.
Cc: Mark Brown <broonie@kernel.org>
Cc: linux-spi@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
drivers/spi/spi.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 5d046be8b2dd..716e6d6ecf98 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -127,10 +127,10 @@ do { \
unsigned int start; \
pcpu_stats = per_cpu_ptr(in, i); \
do { \
- start = u64_stats_fetch_begin_irq( \
+ start = u64_stats_fetch_begin( \
&pcpu_stats->syncp); \
inc = u64_stats_read(&pcpu_stats->field); \
- } while (u64_stats_fetch_retry_irq( \
+ } while (u64_stats_fetch_retry( \
&pcpu_stats->syncp, start)); \
ret += inc; \
} \
--
2.43.0

View File

@ -0,0 +1,392 @@
From a0c8ef7e6160582c71c0d8b1786d8e45dcc02132 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Aug 2022 16:17:37 +0200
Subject: [PATCH 06/62] net: Remove the obsolte u64_stats_fetch_*_irq() users
(net).
Now that the 32bit UP oddity is gone and 32bit uses always a sequence
count, there is no need for the fetch_irq() variants anymore.
Convert to the regular interface.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
net/8021q/vlan_dev.c | 4 ++--
net/bridge/br_multicast.c | 4 ++--
net/bridge/br_vlan.c | 4 ++--
net/core/dev.c | 4 ++--
net/core/drop_monitor.c | 8 ++++----
net/core/gen_stats.c | 16 ++++++++--------
net/devlink/leftover.c | 4 ++--
net/dsa/slave.c | 4 ++--
net/ipv4/af_inet.c | 4 ++--
net/ipv6/seg6_local.c | 4 ++--
net/mac80211/sta_info.c | 8 ++++----
net/mpls/af_mpls.c | 4 ++--
net/netfilter/ipvs/ip_vs_ctl.c | 4 ++--
net/netfilter/nf_tables_api.c | 4 ++--
net/openvswitch/datapath.c | 4 ++--
net/openvswitch/flow_table.c | 9 ++++-----
16 files changed, 44 insertions(+), 45 deletions(-)
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index d3e511e1eba8..0fa52bcc296b 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -712,13 +712,13 @@ static void vlan_dev_get_stats64(struct net_device *dev,
p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i);
do {
- start = u64_stats_fetch_begin_irq(&p->syncp);
+ start = u64_stats_fetch_begin(&p->syncp);
rxpackets = u64_stats_read(&p->rx_packets);
rxbytes = u64_stats_read(&p->rx_bytes);
rxmulticast = u64_stats_read(&p->rx_multicast);
txpackets = u64_stats_read(&p->tx_packets);
txbytes = u64_stats_read(&p->tx_bytes);
- } while (u64_stats_fetch_retry_irq(&p->syncp, start));
+ } while (u64_stats_fetch_retry(&p->syncp, start));
stats->rx_packets += rxpackets;
stats->rx_bytes += rxbytes;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index db4f2641d1cd..7e2a9fb5786c 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -4899,9 +4899,9 @@ void br_multicast_get_stats(const struct net_bridge *br,
unsigned int start;
do {
- start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
memcpy(&temp, &cpu_stats->mstats, sizeof(temp));
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries);
mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries);
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 9ffd40b8270c..bc75fa1e4666 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1389,12 +1389,12 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
cpu_stats = per_cpu_ptr(v->stats, i);
do {
- start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
rxpackets = u64_stats_read(&cpu_stats->rx_packets);
rxbytes = u64_stats_read(&cpu_stats->rx_bytes);
txbytes = u64_stats_read(&cpu_stats->tx_bytes);
txpackets = u64_stats_read(&cpu_stats->tx_packets);
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
u64_stats_add(&stats->rx_packets, rxpackets);
u64_stats_add(&stats->rx_bytes, rxbytes);
diff --git a/net/core/dev.c b/net/core/dev.c
index 0d5aa820fd83..070039f9296c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10505,12 +10505,12 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
stats = per_cpu_ptr(netstats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&stats->syncp);
+ start = u64_stats_fetch_begin(&stats->syncp);
rx_packets = u64_stats_read(&stats->rx_packets);
rx_bytes = u64_stats_read(&stats->rx_bytes);
tx_packets = u64_stats_read(&stats->tx_packets);
tx_bytes = u64_stats_read(&stats->tx_bytes);
- } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+ } while (u64_stats_fetch_retry(&stats->syncp, start));
s->rx_packets += rx_packets;
s->rx_bytes += rx_bytes;
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 8e0a90b45df2..4d5e8b317c47 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -1432,9 +1432,9 @@ static void net_dm_stats_read(struct net_dm_stats *stats)
u64 dropped;
do {
- start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
dropped = u64_stats_read(&cpu_stats->dropped);
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
u64_stats_add(&stats->dropped, dropped);
}
@@ -1476,9 +1476,9 @@ static void net_dm_hw_stats_read(struct net_dm_stats *stats)
u64 dropped;
do {
- start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
dropped = u64_stats_read(&cpu_stats->dropped);
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
u64_stats_add(&stats->dropped, dropped);
}
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index c8d137ef5980..b71ccaec0991 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -135,10 +135,10 @@ static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats,
u64 bytes, packets;
do {
- start = u64_stats_fetch_begin_irq(&bcpu->syncp);
+ start = u64_stats_fetch_begin(&bcpu->syncp);
bytes = u64_stats_read(&bcpu->bytes);
packets = u64_stats_read(&bcpu->packets);
- } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
+ } while (u64_stats_fetch_retry(&bcpu->syncp, start));
t_bytes += bytes;
t_packets += packets;
@@ -162,10 +162,10 @@ void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats,
}
do {
if (running)
- start = u64_stats_fetch_begin_irq(&b->syncp);
+ start = u64_stats_fetch_begin(&b->syncp);
bytes = u64_stats_read(&b->bytes);
packets = u64_stats_read(&b->packets);
- } while (running && u64_stats_fetch_retry_irq(&b->syncp, start));
+ } while (running && u64_stats_fetch_retry(&b->syncp, start));
_bstats_update(bstats, bytes, packets);
}
@@ -187,10 +187,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets,
u64 bytes, packets;
do {
- start = u64_stats_fetch_begin_irq(&bcpu->syncp);
+ start = u64_stats_fetch_begin(&bcpu->syncp);
bytes = u64_stats_read(&bcpu->bytes);
packets = u64_stats_read(&bcpu->packets);
- } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
+ } while (u64_stats_fetch_retry(&bcpu->syncp, start));
t_bytes += bytes;
t_packets += packets;
@@ -201,10 +201,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets,
}
do {
if (running)
- start = u64_stats_fetch_begin_irq(&b->syncp);
+ start = u64_stats_fetch_begin(&b->syncp);
*ret_bytes = u64_stats_read(&b->bytes);
*ret_packets = u64_stats_read(&b->packets);
- } while (running && u64_stats_fetch_retry_irq(&b->syncp, start));
+ } while (running && u64_stats_fetch_retry(&b->syncp, start));
}
static int
diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c
index 032c7af065cd..94e8cc3de330 100644
--- a/net/devlink/leftover.c
+++ b/net/devlink/leftover.c
@@ -8307,10 +8307,10 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats,
cpu_stats = per_cpu_ptr(trap_stats, i);
do {
- start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
rx_packets = u64_stats_read(&cpu_stats->rx_packets);
rx_bytes = u64_stats_read(&cpu_stats->rx_bytes);
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
u64_stats_add(&stats->rx_packets, rx_packets);
u64_stats_add(&stats->rx_bytes, rx_bytes);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 5fe075bf479e..28ee63ec1d1d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -976,12 +976,12 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
s = per_cpu_ptr(dev->tstats, i);
do {
- start = u64_stats_fetch_begin_irq(&s->syncp);
+ start = u64_stats_fetch_begin(&s->syncp);
tx_packets = u64_stats_read(&s->tx_packets);
tx_bytes = u64_stats_read(&s->tx_bytes);
rx_packets = u64_stats_read(&s->rx_packets);
rx_bytes = u64_stats_read(&s->rx_bytes);
- } while (u64_stats_fetch_retry_irq(&s->syncp, start));
+ } while (u64_stats_fetch_retry(&s->syncp, start));
data[0] += tx_packets;
data[1] += tx_bytes;
data[2] += rx_packets;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5d379df90c82..312c730b725f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1708,9 +1708,9 @@ u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt,
bhptr = per_cpu_ptr(mib, cpu);
syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
do {
- start = u64_stats_fetch_begin_irq(syncp);
+ start = u64_stats_fetch_begin(syncp);
v = *(((u64 *)bhptr) + offt);
- } while (u64_stats_fetch_retry_irq(syncp, start));
+ } while (u64_stats_fetch_retry(syncp, start));
return v;
}
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 8370726ae7bf..487f8e98deaa 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -1644,13 +1644,13 @@ static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt)
pcounters = per_cpu_ptr(slwt->pcpu_counters, i);
do {
- start = u64_stats_fetch_begin_irq(&pcounters->syncp);
+ start = u64_stats_fetch_begin(&pcounters->syncp);
packets = u64_stats_read(&pcounters->packets);
bytes = u64_stats_read(&pcounters->bytes);
errors = u64_stats_read(&pcounters->errors);
- } while (u64_stats_fetch_retry_irq(&pcounters->syncp, start));
+ } while (u64_stats_fetch_retry(&pcounters->syncp, start));
counters.packets += packets;
counters.bytes += bytes;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 49b71453dec3..c462e20ccc8d 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2397,9 +2397,9 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats,
u64 value;
do {
- start = u64_stats_fetch_begin_irq(&rxstats->syncp);
+ start = u64_stats_fetch_begin(&rxstats->syncp);
value = rxstats->msdu[tid];
- } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start));
+ } while (u64_stats_fetch_retry(&rxstats->syncp, start));
return value;
}
@@ -2465,9 +2465,9 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
u64 value;
do {
- start = u64_stats_fetch_begin_irq(&rxstats->syncp);
+ start = u64_stats_fetch_begin(&rxstats->syncp);
value = rxstats->bytes;
- } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start));
+ } while (u64_stats_fetch_retry(&rxstats->syncp, start));
return value;
}
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index f1f43894efb8..dc5165d3eec4 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_dev *mdev,
p = per_cpu_ptr(mdev->stats, i);
do {
- start = u64_stats_fetch_begin_irq(&p->syncp);
+ start = u64_stats_fetch_begin(&p->syncp);
local = p->stats;
- } while (u64_stats_fetch_retry_irq(&p->syncp, start));
+ } while (u64_stats_fetch_retry(&p->syncp, start));
stats->rx_packets += local.rx_packets;
stats->rx_bytes += local.rx_bytes;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 17a1b731a76b..2be696513629 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2299,13 +2299,13 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
u64 conns, inpkts, outpkts, inbytes, outbytes;
do {
- start = u64_stats_fetch_begin_irq(&u->syncp);
+ start = u64_stats_fetch_begin(&u->syncp);
conns = u64_stats_read(&u->cnt.conns);
inpkts = u64_stats_read(&u->cnt.inpkts);
outpkts = u64_stats_read(&u->cnt.outpkts);
inbytes = u64_stats_read(&u->cnt.inbytes);
outbytes = u64_stats_read(&u->cnt.outbytes);
- } while (u64_stats_fetch_retry_irq(&u->syncp, start));
+ } while (u64_stats_fetch_retry(&u->syncp, start));
seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
i, (u64)conns, (u64)inpkts,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 05fa5141af51..ab1888991ae5 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1692,10 +1692,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
for_each_possible_cpu(cpu) {
cpu_stats = per_cpu_ptr(stats, cpu);
do {
- seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ seq = u64_stats_fetch_begin(&cpu_stats->syncp);
pkts = cpu_stats->pkts;
bytes = cpu_stats->bytes;
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, seq));
total.pkts += pkts;
total.bytes += bytes;
}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 3c7b24535409..0953f531f984 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -716,9 +716,9 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
do {
- start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
+ start = u64_stats_fetch_begin(&percpu_stats->syncp);
local_stats = *percpu_stats;
- } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
+ } while (u64_stats_fetch_retry(&percpu_stats->syncp, start));
stats->n_hit += local_stats.n_hit;
stats->n_missed += local_stats.n_missed;
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index d4a2db0b2299..0a0e4c283f02 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -205,9 +205,9 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&stats->syncp);
+ start = u64_stats_fetch_begin(&stats->syncp);
counter = stats->usage_cntrs[i];
- } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+ } while (u64_stats_fetch_retry(&stats->syncp, start));
ma->masks_usage_zero_cntr[i] += counter;
}
@@ -1136,10 +1136,9 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&stats->syncp);
+ start = u64_stats_fetch_begin(&stats->syncp);
counter = stats->usage_cntrs[i];
- } while (u64_stats_fetch_retry_irq(&stats->syncp,
- start));
+ } while (u64_stats_fetch_retry(&stats->syncp, start));
masks_and_count[i].counter += counter;
}
--
2.43.0

View File

@ -0,0 +1,50 @@
From b8cff7d0320e3b39f098d9562373e1c16c54c46c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Aug 2022 16:17:57 +0200
Subject: [PATCH 07/62] bpf: Remove the obsolte u64_stats_fetch_*_irq() users.
Now that the 32bit UP oddity is gone and 32bit uses always a sequence
count, there is no need for the fetch_irq() variants anymore.
Convert to the regular interface.
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Hao Luo <haoluo@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Song Liu <song@kernel.org>
Cc: Stanislav Fomichev <sdf@google.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: bpf@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/bpf/syscall.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0c8b7733573e..c0915e2424f1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2115,11 +2115,11 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog,
st = per_cpu_ptr(prog->stats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&st->syncp);
+ start = u64_stats_fetch_begin(&st->syncp);
tnsecs = u64_stats_read(&st->nsecs);
tcnt = u64_stats_read(&st->cnt);
tmisses = u64_stats_read(&st->misses);
- } while (u64_stats_fetch_retry_irq(&st->syncp, start));
+ } while (u64_stats_fetch_retry(&st->syncp, start));
nsecs += tnsecs;
cnt += tcnt;
misses += tmisses;
--
2.43.0

View File

@ -0,0 +1,41 @@
From b908a7b47d95003c498f2f575285f528148602d6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Aug 2022 16:43:46 +0200
Subject: [PATCH 08/62] u64_stat: Remove the obsolete fetch_irq() variants.
Now that the 32bit UP oddity is gone and 32bit uses always a sequence
count, there is no need for the fetch_irq() variants anymore.
Delete the obsolete interfaces.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/u64_stats_sync.h | 12 ------------
1 file changed, 12 deletions(-)
diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index 46040d66334a..ffe48e69b3f3 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -213,16 +213,4 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
return __u64_stats_fetch_retry(syncp, start);
}
-/* Obsolete interfaces */
-static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
-{
- return u64_stats_fetch_begin(syncp);
-}
-
-static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
- unsigned int start)
-{
- return u64_stats_fetch_retry(syncp, start);
-}
-
#endif /* _LINUX_U64_STATS_SYNC_H */
--
2.43.0

View File

@ -0,0 +1,124 @@
From c910f301d71266e18f63407ec6c65d19ae90e779 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 15 Aug 2022 17:29:50 +0200
Subject: [PATCH 09/62] net: Avoid the IPI to free the
skb_attempt_defer_free() collects a skbs, which was allocated on a
remote CPU, on a per-CPU list. These skbs are either freed on that
remote CPU once the CPU enters NET_RX or an remote IPI function is
invoked in to raise the NET_RX softirq if a threshold of pending skb has
been exceeded.
This remote IPI can cause the wakeup of ksoftirqd on PREEMPT_RT if the
remote CPU idle was idle. This is undesired because once the ksoftirqd
is running it will acquire all pending softirqs and they will not be
executed as part of the threaded interrupt until ksoftird goes idle
again.
To void all this, schedule the deferred clean up from a worker.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/netdevice.h | 4 ++++
net/core/dev.c | 37 ++++++++++++++++++++++++++++---------
net/core/skbuff.c | 7 ++++++-
3 files changed, 38 insertions(+), 10 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0373e0935990..55d698367883 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3169,7 +3169,11 @@ struct softnet_data {
int defer_count;
int defer_ipi_scheduled;
struct sk_buff *defer_list;
+#ifndef CONFIG_PREEMPT_RT
call_single_data_t defer_csd;
+#else
+ struct work_struct defer_work;
+#endif
};
static inline void input_queue_head_incr(struct softnet_data *sd)
diff --git a/net/core/dev.c b/net/core/dev.c
index 070039f9296c..a3caa23be3cf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4618,15 +4618,6 @@ static void rps_trigger_softirq(void *data)
#endif /* CONFIG_RPS */
-/* Called from hardirq (IPI) context */
-static void trigger_rx_softirq(void *data)
-{
- struct softnet_data *sd = data;
-
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
- smp_store_release(&sd->defer_ipi_scheduled, 0);
-}
-
/*
* Check if this softnet_data structure is another cpu one
* If yes, queue it to our IPI list and return 1
@@ -6684,6 +6675,30 @@ static void skb_defer_free_flush(struct softnet_data *sd)
}
}
+#ifndef CONFIG_PREEMPT_RT
+/* Called from hardirq (IPI) context */
+static void trigger_rx_softirq(void *data)
+{
+ struct softnet_data *sd = data;
+
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ smp_store_release(&sd->defer_ipi_scheduled, 0);
+}
+
+#else
+
+static void trigger_rx_softirq(struct work_struct *defer_work)
+{
+ struct softnet_data *sd;
+
+ sd = container_of(defer_work, struct softnet_data, defer_work);
+ smp_store_release(&sd->defer_ipi_scheduled, 0);
+ local_bh_disable();
+ skb_defer_free_flush(sd);
+ local_bh_enable();
+}
+#endif
+
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -11435,7 +11450,11 @@ static int __init net_dev_init(void)
INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
sd->cpu = i;
#endif
+#ifndef CONFIG_PREEMPT_RT
INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
+#else
+ INIT_WORK(&sd->defer_work, trigger_rx_softirq);
+#endif
spin_lock_init(&sd->defer_lock);
init_gro_hash(&sd->backlog);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 73b1e0e53534..a457a3445469 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6680,6 +6680,11 @@ nodefer: __kfree_skb(skb);
/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
* if we are unlucky enough (this seems very unlikely).
*/
- if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
+ if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
+#ifndef CONFIG_PREEMPT_RT
smp_call_function_single_async(cpu, &sd->defer_csd);
+#else
+ schedule_work_on(cpu, &sd->defer_work);
+#endif
+ }
}
--
2.43.0

View File

@ -0,0 +1,28 @@
From ce04e41eb149fcd93a71b63a605423d7f18ec8b4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 7 Aug 2019 18:15:38 +0200
Subject: [PATCH 10/62] x86: Allow to enable RT
Allow to select RT.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/x86/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4c9bfc4be58d..f7f81e3012cc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86_64
# Options that are inherently 64-bit kernel only:
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
+ select ARCH_SUPPORTS_RT
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY
select MODULES_USE_ELF_RELA
--
2.43.0

View File

@ -0,0 +1,34 @@
From 28dbe0fc30a2d3e519fac1ffe18fe7427f1f49b3 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 7 Nov 2019 17:49:20 +0100
Subject: [PATCH 11/62] x86: Enable RT also on 32bit
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/x86/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f7f81e3012cc..c9bed9c69423 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,7 +27,6 @@ config X86_64
# Options that are inherently 64-bit kernel only:
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
- select ARCH_SUPPORTS_RT
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY
select MODULES_USE_ELF_RELA
@@ -114,6 +113,7 @@ config X86
select ARCH_USES_CFI_TRAPS if X86_64 && CFI_CLANG
select ARCH_SUPPORTS_LTO_CLANG
select ARCH_SUPPORTS_LTO_CLANG_THIN
+ select ARCH_SUPPORTS_RT
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_MEMTEST
select ARCH_USE_QUEUED_RWLOCKS
--
2.43.0

View File

@ -0,0 +1,233 @@
From 454343a4f08e5de772024588aec2bd396177ee89 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 1 Dec 2021 17:41:09 +0100
Subject: [PATCH 12/62] softirq: Use a dedicated thread for timer wakeups.
A timer/hrtimer softirq is raised in-IRQ context. With threaded
interrupts enabled or on PREEMPT_RT this leads to waking the ksoftirqd
for the processing of the softirq.
Once the ksoftirqd is marked as pending (or is running) it will collect
all raised softirqs. This in turn means that a softirq which would have
been processed at the end of the threaded interrupt, which runs at an
elevated priority, is now moved to ksoftirqd which runs at SCHED_OTHER
priority and competes with every regular task for CPU resources.
This introduces long delays on heavy loaded systems and is not desired
especially if the system is not overloaded by the softirqs.
Split the TIMER_SOFTIRQ and HRTIMER_SOFTIRQ processing into a dedicated
timers thread and let it run at the lowest SCHED_FIFO priority.
RT tasks are are woken up from hardirq context so only timer_list timers
and hrtimers for "regular" tasks are processed here. The higher priority
ensures that wakeups are performed before scheduling SCHED_OTHER tasks.
Using a dedicated variable to store the pending softirq bits values
ensure that the timer are not accidentally picked up by ksoftirqd and
other threaded interrupts.
It shouldn't be picked up by ksoftirqd since it runs at lower priority.
However if the timer bits are ORed while a threaded interrupt is
running, then the timer softirq would be performed at higher priority.
The new timer thread will block on the softirq lock before it starts
softirq work. This "race window" isn't closed because while timer thread
is performing the softirq it can get PI-boosted via the softirq lock by
a random force-threaded thread.
The timer thread can pick up pending softirqs from ksoftirqd but only
if the softirq load is high. It is not be desired that the picked up
softirqs are processed at SCHED_FIFO priority under high softirq load
but this can already happen by a PI-boost by a force-threaded interrupt.
Reported-by: kernel test robot <lkp@intel.com> [ static timer_threads ]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/interrupt.h | 16 +++++++
kernel/softirq.c | 92 +++++++++++++++++++++++++++++++++++++--
kernel/time/hrtimer.c | 4 +-
kernel/time/timer.c | 2 +-
4 files changed, 108 insertions(+), 6 deletions(-)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 4a1dc88ddbff..0efba74a835c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -609,6 +609,22 @@ extern void __raise_softirq_irqoff(unsigned int nr);
extern void raise_softirq_irqoff(unsigned int nr);
extern void raise_softirq(unsigned int nr);
+#ifdef CONFIG_PREEMPT_RT
+extern void raise_timer_softirq(void);
+extern void raise_hrtimer_softirq(void);
+
+#else
+static inline void raise_timer_softirq(void)
+{
+ raise_softirq(TIMER_SOFTIRQ);
+}
+
+static inline void raise_hrtimer_softirq(void)
+{
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+}
+#endif
+
DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
static inline struct task_struct *this_cpu_ksoftirqd(void)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c8a6913c067d..ed6d7c41aa17 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -637,6 +637,29 @@ static inline void tick_irq_exit(void)
#endif
}
+#ifdef CONFIG_PREEMPT_RT
+static DEFINE_PER_CPU(struct task_struct *, timersd);
+static DEFINE_PER_CPU(unsigned long, pending_timer_softirq);
+
+static unsigned int local_pending_timers(void)
+{
+ return __this_cpu_read(pending_timer_softirq);
+}
+
+static void wake_timersd(void)
+{
+ struct task_struct *tsk = __this_cpu_read(timersd);
+
+ if (tsk)
+ wake_up_process(tsk);
+}
+
+#else
+
+static inline void wake_timersd(void) { }
+
+#endif
+
static inline void __irq_exit_rcu(void)
{
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -646,8 +669,13 @@ static inline void __irq_exit_rcu(void)
#endif
account_hardirq_exit(current);
preempt_count_sub(HARDIRQ_OFFSET);
- if (!in_interrupt() && local_softirq_pending())
- invoke_softirq();
+ if (!in_interrupt()) {
+ if (local_softirq_pending())
+ invoke_softirq();
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && local_pending_timers())
+ wake_timersd();
+ }
tick_irq_exit();
}
@@ -976,12 +1004,70 @@ static struct smp_hotplug_thread softirq_threads = {
.thread_comm = "ksoftirqd/%u",
};
+#ifdef CONFIG_PREEMPT_RT
+static void timersd_setup(unsigned int cpu)
+{
+ sched_set_fifo_low(current);
+}
+
+static int timersd_should_run(unsigned int cpu)
+{
+ return local_pending_timers();
+}
+
+static void run_timersd(unsigned int cpu)
+{
+ unsigned int timer_si;
+
+ ksoftirqd_run_begin();
+
+ timer_si = local_pending_timers();
+ __this_cpu_write(pending_timer_softirq, 0);
+ or_softirq_pending(timer_si);
+
+ __do_softirq();
+
+ ksoftirqd_run_end();
+}
+
+static void raise_ktimers_thread(unsigned int nr)
+{
+ trace_softirq_raise(nr);
+ __this_cpu_or(pending_timer_softirq, 1 << nr);
+}
+
+void raise_hrtimer_softirq(void)
+{
+ raise_ktimers_thread(HRTIMER_SOFTIRQ);
+}
+
+void raise_timer_softirq(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ raise_ktimers_thread(TIMER_SOFTIRQ);
+ wake_timersd();
+ local_irq_restore(flags);
+}
+
+static struct smp_hotplug_thread timer_threads = {
+ .store = &timersd,
+ .setup = timersd_setup,
+ .thread_should_run = timersd_should_run,
+ .thread_fn = run_timersd,
+ .thread_comm = "ktimers/%u",
+};
+#endif
+
static __init int spawn_ksoftirqd(void)
{
cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
takeover_tasklets);
BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
-
+#ifdef CONFIG_PREEMPT_RT
+ BUG_ON(smpboot_register_percpu_thread(&timer_threads));
+#endif
return 0;
}
early_initcall(spawn_ksoftirqd);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5561dabc9b22..c5d480d5da15 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1805,7 +1805,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
cpu_base->softirq_activated = 1;
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ raise_hrtimer_softirq();
}
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
@@ -1918,7 +1918,7 @@ void hrtimer_run_queues(void)
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
cpu_base->softirq_activated = 1;
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ raise_hrtimer_softirq();
}
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 717fcb9fb14a..e6219da89933 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1822,7 +1822,7 @@ static void run_local_timers(void)
if (time_before(jiffies, base->next_expiry))
return;
}
- raise_softirq(TIMER_SOFTIRQ);
+ raise_timer_softirq();
}
/*
--
2.43.0

View File

@ -0,0 +1,80 @@
From a10aa54ef224bbd46ca1777e4b9fe960360961cc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 5 Apr 2022 03:07:51 +0200
Subject: [PATCH 13/62] rcutorture: Also force sched priority to timersd on
boosting test.
ksoftirqd is statically boosted to the priority level right above the
one of rcu_torture_boost() so that timers, which torture readers rely on,
get a chance to run while rcu_torture_boost() is polling.
However timers processing got split from ksoftirqd into their own kthread
(timersd) that isn't boosted. It has the same SCHED_FIFO low prio as
rcu_torture_boost() and therefore timers can't preempt it and may
starve.
The issue can be triggered in practice on v5.17.1-rt17 using:
./kvm.sh --allcpus --configs TREE04 --duration 10m --kconfig "CONFIG_EXPERT=y CONFIG_PREEMPT_RT=y"
Fix this with statically boosting timersd just like is done with
ksoftirqd in commit
ea6d962e80b61 ("rcutorture: Judge RCU priority boosting on grace periods, not callbacks")
Suggested-by: Mel Gorman <mgorman@suse.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20220405010752.1347437-1-frederic@kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/interrupt.h | 1 +
kernel/rcu/rcutorture.c | 6 ++++++
kernel/softirq.c | 2 +-
3 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0efba74a835c..f459b0f27c94 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -610,6 +610,7 @@ extern void raise_softirq_irqoff(unsigned int nr);
extern void raise_softirq(unsigned int nr);
#ifdef CONFIG_PREEMPT_RT
+DECLARE_PER_CPU(struct task_struct *, timersd);
extern void raise_timer_softirq(void);
extern void raise_hrtimer_softirq(void);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 503c2aa845a4..dcd8c0e44c00 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2363,6 +2363,12 @@ static int rcutorture_booster_init(unsigned int cpu)
WARN_ON_ONCE(!t);
sp.sched_priority = 2;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+#ifdef CONFIG_PREEMPT_RT
+ t = per_cpu(timersd, cpu);
+ WARN_ON_ONCE(!t);
+ sp.sched_priority = 2;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+#endif
}
/* Don't allow time recalculation while creating a new task. */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ed6d7c41aa17..1892af494cdd 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -638,7 +638,7 @@ static inline void tick_irq_exit(void)
}
#ifdef CONFIG_PREEMPT_RT
-static DEFINE_PER_CPU(struct task_struct *, timersd);
+DEFINE_PER_CPU(struct task_struct *, timersd);
static DEFINE_PER_CPU(unsigned long, pending_timer_softirq);
static unsigned int local_pending_timers(void)
--
2.43.0

View File

@ -0,0 +1,115 @@
From ae3e63c4320c0c2d3865ba8ecff64a6d03948ce7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 5 Apr 2022 03:07:52 +0200
Subject: [PATCH 14/62] tick: Fix timer storm since introduction of timersd
If timers are pending while the tick is reprogrammed on nohz_mode, the
next expiry is not armed to fire now, it is delayed one jiffy forward
instead so as not to raise an inextinguishable timer storm with such
scenario:
1) IRQ triggers and queue a timer
2) ksoftirqd() is woken up
3) IRQ tail: timer is reprogrammed to fire now
4) IRQ exit
5) TIMER interrupt
6) goto 3)
...all that until we finally reach ksoftirqd.
Unfortunately we are checking the wrong softirq vector bitmask since
timersd kthread has split from ksoftirqd. Timers now have their own
vector state field that must be checked separately. As a result, the
old timer storm is back. This shows up early on boot with extremely long
initcalls:
[ 333.004807] initcall dquot_init+0x0/0x111 returned 0 after 323822879 usecs
and the cause is uncovered with the right trace events showing just
10 microseconds between ticks (~100 000 Hz):
|swapper/-1 1dn.h111 60818582us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415486608
|swapper/-1 1dn.h111 60818592us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415496082
|swapper/-1 1dn.h111 60818601us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415505550
Fix this by checking the right timer vector state from the nohz code.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/20220405010752.1347437-2-frederic@kernel.org
---
include/linux/interrupt.h | 12 ++++++++++++
kernel/softirq.c | 7 +------
kernel/time/tick-sched.c | 2 +-
3 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index f459b0f27c94..a5091ac97fc6 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -611,9 +611,16 @@ extern void raise_softirq(unsigned int nr);
#ifdef CONFIG_PREEMPT_RT
DECLARE_PER_CPU(struct task_struct *, timersd);
+DECLARE_PER_CPU(unsigned long, pending_timer_softirq);
+
extern void raise_timer_softirq(void);
extern void raise_hrtimer_softirq(void);
+static inline unsigned int local_pending_timers(void)
+{
+ return __this_cpu_read(pending_timer_softirq);
+}
+
#else
static inline void raise_timer_softirq(void)
{
@@ -624,6 +631,11 @@ static inline void raise_hrtimer_softirq(void)
{
raise_softirq_irqoff(HRTIMER_SOFTIRQ);
}
+
+static inline unsigned int local_pending_timers(void)
+{
+ return local_softirq_pending();
+}
#endif
DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 1892af494cdd..ab1fe34326ba 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -639,12 +639,7 @@ static inline void tick_irq_exit(void)
#ifdef CONFIG_PREEMPT_RT
DEFINE_PER_CPU(struct task_struct *, timersd);
-static DEFINE_PER_CPU(unsigned long, pending_timer_softirq);
-
-static unsigned int local_pending_timers(void)
-{
- return __this_cpu_read(pending_timer_softirq);
-}
+DEFINE_PER_CPU(unsigned long, pending_timer_softirq);
static void wake_timersd(void)
{
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 798e1841d286..b52e1861b913 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -800,7 +800,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
static inline bool local_timer_softirq_pending(void)
{
- return local_softirq_pending() & BIT(TIMER_SOFTIRQ);
+ return local_pending_timers() & BIT(TIMER_SOFTIRQ);
}
static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
--
2.43.0

View File

@ -0,0 +1,49 @@
From ca98adaa69af0a5f3bb28cccb6543ee3e0c4a23f Mon Sep 17 00:00:00 2001
From: Junxiao Chang <junxiao.chang@intel.com>
Date: Mon, 20 Feb 2023 09:12:20 +0100
Subject: [PATCH 15/62] softirq: Wake ktimers thread also in softirq.
If the hrtimer is raised while a softirq is processed then it does not
wake the corresponding ktimers thread. This is due to the optimisation in the
irq-exit path which is also used to wake the ktimers thread. For the other
softirqs, this is okay because the additional softirq bits will be handled by
the currently running softirq handler.
The timer related softirq bits are added to a different variable and rely on
the ktimers thread.
As a consuequence the wake up of ktimersd is delayed until the next timer tick.
Always wake the ktimers thread if a timer related softirq is pending.
Reported-by: Peh, Hock Zhang <hock.zhang.peh@intel.com>
Signed-off-by: Junxiao Chang <junxiao.chang@intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/softirq.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ab1fe34326ba..82f3e68fbe22 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -664,13 +664,12 @@ static inline void __irq_exit_rcu(void)
#endif
account_hardirq_exit(current);
preempt_count_sub(HARDIRQ_OFFSET);
- if (!in_interrupt()) {
- if (local_softirq_pending())
- invoke_softirq();
+ if (!in_interrupt() && local_softirq_pending())
+ invoke_softirq();
- if (IS_ENABLED(CONFIG_PREEMPT_RT) && local_pending_timers())
- wake_timersd();
- }
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && local_pending_timers() &&
+ !(in_nmi() | in_hardirq()))
+ wake_timersd();
tick_irq_exit();
}
--
2.43.0

View File

@ -0,0 +1,81 @@
From 87e5c70f401b5230b5125dedc88e10f54909a37e Mon Sep 17 00:00:00 2001
From: Haris Okanovic <haris.okanovic@ni.com>
Date: Tue, 15 Aug 2017 15:13:08 -0500
Subject: [PATCH 16/62] tpm_tis: fix stall after iowrite*()s
ioread8() operations to TPM MMIO addresses can stall the cpu when
immediately following a sequence of iowrite*()'s to the same region.
For example, cyclitest measures ~400us latency spikes when a non-RT
usermode application communicates with an SPI-based TPM chip (Intel Atom
E3940 system, PREEMPT_RT kernel). The spikes are caused by a
stalling ioread8() operation following a sequence of 30+ iowrite8()s to
the same address. I believe this happens because the write sequence is
buffered (in cpu or somewhere along the bus), and gets flushed on the
first LOAD instruction (ioread*()) that follows.
The enclosed change appears to fix this issue: read the TPM chip's
access register (status code) after every iowrite*() operation to
amortize the cost of flushing data to chip across multiple instructions.
Signed-off-by: Haris Okanovic <haris.okanovic@ni.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
drivers/char/tpm/tpm_tis.c | 29 +++++++++++++++++++++++++++--
1 file changed, 27 insertions(+), 2 deletions(-)
diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
index 0d084d6652c4..5d620322bdc2 100644
--- a/drivers/char/tpm/tpm_tis.c
+++ b/drivers/char/tpm/tpm_tis.c
@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da
return container_of(data, struct tpm_tis_tcg_phy, priv);
}
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Flushes previous write operations to chip so that a subsequent
+ * ioread*()s won't stall a cpu.
+ */
+static inline void tpm_tis_flush(void __iomem *iobase)
+{
+ ioread8(iobase + TPM_ACCESS(0));
+}
+#else
+#define tpm_tis_flush(iobase) do { } while (0)
+#endif
+
+static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr)
+{
+ iowrite8(b, iobase + addr);
+ tpm_tis_flush(iobase);
+}
+
+static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr)
+{
+ iowrite32(b, iobase + addr);
+ tpm_tis_flush(iobase);
+}
+
static int interrupts = -1;
module_param(interrupts, int, 0444);
MODULE_PARM_DESC(interrupts, "Enable interrupts");
@@ -202,12 +227,12 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len,
switch (io_mode) {
case TPM_TIS_PHYS_8:
while (len--)
- iowrite8(*value++, phy->iobase + addr);
+ tpm_tis_iowrite8(*value++, phy->iobase, addr);
break;
case TPM_TIS_PHYS_16:
return -EINVAL;
case TPM_TIS_PHYS_32:
- iowrite32(le32_to_cpu(*((__le32 *)value)), phy->iobase + addr);
+ tpm_tis_iowrite32(le32_to_cpu(*((__le32 *)value)), phy->iobase, addr);
break;
}
--
2.43.0

View File

@ -0,0 +1,99 @@
From 8397109d43ef57d5e91d738354b9c30f49cb2f95 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Thu, 31 Mar 2016 04:08:28 +0200
Subject: [PATCH 17/62] zram: Replace bit spinlocks with spinlock_t for
PREEMPT_RT.
The bit spinlock disables preemption on PREEMPT_RT. With disabled preemption it
is not allowed to acquire other sleeping locks which includes invoking
zs_free().
Use a spinlock_t on PREEMPT_RT for locking and set/ clear ZRAM_LOCK after the
lock has been acquired/ dropped.
Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/YqIbMuHCPiQk+Ac2@linutronix.de
---
drivers/block/zram/zram_drv.c | 36 +++++++++++++++++++++++++++++++++++
drivers/block/zram/zram_drv.h | 3 +++
2 files changed, 39 insertions(+)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 966aab902d19..ee69e4443691 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -57,6 +57,40 @@ static void zram_free_page(struct zram *zram, size_t index);
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
u32 index, int offset, struct bio *bio);
+#ifdef CONFIG_PREEMPT_RT
+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages)
+{
+ size_t index;
+
+ for (index = 0; index < num_pages; index++)
+ spin_lock_init(&zram->table[index].lock);
+}
+
+static int zram_slot_trylock(struct zram *zram, u32 index)
+{
+ int ret;
+
+ ret = spin_trylock(&zram->table[index].lock);
+ if (ret)
+ __set_bit(ZRAM_LOCK, &zram->table[index].flags);
+ return ret;
+}
+
+static void zram_slot_lock(struct zram *zram, u32 index)
+{
+ spin_lock(&zram->table[index].lock);
+ __set_bit(ZRAM_LOCK, &zram->table[index].flags);
+}
+
+static void zram_slot_unlock(struct zram *zram, u32 index)
+{
+ __clear_bit(ZRAM_LOCK, &zram->table[index].flags);
+ spin_unlock(&zram->table[index].lock);
+}
+
+#else
+
+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { }
static int zram_slot_trylock(struct zram *zram, u32 index)
{
@@ -72,6 +106,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index)
{
bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags);
}
+#endif
static inline bool init_done(struct zram *zram)
{
@@ -1187,6 +1222,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
if (!huge_class_size)
huge_class_size = zs_huge_class_size(zram->mem_pool);
+ zram_meta_init_table_locks(zram, num_pages);
return true;
}
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index a2bda53020fd..ae7950b26db5 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -62,6 +62,9 @@ struct zram_table_entry {
unsigned long element;
};
unsigned long flags;
+#ifdef CONFIG_PREEMPT_RT
+ spinlock_t lock;
+#endif
#ifdef CONFIG_ZRAM_MEMORY_TRACKING
ktime_t ac_time;
#endif
--
2.43.0

View File

@ -0,0 +1,33 @@
From dd162e2589601c792a81a3c19ef4a87510ed6ce5 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 11 Mar 2022 17:44:57 +0100
Subject: [PATCH 18/62] locking/lockdep: Remove lockdep_init_map_crosslock.
The cross-release bits have been removed, lockdep_init_map_crosslock() is
a leftover.
Remove lockdep_init_map_crosslock.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Waiman Long <longman@redhat.com>
Link: https://lore.kernel.org/r/20220311164457.46461-1-bigeasy@linutronix.de
Link: https://lore.kernel.org/r/YqITgY+2aPITu96z@linutronix.de
---
include/linux/lockdep.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 1f1099dac3f0..1023f349af71 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -435,7 +435,6 @@ enum xhlock_context_t {
XHLOCK_CTX_NR,
};
-#define lockdep_init_map_crosslock(m, n, k, s) do {} while (0)
/*
* To initialize a lockdep_map statically use this macro.
* Note that _name must not be NULL.
--
2.43.0

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,607 @@
From 18343f23a5f1d466a0c74806983066efba932d5d Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Fri, 4 Feb 2022 16:01:17 +0106
Subject: [PATCH 20/62] printk: add infrastucture for atomic consoles
Many times it is not possible to see the console output on
panic because printing threads cannot be scheduled and/or the
console is already taken and forcibly overtaking/busting the
locks does provide the hoped results.
Introduce a new infrastructure to support "atomic consoles".
A new optional callback in struct console, write_atomic(), is
available for consoles to provide an implemention for writing
console messages. The implementation must be NMI safe if they
can run on an architecture where NMIs exist.
Console drivers implementing the write_atomic() callback must
also select CONFIG_HAVE_ATOMIC_CONSOLE in order to enable the
atomic console code within the printk subsystem.
If atomic consoles are available, panic() will flush the kernel
log only to the atomic consoles (before busting spinlocks).
Afterwards, panic() will continue as before, which includes
attempting to flush the other (non-atomic) consoles.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/console.h | 16 ++-
init/Kconfig | 4 +
kernel/panic.c | 6 +-
kernel/printk/printk.c | 293 ++++++++++++++++++++++++++++++++++++----
4 files changed, 290 insertions(+), 29 deletions(-)
diff --git a/include/linux/console.h b/include/linux/console.h
index 143653090c48..8a813cbaf928 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -138,9 +138,19 @@ static inline int con_debug_leave(void)
#define CON_BRL (32) /* Used for a braille device */
#define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */
+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+struct console_atomic_data {
+ u64 seq;
+ char *text;
+ char *ext_text;
+ char *dropped_text;
+};
+#endif
+
struct console {
char name[16];
void (*write)(struct console *, const char *, unsigned);
+ void (*write_atomic)(struct console *, const char *, unsigned);
int (*read)(struct console *, char *, unsigned);
struct tty_driver *(*device)(struct console *, int *);
void (*unblank)(void);
@@ -153,7 +163,10 @@ struct console {
uint ispeed;
uint ospeed;
u64 seq;
- unsigned long dropped;
+ atomic_long_t dropped;
+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+ struct console_atomic_data *atomic_data;
+#endif
struct task_struct *thread;
bool blocked;
@@ -184,6 +197,7 @@ extern int console_set_on_cmdline;
extern struct console *early_console;
enum con_flush_mode {
+ CONSOLE_ATOMIC_FLUSH_PENDING,
CONSOLE_FLUSH_PENDING,
CONSOLE_REPLAY_ALL,
};
diff --git a/init/Kconfig b/init/Kconfig
index de255842f5d0..d45312780b3a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1582,6 +1582,10 @@ config PRINTK
very difficult to diagnose system problems, saying N here is
strongly discouraged.
+config HAVE_ATOMIC_CONSOLE
+ bool
+ default n
+
config BUG
bool "BUG() support" if EXPERT
default y
diff --git a/kernel/panic.c b/kernel/panic.c
index 88cd873c7c30..97cc495d95f8 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -322,7 +322,6 @@ void panic(const char *fmt, ...)
panic_smp_self_stop();
console_verbose();
- bust_spinlocks(1);
va_start(args, fmt);
len = vscnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
@@ -339,6 +338,11 @@ void panic(const char *fmt, ...)
dump_stack();
#endif
+ /* If atomic consoles are available, flush the kernel log. */
+ console_flush_on_panic(CONSOLE_ATOMIC_FLUSH_PENDING);
+
+ bust_spinlocks(1);
+
/*
* If kgdb is enabled, give it a chance to run before we stop all
* the other CPUs or else we won't be able to debug processes left
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e9f9b66608a0..73b1727087c7 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -44,6 +44,7 @@
#include <linux/irq_work.h>
#include <linux/ctype.h>
#include <linux/uio.h>
+#include <linux/clocksource.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
@@ -2060,19 +2061,28 @@ static int console_trylock_spinning(void)
* dropped, a dropped message will be written out first.
*/
static void call_console_driver(struct console *con, const char *text, size_t len,
- char *dropped_text)
+ char *dropped_text, bool atomic_printing)
{
+ unsigned long dropped = 0;
size_t dropped_len;
- if (con->dropped && dropped_text) {
+ if (dropped_text)
+ dropped = atomic_long_xchg_relaxed(&con->dropped, 0);
+
+ if (dropped) {
dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX,
"** %lu printk messages dropped **\n",
- con->dropped);
- con->dropped = 0;
- con->write(con, dropped_text, dropped_len);
+ dropped);
+ if (atomic_printing)
+ con->write_atomic(con, dropped_text, dropped_len);
+ else
+ con->write(con, dropped_text, dropped_len);
}
- con->write(con, text, len);
+ if (atomic_printing)
+ con->write_atomic(con, text, len);
+ else
+ con->write(con, text, len);
}
/*
@@ -2430,6 +2440,76 @@ asmlinkage __visible int _printk(const char *fmt, ...)
}
EXPORT_SYMBOL(_printk);
+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+static void __free_atomic_data(struct console_atomic_data *d)
+{
+ kfree(d->text);
+ kfree(d->ext_text);
+ kfree(d->dropped_text);
+}
+
+static void free_atomic_data(struct console_atomic_data *d)
+{
+ int count = 1;
+ int i;
+
+ if (!d)
+ return;
+
+#ifdef CONFIG_HAVE_NMI
+ count = 2;
+#endif
+
+ for (i = 0; i < count; i++)
+ __free_atomic_data(&d[i]);
+ kfree(d);
+}
+
+static int __alloc_atomic_data(struct console_atomic_data *d, short flags)
+{
+ d->text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
+ if (!d->text)
+ return -1;
+
+ if (flags & CON_EXTENDED) {
+ d->ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
+ if (!d->ext_text)
+ return -1;
+ } else {
+ d->dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL);
+ if (!d->dropped_text)
+ return -1;
+ }
+
+ return 0;
+}
+
+static struct console_atomic_data *alloc_atomic_data(short flags)
+{
+ struct console_atomic_data *d;
+ int count = 1;
+ int i;
+
+#ifdef CONFIG_HAVE_NMI
+ count = 2;
+#endif
+
+ d = kzalloc(sizeof(*d) * count, GFP_KERNEL);
+ if (!d)
+ goto err_out;
+
+ for (i = 0; i < count; i++) {
+ if (__alloc_atomic_data(&d[i], flags) != 0)
+ goto err_out;
+ }
+
+ return d;
+err_out:
+ free_atomic_data(d);
+ return NULL;
+}
+#endif /* CONFIG_HAVE_ATOMIC_CONSOLE */
+
static bool pr_flush(int timeout_ms, bool reset_on_progress);
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
@@ -2445,6 +2525,8 @@ static void printk_start_kthread(struct console *con);
#define prb_first_valid_seq(rb) 0
#define prb_next_seq(rb) 0
+#define free_atomic_data(d)
+
static u64 syslog_seq;
static size_t record_print_text(const struct printk_record *r,
@@ -2463,7 +2545,7 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
static void console_lock_spinning_enable(void) { }
static int console_lock_spinning_disable_and_check(void) { return 0; }
static void call_console_driver(struct console *con, const char *text, size_t len,
- char *dropped_text)
+ char *dropped_text, bool atomic_printing)
{
}
static bool suppress_message_printing(int level) { return false; }
@@ -2819,10 +2901,20 @@ static inline bool __console_is_usable(short flags)
*
* Requires holding the console_lock.
*/
-static inline bool console_is_usable(struct console *con)
+static inline bool console_is_usable(struct console *con, bool atomic_printing)
{
- if (!con->write)
+ if (atomic_printing) {
+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+ if (!con->write_atomic)
+ return false;
+ if (!con->atomic_data)
+ return false;
+#else
+ return false;
+#endif
+ } else if (!con->write) {
return false;
+ }
return __console_is_usable(con->flags);
}
@@ -2847,6 +2939,66 @@ static void __console_unlock(void)
up_console_sem();
}
+static u64 read_console_seq(struct console *con)
+{
+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+ unsigned long flags;
+ u64 seq2;
+ u64 seq;
+
+ if (!con->atomic_data)
+ return con->seq;
+
+ printk_cpu_sync_get_irqsave(flags);
+
+ seq = con->seq;
+ seq2 = con->atomic_data[0].seq;
+ if (seq2 > seq)
+ seq = seq2;
+#ifdef CONFIG_HAVE_NMI
+ seq2 = con->atomic_data[1].seq;
+ if (seq2 > seq)
+ seq = seq2;
+#endif
+
+ printk_cpu_sync_put_irqrestore(flags);
+
+ return seq;
+#else /* CONFIG_HAVE_ATOMIC_CONSOLE */
+ return con->seq;
+#endif
+}
+
+static void write_console_seq(struct console *con, u64 val, bool atomic_printing)
+{
+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+ unsigned long flags;
+ u64 *seq;
+
+ if (!con->atomic_data) {
+ con->seq = val;
+ return;
+ }
+
+ printk_cpu_sync_get_irqsave(flags);
+
+ if (atomic_printing) {
+ seq = &con->atomic_data[0].seq;
+#ifdef CONFIG_HAVE_NMI
+ if (in_nmi())
+ seq = &con->atomic_data[1].seq;
+#endif
+ } else {
+ seq = &con->seq;
+ }
+ *seq = val;
+
+ printk_cpu_sync_put_irqrestore(flags);
+#else /* CONFIG_HAVE_ATOMIC_CONSOLE */
+ con->seq = val;
+#endif
+}
+
/*
* Print one record for the given console. The record printed is whatever
* record is the next available record for the given console.
@@ -2859,6 +3011,8 @@ static void __console_unlock(void)
* If dropped messages should be printed, @dropped_text is a buffer of size
* DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL.
*
+ * @atomic_printing specifies if atomic printing should be used.
+ *
* @handover will be set to true if a printk waiter has taken over the
* console_lock, in which case the caller is no longer holding the
* console_lock. Otherwise it is set to false. A NULL pointer may be provided
@@ -2871,7 +3025,8 @@ static void __console_unlock(void)
* Requires con->lock otherwise.
*/
static bool __console_emit_next_record(struct console *con, char *text, char *ext_text,
- char *dropped_text, bool *handover)
+ char *dropped_text, bool atomic_printing,
+ bool *handover)
{
static atomic_t panic_console_dropped = ATOMIC_INIT(0);
struct printk_info info;
@@ -2879,18 +3034,22 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex
unsigned long flags;
char *write_text;
size_t len;
+ u64 seq;
prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
if (handover)
*handover = false;
- if (!prb_read_valid(prb, con->seq, &r))
+ seq = read_console_seq(con);
+
+ if (!prb_read_valid(prb, seq, &r))
return false;
- if (con->seq != r.info->seq) {
- con->dropped += r.info->seq - con->seq;
- con->seq = r.info->seq;
+ if (seq != r.info->seq) {
+ atomic_long_add((unsigned long)(r.info->seq - seq), &con->dropped);
+ write_console_seq(con, r.info->seq, atomic_printing);
+ seq = r.info->seq;
if (panic_in_progress() &&
atomic_fetch_inc_relaxed(&panic_console_dropped) > 10) {
suppress_panic_printk = 1;
@@ -2900,7 +3059,7 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex
/* Skip record that has level above the console loglevel. */
if (suppress_message_printing(r.info->level)) {
- con->seq++;
+ write_console_seq(con, seq + 1, atomic_printing);
goto skip;
}
@@ -2932,9 +3091,9 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex
stop_critical_timings();
}
- call_console_driver(con, write_text, len, dropped_text);
+ call_console_driver(con, write_text, len, dropped_text, atomic_printing);
- con->seq++;
+ write_console_seq(con, seq + 1, atomic_printing);
if (handover) {
start_critical_timings();
@@ -2966,7 +3125,7 @@ static bool console_emit_next_record_transferable(struct console *con, char *tex
handover = NULL;
}
- return __console_emit_next_record(con, text, ext_text, dropped_text, handover);
+ return __console_emit_next_record(con, text, ext_text, dropped_text, false, handover);
}
/*
@@ -3014,7 +3173,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
for_each_console(con) {
bool progress;
- if (!console_is_usable(con))
+ if (!console_is_usable(con, false))
continue;
any_usable = true;
@@ -3049,6 +3208,68 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
return any_usable;
}
+#if defined(CONFIG_HAVE_ATOMIC_CONSOLE) && defined(CONFIG_PRINTK)
+static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
+ char *dropped_text, bool atomic_printing);
+
+static void atomic_console_flush_all(void)
+{
+ unsigned long flags;
+ struct console *con;
+ bool any_progress;
+ int index = 0;
+
+ if (console_suspended)
+ return;
+
+#ifdef CONFIG_HAVE_NMI
+ if (in_nmi())
+ index = 1;
+#endif
+
+ printk_cpu_sync_get_irqsave(flags);
+
+ do {
+ any_progress = false;
+
+ for_each_console(con) {
+ bool progress;
+
+ if (!console_is_usable(con, true))
+ continue;
+
+ if (con->flags & CON_EXTENDED) {
+ /* Extended consoles do not print "dropped messages". */
+ progress = console_emit_next_record(con,
+ &con->atomic_data->text[index],
+ &con->atomic_data->ext_text[index],
+ NULL,
+ true);
+ } else {
+ progress = console_emit_next_record(con,
+ &con->atomic_data->text[index],
+ NULL,
+ &con->atomic_data->dropped_text[index],
+ true);
+ }
+
+ if (!progress)
+ continue;
+ any_progress = true;
+
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+ touch_nmi_watchdog();
+ }
+ } while (any_progress);
+
+ printk_cpu_sync_put_irqrestore(flags);
+}
+#else /* CONFIG_HAVE_ATOMIC_CONSOLE && CONFIG_PRINTK */
+#define atomic_console_flush_all()
+#endif
+
/**
* console_unlock - unlock the console system
*
@@ -3164,6 +3385,11 @@ void console_unblank(void)
*/
void console_flush_on_panic(enum con_flush_mode mode)
{
+ if (mode == CONSOLE_ATOMIC_FLUSH_PENDING) {
+ atomic_console_flush_all();
+ return;
+ }
+
/*
* If someone else is holding the console lock, trylock will fail
* and may_schedule may be set. Ignore and proceed to unlock so
@@ -3180,7 +3406,7 @@ void console_flush_on_panic(enum con_flush_mode mode)
seq = prb_first_valid_seq(prb);
for_each_console(c)
- c->seq = seq;
+ write_console_seq(c, seq, false);
}
console_unlock();
}
@@ -3420,19 +3646,22 @@ void register_console(struct console *newcon)
console_drivers->next = newcon;
}
- newcon->dropped = 0;
+ atomic_long_set(&newcon->dropped, 0);
newcon->thread = NULL;
newcon->blocked = true;
mutex_init(&newcon->lock);
+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+ newcon->atomic_data = NULL;
+#endif
if (newcon->flags & CON_PRINTBUFFER) {
/* Get a consistent copy of @syslog_seq. */
mutex_lock(&syslog_lock);
- newcon->seq = syslog_seq;
+ write_console_seq(newcon, syslog_seq, false);
mutex_unlock(&syslog_lock);
} else {
/* Begin with next message. */
- newcon->seq = prb_next_seq(prb);
+ write_console_seq(newcon, prb_next_seq(prb), false);
}
if (printk_kthreads_available)
@@ -3515,6 +3744,10 @@ int unregister_console(struct console *console)
console_sysfs_notify();
+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+ free_atomic_data(console->atomic_data);
+#endif
+
if (console->exit)
res = console->exit(console);
@@ -3645,7 +3878,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
for_each_console(c) {
if (con && con != c)
continue;
- if (!console_is_usable(c))
+ if (!console_is_usable(c, false))
continue;
printk_seq = c->seq;
if (printk_seq < seq)
@@ -3734,9 +3967,10 @@ static void printk_fallback_preferred_direct(void)
* See __console_emit_next_record() for argument and return details.
*/
static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
- char *dropped_text)
+ char *dropped_text, bool atomic_printing)
{
- return __console_emit_next_record(con, text, ext_text, dropped_text, NULL);
+ return __console_emit_next_record(con, text, ext_text, dropped_text,
+ atomic_printing, NULL);
}
static bool printer_should_wake(struct console *con, u64 seq)
@@ -3777,6 +4011,11 @@ static int printk_kthread_func(void *data)
char *text;
int error;
+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+ if (con->write_atomic)
+ con->atomic_data = alloc_atomic_data(con->flags);
+#endif
+
text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
if (!text) {
con_printk(KERN_ERR, con, "failed to allocate text buffer\n");
@@ -3854,7 +4093,7 @@ static int printk_kthread_func(void *data)
* which can conditionally invoke cond_resched().
*/
console_may_schedule = 0;
- console_emit_next_record(con, text, ext_text, dropped_text);
+ console_emit_next_record(con, text, ext_text, dropped_text, false);
seq = con->seq;
--
2.43.0

View File

@ -0,0 +1,937 @@
From 08b8c0b589806331dc645a8ead6be51c174d93e0 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Fri, 4 Feb 2022 16:01:17 +0106
Subject: [PATCH 21/62] serial: 8250: implement write_atomic
Implement a non-sleeping NMI-safe write_atomic() console function in
order to support atomic console printing during a panic.
Trasmitting data requires disabling interrupts. Since write_atomic()
can be called from any context, it may be called while another CPU
is executing in console code. In order to maintain the correct state
of the IER register, use the global cpu_sync to synchronize all
access to the IER register. This synchronization is only necessary
for serial ports that are being used as consoles.
The global cpu_sync is also used to synchronize between the write()
and write_atomic() callbacks. write() synchronizes per character,
write_atomic() synchronizes per line.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/tty/serial/8250/8250.h | 41 ++++-
drivers/tty/serial/8250/8250_aspeed_vuart.c | 2 +-
drivers/tty/serial/8250/8250_bcm7271.c | 21 ++-
drivers/tty/serial/8250/8250_core.c | 24 ++-
drivers/tty/serial/8250/8250_exar.c | 4 +-
drivers/tty/serial/8250/8250_fsl.c | 3 +-
drivers/tty/serial/8250/8250_ingenic.c | 3 +-
drivers/tty/serial/8250/8250_mtk.c | 32 +++-
drivers/tty/serial/8250/8250_omap.c | 18 +--
drivers/tty/serial/8250/8250_port.c | 158 ++++++++++++++++----
drivers/tty/serial/8250/Kconfig | 1 +
include/linux/serial_8250.h | 5 +
12 files changed, 261 insertions(+), 51 deletions(-)
diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
index eeb7b43ebe53..b17715d340c3 100644
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -176,12 +176,49 @@ static inline void serial_dl_write(struct uart_8250_port *up, int value)
up->dl_write(up, value);
}
+static inline int serial8250_in_IER(struct uart_8250_port *up)
+{
+ struct uart_port *port = &up->port;
+ unsigned long flags;
+ bool is_console;
+ int ier;
+
+ is_console = uart_console(port);
+
+ if (is_console)
+ printk_cpu_sync_get_irqsave(flags);
+
+ ier = serial_in(up, UART_IER);
+
+ if (is_console)
+ printk_cpu_sync_put_irqrestore(flags);
+
+ return ier;
+}
+
+static inline void serial8250_set_IER(struct uart_8250_port *up, int ier)
+{
+ struct uart_port *port = &up->port;
+ unsigned long flags;
+ bool is_console;
+
+ is_console = uart_console(port);
+
+ if (is_console)
+ printk_cpu_sync_get_irqsave(flags);
+
+ serial_out(up, UART_IER, ier);
+
+ if (is_console)
+ printk_cpu_sync_put_irqrestore(flags);
+}
+
static inline bool serial8250_set_THRI(struct uart_8250_port *up)
{
if (up->ier & UART_IER_THRI)
return false;
up->ier |= UART_IER_THRI;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
return true;
}
@@ -190,7 +227,7 @@ static inline bool serial8250_clear_THRI(struct uart_8250_port *up)
if (!(up->ier & UART_IER_THRI))
return false;
up->ier &= ~UART_IER_THRI;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
return true;
}
diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c b/drivers/tty/serial/8250/8250_aspeed_vuart.c
index 9d2a7856784f..7cc6b527c088 100644
--- a/drivers/tty/serial/8250/8250_aspeed_vuart.c
+++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c
@@ -278,7 +278,7 @@ static void __aspeed_vuart_set_throttle(struct uart_8250_port *up,
up->ier &= ~irqs;
if (!throttle)
up->ier |= irqs;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
}
static void aspeed_vuart_set_throttle(struct uart_port *port, bool throttle)
{
diff --git a/drivers/tty/serial/8250/8250_bcm7271.c b/drivers/tty/serial/8250/8250_bcm7271.c
index ffc7f67e27e3..8b211e668bc0 100644
--- a/drivers/tty/serial/8250/8250_bcm7271.c
+++ b/drivers/tty/serial/8250/8250_bcm7271.c
@@ -609,7 +609,7 @@ static int brcmuart_startup(struct uart_port *port)
* will handle this.
*/
up->ier &= ~UART_IER_RDI;
- serial_port_out(port, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
priv->tx_running = false;
priv->dma.rx_dma = NULL;
@@ -775,10 +775,12 @@ static int brcmuart_handle_irq(struct uart_port *p)
unsigned int iir = serial_port_in(p, UART_IIR);
struct brcmuart_priv *priv = p->private_data;
struct uart_8250_port *up = up_to_u8250p(p);
+ unsigned long cs_flags;
unsigned int status;
unsigned long flags;
unsigned int ier;
unsigned int mcr;
+ bool is_console;
int handled = 0;
/*
@@ -789,6 +791,10 @@ static int brcmuart_handle_irq(struct uart_port *p)
spin_lock_irqsave(&p->lock, flags);
status = serial_port_in(p, UART_LSR);
if ((status & UART_LSR_DR) == 0) {
+ is_console = uart_console(p);
+
+ if (is_console)
+ printk_cpu_sync_get_irqsave(cs_flags);
ier = serial_port_in(p, UART_IER);
/*
@@ -809,6 +815,9 @@ static int brcmuart_handle_irq(struct uart_port *p)
serial_port_in(p, UART_RX);
}
+ if (is_console)
+ printk_cpu_sync_put_irqrestore(cs_flags);
+
handled = 1;
}
spin_unlock_irqrestore(&p->lock, flags);
@@ -823,8 +832,10 @@ static enum hrtimer_restart brcmuart_hrtimer_func(struct hrtimer *t)
struct brcmuart_priv *priv = container_of(t, struct brcmuart_priv, hrt);
struct uart_port *p = priv->up;
struct uart_8250_port *up = up_to_u8250p(p);
+ unsigned long cs_flags;
unsigned int status;
unsigned long flags;
+ bool is_console;
if (priv->shutdown)
return HRTIMER_NORESTART;
@@ -846,12 +857,20 @@ static enum hrtimer_restart brcmuart_hrtimer_func(struct hrtimer *t)
/* re-enable receive unless upper layer has disabled it */
if ((up->ier & (UART_IER_RLSI | UART_IER_RDI)) ==
(UART_IER_RLSI | UART_IER_RDI)) {
+ is_console = uart_console(p);
+
+ if (is_console)
+ printk_cpu_sync_get_irqsave(cs_flags);
+
status = serial_port_in(p, UART_IER);
status |= (UART_IER_RLSI | UART_IER_RDI);
serial_port_out(p, UART_IER, status);
status = serial_port_in(p, UART_MCR);
status |= UART_MCR_RTS;
serial_port_out(p, UART_MCR, status);
+
+ if (is_console)
+ printk_cpu_sync_put_irqrestore(cs_flags);
}
spin_unlock_irqrestore(&p->lock, flags);
return HRTIMER_NORESTART;
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index 81a5dab1a828..536f639ff56c 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -255,8 +255,11 @@ static void serial8250_timeout(struct timer_list *t)
static void serial8250_backup_timeout(struct timer_list *t)
{
struct uart_8250_port *up = from_timer(up, t, timer);
+ struct uart_port *port = &up->port;
unsigned int iir, ier = 0, lsr;
+ unsigned long cs_flags;
unsigned long flags;
+ bool is_console;
spin_lock_irqsave(&up->port.lock, flags);
@@ -265,8 +268,16 @@ static void serial8250_backup_timeout(struct timer_list *t)
* based handler.
*/
if (up->port.irq) {
+ is_console = uart_console(port);
+
+ if (is_console)
+ printk_cpu_sync_get_irqsave(cs_flags);
+
ier = serial_in(up, UART_IER);
serial_out(up, UART_IER, 0);
+
+ if (is_console)
+ printk_cpu_sync_put_irqrestore(cs_flags);
}
iir = serial_in(up, UART_IIR);
@@ -289,7 +300,7 @@ static void serial8250_backup_timeout(struct timer_list *t)
serial8250_tx_chars(up);
if (up->port.irq)
- serial_out(up, UART_IER, ier);
+ serial8250_set_IER(up, ier);
spin_unlock_irqrestore(&up->port.lock, flags);
@@ -575,6 +586,14 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev)
#ifdef CONFIG_SERIAL_8250_CONSOLE
+static void univ8250_console_write_atomic(struct console *co, const char *s,
+ unsigned int count)
+{
+ struct uart_8250_port *up = &serial8250_ports[co->index];
+
+ serial8250_console_write_atomic(up, s, count);
+}
+
static void univ8250_console_write(struct console *co, const char *s,
unsigned int count)
{
@@ -668,6 +687,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx,
static struct console univ8250_console = {
.name = "ttyS",
+ .write_atomic = univ8250_console_write_atomic,
.write = univ8250_console_write,
.device = uart_console_device,
.setup = univ8250_console_setup,
@@ -961,7 +981,7 @@ static void serial_8250_overrun_backoff_work(struct work_struct *work)
spin_lock_irqsave(&port->lock, flags);
up->ier |= UART_IER_RLSI | UART_IER_RDI;
up->port.read_status_mask |= UART_LSR_DR;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
spin_unlock_irqrestore(&port->lock, flags);
}
diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c
index b406cba10b0e..246c32c75a4c 100644
--- a/drivers/tty/serial/8250/8250_exar.c
+++ b/drivers/tty/serial/8250/8250_exar.c
@@ -189,6 +189,8 @@ static void xr17v35x_set_divisor(struct uart_port *p, unsigned int baud,
static int xr17v35x_startup(struct uart_port *port)
{
+ struct uart_8250_port *up = up_to_u8250p(port);
+
/*
* First enable access to IER [7:5], ISR [5:4], FCR [5:4],
* MCR [7:5] and MSR [7:0]
@@ -199,7 +201,7 @@ static int xr17v35x_startup(struct uart_port *port)
* Make sure all interrups are masked until initialization is
* complete and the FIFOs are cleared
*/
- serial_port_out(port, UART_IER, 0);
+ serial8250_set_IER(up, 0);
return serial8250_do_startup(port);
}
diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c
index 8adfaa183f77..eaf148245a10 100644
--- a/drivers/tty/serial/8250/8250_fsl.c
+++ b/drivers/tty/serial/8250/8250_fsl.c
@@ -58,7 +58,8 @@ int fsl8250_handle_irq(struct uart_port *port)
if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) {
unsigned long delay;
- up->ier = port->serial_in(port, UART_IER);
+ up->ier = serial8250_in_IER(up);
+
if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
port->ops->stop_rx(port);
} else {
diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c
index 2b2f5d8d24b9..2b78e6c394fb 100644
--- a/drivers/tty/serial/8250/8250_ingenic.c
+++ b/drivers/tty/serial/8250/8250_ingenic.c
@@ -146,6 +146,7 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart",
static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value)
{
+ struct uart_8250_port *up = up_to_u8250p(p);
int ier;
switch (offset) {
@@ -167,7 +168,7 @@ static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value)
* If we have enabled modem status IRQs we should enable
* modem mode.
*/
- ier = p->serial_in(p, UART_IER);
+ ier = serial8250_in_IER(up);
if (ier & UART_IER_MSI)
value |= UART_MCR_MDCE | UART_MCR_FCM;
diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c
index fb1d5ec0940e..3e7203909d6a 100644
--- a/drivers/tty/serial/8250/8250_mtk.c
+++ b/drivers/tty/serial/8250/8250_mtk.c
@@ -222,12 +222,40 @@ static void mtk8250_shutdown(struct uart_port *port)
static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask)
{
- serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask));
+ struct uart_port *port = &up->port;
+ unsigned long flags;
+ bool is_console;
+ int ier;
+
+ is_console = uart_console(port);
+
+ if (is_console)
+ printk_cpu_sync_get_irqsave(flags);
+
+ ier = serial_in(up, UART_IER);
+ serial_out(up, UART_IER, ier & (~mask));
+
+ if (is_console)
+ printk_cpu_sync_put_irqrestore(flags);
}
static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask)
{
- serial_out(up, UART_IER, serial_in(up, UART_IER) | mask);
+ struct uart_port *port = &up->port;
+ unsigned long flags;
+ bool is_console;
+ int ier;
+
+ is_console = uart_console(port);
+
+ if (is_console)
+ printk_cpu_sync_get_irqsave(flags);
+
+ ier = serial_in(up, UART_IER);
+ serial_out(up, UART_IER, ier | mask);
+
+ if (is_console)
+ printk_cpu_sync_put_irqrestore(flags);
}
static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode)
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index 0b04d810b3e6..2b8ad5176399 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -330,7 +330,7 @@ static void omap8250_restore_regs(struct uart_8250_port *up)
/* drop TCR + TLR access, we setup XON/XOFF later */
serial8250_out_MCR(up, mcr);
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
serial_dl_write(up, priv->quot);
@@ -520,7 +520,7 @@ static void omap_8250_pm(struct uart_port *port, unsigned int state,
serial_out(up, UART_EFR, efr | UART_EFR_ECB);
serial_out(up, UART_LCR, 0);
- serial_out(up, UART_IER, (state != 0) ? UART_IERX_SLEEP : 0);
+ serial8250_set_IER(up, (state != 0) ? UART_IERX_SLEEP : 0);
serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
serial_out(up, UART_EFR, efr);
serial_out(up, UART_LCR, 0);
@@ -703,7 +703,7 @@ static int omap_8250_startup(struct uart_port *port)
goto err;
up->ier = UART_IER_RLSI | UART_IER_RDI;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
#ifdef CONFIG_PM
up->capabilities |= UART_CAP_RPM;
@@ -744,7 +744,7 @@ static void omap_8250_shutdown(struct uart_port *port)
serial_out(up, UART_OMAP_EFR2, 0x0);
up->ier = 0;
- serial_out(up, UART_IER, 0);
+ serial8250_set_IER(up, 0);
if (up->dma)
serial8250_release_dma(up);
@@ -792,7 +792,7 @@ static void omap_8250_unthrottle(struct uart_port *port)
up->dma->rx_dma(up);
up->ier |= UART_IER_RLSI | UART_IER_RDI;
port->read_status_mask |= UART_LSR_DR;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
spin_unlock_irqrestore(&port->lock, flags);
pm_runtime_mark_last_busy(port->dev);
@@ -883,7 +883,7 @@ static void __dma_rx_complete(void *param)
__dma_rx_do_complete(p);
if (!priv->throttled) {
p->ier |= UART_IER_RLSI | UART_IER_RDI;
- serial_out(p, UART_IER, p->ier);
+ serial8250_set_IER(p, p->ier);
if (!(priv->habit & UART_HAS_EFR2))
omap_8250_rx_dma(p);
}
@@ -940,7 +940,7 @@ static int omap_8250_rx_dma(struct uart_8250_port *p)
* callback to run.
*/
p->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
- serial_out(p, UART_IER, p->ier);
+ serial8250_set_IER(p, p->ier);
}
goto out;
}
@@ -1153,12 +1153,12 @@ static void am654_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir,
* periodic timeouts, re-enable interrupts.
*/
up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
omap_8250_rx_dma_flush(up);
serial_in(up, UART_IIR);
serial_out(up, UART_OMAP_EFR2, 0x0);
up->ier |= UART_IER_RLSI | UART_IER_RDI;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
}
}
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 8efe31448df3..975c16267196 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -744,7 +744,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
serial_out(p, UART_EFR, UART_EFR_ECB);
serial_out(p, UART_LCR, 0);
}
- serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0);
+ serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0);
if (p->capabilities & UART_CAP_EFR) {
serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
serial_out(p, UART_EFR, efr);
@@ -755,12 +755,29 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
serial8250_rpm_put(p);
}
-static void serial8250_clear_IER(struct uart_8250_port *up)
+static unsigned int serial8250_clear_IER(struct uart_8250_port *up)
{
+ struct uart_port *port = &up->port;
+ unsigned int clearval = 0;
+ unsigned long flags;
+ bool is_console;
+ unsigned int prior;
+
+ is_console = uart_console(port);
+
if (up->capabilities & UART_CAP_UUE)
- serial_out(up, UART_IER, UART_IER_UUE);
- else
- serial_out(up, UART_IER, 0);
+ clearval = UART_IER_UUE;
+
+ if (is_console)
+ printk_cpu_sync_get_irqsave(flags);
+
+ prior = serial_in(up, UART_IER);
+ serial_out(up, UART_IER, clearval);
+
+ if (is_console)
+ printk_cpu_sync_put_irqrestore(flags);
+
+ return prior;
}
#ifdef CONFIG_SERIAL_8250_RSA
@@ -1026,8 +1043,11 @@ static int broken_efr(struct uart_8250_port *up)
*/
static void autoconfig_16550a(struct uart_8250_port *up)
{
+ struct uart_port *port = &up->port;
unsigned char status1, status2;
unsigned int iersave;
+ unsigned long flags;
+ bool is_console;
up->port.type = PORT_16550A;
up->capabilities |= UART_CAP_FIFO;
@@ -1139,6 +1159,11 @@ static void autoconfig_16550a(struct uart_8250_port *up)
return;
}
+ is_console = uart_console(port);
+
+ if (is_console)
+ printk_cpu_sync_get_irqsave(flags);
+
/*
* Try writing and reading the UART_IER_UUE bit (b6).
* If it works, this is probably one of the Xscale platform's
@@ -1174,6 +1199,9 @@ static void autoconfig_16550a(struct uart_8250_port *up)
}
serial_out(up, UART_IER, iersave);
+ if (is_console)
+ printk_cpu_sync_put_irqrestore(flags);
+
/*
* We distinguish between 16550A and U6 16550A by counting
* how many bytes are in the FIFO.
@@ -1196,8 +1224,10 @@ static void autoconfig(struct uart_8250_port *up)
unsigned char status1, scratch, scratch2, scratch3;
unsigned char save_lcr, save_mcr;
struct uart_port *port = &up->port;
+ unsigned long cs_flags;
unsigned long flags;
unsigned int old_capabilities;
+ bool is_console;
if (!port->iobase && !port->mapbase && !port->membase)
return;
@@ -1215,6 +1245,11 @@ static void autoconfig(struct uart_8250_port *up)
up->bugs = 0;
if (!(port->flags & UPF_BUGGY_UART)) {
+ is_console = uart_console(port);
+
+ if (is_console)
+ printk_cpu_sync_get_irqsave(cs_flags);
+
/*
* Do a simple existence test first; if we fail this,
* there's no point trying anything else.
@@ -1244,6 +1279,10 @@ static void autoconfig(struct uart_8250_port *up)
#endif
scratch3 = serial_in(up, UART_IER) & 0x0f;
serial_out(up, UART_IER, scratch);
+
+ if (is_console)
+ printk_cpu_sync_put_irqrestore(cs_flags);
+
if (scratch2 != 0 || scratch3 != 0x0F) {
/*
* We failed; there's nothing here
@@ -1367,7 +1406,9 @@ static void autoconfig_irq(struct uart_8250_port *up)
unsigned char save_mcr, save_ier;
unsigned char save_ICP = 0;
unsigned int ICP = 0;
+ unsigned long flags;
unsigned long irqs;
+ bool is_console;
int irq;
if (port->flags & UPF_FOURPORT) {
@@ -1377,8 +1418,12 @@ static void autoconfig_irq(struct uart_8250_port *up)
inb_p(ICP);
}
- if (uart_console(port))
+ is_console = uart_console(port);
+
+ if (is_console) {
console_lock();
+ printk_cpu_sync_get_irqsave(flags);
+ }
/* forget possible initially masked and pending IRQ */
probe_irq_off(probe_irq_on());
@@ -1410,8 +1455,10 @@ static void autoconfig_irq(struct uart_8250_port *up)
if (port->flags & UPF_FOURPORT)
outb_p(save_ICP, ICP);
- if (uart_console(port))
+ if (is_console) {
+ printk_cpu_sync_put_irqrestore(flags);
console_unlock();
+ }
port->irq = (irq > 0) ? irq : 0;
}
@@ -1424,7 +1471,7 @@ static void serial8250_stop_rx(struct uart_port *port)
up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
up->port.read_status_mask &= ~UART_LSR_DR;
- serial_port_out(port, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
serial8250_rpm_put(up);
}
@@ -1454,7 +1501,7 @@ void serial8250_em485_stop_tx(struct uart_8250_port *p)
serial8250_clear_and_reinit_fifos(p);
p->ier |= UART_IER_RLSI | UART_IER_RDI;
- serial_port_out(&p->port, UART_IER, p->ier);
+ serial8250_set_IER(p, p->ier);
}
}
EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx);
@@ -1703,7 +1750,7 @@ static void serial8250_disable_ms(struct uart_port *port)
mctrl_gpio_disable_ms(up->gpios);
up->ier &= ~UART_IER_MSI;
- serial_port_out(port, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
}
static void serial8250_enable_ms(struct uart_port *port)
@@ -1719,7 +1766,7 @@ static void serial8250_enable_ms(struct uart_port *port)
up->ier |= UART_IER_MSI;
serial8250_rpm_get(up);
- serial_port_out(port, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
serial8250_rpm_put(up);
}
@@ -2174,8 +2221,7 @@ static void serial8250_put_poll_char(struct uart_port *port,
/*
* First save the IER then disable the interrupts
*/
- ier = serial_port_in(port, UART_IER);
- serial8250_clear_IER(up);
+ ier = serial8250_clear_IER(up);
wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
/*
@@ -2188,7 +2234,7 @@ static void serial8250_put_poll_char(struct uart_port *port,
* and restore the IER
*/
wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
- serial_port_out(port, UART_IER, ier);
+ serial8250_set_IER(up, ier);
serial8250_rpm_put(up);
}
@@ -2197,8 +2243,10 @@ static void serial8250_put_poll_char(struct uart_port *port,
int serial8250_do_startup(struct uart_port *port)
{
struct uart_8250_port *up = up_to_u8250p(port);
+ unsigned long cs_flags;
unsigned long flags;
unsigned char iir;
+ bool is_console;
int retval;
u16 lsr;
@@ -2219,7 +2267,7 @@ int serial8250_do_startup(struct uart_port *port)
up->acr = 0;
serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
serial_port_out(port, UART_EFR, UART_EFR_ECB);
- serial_port_out(port, UART_IER, 0);
+ serial8250_set_IER(up, 0);
serial_port_out(port, UART_LCR, 0);
serial_icr_write(up, UART_CSR, 0); /* Reset the UART */
serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
@@ -2229,7 +2277,7 @@ int serial8250_do_startup(struct uart_port *port)
if (port->type == PORT_DA830) {
/* Reset the port */
- serial_port_out(port, UART_IER, 0);
+ serial8250_set_IER(up, 0);
serial_port_out(port, UART_DA830_PWREMU_MGMT, 0);
mdelay(10);
@@ -2328,6 +2376,8 @@ int serial8250_do_startup(struct uart_port *port)
if (retval)
goto out;
+ is_console = uart_console(port);
+
if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) {
unsigned char iir1;
@@ -2344,6 +2394,9 @@ int serial8250_do_startup(struct uart_port *port)
*/
spin_lock_irqsave(&port->lock, flags);
+ if (is_console)
+ printk_cpu_sync_get_irqsave(cs_flags);
+
wait_for_xmitr(up, UART_LSR_THRE);
serial_port_out_sync(port, UART_IER, UART_IER_THRI);
udelay(1); /* allow THRE to set */
@@ -2354,6 +2407,9 @@ int serial8250_do_startup(struct uart_port *port)
iir = serial_port_in(port, UART_IIR);
serial_port_out(port, UART_IER, 0);
+ if (is_console)
+ printk_cpu_sync_put_irqrestore(cs_flags);
+
spin_unlock_irqrestore(&port->lock, flags);
if (port->irqflags & IRQF_SHARED)
@@ -2408,10 +2464,14 @@ int serial8250_do_startup(struct uart_port *port)
* Do a quick test to see if we receive an interrupt when we enable
* the TX irq.
*/
+ if (is_console)
+ printk_cpu_sync_get_irqsave(cs_flags);
serial_port_out(port, UART_IER, UART_IER_THRI);
lsr = serial_port_in(port, UART_LSR);
iir = serial_port_in(port, UART_IIR);
serial_port_out(port, UART_IER, 0);
+ if (is_console)
+ printk_cpu_sync_put_irqrestore(cs_flags);
if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) {
if (!(up->bugs & UART_BUG_TXEN)) {
@@ -2443,7 +2503,7 @@ int serial8250_do_startup(struct uart_port *port)
if (up->dma) {
const char *msg = NULL;
- if (uart_console(port))
+ if (is_console)
msg = "forbid DMA for kernel console";
else if (serial8250_request_dma(up))
msg = "failed to request DMA";
@@ -2494,7 +2554,7 @@ void serial8250_do_shutdown(struct uart_port *port)
*/
spin_lock_irqsave(&port->lock, flags);
up->ier = 0;
- serial_port_out(port, UART_IER, 0);
+ serial8250_set_IER(up, 0);
spin_unlock_irqrestore(&port->lock, flags);
synchronize_irq(port->irq);
@@ -2856,7 +2916,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
if (up->capabilities & UART_CAP_RTOIE)
up->ier |= UART_IER_RTOIE;
- serial_port_out(port, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
if (up->capabilities & UART_CAP_EFR) {
unsigned char efr = 0;
@@ -3321,7 +3381,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults);
#ifdef CONFIG_SERIAL_8250_CONSOLE
-static void serial8250_console_putchar(struct uart_port *port, unsigned char ch)
+static void serial8250_console_putchar_locked(struct uart_port *port, unsigned char ch)
{
struct uart_8250_port *up = up_to_u8250p(port);
@@ -3329,6 +3389,18 @@ static void serial8250_console_putchar(struct uart_port *port, unsigned char ch)
serial_port_out(port, UART_TX, ch);
}
+static void serial8250_console_putchar(struct uart_port *port, unsigned char ch)
+{
+ struct uart_8250_port *up = up_to_u8250p(port);
+ unsigned long flags;
+
+ wait_for_xmitr(up, UART_LSR_THRE);
+
+ printk_cpu_sync_get_irqsave(flags);
+ serial8250_console_putchar_locked(port, ch);
+ printk_cpu_sync_put_irqrestore(flags);
+}
+
/*
* Restore serial console when h/w power-off detected
*/
@@ -3355,6 +3427,32 @@ static void serial8250_console_restore(struct uart_8250_port *up)
serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS);
}
+void serial8250_console_write_atomic(struct uart_8250_port *up,
+ const char *s, unsigned int count)
+{
+ struct uart_port *port = &up->port;
+ unsigned long flags;
+ unsigned int ier;
+
+ printk_cpu_sync_get_irqsave(flags);
+
+ touch_nmi_watchdog();
+
+ ier = serial8250_clear_IER(up);
+
+ if (atomic_fetch_inc(&up->console_printing)) {
+ uart_console_write(port, "\n", 1,
+ serial8250_console_putchar_locked);
+ }
+ uart_console_write(port, s, count, serial8250_console_putchar_locked);
+ atomic_dec(&up->console_printing);
+
+ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
+ serial8250_set_IER(up, ier);
+
+ printk_cpu_sync_put_irqrestore(flags);
+}
+
/*
* Print a string to the serial port using the device FIFO
*
@@ -3400,20 +3498,15 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
struct uart_port *port = &up->port;
unsigned long flags;
unsigned int ier, use_fifo;
- int locked = 1;
touch_nmi_watchdog();
- if (oops_in_progress)
- locked = spin_trylock_irqsave(&port->lock, flags);
- else
- spin_lock_irqsave(&port->lock, flags);
+ spin_lock_irqsave(&port->lock, flags);
/*
* First save the IER then disable the interrupts
*/
- ier = serial_port_in(port, UART_IER);
- serial8250_clear_IER(up);
+ ier = serial8250_clear_IER(up);
/* check scratch reg to see if port powered off during system sleep */
if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
@@ -3447,10 +3540,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
*/
!(up->port.flags & UPF_CONS_FLOW);
+ atomic_inc(&up->console_printing);
if (likely(use_fifo))
serial8250_console_fifo_write(up, s, count);
else
uart_console_write(port, s, count, serial8250_console_putchar);
+ atomic_dec(&up->console_printing);
/*
* Finally, wait for transmitter to become empty
@@ -3463,8 +3558,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
if (em485->tx_stopped)
up->rs485_stop_tx(up);
}
-
- serial_port_out(port, UART_IER, ier);
+ serial8250_set_IER(up, ier);
/*
* The receive handling will happen properly because the
@@ -3476,8 +3570,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
if (up->msr_saved_flags)
serial8250_modem_status(up);
- if (locked)
- spin_unlock_irqrestore(&port->lock, flags);
+ spin_unlock_irqrestore(&port->lock, flags);
}
static unsigned int probe_baud(struct uart_port *port)
@@ -3497,6 +3590,7 @@ static unsigned int probe_baud(struct uart_port *port)
int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
{
+ struct uart_8250_port *up = up_to_u8250p(port);
int baud = 9600;
int bits = 8;
int parity = 'n';
@@ -3506,6 +3600,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
if (!port->iobase && !port->membase)
return -ENODEV;
+ atomic_set(&up->console_printing, 0);
+
if (options)
uart_parse_options(options, &baud, &parity, &bits, &flow);
else if (probe)
diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
index 583a340f9934..1f31320820a6 100644
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig
@@ -9,6 +9,7 @@ config SERIAL_8250
depends on !S390
select SERIAL_CORE
select SERIAL_MCTRL_GPIO if GPIOLIB
+ select HAVE_ATOMIC_CONSOLE
help
This selects whether you want to include the driver for the standard
serial ports. The standard answer is Y. People who might say N
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 79b328861c5f..35f44352e641 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -7,6 +7,7 @@
#ifndef _LINUX_SERIAL_8250_H
#define _LINUX_SERIAL_8250_H
+#include <linux/atomic.h>
#include <linux/serial_core.h>
#include <linux/serial_reg.h>
#include <linux/platform_device.h>
@@ -124,6 +125,8 @@ struct uart_8250_port {
#define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA
unsigned char msr_saved_flags;
+ atomic_t console_printing;
+
struct uart_8250_dma *dma;
const struct uart_8250_ops *ops;
@@ -179,6 +182,8 @@ void serial8250_init_port(struct uart_8250_port *up);
void serial8250_set_defaults(struct uart_8250_port *up);
void serial8250_console_write(struct uart_8250_port *up, const char *s,
unsigned int count);
+void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s,
+ unsigned int count);
int serial8250_console_setup(struct uart_port *port, char *options, bool probe);
int serial8250_console_exit(struct uart_port *port);
--
2.43.0

View File

@ -0,0 +1,95 @@
From a549c0dff2a1a3b11b40abc969358d8cb367871d Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Fri, 4 Feb 2022 16:01:17 +0106
Subject: [PATCH 22/62] printk: avoid preempt_disable() for PREEMPT_RT
During non-normal operation, printk() calls will attempt to
write the messages directly to the consoles. This involves
using console_trylock() to acquire @console_sem.
Preemption is disabled while directly printing to the consoles
in order to ensure that the printing task is not scheduled away
while holding @console_sem, thus blocking all other printers
and causing delays in printing.
Commit fd5f7cde1b85 ("printk: Never set console_may_schedule in
console_trylock()") specifically reverted a previous attempt at
allowing preemption while printing.
However, on PREEMPT_RT systems, disabling preemption while
printing is not allowed because console drivers typically
acquire a spin lock (which under PREEMPT_RT is an rtmutex).
Since direct printing is only used during early boot and
non-panic dumps, the risks of delayed print output for these
scenarios will be accepted under PREEMPT_RT.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 21 ++++++++++++++++++++-
1 file changed, 20 insertions(+), 1 deletion(-)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 73b1727087c7..3d0ff49cca29 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1981,6 +1981,7 @@ static int console_lock_spinning_disable_and_check(void)
return 1;
}
+#if !IS_ENABLED(CONFIG_PREEMPT_RT)
/**
* console_trylock_spinning - try to get console_lock by busy waiting
*
@@ -2054,6 +2055,7 @@ static int console_trylock_spinning(void)
return 1;
}
+#endif /* CONFIG_PREEMPT_RT */
/*
* Call the specified console driver, asking it to write out the specified
@@ -2393,6 +2395,18 @@ asmlinkage int vprintk_emit(int facility, int level,
/* If called from the scheduler, we can not call up(). */
if (!in_sched && allow_direct_printing()) {
+#if IS_ENABLED(CONFIG_PREEMPT_RT)
+ /*
+ * Use the non-spinning trylock since PREEMPT_RT does not
+ * support console lock handovers.
+ *
+ * Direct printing will most likely involve taking spinlocks.
+ * For PREEMPT_RT, this is only allowed if in a preemptible
+ * context.
+ */
+ if (preemptible() && console_trylock())
+ console_unlock();
+#else
/*
* The caller may be holding system-critical or
* timing-sensitive locks. Disable preemption during direct
@@ -2410,6 +2424,7 @@ asmlinkage int vprintk_emit(int facility, int level,
if (console_trylock_spinning())
console_unlock();
preempt_enable();
+#endif
}
if (in_sched)
@@ -3119,8 +3134,12 @@ static bool console_emit_next_record_transferable(struct console *con, char *tex
/*
* Handovers are only supported if threaded printers are atomically
* blocked. The context taking over the console_lock may be atomic.
+ *
+ * PREEMPT_RT also does not support handovers because the spinning
+ * waiter can cause large latencies.
*/
- if (!console_kthreads_atomically_blocked()) {
+ if (!console_kthreads_atomically_blocked() ||
+ IS_ENABLED(CONFIG_PREEMPT_RT)) {
*handover = false;
handover = NULL;
}
--
2.43.0

View File

@ -0,0 +1,61 @@
From f99a0874dccc1ba9f9ead4473ba09603d10946b4 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Sat, 27 Feb 2016 08:09:11 +0100
Subject: [PATCH 23/62] drm/i915: Use preempt_disable/enable_rt() where
recommended
Mario Kleiner suggest in commit
ad3543ede630f ("drm/intel: Push get_scanout_position() timestamping into kms driver.")
a spots where preemption should be disabled on PREEMPT_RT. The
difference is that on PREEMPT_RT the intel_uncore::lock disables neither
preemption nor interrupts and so region remains preemptible.
The area covers only register reads and writes. The part that worries me
is:
- __intel_get_crtc_scanline() the worst case is 100us if no match is
found.
- intel_crtc_scanlines_since_frame_timestamp() not sure how long this
may take in the worst case.
It was in the RT queue for a while and nobody complained.
Disable preemption on PREEPMPT_RT during timestamping.
[bigeasy: patch description.]
Cc: Mario Kleiner <mario.kleiner.de@gmail.com>
Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/gpu/drm/i915/i915_irq.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index f93ffa6626a5..6e9d033cf808 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -917,7 +917,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc,
*/
spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
- /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
/* Get optional system timestamp before query. */
if (stime)
@@ -981,7 +982,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc,
if (etime)
*etime = ktime_get();
- /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
--
2.43.0

View File

@ -0,0 +1,90 @@
From 7540de1a42a1a7409b4367b67c76480f7e96d25b Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Sat, 27 Feb 2016 09:01:42 +0100
Subject: [PATCH 24/62] drm/i915: Don't disable interrupts on PREEMPT_RT during
atomic updates
Commit
8d7849db3eab7 ("drm/i915: Make sprite updates atomic")
started disabling interrupts across atomic updates. This breaks on PREEMPT_RT
because within this section the code attempt to acquire spinlock_t locks which
are sleeping locks on PREEMPT_RT.
According to the comment the interrupts are disabled to avoid random delays and
not required for protection or synchronisation.
If this needs to happen with disabled interrupts on PREEMPT_RT, and the
whole section is restricted to register access then all sleeping locks
need to be acquired before interrupts are disabled and some function
maybe moved after enabling interrupts again.
This includes:
- prepare_to_wait() + finish_wait() due its wake queue.
- drm_crtc_vblank_put() -> vblank_disable_fn() drm_device::vbl_lock.
- skl_pfit_enable(), intel_update_plane(), vlv_atomic_update_fifo() and
maybe others due to intel_uncore::lock
- drm_crtc_arm_vblank_event() due to drm_device::event_lock and
drm_device::vblank_time_lock.
Don't disable interrupts on PREEMPT_RT during atomic updates.
[bigeasy: drop local locks, commit message]
Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/gpu/drm/i915/display/intel_crtc.c | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c
index 6792a9056f46..43cedfef104f 100644
--- a/drivers/gpu/drm/i915/display/intel_crtc.c
+++ b/drivers/gpu/drm/i915/display/intel_crtc.c
@@ -521,7 +521,8 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
*/
intel_psr_wait_for_idle_locked(new_crtc_state);
- local_irq_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
crtc->debug.min_vbl = min;
crtc->debug.max_vbl = max;
@@ -546,11 +547,13 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
break;
}
- local_irq_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
timeout = schedule_timeout(timeout);
- local_irq_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
}
finish_wait(wq, &wait);
@@ -583,7 +586,8 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
return;
irq_disable:
- local_irq_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
}
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE)
@@ -684,7 +688,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state)
*/
intel_vrr_send_push(new_crtc_state);
- local_irq_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
if (intel_vgpu_active(dev_priv))
return;
--
2.43.0

View File

@ -0,0 +1,35 @@
From d9970290d810087f44a37de7ae6f3638ecddd3a0 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 Oct 2021 15:05:18 +0200
Subject: [PATCH 25/62] drm/i915: Don't check for atomic context on PREEMPT_RT
The !in_atomic() check in _wait_for_atomic() triggers on PREEMPT_RT
because the uncore::lock is a spinlock_t and does not disable
preemption or interrupts.
Changing the uncore:lock to a raw_spinlock_t doubles the worst case
latency on an otherwise idle testbox during testing. Therefore I'm
currently unsure about changing this.
Link: https://lore.kernel.org/all/20211006164628.s2mtsdd2jdbfyf7g@linutronix.de/
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/gpu/drm/i915/i915_utils.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index 6c14d13364bf..de58855e6926 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -294,7 +294,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
#define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000)
/* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */
-#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT)
+#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT)
# define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic())
#else
# define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0)
--
2.43.0

View File

@ -0,0 +1,50 @@
From 05b1ad300df55e576476a0dee533c90068480372 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 6 Dec 2018 09:52:20 +0100
Subject: [PATCH 26/62] drm/i915: Disable tracing points on PREEMPT_RT
Luca Abeni reported this:
| BUG: scheduling while atomic: kworker/u8:2/15203/0x00000003
| CPU: 1 PID: 15203 Comm: kworker/u8:2 Not tainted 4.19.1-rt3 #10
| Call Trace:
| rt_spin_lock+0x3f/0x50
| gen6_read32+0x45/0x1d0 [i915]
| g4x_get_vblank_counter+0x36/0x40 [i915]
| trace_event_raw_event_i915_pipe_update_start+0x7d/0xf0 [i915]
The tracing events use trace_i915_pipe_update_start() among other events
use functions acquire spinlock_t locks which are transformed into
sleeping locks on PREEMPT_RT. A few trace points use
intel_get_crtc_scanline(), others use ->get_vblank_counter() wich also
might acquire a sleeping locks on PREEMPT_RT.
At the time the arguments are evaluated within trace point, preemption
is disabled and so the locks must not be acquired on PREEMPT_RT.
Based on this I don't see any other way than disable trace points on
PREMPT_RT.
Reported-by: Luca Abeni <lucabe72@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/gpu/drm/i915/i915_trace.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 37b5c9e9d260..1434485cb536 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -6,6 +6,10 @@
#if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
#define _I915_TRACE_H_
+#ifdef CONFIG_PREEMPT_RT
+#define NOTRACE
+#endif
+
#include <linux/stringify.h>
#include <linux/types.h>
#include <linux/tracepoint.h>
--
2.43.0

View File

@ -0,0 +1,34 @@
From 1fcbddeff5b9a56382a6ba0aba49578f8cdf9aa4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 19 Dec 2018 10:47:02 +0100
Subject: [PATCH 27/62] drm/i915: skip DRM_I915_LOW_LEVEL_TRACEPOINTS with
NOTRACE
The order of the header files is important. If this header file is
included after tracepoint.h was included then the NOTRACE here becomes a
nop. Currently this happens for two .c files which use the tracepoitns
behind DRM_I915_LOW_LEVEL_TRACEPOINTS.
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
drivers/gpu/drm/i915/i915_trace.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 1434485cb536..73f29d8008f0 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -327,7 +327,7 @@ DEFINE_EVENT(i915_request, i915_request_add,
TP_ARGS(rq)
);
-#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS)
+#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE)
DEFINE_EVENT(i915_request, i915_request_guc_submit,
TP_PROTO(struct i915_request *rq),
TP_ARGS(rq)
--
2.43.0

View File

@ -0,0 +1,47 @@
From 6dfc680fe2808eaf10a9feed7e3116df60b6032f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 8 Sep 2021 17:18:00 +0200
Subject: [PATCH 28/62] drm/i915/gt: Queue and wait for the irq_work item.
Disabling interrupts and invoking the irq_work function directly breaks
on PREEMPT_RT.
PREEMPT_RT does not invoke all irq_work from hardirq context because
some of the user have spinlock_t locking in the callback function.
These locks are then turned into a sleeping locks which can not be
acquired with disabled interrupts.
Using irq_work_queue() has the benefit that the irqwork will be invoked
in the regular context. In general there is "no" delay between enqueuing
the callback and its invocation because the interrupt is raised right
away on architectures which support it (which includes x86).
Use irq_work_queue() + irq_work_sync() instead invoking the callback
directly.
Reported-by: Clark Williams <williams@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
---
drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
index ecc990ec1b95..8d04b10681f0 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
@@ -312,10 +312,9 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b)
/* Kick the work once more to drain the signalers, and disarm the irq */
irq_work_sync(&b->irq_work);
while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) {
- local_irq_disable();
- signal_irq_work(&b->irq_work);
- local_irq_enable();
+ irq_work_queue(&b->irq_work);
cond_resched();
+ irq_work_sync(&b->irq_work);
}
}
--
2.43.0

View File

@ -0,0 +1,94 @@
From 6ca6d59038e0a61a7bb4904310525b1df57a2867 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 8 Sep 2021 19:03:41 +0200
Subject: [PATCH 29/62] drm/i915/gt: Use spin_lock_irq() instead of
local_irq_disable() + spin_lock()
execlists_dequeue() is invoked from a function which uses
local_irq_disable() to disable interrupts so the spin_lock() behaves
like spin_lock_irq().
This breaks PREEMPT_RT because local_irq_disable() + spin_lock() is not
the same as spin_lock_irq().
execlists_dequeue_irq() and execlists_dequeue() has each one caller
only. If intel_engine_cs::active::lock is acquired and released with the
_irq suffix then it behaves almost as if execlists_dequeue() would be
invoked with disabled interrupts. The difference is the last part of the
function which is then invoked with enabled interrupts.
I can't tell if this makes a difference. From looking at it, it might
work to move the last unlock at the end of the function as I didn't find
anything that would acquire the lock again.
Reported-by: Clark Williams <williams@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
---
.../drm/i915/gt/intel_execlists_submission.c | 17 +++++------------
1 file changed, 5 insertions(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index f903ee1ce06e..f54059b63ea9 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -1302,7 +1302,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
* and context switches) submission.
*/
- spin_lock(&sched_engine->lock);
+ spin_lock_irq(&sched_engine->lock);
/*
* If the queue is higher priority than the last
@@ -1402,7 +1402,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
* Even if ELSP[1] is occupied and not worthy
* of timeslices, our queue might be.
*/
- spin_unlock(&sched_engine->lock);
+ spin_unlock_irq(&sched_engine->lock);
return;
}
}
@@ -1428,7 +1428,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
if (last && !can_merge_rq(last, rq)) {
spin_unlock(&ve->base.sched_engine->lock);
- spin_unlock(&engine->sched_engine->lock);
+ spin_unlock_irq(&engine->sched_engine->lock);
return; /* leave this for another sibling */
}
@@ -1590,7 +1590,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
*/
sched_engine->queue_priority_hint = queue_prio(sched_engine);
i915_sched_engine_reset_on_empty(sched_engine);
- spin_unlock(&sched_engine->lock);
+ spin_unlock_irq(&sched_engine->lock);
/*
* We can skip poking the HW if we ended up with exactly the same set
@@ -1616,13 +1616,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
}
}
-static void execlists_dequeue_irq(struct intel_engine_cs *engine)
-{
- local_irq_disable(); /* Suspend interrupts across request submission */
- execlists_dequeue(engine);
- local_irq_enable(); /* flush irq_work (e.g. breadcrumb enabling) */
-}
-
static void clear_ports(struct i915_request **ports, int count)
{
memset_p((void **)ports, NULL, count);
@@ -2476,7 +2469,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t)
}
if (!engine->execlists.pending[0]) {
- execlists_dequeue_irq(engine);
+ execlists_dequeue(engine);
start_timeslice(engine);
}
--
2.43.0

View File

@ -0,0 +1,44 @@
From 6779399c1acfc91f6e9b0bd4dc4abb4f3cb30c78 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 1 Oct 2021 20:01:03 +0200
Subject: [PATCH 30/62] drm/i915: Drop the irqs_disabled() check
The !irqs_disabled() check triggers on PREEMPT_RT even with
i915_sched_engine::lock acquired. The reason is the lock is transformed
into a sleeping lock on PREEMPT_RT and does not disable interrupts.
There is no need to check for disabled interrupts. The lockdep
annotation below already check if the lock has been acquired by the
caller and will yell if the interrupts are not disabled.
Remove the !irqs_disabled() check.
Reported-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/gpu/drm/i915/i915_request.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 7ce126a01cbf..64a032dfaa90 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -609,7 +609,6 @@ bool __i915_request_submit(struct i915_request *request)
RQ_TRACE(request, "\n");
- GEM_BUG_ON(!irqs_disabled());
lockdep_assert_held(&engine->sched_engine->lock);
/*
@@ -718,7 +717,6 @@ void __i915_request_unsubmit(struct i915_request *request)
*/
RQ_TRACE(request, "\n");
- GEM_BUG_ON(!irqs_disabled());
lockdep_assert_held(&engine->sched_engine->lock);
/*
--
2.43.0

View File

@ -0,0 +1,28 @@
From fe8c8e1f1ec5c61814ee0c4c90a2cef9d35ecad6 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 21 Feb 2022 17:59:14 +0100
Subject: [PATCH 31/62] Revert "drm/i915: Depend on !PREEMPT_RT."
Once the known issues are addressed, it should be safe to enable the
driver.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/gpu/drm/i915/Kconfig | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 6b10868ec72f..1fbdb7b4e6e1 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -3,7 +3,6 @@ config DRM_I915
tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics"
depends on DRM
depends on X86 && PCI
- depends on !PREEMPT_RT
select INTEL_GTT if X86
select INTERVAL_TREE
# we need shmfs for the swappable backing store, and in particular
--
2.43.0

View File

@ -0,0 +1,713 @@
From 87194c420f8ef3b1a8b9b63ae640180e2414e8c4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Oct 2012 18:50:54 +0100
Subject: [PATCH 32/62] sched: Add support for lazy preemption
It has become an obsession to mitigate the determinism vs. throughput
loss of RT. Looking at the mainline semantics of preemption points
gives a hint why RT sucks throughput wise for ordinary SCHED_OTHER
tasks. One major issue is the wakeup of tasks which are right away
preempting the waking task while the waking task holds a lock on which
the woken task will block right after having preempted the wakee. In
mainline this is prevented due to the implicit preemption disable of
spin/rw_lock held regions. On RT this is not possible due to the fully
preemptible nature of sleeping spinlocks.
Though for a SCHED_OTHER task preempting another SCHED_OTHER task this
is really not a correctness issue. RT folks are concerned about
SCHED_FIFO/RR tasks preemption and not about the purely fairness
driven SCHED_OTHER preemption latencies.
So I introduced a lazy preemption mechanism which only applies to
SCHED_OTHER tasks preempting another SCHED_OTHER task. Aside of the
existing preempt_count each tasks sports now a preempt_lazy_count
which is manipulated on lock acquiry and release. This is slightly
incorrect as for lazyness reasons I coupled this on
migrate_disable/enable so some other mechanisms get the same treatment
(e.g. get_cpu_light).
Now on the scheduler side instead of setting NEED_RESCHED this sets
NEED_RESCHED_LAZY in case of a SCHED_OTHER/SCHED_OTHER preemption and
therefor allows to exit the waking task the lock held region before
the woken task preempts. That also works better for cross CPU wakeups
as the other side can stay in the adaptive spinning loop.
For RT class preemption there is no change. This simply sets
NEED_RESCHED and forgoes the lazy preemption counter.
Initial test do not expose any observable latency increasement, but
history shows that I've been proven wrong before :)
The lazy preemption mode is per default on, but with
CONFIG_SCHED_DEBUG enabled it can be disabled via:
# echo NO_PREEMPT_LAZY >/sys/kernel/debug/sched_features
and reenabled via
# echo PREEMPT_LAZY >/sys/kernel/debug/sched_features
The test results so far are very machine and workload dependent, but
there is a clear trend that it enhances the non RT workload
performance.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/preempt.h | 54 ++++++++++++++++++++++--
include/linux/sched.h | 37 +++++++++++++++++
include/linux/thread_info.h | 12 +++++-
include/linux/trace_events.h | 10 ++++-
kernel/Kconfig.preempt | 6 +++
kernel/sched/core.c | 79 +++++++++++++++++++++++++++++++++++-
kernel/sched/fair.c | 16 ++++----
kernel/sched/features.h | 3 ++
kernel/sched/sched.h | 9 ++++
kernel/trace/trace.c | 50 ++++++++++++++---------
kernel/trace/trace_events.c | 1 +
kernel/trace/trace_output.c | 18 +++++++-
12 files changed, 260 insertions(+), 35 deletions(-)
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 8cfcc5d45451..9fc4c4bb320f 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -207,6 +207,20 @@ extern void preempt_count_sub(int val);
#define preempt_count_inc() preempt_count_add(1)
#define preempt_count_dec() preempt_count_sub(1)
+#ifdef CONFIG_PREEMPT_LAZY
+#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0)
+#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0)
+#define inc_preempt_lazy_count() add_preempt_lazy_count(1)
+#define dec_preempt_lazy_count() sub_preempt_lazy_count(1)
+#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count)
+#else
+#define add_preempt_lazy_count(val) do { } while (0)
+#define sub_preempt_lazy_count(val) do { } while (0)
+#define inc_preempt_lazy_count() do { } while (0)
+#define dec_preempt_lazy_count() do { } while (0)
+#define preempt_lazy_count() (0)
+#endif
+
#ifdef CONFIG_PREEMPT_COUNT
#define preempt_disable() \
@@ -215,6 +229,12 @@ do { \
barrier(); \
} while (0)
+#define preempt_lazy_disable() \
+do { \
+ inc_preempt_lazy_count(); \
+ barrier(); \
+} while (0)
+
#define sched_preempt_enable_no_resched() \
do { \
barrier(); \
@@ -246,6 +266,18 @@ do { \
__preempt_schedule(); \
} while (0)
+/*
+ * open code preempt_check_resched() because it is not exported to modules and
+ * used by local_unlock() or bpf_enable_instrumentation().
+ */
+#define preempt_lazy_enable() \
+do { \
+ dec_preempt_lazy_count(); \
+ barrier(); \
+ if (should_resched(0)) \
+ __preempt_schedule(); \
+} while (0)
+
#else /* !CONFIG_PREEMPTION */
#define preempt_enable() \
do { \
@@ -253,6 +285,12 @@ do { \
preempt_count_dec(); \
} while (0)
+#define preempt_lazy_enable() \
+do { \
+ dec_preempt_lazy_count(); \
+ barrier(); \
+} while (0)
+
#define preempt_enable_notrace() \
do { \
barrier(); \
@@ -293,6 +331,9 @@ do { \
#define preempt_enable_notrace() barrier()
#define preemptible() 0
+#define preempt_lazy_disable() barrier()
+#define preempt_lazy_enable() barrier()
+
#endif /* CONFIG_PREEMPT_COUNT */
#ifdef MODULE
@@ -311,7 +352,7 @@ do { \
} while (0)
#define preempt_fold_need_resched() \
do { \
- if (tif_need_resched()) \
+ if (tif_need_resched_now()) \
set_preempt_need_resched(); \
} while (0)
@@ -427,8 +468,15 @@ extern void migrate_enable(void);
#else
-static inline void migrate_disable(void) { }
-static inline void migrate_enable(void) { }
+static inline void migrate_disable(void)
+{
+ preempt_lazy_disable();
+}
+
+static inline void migrate_enable(void)
+{
+ preempt_lazy_enable();
+}
#endif /* CONFIG_SMP */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0cac69902ec5..67ec36dbfacf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2061,6 +2061,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}
+#ifdef CONFIG_PREEMPT_LAZY
+static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+ set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
+}
+
+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+ clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
+}
+
+static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+ return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
+}
+
+static inline int need_resched_lazy(void)
+{
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+}
+
+static inline int need_resched_now(void)
+{
+ return test_thread_flag(TIF_NEED_RESCHED);
+}
+
+#else
+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
+static inline int need_resched_lazy(void) { return 0; }
+
+static inline int need_resched_now(void)
+{
+ return test_thread_flag(TIF_NEED_RESCHED);
+}
+
+#endif
+
/*
* cond_resched() and cond_resched_lock(): latency reduction via
* explicit rescheduling in places that are safe. The return
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 9f392ec76f2b..779e0e96b9cb 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -177,7 +177,17 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti
clear_ti_thread_flag(task_thread_info(t), TIF_##fl)
#endif /* !CONFIG_GENERIC_ENTRY */
-#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
+#ifdef CONFIG_PREEMPT_LAZY
+#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \
+ test_thread_flag(TIF_NEED_RESCHED_LAZY))
+#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
+#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY)
+
+#else
+#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
+#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
+#define tif_need_resched_lazy() 0
+#endif
#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
static inline int arch_within_stack_frames(const void * const stack,
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index c8b5e9781d01..743b1183d184 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -70,6 +70,7 @@ struct trace_entry {
unsigned char flags;
unsigned char preempt_count;
int pid;
+ unsigned char preempt_lazy_count;
};
#define TRACE_EVENT_TYPE_MAX \
@@ -159,9 +160,10 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry,
unsigned int trace_ctx)
{
entry->preempt_count = trace_ctx & 0xff;
+ entry->preempt_lazy_count = (trace_ctx >> 16) & 0xff;
entry->pid = current->pid;
entry->type = type;
- entry->flags = trace_ctx >> 16;
+ entry->flags = trace_ctx >> 24;
}
unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status);
@@ -172,7 +174,13 @@ enum trace_flag_type {
TRACE_FLAG_NEED_RESCHED = 0x04,
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
+#ifdef CONFIG_PREEMPT_LAZY
+ TRACE_FLAG_PREEMPT_RESCHED = 0x00,
+ TRACE_FLAG_NEED_RESCHED_LAZY = 0x20,
+#else
+ TRACE_FLAG_NEED_RESCHED_LAZY = 0x00,
TRACE_FLAG_PREEMPT_RESCHED = 0x20,
+#endif
TRACE_FLAG_NMI = 0x40,
TRACE_FLAG_BH_OFF = 0x80,
};
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c2f1fd95a821..260c08efeb48 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,5 +1,11 @@
# SPDX-License-Identifier: GPL-2.0-only
+config HAVE_PREEMPT_LAZY
+ bool
+
+config PREEMPT_LAZY
+ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT
+
config PREEMPT_NONE_BUILD
bool
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6bd06122850a..b72fc7d336e4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1040,6 +1040,46 @@ void resched_curr(struct rq *rq)
trace_sched_wake_idle_without_ipi(cpu);
}
+#ifdef CONFIG_PREEMPT_LAZY
+
+static int tsk_is_polling(struct task_struct *p)
+{
+#ifdef TIF_POLLING_NRFLAG
+ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
+#else
+ return 0;
+#endif
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ int cpu;
+
+ if (!sched_feat(PREEMPT_LAZY)) {
+ resched_curr(rq);
+ return;
+ }
+
+ if (test_tsk_need_resched(curr))
+ return;
+
+ if (test_tsk_need_resched_lazy(curr))
+ return;
+
+ set_tsk_need_resched_lazy(curr);
+
+ cpu = cpu_of(rq);
+ if (cpu == smp_processor_id())
+ return;
+
+ /* NEED_RESCHED_LAZY must be visible before we test polling */
+ smp_mb();
+ if (!tsk_is_polling(curr))
+ smp_send_reschedule(cpu);
+}
+#endif
+
void resched_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -2224,6 +2264,7 @@ void migrate_disable(void)
preempt_disable();
this_rq()->nr_pinned++;
p->migration_disabled = 1;
+ preempt_lazy_disable();
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_disable);
@@ -2255,6 +2296,7 @@ void migrate_enable(void)
barrier();
p->migration_disabled = 0;
this_rq()->nr_pinned--;
+ preempt_lazy_enable();
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
@@ -4722,6 +4764,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->on_cpu = 0;
#endif
init_task_preempt_count(p);
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+ task_thread_info(p)->preempt_lazy_count = 0;
+#endif
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
@@ -6592,6 +6637,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
+ clear_tsk_need_resched_lazy(prev);
clear_preempt_need_resched();
#ifdef CONFIG_SCHED_DEBUG
rq->last_seen_need_resched_ns = 0;
@@ -6806,6 +6852,30 @@ static void __sched notrace preempt_schedule_common(void)
} while (need_resched());
}
+#ifdef CONFIG_PREEMPT_LAZY
+/*
+ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
+ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
+ * preempt_lazy_count counter >0.
+ */
+static __always_inline int preemptible_lazy(void)
+{
+ if (test_thread_flag(TIF_NEED_RESCHED))
+ return 1;
+ if (current_thread_info()->preempt_lazy_count)
+ return 0;
+ return 1;
+}
+
+#else
+
+static inline int preemptible_lazy(void)
+{
+ return 1;
+}
+
+#endif
+
#ifdef CONFIG_PREEMPTION
/*
* This is the entry point to schedule() from in-kernel preemption
@@ -6819,6 +6889,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
*/
if (likely(!preemptible()))
return;
+ if (!preemptible_lazy())
+ return;
preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
@@ -6866,6 +6938,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
if (likely(!preemptible()))
return;
+ if (!preemptible_lazy())
+ return;
+
do {
/*
* Because the function tracer can trace preempt_count_sub()
@@ -9131,7 +9206,9 @@ void __init init_idle(struct task_struct *idle, int cpu)
/* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu);
-
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+ task_thread_info(idle)->preempt_lazy_count = 0;
+#endif
/*
* The idle tasks have their own, simple scheduling class:
*/
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2558ab9033be..2dc35af7b5a6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4914,7 +4914,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
@@ -4938,7 +4938,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
return;
if (delta > ideal_runtime)
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
}
static void
@@ -5084,7 +5084,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* validating it and just reschedule.
*/
if (queued) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
return;
}
/*
@@ -5233,7 +5233,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
* hierarchy can be throttled
*/
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
}
static __always_inline
@@ -5984,7 +5984,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
if (delta < 0) {
if (task_current(rq, p))
- resched_curr(rq);
+ resched_curr_lazy(rq);
return;
}
hrtick_start(rq, delta);
@@ -7712,7 +7712,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
preempt:
- resched_curr(rq);
+ resched_curr_lazy(rq);
/*
* Only set the backward buddy when the current task is still
* on the rq. This can happen when a wakeup gets interleaved
@@ -11877,7 +11877,7 @@ static void task_fork_fair(struct task_struct *p)
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
- resched_curr(rq);
+ resched_curr_lazy(rq);
}
se->vruntime -= cfs_rq->min_vruntime;
@@ -11904,7 +11904,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
*/
if (task_current(rq, p)) {
if (p->prio > oldprio)
- resched_curr(rq);
+ resched_curr_lazy(rq);
} else
check_preempt_curr(rq, p, 0);
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index ee7f23c76bd3..e13090e33f3c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -48,6 +48,9 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
#ifdef CONFIG_PREEMPT_RT
SCHED_FEAT(TTWU_QUEUE, false)
+# ifdef CONFIG_PREEMPT_LAZY
+SCHED_FEAT(PREEMPT_LAZY, true)
+# endif
#else
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b62d53d7c264..f2577f511a41 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2350,6 +2350,15 @@ extern void reweight_task(struct task_struct *p, int prio);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
+#ifdef CONFIG_PREEMPT_LAZY
+extern void resched_curr_lazy(struct rq *rq);
+#else
+static inline void resched_curr_lazy(struct rq *rq)
+{
+ resched_curr(rq);
+}
+#endif
+
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 87eca95b57fb..462564d652be 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2616,11 +2616,19 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
if (softirq_count() >> (SOFTIRQ_SHIFT + 1))
trace_flags |= TRACE_FLAG_BH_OFF;
- if (tif_need_resched())
+ if (tif_need_resched_now())
trace_flags |= TRACE_FLAG_NEED_RESCHED;
+#ifdef CONFIG_PREEMPT_LAZY
+ /* Run out of bits. Share the LAZY and PREEMPT_RESCHED */
+ if (need_resched_lazy())
+ trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY;
+#else
if (test_preempt_need_resched())
trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
- return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) |
+#endif
+
+ return (trace_flags << 24) | (min_t(unsigned int, pc & 0xff, 0xf)) |
+ (preempt_lazy_count() & 0xff) << 16 |
(min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
}
@@ -4212,15 +4220,17 @@ unsigned long trace_total_entries(struct trace_array *tr)
static void print_lat_help_header(struct seq_file *m)
{
- seq_puts(m, "# _------=> CPU# \n"
- "# / _-----=> irqs-off/BH-disabled\n"
- "# | / _----=> need-resched \n"
- "# || / _---=> hardirq/softirq \n"
- "# ||| / _--=> preempt-depth \n"
- "# |||| / _-=> migrate-disable \n"
- "# ||||| / delay \n"
- "# cmd pid |||||| time | caller \n"
- "# \\ / |||||| \\ | / \n");
+ seq_puts(m, "# _--------=> CPU# \n"
+ "# / _-------=> irqs-off/BH-disabled\n"
+ "# | / _------=> need-resched \n"
+ "# || / _-----=> need-resched-lazy\n"
+ "# ||| / _----=> hardirq/softirq \n"
+ "# |||| / _---=> preempt-depth \n"
+ "# ||||| / _--=> preempt-lazy-depth\n"
+ "# |||||| / _-=> migrate-disable \n"
+ "# ||||||| / delay \n"
+ "# cmd pid |||||||| time | caller \n"
+ "# \\ / |||||||| \\ | / \n");
}
static void print_event_info(struct array_buffer *buf, struct seq_file *m)
@@ -4254,14 +4264,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file
print_event_info(buf, m);
- seq_printf(m, "# %.*s _-----=> irqs-off/BH-disabled\n", prec, space);
- seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
- seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
- seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
- seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space);
- seq_printf(m, "# %.*s|||| / delay\n", prec, space);
- seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID ");
- seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | ");
+ seq_printf(m, "# %.*s _-------=> irqs-off/BH-disabled\n", prec, space);
+ seq_printf(m, "# %.*s / _------=> need-resched\n", prec, space);
+ seq_printf(m, "# %.*s| / _-----=> need-resched-lazy\n", prec, space);
+ seq_printf(m, "# %.*s|| / _----=> hardirq/softirq\n", prec, space);
+ seq_printf(m, "# %.*s||| / _---=> preempt-depth\n", prec, space);
+ seq_printf(m, "# %.*s|||| / _--=> preempt-lazy-depth\n", prec, space);
+ seq_printf(m, "# %.*s||||| / _-=> migrate-disable\n", prec, space);
+ seq_printf(m, "# %.*s|||||| / delay\n", prec, space);
+ seq_printf(m, "# TASK-PID %.*s CPU# ||||||| TIMESTAMP FUNCTION\n", prec, " TGID ");
+ seq_printf(m, "# | | %.*s | ||||||| | |\n", prec, " | ");
}
void
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a6d2f99f847d..493c3f9cf01a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -208,6 +208,7 @@ static int trace_define_common_fields(void)
/* Holds both preempt_count and migrate_disable */
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
+ __common_field(unsigned char, preempt_lazy_count);
return ret;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 5cd4fb656306..3c227e2843ae 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -442,6 +442,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
{
char hardsoft_irq;
char need_resched;
+ char need_resched_lazy;
char irqs_off;
int hardirq;
int softirq;
@@ -462,20 +463,27 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
TRACE_FLAG_PREEMPT_RESCHED)) {
+#ifndef CONFIG_PREEMPT_LAZY
case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
need_resched = 'N';
break;
+#endif
case TRACE_FLAG_NEED_RESCHED:
need_resched = 'n';
break;
+#ifndef CONFIG_PREEMPT_LAZY
case TRACE_FLAG_PREEMPT_RESCHED:
need_resched = 'p';
break;
+#endif
default:
need_resched = '.';
break;
}
+ need_resched_lazy =
+ (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
+
hardsoft_irq =
(nmi && hardirq) ? 'Z' :
nmi ? 'z' :
@@ -484,14 +492,20 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
softirq ? 's' :
'.' ;
- trace_seq_printf(s, "%c%c%c",
- irqs_off, need_resched, hardsoft_irq);
+ trace_seq_printf(s, "%c%c%c%c",
+ irqs_off, need_resched, need_resched_lazy,
+ hardsoft_irq);
if (entry->preempt_count & 0xf)
trace_seq_printf(s, "%x", entry->preempt_count & 0xf);
else
trace_seq_putc(s, '.');
+ if (entry->preempt_lazy_count)
+ trace_seq_printf(s, "%x", entry->preempt_lazy_count);
+ else
+ trace_seq_putc(s, '.');
+
if (entry->preempt_count & 0xf0)
trace_seq_printf(s, "%x", entry->preempt_count >> 4);
else
--
2.43.0

View File

@ -0,0 +1,35 @@
From 37719a07083b68dad1449227e9fe66e8e6c9f2b6 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 30 Jun 2020 11:45:14 +0200
Subject: [PATCH 33/62] x86/entry: Use should_resched() in
idtentry_exit_cond_resched()
The TIF_NEED_RESCHED bit is inlined on x86 into the preemption counter.
By using should_resched(0) instead of need_resched() the same check can
be performed which uses the same variable as 'preempt_count()` which was
issued before.
Use should_resched(0) instead need_resched().
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/entry/common.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index be61332c66b5..97ff5faad4fb 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -386,7 +386,7 @@ void raw_irqentry_exit_cond_resched(void)
rcu_irq_exit_check_preempt();
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
WARN_ON_ONCE(!on_thread_stack());
- if (need_resched())
+ if (should_resched(0))
preempt_schedule_irq();
}
}
--
2.43.0

View File

@ -0,0 +1,157 @@
From 65ce5ba8ccb26e6de364e76228e645b2c02b921d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Nov 2012 11:03:47 +0100
Subject: [PATCH 34/62] x86: Support for lazy preemption
Implement the x86 pieces for lazy preempt.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/x86/Kconfig | 1 +
arch/x86/include/asm/preempt.h | 33 +++++++++++++++++++++++++++++-
arch/x86/include/asm/thread_info.h | 7 +++++++
include/linux/entry-common.h | 2 +-
kernel/entry/common.c | 2 +-
5 files changed, 42 insertions(+), 3 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c9bed9c69423..f38bd8a5061e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -251,6 +251,7 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+ select HAVE_PREEMPT_LAZY
select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
select MMU_GATHER_MERGE_VMAS
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 5f6daea1ee24..cd20b4a5719a 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -90,17 +90,48 @@ static __always_inline void __preempt_count_sub(int val)
* a decrement which hits zero means we have no preempt_count and should
* reschedule.
*/
-static __always_inline bool __preempt_count_dec_and_test(void)
+static __always_inline bool ____preempt_count_dec_and_test(void)
{
return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var]));
}
+static __always_inline bool __preempt_count_dec_and_test(void)
+{
+ if (____preempt_count_dec_and_test())
+ return true;
+#ifdef CONFIG_PREEMPT_LAZY
+ if (preempt_count())
+ return false;
+ if (current_thread_info()->preempt_lazy_count)
+ return false;
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
+ return false;
+#endif
+}
+
/*
* Returns true when we need to resched and can (barring IRQ state).
*/
static __always_inline bool should_resched(int preempt_offset)
{
+#ifdef CONFIG_PREEMPT_LAZY
+ u32 tmp;
+ tmp = raw_cpu_read_4(__preempt_count);
+ if (tmp == preempt_offset)
+ return true;
+
+ /* preempt count == 0 ? */
+ tmp &= ~PREEMPT_NEED_RESCHED;
+ if (tmp != preempt_offset)
+ return false;
+ /* XXX PREEMPT_LOCK_OFFSET */
+ if (current_thread_info()->preempt_lazy_count)
+ return false;
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
+#endif
}
#ifdef CONFIG_PREEMPTION
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index f0cb881c1d69..0da06a9b5f72 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -57,6 +57,8 @@ struct thread_info {
unsigned long flags; /* low level flags */
unsigned long syscall_work; /* SYSCALL_WORK_ flags */
u32 status; /* thread synchronous flags */
+ int preempt_lazy_count; /* 0 => lazy preemptable
+ <0 => BUG */
#ifdef CONFIG_SMP
u32 cpu; /* current CPU */
#endif
@@ -65,6 +67,7 @@ struct thread_info {
#define INIT_THREAD_INFO(tsk) \
{ \
.flags = 0, \
+ .preempt_lazy_count = 0, \
}
#else /* !__ASSEMBLY__ */
@@ -92,6 +95,7 @@ struct thread_info {
#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */
+#define TIF_NEED_RESCHED_LAZY 19 /* lazy rescheduling necessary */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
@@ -115,6 +119,7 @@ struct thread_info {
#define _TIF_NOCPUID (1 << TIF_NOCPUID)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
@@ -146,6 +151,8 @@ struct thread_info {
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+
#define STACK_WARN (THREAD_SIZE/8)
/*
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index d95ab85f96ba..93cc1ae12125 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -59,7 +59,7 @@
#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
+ _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
ARCH_EXIT_TO_USER_MODE_WORK)
/**
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 97ff5faad4fb..c6301e520d47 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -155,7 +155,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
local_irq_enable_exit_to_user(ti_work);
- if (ti_work & _TIF_NEED_RESCHED)
+ if (ti_work & _TIF_NEED_RESCHED_MASK)
schedule();
if (ti_work & _TIF_UPROBE)
--
2.43.0

View File

@ -0,0 +1,48 @@
From d37c604152cbded61a8e107918d3b9950725d897 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 13 Jul 2021 07:52:52 +0200
Subject: [PATCH 35/62] entry: Fix the preempt lazy fallout
Common code needs common defines....
Fixes: f2f9e496208c ("x86: Support for lazy preemption")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/x86/include/asm/thread_info.h | 2 --
include/linux/entry-common.h | 6 ++++++
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 0da06a9b5f72..fd8fb76f324f 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -151,8 +151,6 @@ struct thread_info {
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
-#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
-
#define STACK_WARN (THREAD_SIZE/8)
/*
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 93cc1ae12125..3dc3704a3cdb 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -57,6 +57,12 @@
# define ARCH_EXIT_TO_USER_MODE_WORK (0)
#endif
+#ifdef CONFIG_PREEMPT_LAZY
+# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+#else
+# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED)
+#endif
+
#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
--
2.43.0

View File

@ -0,0 +1,136 @@
From f242fbbb3f85d6b9c8c8fc06ddbc83b9ca5a0511 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Oct 2012 12:04:11 +0100
Subject: [PATCH 36/62] arm: Add support for lazy preemption
Implement the arm pieces for lazy preempt.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/arm/Kconfig | 1 +
arch/arm/include/asm/thread_info.h | 6 +++++-
arch/arm/kernel/asm-offsets.c | 1 +
arch/arm/kernel/entry-armv.S | 19 ++++++++++++++++---
arch/arm/kernel/signal.c | 3 ++-
5 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 6d5afe2e6ba3..717e596dc13b 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -115,6 +115,7 @@ config ARM
select HAVE_PERF_EVENTS
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+ select HAVE_PREEMPT_LAZY
select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RSEQ
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 7f092cb55a41..ffcbf8ebed4b 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -62,6 +62,7 @@ struct cpu_context_save {
struct thread_info {
unsigned long flags; /* low level flags */
int preempt_count; /* 0 => preemptable, <0 => bug */
+ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
__u32 cpu; /* cpu */
__u32 cpu_domain; /* cpu domain */
struct cpu_context_save cpu_context; /* cpu context */
@@ -129,6 +130,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
#define TIF_UPROBE 3 /* breakpointed or singlestepping */
#define TIF_NOTIFY_SIGNAL 4 /* signal notifications exist */
+#define TIF_NEED_RESCHED_LAZY 5
#define TIF_USING_IWMMXT 17
#define TIF_MEMDIE 18 /* is terminating due to OOM killer */
@@ -148,6 +150,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
#define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT)
/* Checks for any syscall work in entry-common.S */
@@ -157,7 +160,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
/*
* Change these and you break ASM code in entry-common.S
*/
-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
+ _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_NOTIFY_SIGNAL)
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 2c8d76fd7c66..c3bdec7d2df9 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -43,6 +43,7 @@ int main(void)
BLANK();
DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
+ DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
DEFINE(TI_CPU_DOMAIN, offsetof(struct thread_info, cpu_domain));
DEFINE(TI_CPU_SAVE, offsetof(struct thread_info, cpu_context));
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index c39303e5c234..cfb4660e9fea 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -222,11 +222,18 @@ ENDPROC(__dabt_svc)
#ifdef CONFIG_PREEMPTION
ldr r8, [tsk, #TI_PREEMPT] @ get preempt count
- ldr r0, [tsk, #TI_FLAGS] @ get flags
teq r8, #0 @ if preempt count != 0
+ bne 1f @ return from exeption
+ ldr r0, [tsk, #TI_FLAGS] @ get flags
+ tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set
+ blne svc_preempt @ preempt!
+
+ ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
+ teq r8, #0 @ if preempt lazy count != 0
movne r0, #0 @ force flags to 0
- tst r0, #_TIF_NEED_RESCHED
+ tst r0, #_TIF_NEED_RESCHED_LAZY
blne svc_preempt
+1:
#endif
svc_exit r5, irq = 1 @ return from exception
@@ -241,8 +248,14 @@ ENDPROC(__irq_svc)
1: bl preempt_schedule_irq @ irq en/disable is done inside
ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS
tst r0, #_TIF_NEED_RESCHED
+ bne 1b
+ tst r0, #_TIF_NEED_RESCHED_LAZY
reteq r8 @ go again
- b 1b
+ ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
+ teq r0, #0 @ if preempt lazy count != 0
+ beq 1b
+ ret r8 @ go again
+
#endif
__und_fault:
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index e07f359254c3..b50a3248e79f 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -607,7 +607,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
*/
trace_hardirqs_off();
do {
- if (likely(thread_flags & _TIF_NEED_RESCHED)) {
+ if (likely(thread_flags & (_TIF_NEED_RESCHED |
+ _TIF_NEED_RESCHED_LAZY))) {
schedule();
} else {
if (unlikely(!user_mode(regs)))
--
2.43.0

View File

@ -0,0 +1,117 @@
From 65e31d7b980c1413f19fcb84234387f97b09588f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Nov 2012 10:14:11 +0100
Subject: [PATCH 37/62] powerpc: Add support for lazy preemption
Implement the powerpc pieces for lazy preempt.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/powerpc/Kconfig | 1 +
arch/powerpc/include/asm/thread_info.h | 8 ++++++++
arch/powerpc/kernel/interrupt.c | 8 ++++++--
3 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6050e6e10d32..0eff864d6ec3 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -242,6 +242,7 @@ config PPC
select HAVE_PERF_EVENTS_NMI if PPC64
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+ select HAVE_PREEMPT_LAZY
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE
select HAVE_RSEQ
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index af58f1ed3952..520864de8bb2 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -53,6 +53,8 @@
struct thread_info {
int preempt_count; /* 0 => preemptable,
<0 => BUG */
+ int preempt_lazy_count; /* 0 => preemptable,
+ <0 => BUG */
#ifdef CONFIG_SMP
unsigned int cpu;
#endif
@@ -77,6 +79,7 @@ struct thread_info {
#define INIT_THREAD_INFO(tsk) \
{ \
.preempt_count = INIT_PREEMPT_COUNT, \
+ .preempt_lazy_count = 0, \
.flags = 0, \
}
@@ -102,6 +105,7 @@ void arch_setup_new_exec(void);
#define TIF_PATCH_PENDING 6 /* pending live patching update */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SINGLESTEP 8 /* singlestepping active */
+#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */
#define TIF_SECCOMP 10 /* secure computing */
#define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */
#define TIF_NOERROR 12 /* Force successful syscall return */
@@ -117,6 +121,7 @@ void arch_setup_new_exec(void);
#define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */
#define TIF_32BIT 20 /* 32 bit binary */
+
/* as above, but as bit values */
#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
@@ -128,6 +133,7 @@ void arch_setup_new_exec(void);
#define _TIF_PATCH_PENDING (1<<TIF_PATCH_PENDING)
#define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
#define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP)
+#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
#define _TIF_SECCOMP (1<<TIF_SECCOMP)
#define _TIF_RESTOREALL (1<<TIF_RESTOREALL)
#define _TIF_NOERROR (1<<TIF_NOERROR)
@@ -141,10 +147,12 @@ void arch_setup_new_exec(void);
_TIF_SYSCALL_EMU)
#define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
+ _TIF_NEED_RESCHED_LAZY | \
_TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_RESTORE_TM | _TIF_PATCH_PENDING | \
_TIF_NOTIFY_SIGNAL)
#define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR)
+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
/* Bits in local_flags */
/* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index cf770d86c03c..2c454731c250 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -186,7 +186,7 @@ interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs)
ti_flags = read_thread_flags();
while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
local_irq_enable();
- if (ti_flags & _TIF_NEED_RESCHED) {
+ if (ti_flags & _TIF_NEED_RESCHED_MASK) {
schedule();
} else {
/*
@@ -397,11 +397,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs)
/* Returning to a kernel context with local irqs enabled. */
WARN_ON_ONCE(!(regs->msr & MSR_EE));
again:
- if (IS_ENABLED(CONFIG_PREEMPT)) {
+ if (IS_ENABLED(CONFIG_PREEMPTION)) {
/* Return to preemptible kernel context */
if (unlikely(read_thread_flags() & _TIF_NEED_RESCHED)) {
if (preempt_count() == 0)
preempt_schedule_irq();
+ } else if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED_LAZY)) {
+ if ((preempt_count() == 0) &&
+ (current_thread_info()->preempt_lazy_count == 0))
+ preempt_schedule_irq();
}
}
--
2.43.0

View File

@ -0,0 +1,145 @@
From 12a36ad989490f5ae3ed6a50d764385e8e27e024 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Thu, 14 May 2015 17:52:17 +0200
Subject: [PATCH 38/62] arch/arm64: Add lazy preempt support
arm64 is missing support for PREEMPT_RT. The main feature which is
lacking is support for lazy preemption. The arch-specific entry code,
thread information structure definitions, and associated data tables
have to be extended to provide this support. Then the Kconfig file has
to be extended to indicate the support is available, and also to
indicate that support for full RT preemption is now available.
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/arm64/Kconfig | 1 +
arch/arm64/include/asm/preempt.h | 25 ++++++++++++++++++++++++-
arch/arm64/include/asm/thread_info.h | 8 +++++++-
arch/arm64/kernel/asm-offsets.c | 1 +
arch/arm64/kernel/signal.c | 2 +-
5 files changed, 34 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index ea70eb960565..6e16670a7f43 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -199,6 +199,7 @@ config ARM64
select HAVE_PERF_USER_STACK_DUMP
select HAVE_PREEMPT_DYNAMIC_KEY
select HAVE_REGS_AND_STACK_ACCESS_API
+ select HAVE_PREEMPT_LAZY
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
select HAVE_FUNCTION_ARG_ACCESS_API
select MMU_GATHER_RCU_TABLE_FREE
diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h
index 0159b625cc7f..a5486918e5ee 100644
--- a/arch/arm64/include/asm/preempt.h
+++ b/arch/arm64/include/asm/preempt.h
@@ -71,13 +71,36 @@ static inline bool __preempt_count_dec_and_test(void)
* interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE
* pair.
*/
- return !pc || !READ_ONCE(ti->preempt_count);
+ if (!pc || !READ_ONCE(ti->preempt_count))
+ return true;
+#ifdef CONFIG_PREEMPT_LAZY
+ if ((pc & ~PREEMPT_NEED_RESCHED))
+ return false;
+ if (current_thread_info()->preempt_lazy_count)
+ return false;
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
+ return false;
+#endif
}
static inline bool should_resched(int preempt_offset)
{
+#ifdef CONFIG_PREEMPT_LAZY
+ u64 pc = READ_ONCE(current_thread_info()->preempt_count);
+ if (pc == preempt_offset)
+ return true;
+
+ if ((pc & ~PREEMPT_NEED_RESCHED) != preempt_offset)
+ return false;
+
+ if (current_thread_info()->preempt_lazy_count)
+ return false;
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
u64 pc = READ_ONCE(current_thread_info()->preempt_count);
return pc == preempt_offset;
+#endif
}
#ifdef CONFIG_PREEMPTION
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 848739c15de8..4b7148fd5551 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -26,6 +26,7 @@ struct thread_info {
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
u64 ttbr0; /* saved TTBR0_EL1 */
#endif
+ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
union {
u64 preempt_count; /* 0 => preemptible, <0 => bug */
struct {
@@ -68,6 +69,7 @@ int arch_dup_task_struct(struct task_struct *dst,
#define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */
#define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */
#define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */
+#define TIF_NEED_RESCHED_LAZY 7
#define TIF_SYSCALL_TRACE 8 /* syscall trace active */
#define TIF_SYSCALL_AUDIT 9 /* syscall auditing */
#define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */
@@ -100,8 +102,10 @@ int arch_dup_task_struct(struct task_struct *dst,
#define _TIF_SVE (1 << TIF_SVE)
#define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
+ _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
_TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \
_TIF_NOTIFY_SIGNAL)
@@ -110,6 +114,8 @@ int arch_dup_task_struct(struct task_struct *dst,
_TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
_TIF_SYSCALL_EMU)
+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+
#ifdef CONFIG_SHADOW_CALL_STACK
#define INIT_SCS \
.scs_base = init_shadow_call_stack, \
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 1197e7679882..e74c0415f67e 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -32,6 +32,7 @@ int main(void)
DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu));
DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags));
DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count));
+ DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count));
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0));
#endif
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 82f4572c8ddf..2a606c7bf025 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -1108,7 +1108,7 @@ static void do_signal(struct pt_regs *regs)
void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
{
do {
- if (thread_flags & _TIF_NEED_RESCHED) {
+ if (thread_flags & _TIF_NEED_RESCHED_MASK) {
/* Unmask Debug and SError for the next task */
local_daif_restore(DAIF_PROCCTX_NOIRQ);
--
2.43.0

View File

@ -0,0 +1,41 @@
From f1c0950a69e46ee1d45192bfae026622da60bdc8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Jul 2015 17:14:48 +0200
Subject: [PATCH 39/62] arm: Disable jump-label on PREEMPT_RT.
jump-labels are used to efficiently switch between two possible code
paths. To achieve this, stop_machine() is used to keep the CPU in a
known state while the opcode is modified. The usage of stop_machine()
here leads to large latency spikes which can be observed on PREEMPT_RT.
Jump labels may change the target during runtime and are not restricted
to debug or "configuration/ setup" part of a PREEMPT_RT system where
high latencies could be defined as acceptable.
Disable jump-label support on a PREEMPT_RT system.
[bigeasy: Patch description.]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/20220613182447.112191-2-bigeasy@linutronix.de
---
arch/arm/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 717e596dc13b..f170f29e98ac 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -71,7 +71,7 @@ config ARM
select HARDIRQS_SW_RESEND
select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT
select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
- select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
+ select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT
select HAVE_ARCH_KFENCE if MMU && !XIP_KERNEL
select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL
--
2.43.0

View File

@ -0,0 +1,96 @@
From 802d6978bf4ebdec28d2ba4e715c2e414d9c7d06 Mon Sep 17 00:00:00 2001
From: "Yadi.hu" <yadi.hu@windriver.com>
Date: Wed, 10 Dec 2014 10:32:09 +0800
Subject: [PATCH 40/62] ARM: enable irq in translation/section permission fault
handlers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Probably happens on all ARM, with
CONFIG_PREEMPT_RT
CONFIG_DEBUG_ATOMIC_SLEEP
This simple program....
int main() {
*((char*)0xc0001000) = 0;
};
[ 512.742724] BUG: sleeping function called from invalid context at kernel/rtmutex.c:658
[ 512.743000] in_atomic(): 0, irqs_disabled(): 128, pid: 994, name: a
[ 512.743217] INFO: lockdep is turned off.
[ 512.743360] irq event stamp: 0
[ 512.743482] hardirqs last enabled at (0): [< (null)>] (null)
[ 512.743714] hardirqs last disabled at (0): [<c0426370>] copy_process+0x3b0/0x11c0
[ 512.744013] softirqs last enabled at (0): [<c0426370>] copy_process+0x3b0/0x11c0
[ 512.744303] softirqs last disabled at (0): [< (null)>] (null)
[ 512.744631] [<c041872c>] (unwind_backtrace+0x0/0x104)
[ 512.745001] [<c09af0c4>] (dump_stack+0x20/0x24)
[ 512.745355] [<c0462490>] (__might_sleep+0x1dc/0x1e0)
[ 512.745717] [<c09b6770>] (rt_spin_lock+0x34/0x6c)
[ 512.746073] [<c0441bf0>] (do_force_sig_info+0x34/0xf0)
[ 512.746457] [<c0442668>] (force_sig_info+0x18/0x1c)
[ 512.746829] [<c041d880>] (__do_user_fault+0x9c/0xd8)
[ 512.747185] [<c041d938>] (do_bad_area+0x7c/0x94)
[ 512.747536] [<c041d990>] (do_sect_fault+0x40/0x48)
[ 512.747898] [<c040841c>] (do_DataAbort+0x40/0xa0)
[ 512.748181] Exception stack(0xecaa1fb0 to 0xecaa1ff8)
Oxc0000000 belongs to kernel address space, user task can not be
allowed to access it. For above condition, correct result is that
test case should receive a “segment fault” and exits but not stacks.
the root cause is commit 02fe2845d6a8 ("avoid enabling interrupts in
prefetch/data abort handlers"),it deletes irq enable block in Data
abort assemble code and move them into page/breakpiont/alignment fault
handlers instead. But author does not enable irq in translation/section
permission fault handlers. ARM disables irq when it enters exception/
interrupt mode, if kernel doesn't enable irq, it would be still disabled
during translation/section permission fault.
We see the above splat because do_force_sig_info is still called with
IRQs off, and that code eventually does a:
spin_lock_irqsave(&t->sighand->siglock, flags);
As this is architecture independent code, and we've not seen any other
need for other arch to have the siglock converted to raw lock, we can
conclude that we should enable irq for ARM translation/section
permission exception.
Signed-off-by: Yadi.hu <yadi.hu@windriver.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/arm/mm/fault.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index b0db85310331..77877dcb54ed 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -400,6 +400,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
if (addr < TASK_SIZE)
return do_page_fault(addr, fsr, regs);
+ if (interrupts_enabled(regs))
+ local_irq_enable();
+
if (user_mode(regs))
goto bad_area;
@@ -470,6 +473,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
static int
do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
+ if (interrupts_enabled(regs))
+ local_irq_enable();
+
do_bad_area(addr, fsr, regs);
return 0;
}
--
2.43.0

View File

@ -0,0 +1,48 @@
From 2a89ee21ea5c408b560839cc06bd0c13580fb3a4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 28 Jul 2011 13:32:57 +0200
Subject: [PATCH 41/62] tty/serial/omap: Make the locking RT aware
The lock is a sleeping lock and local_irq_save() is not the
optimsation we are looking for. Redo it to make it work on -RT and
non-RT.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
drivers/tty/serial/omap-serial.c | 12 ++++--------
1 file changed, 4 insertions(+), 8 deletions(-)
diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index 7d0d2718ef59..aa216fdbcb1d 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c
@@ -1241,13 +1241,10 @@ serial_omap_console_write(struct console *co, const char *s,
unsigned int ier;
int locked = 1;
- local_irq_save(flags);
- if (up->port.sysrq)
- locked = 0;
- else if (oops_in_progress)
- locked = spin_trylock(&up->port.lock);
+ if (up->port.sysrq || oops_in_progress)
+ locked = spin_trylock_irqsave(&up->port.lock, flags);
else
- spin_lock(&up->port.lock);
+ spin_lock_irqsave(&up->port.lock, flags);
/*
* First save the IER then disable the interrupts
@@ -1274,8 +1271,7 @@ serial_omap_console_write(struct console *co, const char *s,
check_modem_status(up);
if (locked)
- spin_unlock(&up->port.lock);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&up->port.lock, flags);
}
static int __init
--
2.43.0

View File

@ -0,0 +1,59 @@
From 1db3dc8a6af68447b52efbbf8dbb4d210d23d57b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Jan 2013 21:36:51 +0100
Subject: [PATCH 42/62] tty/serial/pl011: Make the locking work on RT
The lock is a sleeping lock and local_irq_save() is not the optimsation
we are looking for. Redo it to make it work on -RT and non-RT.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
drivers/tty/serial/amba-pl011.c | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index c74eaf2552c3..38eb30b8491b 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -2316,18 +2316,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
{
struct uart_amba_port *uap = amba_ports[co->index];
unsigned int old_cr = 0, new_cr;
- unsigned long flags;
+ unsigned long flags = 0;
int locked = 1;
clk_enable(uap->clk);
- local_irq_save(flags);
+ /*
+ * local_irq_save(flags);
+ *
+ * This local_irq_save() is nonsense. If we come in via sysrq
+ * handling then interrupts are already disabled. Aside of
+ * that the port.sysrq check is racy on SMP regardless.
+ */
if (uap->port.sysrq)
locked = 0;
else if (oops_in_progress)
- locked = spin_trylock(&uap->port.lock);
+ locked = spin_trylock_irqsave(&uap->port.lock, flags);
else
- spin_lock(&uap->port.lock);
+ spin_lock_irqsave(&uap->port.lock, flags);
/*
* First save the CR then disable the interrupts
@@ -2353,8 +2359,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
pl011_write(old_cr, uap, REG_CR);
if (locked)
- spin_unlock(&uap->port.lock);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&uap->port.lock, flags);
clk_disable(uap->clk);
}
--
2.43.0

View File

@ -0,0 +1,36 @@
From 47c9956482a592a16b58831ded27e3c0f62ec11d Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 11 Oct 2019 13:14:29 +0200
Subject: [PATCH 43/62] ARM: Allow to enable RT
Allow to select RT.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/arm/Kconfig | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index f170f29e98ac..d1f2e062ce0b 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -33,6 +33,7 @@ config ARM
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE
+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_USE_MEMTEST
@@ -115,6 +116,7 @@ config ARM
select HAVE_PERF_EVENTS
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM
select HAVE_PREEMPT_LAZY
select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE
select HAVE_REGS_AND_STACK_ACCESS_API
--
2.43.0

View File

@ -0,0 +1,28 @@
From c71e1e0561e008bb2fe230ad7022c3e2483cd6c0 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 11 Oct 2019 13:14:35 +0200
Subject: [PATCH 44/62] ARM64: Allow to enable RT
Allow to select RT.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/arm64/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6e16670a7f43..0c617e48177c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -93,6 +93,7 @@ config ARM64
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_SUPPORTS_PAGE_TABLE_CHECK
+ select ARCH_SUPPORTS_RT
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
select ARCH_WANT_DEFAULT_BPF_JIT
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
--
2.43.0

View File

@ -0,0 +1,39 @@
From 4d7273bd07600b933e6d25807cd96df04e435cbe Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Jul 2019 11:30:49 +0200
Subject: [PATCH 45/62] powerpc: traps: Use PREEMPT_RT
Add PREEMPT_RT to the backtrace if enabled.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/powerpc/kernel/traps.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 3956f32682c6..8e15205e51ef 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -261,12 +261,17 @@ static char *get_mmu_str(void)
static int __die(const char *str, struct pt_regs *regs, long err)
{
+ const char *pr = "";
+
printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
+
printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n",
IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE",
PAGE_SIZE / 1024, get_mmu_str(),
- IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
+ pr,
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
--
2.43.0

View File

@ -0,0 +1,117 @@
From 1d20f49e62250211b43cbe18a087fdf19c313081 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 26 Mar 2019 18:31:54 +0100
Subject: [PATCH 46/62] powerpc/pseries/iommu: Use a locallock instead
local_irq_save()
The locallock protects the per-CPU variable tce_page. The function
attempts to allocate memory while tce_page is protected (by disabling
interrupts).
Use local_irq_save() instead of local_irq_disable().
Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/powerpc/platforms/pseries/iommu.c | 31 +++++++++++++++++---------
1 file changed, 20 insertions(+), 11 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 97b026130c71..01b3d19be382 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -24,6 +24,7 @@
#include <linux/of.h>
#include <linux/iommu.h>
#include <linux/rculist.h>
+#include <linux/local_lock.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/rtas.h>
@@ -200,7 +201,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
return ret;
}
-static DEFINE_PER_CPU(__be64 *, tce_page);
+struct tce_page {
+ __be64 * page;
+ local_lock_t lock;
+};
+static DEFINE_PER_CPU(struct tce_page, tce_page) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
long npages, unsigned long uaddr,
@@ -223,9 +230,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
direction, attrs);
}
- local_irq_save(flags); /* to protect tcep and the page behind it */
+ /* to protect tcep and the page behind it */
+ local_lock_irqsave(&tce_page.lock, flags);
- tcep = __this_cpu_read(tce_page);
+ tcep = __this_cpu_read(tce_page.page);
/* This is safe to do since interrupts are off when we're called
* from iommu_alloc{,_sg}()
@@ -234,12 +242,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
/* If allocation fails, fall back to the loop implementation */
if (!tcep) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&tce_page.lock, flags);
return tce_build_pSeriesLP(tbl->it_index, tcenum,
tceshift,
npages, uaddr, direction, attrs);
}
- __this_cpu_write(tce_page, tcep);
+ __this_cpu_write(tce_page.page, tcep);
}
rpn = __pa(uaddr) >> tceshift;
@@ -269,7 +277,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
tcenum += limit;
} while (npages > 0 && !rc);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&tce_page.lock, flags);
if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
ret = (int)rc;
@@ -454,16 +462,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
DMA_BIDIRECTIONAL, 0);
}
- local_irq_disable(); /* to protect tcep and the page behind it */
- tcep = __this_cpu_read(tce_page);
+ /* to protect tcep and the page behind it */
+ local_lock_irq(&tce_page.lock);
+ tcep = __this_cpu_read(tce_page.page);
if (!tcep) {
tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
if (!tcep) {
- local_irq_enable();
+ local_unlock_irq(&tce_page.lock);
return -ENOMEM;
}
- __this_cpu_write(tce_page, tcep);
+ __this_cpu_write(tce_page.page, tcep);
}
proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
@@ -506,7 +515,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
/* error cleanup: caller will clear whole range */
- local_irq_enable();
+ local_unlock_irq(&tce_page.lock);
return rc;
}
--
2.43.0

View File

@ -0,0 +1,45 @@
From 34f52991170848510810b486dd6fe9a19cbe4c46 Mon Sep 17 00:00:00 2001
From: Bogdan Purcareata <bogdan.purcareata@freescale.com>
Date: Fri, 24 Apr 2015 15:53:13 +0000
Subject: [PATCH 47/62] powerpc/kvm: Disable in-kernel MPIC emulation for
PREEMPT_RT
While converting the openpic emulation code to use a raw_spinlock_t enables
guests to run on RT, there's still a performance issue. For interrupts sent in
directed delivery mode with a multiple CPU mask, the emulated openpic will loop
through all of the VCPUs, and for each VCPUs, it call IRQ_check, which will loop
through all the pending interrupts for that VCPU. This is done while holding the
raw_lock, meaning that in all this time the interrupts and preemption are
disabled on the host Linux. A malicious user app can max both these number and
cause a DoS.
This temporary fix is sent for two reasons. First is so that users who want to
use the in-kernel MPIC emulation are aware of the potential latencies, thus
making sure that the hardware MPIC and their usage scenario does not involve
interrupts sent in directed delivery mode, and the number of possible pending
interrupts is kept small. Secondly, this should incentivize the development of a
proper openpic emulation that would be better suited for RT.
Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Bogdan Purcareata <bogdan.purcareata@freescale.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/powerpc/kvm/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index a9f57dad6d91..a0b528d4bb7c 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -225,6 +225,7 @@ config KVM_E500MC
config KVM_MPIC
bool "KVM in-kernel MPIC emulation"
depends on KVM && PPC_E500
+ depends on !PREEMPT_RT
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING
--
2.43.0

View File

@ -0,0 +1,36 @@
From d2e9a96e5570459dd886f87bf81c7714fb0a2108 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 26 Mar 2019 18:31:29 +0100
Subject: [PATCH 48/62] powerpc/stackprotector: work around stack-guard init
from atomic
This is invoked from the secondary CPU in atomic context. On x86 we use
tsc instead. On Power we XOR it against mftb() so lets use stack address
as the initial value.
Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/powerpc/include/asm/stackprotector.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h
index 1c8460e23583..b1653c160bab 100644
--- a/arch/powerpc/include/asm/stackprotector.h
+++ b/arch/powerpc/include/asm/stackprotector.h
@@ -24,7 +24,11 @@ static __always_inline void boot_init_stack_canary(void)
unsigned long canary;
/* Try to get a semi random initial value. */
+#ifdef CONFIG_PREEMPT_RT
+ canary = (unsigned long)&canary;
+#else
canary = get_random_canary();
+#endif
canary ^= mftb();
canary ^= LINUX_VERSION_CODE;
canary &= CANARY_MASK;
--
2.43.0

View File

@ -0,0 +1,36 @@
From 93941796079cb4515170adf454f9218adc89856d Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 11 Oct 2019 13:14:41 +0200
Subject: [PATCH 49/62] POWERPC: Allow to enable RT
Allow to select RT.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/powerpc/Kconfig | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 0eff864d6ec3..df697d3f68cd 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -151,6 +151,7 @@ config PPC
select ARCH_STACKWALK
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x
+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
select ARCH_USE_MEMTEST
@@ -245,6 +246,7 @@ config PPC
select HAVE_PREEMPT_LAZY
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE
+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM
select HAVE_RSEQ
select HAVE_SETUP_PER_CPU_AREA if PPC64
select HAVE_SOFTIRQ_ON_OWN_STACK
--
2.43.0

View File

@ -0,0 +1,54 @@
From b2103f830327ab5d1e6f49134c22d3c5adfb52cc Mon Sep 17 00:00:00 2001
From: Clark Williams <williams@redhat.com>
Date: Sat, 30 Jul 2011 21:55:53 -0500
Subject: [PATCH 50/62] sysfs: Add /sys/kernel/realtime entry
Add a /sys/kernel entry to indicate that the kernel is a
realtime kernel.
Clark says that he needs this for udev rules, udev needs to evaluate
if its a PREEMPT_RT kernel a few thousand times and parsing uname
output is too slow or so.
Are there better solutions? Should it exist and return 0 on !-rt?
Signed-off-by: Clark Williams <williams@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/ksysfs.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 65dba9076f31..ab18048e2186 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -142,6 +142,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
#endif /* CONFIG_CRASH_CORE */
+#if defined(CONFIG_PREEMPT_RT)
+static ssize_t realtime_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", 1);
+}
+KERNEL_ATTR_RO(realtime);
+#endif
+
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -232,6 +241,9 @@ static struct attribute * kernel_attrs[] = {
#ifndef CONFIG_TINY_RCU
&rcu_expedited_attr.attr,
&rcu_normal_attr.attr,
+#endif
+#ifdef CONFIG_PREEMPT_RT
+ &realtime_attr.attr,
#endif
NULL
};
--
2.43.0

View File

@ -0,0 +1,21 @@
From 9c7d6e723acbbc184d1dd04811863378699134fb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Jul 2011 20:25:16 +0200
Subject: [PATCH 51/62] Add localversion for -RT release
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
localversion-rt | 1 +
1 file changed, 1 insertion(+)
create mode 100644 localversion-rt
diff --git a/localversion-rt b/localversion-rt
new file mode 100644
index 000000000000..045478966e9f
--- /dev/null
+++ b/localversion-rt
@@ -0,0 +1 @@
+-rt7
--
2.43.0

View File

@ -0,0 +1,20 @@
From 3f783498b292a814f8f364bbfd0efbfc1be6d30f Mon Sep 17 00:00:00 2001
From: Clark Williams <clrkwllms@kernel.org>
Date: Fri, 18 Aug 2023 10:45:35 -0500
Subject: [PATCH 52/62] 'Linux 6.1.46-rt13 REBASE'
Signed-off-by: Clark Williams <clrkwllms@kernel.org>
---
localversion-rt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/localversion-rt b/localversion-rt
index 045478966e9f..9f7d0bdbffb1 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt7
+-rt13
--
2.43.0

View File

@ -0,0 +1,92 @@
From 53c6a09e670e985d37ca05785a0155ab51b49cf4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 10 Mar 2023 17:29:05 +0100
Subject: [PATCH 53/62] io-mapping: don't disable preempt on RT in
io_mapping_map_atomic_wc().
io_mapping_map_atomic_wc() disables preemption and pagefaults for
historical reasons. The conversion to io_mapping_map_local_wc(), which
only disables migration, cannot be done wholesale because quite some call
sites need to be updated to accommodate with the changed semantics.
On PREEMPT_RT enabled kernels the io_mapping_map_atomic_wc() semantics are
problematic due to the implicit disabling of preemption which makes it
impossible to acquire 'sleeping' spinlocks within the mapped atomic
sections.
PREEMPT_RT replaces the preempt_disable() with a migrate_disable() for
more than a decade. It could be argued that this is a justification to do
this unconditionally, but PREEMPT_RT covers only a limited number of
architectures and it disables some functionality which limits the coverage
further.
Limit the replacement to PREEMPT_RT for now. This is also done
kmap_atomic().
Link: https://lkml.kernel.org/r/20230310162905.O57Pj7hh@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reported-by: Richard Weinberger <richard.weinberger@gmail.com>
Link: https://lore.kernel.org/CAFLxGvw0WMxaMqYqJ5WgvVSbKHq2D2xcXTOgMCpgq9nDC-MWTQ@mail.gmail.com
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 7eb16f23b9a415f062db22739e59bb144e0b24ab)
Signed-off-by: Clark Williams <clark.williams@gmail.com>
---
include/linux/io-mapping.h | 20 ++++++++++++++++----
1 file changed, 16 insertions(+), 4 deletions(-)
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 66a774d2710e..b08532b8fba7 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -69,7 +69,10 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping,
BUG_ON(offset >= mapping->size);
phys_addr = mapping->base + offset;
- preempt_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+ else
+ migrate_disable();
pagefault_disable();
return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot);
}
@@ -79,7 +82,10 @@ io_mapping_unmap_atomic(void __iomem *vaddr)
{
kunmap_local_indexed((void __force *)vaddr);
pagefault_enable();
- preempt_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+ else
+ migrate_enable();
}
static inline void __iomem *
@@ -162,7 +168,10 @@ static inline void __iomem *
io_mapping_map_atomic_wc(struct io_mapping *mapping,
unsigned long offset)
{
- preempt_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+ else
+ migrate_disable();
pagefault_disable();
return io_mapping_map_wc(mapping, offset, PAGE_SIZE);
}
@@ -172,7 +181,10 @@ io_mapping_unmap_atomic(void __iomem *vaddr)
{
io_mapping_unmap(vaddr);
pagefault_enable();
- preempt_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+ else
+ migrate_enable();
}
static inline void __iomem *
--
2.43.0

View File

@ -0,0 +1,62 @@
From 8785dde5198dc91cbb518044e1c6d301ef9a9857 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 21 Mar 2023 17:11:40 +0100
Subject: [PATCH 54/62] locking/rwbase: Mitigate indefinite writer starvation
On PREEMPT_RT, rw_semaphore and rwlock_t locks are unfair to writers.
Readers can indefinitely acquire the lock unless the writer fully acquired
the lock, which might never happen if there is always a reader in the
critical section owning the lock.
Mel Gorman reported that since LTP-20220121 the dio_truncate test case
went from having 1 reader to having 16 readers and that number of readers
is sufficient to prevent the down_write ever succeeding while readers
exist. Eventually the test is killed after 30 minutes as a failure.
Mel proposed a timeout to limit how long a writer can be blocked until
the reader is forced into the slowpath.
Thomas argued that there is no added value by providing this timeout. From
a PREEMPT_RT point of view, there are no critical rw_semaphore or rwlock_t
locks left where the reader must be preferred.
Mitigate indefinite writer starvation by forcing the READER into the
slowpath once the WRITER attempts to acquire the lock.
Reported-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Link: https://lore.kernel.org/877cwbq4cq.ffs@tglx
Link: https://lore.kernel.org/r/20230321161140.HMcQEhHb@linutronix.de
Cc: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 286deb7ec03d941664ac3ffaff58814b454adf65)
Signed-off-by: Clark Williams <clark.williams@gmail.com>
---
kernel/locking/rwbase_rt.c | 9 ---------
1 file changed, 9 deletions(-)
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index c201aadb9301..25ec0239477c 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -72,15 +72,6 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
int ret;
raw_spin_lock_irq(&rtm->wait_lock);
- /*
- * Allow readers, as long as the writer has not completely
- * acquired the semaphore for write.
- */
- if (atomic_read(&rwb->readers) != WRITER_BIAS) {
- atomic_inc(&rwb->readers);
- raw_spin_unlock_irq(&rtm->wait_lock);
- return 0;
- }
/*
* Call into the slow lock path with the rtmutex->wait_lock
--
2.43.0

View File

@ -0,0 +1,107 @@
From fcdb9b29c6d58895a386ac23229564fad2c316b5 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 8 May 2023 08:17:44 +0200
Subject: [PATCH 55/62] revert: "softirq: Let ksoftirqd do its job"
Due to the mentioned commit, when the ksoftirqd processes take charge
of softirq processing, the system can experience high latencies.
In the past a few workarounds have been implemented for specific
side-effects of the above:
commit 1ff688209e2e ("watchdog: core: make sure the watchdog_worker is not deferred")
commit 8d5755b3f77b ("watchdog: softdog: fire watchdog even if softirqs do not get to run")
commit 217f69743681 ("net: busy-poll: allow preemption in sk_busy_loop()")
commit 3c53776e29f8 ("Mark HI and TASKLET softirq synchronous")
but the latency problem still exists in real-life workloads, see the
link below.
The reverted commit intended to solve a live-lock scenario that can now
be addressed with the NAPI threaded mode, introduced with commit
29863d41bb6e ("net: implement threaded-able napi poll loop support"),
and nowadays in a pretty stable status.
While a complete solution to put softirq processing under nice resource
control would be preferable, that has proven to be a very hard task. In
the short term, remove the main pain point, and also simplify a bit the
current softirq implementation.
Note that this change also reverts commit 3c53776e29f8 ("Mark HI and
TASKLET softirq synchronous") and commit 1342d8080f61 ("softirq: Don't
skip softirq execution when softirq thread is parking"), which are
direct follow-ups of the feature commit. A single change is preferred to
avoid known bad intermediate states introduced by a patch series
reverting them individually.
Link: https://lore.kernel.org/netdev/305d7742212cbe98621b16be782b0562f1012cb6.camel@redhat.com/
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Tested-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lore.kernel.org/r/57e66b364f1b6f09c9bc0316742c3b14f4ce83bd.1683526542.git.pabeni@redhat.com
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
(cherry picked from commit b8a04a538ed4755dc97c403ee3b8dd882955c98c)
Signed-off-by: Clark Williams <clark.williams@gmail.com>
---
kernel/softirq.c | 22 ++--------------------
1 file changed, 2 insertions(+), 20 deletions(-)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 82f3e68fbe22..af9e879bbbf7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -80,21 +80,6 @@ static void wakeup_softirqd(void)
wake_up_process(tsk);
}
-/*
- * If ksoftirqd is scheduled, we do not want to process pending softirqs
- * right now. Let ksoftirqd handle this at its own rate, to get fairness,
- * unless we're doing some of the synchronous softirqs.
- */
-#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ))
-static bool ksoftirqd_running(unsigned long pending)
-{
- struct task_struct *tsk = __this_cpu_read(ksoftirqd);
-
- if (pending & SOFTIRQ_NOW_MASK)
- return false;
- return tsk && task_is_running(tsk) && !__kthread_should_park(tsk);
-}
-
#ifdef CONFIG_TRACE_IRQFLAGS
DEFINE_PER_CPU(int, hardirqs_enabled);
DEFINE_PER_CPU(int, hardirq_context);
@@ -236,7 +221,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
goto out;
pending = local_softirq_pending();
- if (!pending || ksoftirqd_running(pending))
+ if (!pending)
goto out;
/*
@@ -432,9 +417,6 @@ static inline bool should_wake_ksoftirqd(void)
static inline void invoke_softirq(void)
{
- if (ksoftirqd_running(local_softirq_pending()))
- return;
-
if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
@@ -468,7 +450,7 @@ asmlinkage __visible void do_softirq(void)
pending = local_softirq_pending();
- if (pending && !ksoftirqd_running(pending))
+ if (pending)
do_softirq_own_stack();
local_irq_restore(flags);
--
2.43.0

View File

@ -0,0 +1,176 @@
From c082e5d28e56252dca01b53c553bba5cd152fec1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 25 Apr 2023 17:03:13 +0200
Subject: [PATCH 56/62] debugobjects,locking: Annotate debug_object_fill_pool()
wait type violation
There is an explicit wait-type violation in debug_object_fill_pool()
for PREEMPT_RT=n kernels which allows them to more easily fill the
object pool and reduce the chance of allocation failures.
Lockdep's wait-type checks are designed to check the PREEMPT_RT
locking rules even for PREEMPT_RT=n kernels and object to this, so
create a lockdep annotation to allow this to stand.
Specifically, create a 'lock' type that overrides the inner wait-type
while it is held -- allowing one to temporarily raise it, such that
the violation is hidden.
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Qi Zheng <zhengqi.arch@bytedance.com>
Link: https://lkml.kernel.org/r/20230429100614.GA1489784@hirez.programming.kicks-ass.net
(cherry picked from commit 0cce06ba859a515bd06224085d3addb870608b6d)
Signed-off-by: Clark Williams <clark.williams@gmail.com>
---
include/linux/lockdep.h | 14 ++++++++++++++
include/linux/lockdep_types.h | 1 +
kernel/locking/lockdep.c | 28 +++++++++++++++++++++-------
lib/debugobjects.c | 15 +++++++++++++--
4 files changed, 49 insertions(+), 9 deletions(-)
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 1023f349af71..a3329fb49b33 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -339,6 +339,16 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
#define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c))
#define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c))
+/*
+ * Must use lock_map_aquire_try() with override maps to avoid
+ * lockdep thinking they participate in the block chain.
+ */
+#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \
+ struct lockdep_map _name = { \
+ .name = #_name "-wait-type-override", \
+ .wait_type_inner = _wait_type, \
+ .lock_type = LD_LOCK_WAIT_OVERRIDE, }
+
#else /* !CONFIG_LOCKDEP */
static inline void lockdep_init_task(struct task_struct *task)
@@ -427,6 +437,9 @@ extern int lockdep_is_held(const void *);
#define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0)
#define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0)
+#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \
+ struct lockdep_map __maybe_unused _name = {}
+
#endif /* !LOCKDEP */
enum xhlock_context_t {
@@ -551,6 +564,7 @@ do { \
#define rwsem_release(l, i) lock_release(l, i)
#define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_)
+#define lock_map_acquire_try(l) lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_)
#define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_)
#define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_)
#define lock_map_release(l) lock_release(l, _THIS_IP_)
diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index d22430840b53..59f4fb1626ea 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -33,6 +33,7 @@ enum lockdep_wait_type {
enum lockdep_lock_type {
LD_LOCK_NORMAL = 0, /* normal, catch all */
LD_LOCK_PERCPU, /* percpu */
+ LD_LOCK_WAIT_OVERRIDE, /* annotation */
LD_LOCK_MAX,
};
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 3b38303ed27b..a046e03c7ead 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2245,6 +2245,9 @@ static inline bool usage_match(struct lock_list *entry, void *mask)
static inline bool usage_skip(struct lock_list *entry, void *mask)
{
+ if (entry->class->lock_type == LD_LOCK_NORMAL)
+ return false;
+
/*
* Skip local_lock() for irq inversion detection.
*
@@ -2271,14 +2274,16 @@ static inline bool usage_skip(struct lock_list *entry, void *mask)
* As a result, we will skip local_lock(), when we search for irq
* inversion bugs.
*/
- if (entry->class->lock_type == LD_LOCK_PERCPU) {
- if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
- return false;
+ if (entry->class->lock_type == LD_LOCK_PERCPU &&
+ DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
+ return false;
- return true;
- }
+ /*
+ * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually
+ * a lock and only used to override the wait_type.
+ */
- return false;
+ return true;
}
/*
@@ -4745,7 +4750,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
for (; depth < curr->lockdep_depth; depth++) {
struct held_lock *prev = curr->held_locks + depth;
- u8 prev_inner = hlock_class(prev)->wait_type_inner;
+ struct lock_class *class = hlock_class(prev);
+ u8 prev_inner = class->wait_type_inner;
if (prev_inner) {
/*
@@ -4755,6 +4761,14 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
* Also due to trylocks.
*/
curr_inner = min(curr_inner, prev_inner);
+
+ /*
+ * Allow override for annotations -- this is typically
+ * only valid/needed for code that only exists when
+ * CONFIG_PREEMPT_RT=n.
+ */
+ if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE))
+ curr_inner = prev_inner;
}
}
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index dacb80c22c4f..3c9e00e207dc 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -600,10 +600,21 @@ static void debug_objects_fill_pool(void)
{
/*
* On RT enabled kernels the pool refill must happen in preemptible
- * context:
+ * context -- for !RT kernels we rely on the fact that spinlock_t and
+ * raw_spinlock_t are basically the same type and this lock-type
+ * inversion works just fine.
*/
- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
+ /*
+ * Annotate away the spinlock_t inside raw_spinlock_t warning
+ * by temporarily raising the wait-type to WAIT_SLEEP, matching
+ * the preemptible() condition above.
+ */
+ static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_SLEEP);
+ lock_map_acquire_try(&fill_pool_map);
fill_pool();
+ lock_map_release(&fill_pool_map);
+ }
}
static void
--
2.43.0

View File

@ -0,0 +1,51 @@
From 5c27e6fdf46d68180a46fdf7944aa7e4668680c3 Mon Sep 17 00:00:00 2001
From: Wander Lairson Costa <wander@redhat.com>
Date: Wed, 14 Jun 2023 09:23:22 -0300
Subject: [PATCH 57/62] sched: avoid false lockdep splat in put_task_struct()
In put_task_struct(), a spin_lock is indirectly acquired under the kernel
stock. When running the kernel in real-time (RT) configuration, the
operation is dispatched to a preemptible context call to ensure
guaranteed preemption. However, if PROVE_RAW_LOCK_NESTING is enabled
and __put_task_struct() is called while holding a raw_spinlock, lockdep
incorrectly reports an "Invalid lock context" in the stock kernel.
This false splat occurs because lockdep is unaware of the different
route taken under RT. To address this issue, override the inner wait
type to prevent the false lockdep splat.
Signed-off-by: Wander Lairson Costa <wander@redhat.com>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Suggested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Luis Goncalves <lgoncalv@redhat.com>
Link: https://lore.kernel.org/r/20230614122323.37957-3-wander@redhat.com
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
(cherry picked from commit a5e446e728e89d5f5c5e427cc919bc7813c64c28)
Signed-off-by: Clark Williams <clark.williams@gmail.com>
---
include/linux/sched/task.h | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 7291fb6399d2..de7ebd2bf3ba 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -141,8 +141,12 @@ static inline void put_task_struct(struct task_struct *t)
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible())
call_rcu(&t->rcu, __put_task_struct_rcu_cb);
- else
+ else {
+ static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP);
+ lock_map_acquire_try(&put_task_map);
__put_task_struct(t);
+ lock_map_release(&put_task_map);
+ }
}
static inline void put_task_struct_many(struct task_struct *t, int nr)
--
2.43.0

View File

@ -0,0 +1,96 @@
From 9512a9467dec62e03f2df4f15af9a38332b8de58 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 23 Jun 2023 22:15:17 +0200
Subject: [PATCH 58/62] mm/page_alloc: Use write_seqlock_irqsave() instead
write_seqlock() + local_irq_save().
__build_all_zonelists() acquires zonelist_update_seq by first disabling
interrupts via local_irq_save() and then acquiring the seqlock with
write_seqlock(). This is troublesome and leads to problems on
PREEMPT_RT. The problem is that the inner spinlock_t becomes a sleeping
lock on PREEMPT_RT and must not be acquired with disabled interrupts.
The API provides write_seqlock_irqsave() which does the right thing in
one step.
printk_deferred_enter() has to be invoked in non-migrate-able context to
ensure that deferred printing is enabled and disabled on the same CPU.
This is the case after zonelist_update_seq has been acquired.
There was discussion on the first submission that the order should be:
local_irq_disable();
printk_deferred_enter();
write_seqlock();
to avoid pitfalls like having an unaccounted printk() coming from
write_seqlock_irqsave() before printk_deferred_enter() is invoked. The
only origin of such a printk() can be a lockdep splat because the
lockdep annotation happens after the sequence count is incremented.
This is exceptional and subject to change.
It was also pointed that PREEMPT_RT can be affected by the printk
problem since its write_seqlock_irqsave() does not really disable
interrupts. This isn't the case because PREEMPT_RT's printk
implementation differs from the mainline implementation in two important
aspects:
- Printing happens in a dedicated threads and not at during the
invocation of printk().
- In emergency cases where synchronous printing is used, a different
driver is used which does not use tty_port::lock.
Acquire zonelist_update_seq with write_seqlock_irqsave() and then defer
printk output.
Fixes: 1007843a91909 ("mm/page_alloc: fix potential deadlock on zonelist_update_seq seqlock")
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/r/20230623201517.yw286Knb@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
(cherry picked from commit 4d1139baae8bc4fff3728d1d204bdb04c13dbe10)
Signed-off-by: Clark Williams <clark.williams@gmail.com>
---
mm/page_alloc.c | 15 ++++++---------
1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4583f8a42d91..835b69a64f4f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6588,19 +6588,17 @@ static void __build_all_zonelists(void *data)
unsigned long flags;
/*
- * Explicitly disable this CPU's interrupts before taking seqlock
- * to prevent any IRQ handler from calling into the page allocator
- * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
+ * The zonelist_update_seq must be acquired with irqsave because the
+ * reader can be invoked from IRQ with GFP_ATOMIC.
*/
- local_irq_save(flags);
+ write_seqlock_irqsave(&zonelist_update_seq, flags);
/*
- * Explicitly disable this CPU's synchronous printk() before taking
- * seqlock to prevent any printk() from trying to hold port->lock, for
+ * Also disable synchronous printk() to prevent any printk() from
+ * trying to hold port->lock, for
* tty_insert_flip_string_and_push_buffer() on other CPU might be
* calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
*/
printk_deferred_enter();
- write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
@@ -6637,9 +6635,8 @@ static void __build_all_zonelists(void *data)
#endif
}
- write_sequnlock(&zonelist_update_seq);
printk_deferred_exit();
- local_irq_restore(flags);
+ write_sequnlock_irqrestore(&zonelist_update_seq, flags);
}
static noinline void __init
--
2.43.0

View File

@ -0,0 +1,119 @@
From 05999b640eb04be872e5491a040701fcddc73349 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 14 Jun 2023 10:34:30 +0200
Subject: [PATCH 59/62] bpf: Remove in_atomic() from bpf_link_put().
bpf_free_inode() is invoked as a RCU callback. Usually RCU callbacks are
invoked within softirq context. By setting rcutree.use_softirq=0 boot
option the RCU callbacks will be invoked in a per-CPU kthread with
bottom halves disabled which implies a RCU read section.
On PREEMPT_RT the context remains fully preemptible. The RCU read
section however does not allow schedule() invocation. The latter happens
in mutex_lock() performed by bpf_trampoline_unlink_prog() originated
from bpf_link_put().
It was pointed out that the bpf_link_put() invocation should not be
delayed if originated from close(). It was also pointed out that other
invocations from within a syscall should also avoid the workqueue.
Everyone else should use workqueue by default to remain safe in the
future (while auditing the code, every caller was preemptible except for
the RCU case).
Let bpf_link_put() use the worker unconditionally. Add
bpf_link_put_direct() which will directly free the resources and is used
by close() and from within __sys_bpf().
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230614083430.oENawF8f@linutronix.de
(cherry picked from commit ab5d47bd41b1db82c295b0e751e2b822b43a4b5a)
Signed-off-by: Clark Williams <clark.williams@gmail.com>
---
kernel/bpf/syscall.c | 29 ++++++++++++++++-------------
1 file changed, 16 insertions(+), 13 deletions(-)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c0915e2424f1..f8ba6e0a5c08 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2732,28 +2732,31 @@ static void bpf_link_put_deferred(struct work_struct *work)
bpf_link_free(link);
}
-/* bpf_link_put can be called from atomic context, but ensures that resources
- * are freed from process context
+/* bpf_link_put might be called from atomic context. It needs to be called
+ * from sleepable context in order to acquire sleeping locks during the process.
*/
void bpf_link_put(struct bpf_link *link)
{
if (!atomic64_dec_and_test(&link->refcnt))
return;
- if (in_atomic()) {
- INIT_WORK(&link->work, bpf_link_put_deferred);
- schedule_work(&link->work);
- } else {
- bpf_link_free(link);
- }
+ INIT_WORK(&link->work, bpf_link_put_deferred);
+ schedule_work(&link->work);
}
EXPORT_SYMBOL(bpf_link_put);
+static void bpf_link_put_direct(struct bpf_link *link)
+{
+ if (!atomic64_dec_and_test(&link->refcnt))
+ return;
+ bpf_link_free(link);
+}
+
static int bpf_link_release(struct inode *inode, struct file *filp)
{
struct bpf_link *link = filp->private_data;
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return 0;
}
@@ -4674,7 +4677,7 @@ static int link_update(union bpf_attr *attr)
if (ret)
bpf_prog_put(new_prog);
out_put_link:
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return ret;
}
@@ -4697,7 +4700,7 @@ static int link_detach(union bpf_attr *attr)
else
ret = -EOPNOTSUPP;
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return ret;
}
@@ -4767,7 +4770,7 @@ static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
fd = bpf_link_new_fd(link);
if (fd < 0)
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return fd;
}
@@ -4844,7 +4847,7 @@ static int bpf_iter_create(union bpf_attr *attr)
return PTR_ERR(link);
err = bpf_iter_new_fd(link);
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return err;
}
--
2.43.0

View File

@ -0,0 +1,114 @@
From a0d2c56749857956cb8ef1ccf2d982e2c1770f08 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Jun 2023 20:58:47 +0200
Subject: [PATCH 60/62] posix-timers: Ensure timer ID search-loop limit is
valid
posix_timer_add() tries to allocate a posix timer ID by starting from the
cached ID which was stored by the last successful allocation.
This is done in a loop searching the ID space for a free slot one by
one. The loop has to terminate when the search wrapped around to the
starting point.
But that's racy vs. establishing the starting point. That is read out
lockless, which leads to the following problem:
CPU0 CPU1
posix_timer_add()
start = sig->posix_timer_id;
lock(hash_lock);
... posix_timer_add()
if (++sig->posix_timer_id < 0)
start = sig->posix_timer_id;
sig->posix_timer_id = 0;
So CPU1 can observe a negative start value, i.e. -1, and the loop break
never happens because the condition can never be true:
if (sig->posix_timer_id == start)
break;
While this is unlikely to ever turn into an endless loop as the ID space is
huge (INT_MAX), the racy read of the start value caught the attention of
KCSAN and Dmitry unearthed that incorrectness.
Rewrite it so that all id operations are under the hash lock.
Reported-by: syzbot+5c54bd3eb218bb595aa9@syzkaller.appspotmail.com
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/87bkhzdn6g.ffs@tglx
(cherry picked from commit 8ce8849dd1e78dadcee0ec9acbd259d239b7069f)
Signed-off-by: Clark Williams <clark.williams@gmail.com>
---
include/linux/sched/signal.h | 2 +-
kernel/time/posix-timers.c | 31 ++++++++++++++++++-------------
2 files changed, 19 insertions(+), 14 deletions(-)
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 20099268fa25..669e8cff40c7 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -135,7 +135,7 @@ struct signal_struct {
#ifdef CONFIG_POSIX_TIMERS
/* POSIX.1b Interval Timers */
- int posix_timer_id;
+ unsigned int next_posix_timer_id;
struct list_head posix_timers;
/* ITIMER_REAL timer for the process */
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index ed3c4a954398..2d6cf93ca370 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -140,25 +140,30 @@ static struct k_itimer *posix_timer_by_id(timer_t id)
static int posix_timer_add(struct k_itimer *timer)
{
struct signal_struct *sig = current->signal;
- int first_free_id = sig->posix_timer_id;
struct hlist_head *head;
- int ret = -ENOENT;
+ unsigned int cnt, id;
- do {
+ /*
+ * FIXME: Replace this by a per signal struct xarray once there is
+ * a plan to handle the resulting CRIU regression gracefully.
+ */
+ for (cnt = 0; cnt <= INT_MAX; cnt++) {
spin_lock(&hash_lock);
- head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
- if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+ id = sig->next_posix_timer_id;
+
+ /* Write the next ID back. Clamp it to the positive space */
+ sig->next_posix_timer_id = (id + 1) & INT_MAX;
+
+ head = &posix_timers_hashtable[hash(sig, id)];
+ if (!__posix_timers_find(head, sig, id)) {
hlist_add_head_rcu(&timer->t_hash, head);
- ret = sig->posix_timer_id;
+ spin_unlock(&hash_lock);
+ return id;
}
- if (++sig->posix_timer_id < 0)
- sig->posix_timer_id = 0;
- if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
- /* Loop over all possible ids completed */
- ret = -EAGAIN;
spin_unlock(&hash_lock);
- } while (ret == -ENOENT);
- return ret;
+ }
+ /* POSIX return code when no timer ID could be allocated */
+ return -EAGAIN;
}
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
--
2.43.0

View File

@ -0,0 +1,109 @@
From 1d651fe6c67cb3b355cc228f75289657496520ff Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Fri, 18 Aug 2023 22:45:25 -0400
Subject: [PATCH 61/62] drm/i915: Do not disable preemption for resets
[commit 40cd2835ced288789a685aa4aa7bc04b492dcd45 in linux-rt-devel]
Commit ade8a0f59844 ("drm/i915: Make all GPU resets atomic") added a
preempt disable section over the hardware reset callback to prepare the
driver for being able to reset from atomic contexts.
In retrospect I can see that the work item at a time was about removing
the struct mutex from the reset path. Code base also briefly entertained
the idea of doing the reset under stop_machine in order to serialize
userspace mmap and temporary glitch in the fence registers (see
eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex"),
but that never materialized and was soon removed in 2caffbf11762
("drm/i915: Revoke mmaps and prevent access to fence registers across
reset") and replaced with a SRCU based solution.
As such, as far as I can see, today we still have a requirement that
resets must not sleep (invoked from submission tasklets), but no need to
support invoking them from a truly atomic context.
Given that the preemption section is problematic on RT kernels, since the
uncore lock becomes a sleeping lock and so is invalid in such section,
lets try and remove it. Potential downside is that our short waits on GPU
to complete the reset may get extended if CPU scheduling interferes, but
in practice that probably isn't a deal breaker.
In terms of mechanics, since the preemption disabled block is being
removed we just need to replace a few of the wait_for_atomic macros into
busy looping versions which will work (and not complain) when called from
non-atomic sections.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris.p.wilson@intel.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lore.kernel.org/r/20230705093025.3689748-1-tvrtko.ursulin@linux.intel.com
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
[PG: backport from v6.4-rt ; minor context fixup caused by b7d70b8b06ed]
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Clark Williams <williams@redhat.com>
---
drivers/gpu/drm/i915/gt/intel_reset.c | 12 +++++-------
1 file changed, 5 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index 10b930eaa8cb..6108a449cd19 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -174,13 +174,13 @@ static int i915_do_reset(struct intel_gt *gt,
/* Assert reset for at least 20 usec, and wait for acknowledgement. */
pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
udelay(50);
- err = wait_for_atomic(i915_in_reset(pdev), 50);
+ err = _wait_for_atomic(i915_in_reset(pdev), 50, 0);
/* Clear the reset request. */
pci_write_config_byte(pdev, I915_GDRST, 0);
udelay(50);
if (!err)
- err = wait_for_atomic(!i915_in_reset(pdev), 50);
+ err = _wait_for_atomic(!i915_in_reset(pdev), 50, 0);
return err;
}
@@ -200,7 +200,7 @@ static int g33_do_reset(struct intel_gt *gt,
struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
- return wait_for_atomic(g4x_reset_complete(pdev), 50);
+ return _wait_for_atomic(g4x_reset_complete(pdev), 50, 0);
}
static int g4x_do_reset(struct intel_gt *gt,
@@ -217,7 +217,7 @@ static int g4x_do_reset(struct intel_gt *gt,
pci_write_config_byte(pdev, I915_GDRST,
GRDOM_MEDIA | GRDOM_RESET_ENABLE);
- ret = wait_for_atomic(g4x_reset_complete(pdev), 50);
+ ret = _wait_for_atomic(g4x_reset_complete(pdev), 50, 0);
if (ret) {
GT_TRACE(gt, "Wait for media reset failed\n");
goto out;
@@ -225,7 +225,7 @@ static int g4x_do_reset(struct intel_gt *gt,
pci_write_config_byte(pdev, I915_GDRST,
GRDOM_RENDER | GRDOM_RESET_ENABLE);
- ret = wait_for_atomic(g4x_reset_complete(pdev), 50);
+ ret = _wait_for_atomic(g4x_reset_complete(pdev), 50, 0);
if (ret) {
GT_TRACE(gt, "Wait for render reset failed\n");
goto out;
@@ -718,9 +718,7 @@ int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
GT_TRACE(gt, "engine_mask=%x\n", engine_mask);
- preempt_disable();
ret = reset(gt, engine_mask, retry);
- preempt_enable();
}
intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
--
2.43.0

View File

@ -0,0 +1,20 @@
From 8aa6a280fc011cccf7cfcc0f5942e3ec6bdd73b4 Mon Sep 17 00:00:00 2001
From: Clark Williams <clark.williams@gmail.com>
Date: Thu, 28 Dec 2023 23:45:11 -0600
Subject: [PATCH 62/62] Linux 6.1.69-rt21 REBASE
Signed-off-by: Clark Williams <clark.williams@gmail.com>
---
localversion-rt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/localversion-rt b/localversion-rt
index 9f7d0bdbffb1..6c6cde1c29e3 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt13
+-rt21
--
2.43.0

View File

@ -1,3 +0,0 @@
@pipewire - rtprio 95
@pipewire - nice -19
@pipewire - memlock 4194304

View File

@ -128,7 +128,7 @@ bluez_monitor.rules = {
--["resample.quality"] = 4,
--["channelmix.normalize"] = false,
--["channelmix.mix-lfe"] = false,
--["session.suspend-timeout-seconds"] = 5, -- 0 disables suspend
["session.suspend-timeout-seconds"] = 0, -- 0 disables suspend
--["monitor.channel-volumes"] = false,
-- Media source role, "input" or "playback"

View File

@ -34,7 +34,7 @@ bluez_midi_monitor.rules = {
--["priority.driver"] = 100,
--["priority.session"] = 100,
--["node.pause-on-idle"] = false,
--["session.suspend-timeout-seconds"] = 5, -- 0 disables suspend
["session.suspend-timeout-seconds"] = 0, -- 0 disables suspend
--["monitor.channel-volumes"] = false,
--["node.latency-offset-msec"] = -10, -- delay (<0) input to reduce jitter
},