iommu/arm-smmu-v3: Reduce contention during command-queue insertion

The SMMU command queue is a bottleneck in large systems, thanks to the spin_lock which serialises accesses from all CPUs to the single queue supported by the hardware. Attempt to improve this situation by moving to a new algorithm for inserting commands into the queue, which is lock-free on the fast-path. Tested-by: Ganapatrao Kulkarni <gkulkarni@marvell.com> Signed-off-by: Will Deacon <will@kernel.org>
2025-08-31 22:23:05 +00:00 · 2019-07-02 17:16:25 +01:00 · 2019-07-02 17:16:25 +01:00 · 587e6c10a7
commit 587e6c10a7
parent 7c288a5b27
1 changed files with 552 additions and 163 deletions
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@ -183,7 +183,7 @@
 #define Q_IDX(llq, p)			((p) & ((1 << (llq)->max_n_shift) - 1))
 #define Q_WRP(llq, p)			((p) & (1 << (llq)->max_n_shift))
-#define Q_OVERFLOW_FLAG			(1 << 31)
+#define Q_OVERFLOW_FLAG			(1U << 31)
 #define Q_OVF(p)			((p) & Q_OVERFLOW_FLAG)
 #define Q_ENT(q, p)			((q)->base +			\
 					 Q_IDX(&((q)->llq), p) *	\
@ -307,6 +307,8 @@
 #define CMDQ_ERR_CERROR_ABT_IDX		2
 #define CMDQ_ERR_CERROR_ATC_INV_IDX	3
 #define CMDQ_PROD_OWNED_FLAG		Q_OVERFLOW_FLAG
 #define CMDQ_0_OP			GENMASK_ULL(7, 0)
 #define CMDQ_0_SSV			(1UL << 11)
@ -369,9 +371,8 @@
 #define PRIQ_1_ADDR_MASK		GENMASK_ULL(63, 12)
 /* High-level queue structures */
-#define ARM_SMMU_POLL_TIMEOUT_US	100
+#define ARM_SMMU_POLL_TIMEOUT_US	1000000 /* 1s! */
-#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US	1000000 /* 1s! */
+#define ARM_SMMU_POLL_SPIN_COUNT	10
 #define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT	10
 #define MSI_IOVA_BASE			0x8000000
 #define MSI_IOVA_LENGTH			0x100000
@ -473,15 +474,24 @@ struct arm_smmu_cmdq_ent {
 		#define CMDQ_OP_CMD_SYNC	0x46
 		struct {
 			u32			msidata;
 			u64			msiaddr;
 		} sync;
 	};
 };
 struct arm_smmu_ll_queue {
-	u32				prod;
+	union {
-	u32				cons;
+		u64			val;
 		struct {
 			u32		prod;
 			u32		cons;
 		};
 		struct {
 			atomic_t	prod;
 			atomic_t	cons;
 		} atomic;
 		u8			__pad[SMP_CACHE_BYTES];
 	} ____cacheline_aligned_in_smp;
 	u32				max_n_shift;
 };
@ -499,9 +509,18 @@ struct arm_smmu_queue {
 	u32 __iomem			*cons_reg;
 };
 struct arm_smmu_queue_poll {
 	ktime_t				timeout;
 	unsigned int			delay;
 	unsigned int			spin_cnt;
 	bool				wfe;
 };
 struct arm_smmu_cmdq {
 	struct arm_smmu_queue		q;
-	spinlock_t			lock;
+	atomic_long_t			*valid_map;
 	atomic_t			owner_prod;
 	atomic_t			lock;
 };
 struct arm_smmu_evtq {
@ -581,8 +600,6 @@ struct arm_smmu_device {
 	int				gerr_irq;
 	int				combined_irq;
 	u32				sync_nr;
 	u8				prev_cmd_opcode;
 	unsigned long			ias; /* IPA */
 	unsigned long			oas; /* PA */
@ -601,12 +618,6 @@ struct arm_smmu_device {
 	struct arm_smmu_strtab_cfg	strtab_cfg;
 	/* Hi16xx adds an extra 32 bits of goodness to its MSI payload */
 	union {
 		u32			sync_count;
 		u64			padding;
 	};
 	/* IOMMU core code handle */
 	struct iommu_device		iommu;
 };
@ -690,6 +701,21 @@ static void parse_driver_options(struct arm_smmu_device *smmu)
 }
 /* Low-level queue manipulation functions */
 static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n)
 {
 	u32 space, prod, cons;
 	prod = Q_IDX(q, q->prod);
 	cons = Q_IDX(q, q->cons);
 	if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons))
 		space = (1 << q->max_n_shift) - (prod - cons);
 	else
 		space = cons - prod;
 	return space >= n;
 }
 static bool queue_full(struct arm_smmu_ll_queue *q)
 {
 	return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
@ -702,9 +728,12 @@ static bool queue_empty(struct arm_smmu_ll_queue *q)
 	       Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
 }
-static void queue_sync_cons_in(struct arm_smmu_queue *q)
+static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod)
 {
-	q->llq.cons = readl_relaxed(q->cons_reg);
+	return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) &&
 		(Q_IDX(q, q->cons) > Q_IDX(q, prod))) ||
 	       ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) &&
 		(Q_IDX(q, q->cons) <= Q_IDX(q, prod)));
 }
 static void queue_sync_cons_out(struct arm_smmu_queue *q)
@ -735,46 +764,34 @@ static int queue_sync_prod_in(struct arm_smmu_queue *q)
 	return ret;
 }
-static void queue_sync_prod_out(struct arm_smmu_queue *q)
+static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n)
 {
-	writel(q->llq.prod, q->prod_reg);
+	u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n;
 	return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
 }
-static void queue_inc_prod(struct arm_smmu_ll_queue *q)
+static void queue_poll_init(struct arm_smmu_device *smmu,
 			    struct arm_smmu_queue_poll *qp)
 {
-	u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
+	qp->delay = 1;
-	q->prod = Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
+	qp->spin_cnt = 0;
 	qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
 	qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
 }
-/*
+static int queue_poll(struct arm_smmu_queue_poll *qp)
 * Wait for the SMMU to consume items. If sync is true, wait until the queue
 * is empty. Otherwise, wait until there is at least one free slot.
 */
 static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
 {
-	ktime_t timeout;
+	if (ktime_compare(ktime_get(), qp->timeout) > 0)
-	unsigned int delay = 1, spin_cnt = 0;
+		return -ETIMEDOUT;
-	/* Wait longer if it's a CMD_SYNC */
+	if (qp->wfe) {
-	timeout = ktime_add_us(ktime_get(), sync ?
+		wfe();
-					    ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
+	} else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) {
-					    ARM_SMMU_POLL_TIMEOUT_US);
+		cpu_relax();
-
+	} else {
-	while (queue_sync_cons_in(q),
+		udelay(qp->delay);
-	      (sync ? !queue_empty(&q->llq) : queue_full(&q->llq))) {
+		qp->delay *= 2;
-		if (ktime_compare(ktime_get(), timeout) > 0)
+		qp->spin_cnt = 0;
 			return -ETIMEDOUT;
 		if (wfe) {
 			wfe();
 		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
 			cpu_relax();
 			continue;
 		} else {
 			udelay(delay);
 			delay *= 2;
 			spin_cnt = 0;
 		}
 	}
 	return 0;
@ -788,17 +805,6 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
 		*dst++ = cpu_to_le64(*src++);
 }
 static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
 {
 	if (queue_full(&q->llq))
 		return -ENOSPC;
 	queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords);
 	queue_inc_prod(&q->llq);
 	queue_sync_prod_out(q);
 	return 0;
 }
 static void queue_read(__le64 *dst, u64 *src, size_t n_dwords)
 {
 	int i;
@ -881,20 +887,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 		cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
 		break;
 	case CMDQ_OP_CMD_SYNC:
-		if (ent->sync.msiaddr)
+		if (ent->sync.msiaddr) {
 			cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ);
-		else
+			cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
 		} else {
 			cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
 		}
 		cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH);
 		cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB);
 		/*
 		 * Commands are written little-endian, but we want the SMMU to
 		 * receive MSIData, and thus write it back to memory, in CPU
 		 * byte order, so big-endian needs an extra byteswap here.
 		 */
 		cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA,
 				     cpu_to_le32(ent->sync.msidata));
 		cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
 		break;
 	default:
 		return -ENOENT;
@ -903,6 +903,27 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 	return 0;
 }
 static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
 					 u32 prod)
 {
 	struct arm_smmu_queue *q = &smmu->cmdq.q;
 	struct arm_smmu_cmdq_ent ent = {
 		.opcode = CMDQ_OP_CMD_SYNC,
 	};
 	/*
 	 * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
 	 * payload, so the write will zero the entire command on that platform.
 	 */
 	if (smmu->features & ARM_SMMU_FEAT_MSI &&
 	    smmu->features & ARM_SMMU_FEAT_COHERENCY) {
 		ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
 				   q->ent_dwords * 8;
 	}
 	arm_smmu_cmdq_build_cmd(cmd, &ent);
 }
 static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
 {
 	static const char *cerror_str[] = {
@ -961,109 +982,440 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
 	queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
 }
-static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
+/*
 * Command queue locking.
 * This is a form of bastardised rwlock with the following major changes:
 *
 * - The only LOCK routines are exclusive_trylock() and shared_lock().
 *   Neither have barrier semantics, and instead provide only a control
 *   dependency.
 *
 * - The UNLOCK routines are supplemented with shared_tryunlock(), which
 *   fails if the caller appears to be the last lock holder (yes, this is
 *   racy). All successful UNLOCK routines have RELEASE semantics.
 */
 static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq)
 {
-	struct arm_smmu_queue *q = &smmu->cmdq.q;
+	int val;
 	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
-	smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
+	/*
 	 * We can try to avoid the cmpxchg() loop by simply incrementing the
 	 * lock counter. When held in exclusive state, the lock counter is set
 	 * to INT_MIN so these increments won't hurt as the value will remain
 	 * negative.
 	 */
 	if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0)
 		return;
-	while (queue_insert_raw(q, cmd) == -ENOSPC) {
+	do {
-		if (queue_poll_cons(q, false, wfe))
+		val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0);
-			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
+	} while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val);
 }
 static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq)
 {
 	(void)atomic_dec_return_release(&cmdq->lock);
 }
 static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq)
 {
 	if (atomic_read(&cmdq->lock) == 1)
 		return false;
 	arm_smmu_cmdq_shared_unlock(cmdq);
 	return true;
 }
 #define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)		\
 ({									\
 	bool __ret;							\
 	local_irq_save(flags);						\
 	__ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN);	\
 	if (!__ret)							\
 		local_irq_restore(flags);				\
 	__ret;								\
 })
 #define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags)		\
 ({									\
 	atomic_set_release(&cmdq->lock, 0);				\
 	local_irq_restore(flags);					\
 })
 /*
 * Command queue insertion.
 * This is made fiddly by our attempts to achieve some sort of scalability
 * since there is one queue shared amongst all of the CPUs in the system.  If
 * you like mixed-size concurrency, dependency ordering and relaxed atomics,
 * then you'll *love* this monstrosity.
 *
 * The basic idea is to split the queue up into ranges of commands that are
 * owned by a given CPU; the owner may not have written all of the commands
 * itself, but is responsible for advancing the hardware prod pointer when
 * the time comes. The algorithm is roughly:
 *
 * 	1. Allocate some space in the queue. At this point we also discover
 *	   whether the head of the queue is currently owned by another CPU,
 *	   or whether we are the owner.
 *
 *	2. Write our commands into our allocated slots in the queue.
 *
 *	3. Mark our slots as valid in arm_smmu_cmdq.valid_map.
 *
 *	4. If we are an owner:
 *		a. Wait for the previous owner to finish.
 *		b. Mark the queue head as unowned, which tells us the range
 *		   that we are responsible for publishing.
 *		c. Wait for all commands in our owned range to become valid.
 *		d. Advance the hardware prod pointer.
 *		e. Tell the next owner we've finished.
 *
 *	5. If we are inserting a CMD_SYNC (we may or may not have been an
 *	   owner), then we need to stick around until it has completed:
 *		a. If we have MSIs, the SMMU can write back into the CMD_SYNC
 *		   to clear the first 4 bytes.
 *		b. Otherwise, we spin waiting for the hardware cons pointer to
 *		   advance past our command.
 *
 * The devil is in the details, particularly the use of locking for handling
 * SYNC completion and freeing up space in the queue before we think that it is
 * full.
 */
 static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq,
 					       u32 sprod, u32 eprod, bool set)
 {
 	u32 swidx, sbidx, ewidx, ebidx;
 	struct arm_smmu_ll_queue llq = {
 		.max_n_shift	= cmdq->q.llq.max_n_shift,
 		.prod		= sprod,
 	};
 	ewidx = BIT_WORD(Q_IDX(&llq, eprod));
 	ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG;
 	while (llq.prod != eprod) {
 		unsigned long mask;
 		atomic_long_t *ptr;
 		u32 limit = BITS_PER_LONG;
 		swidx = BIT_WORD(Q_IDX(&llq, llq.prod));
 		sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG;
 		ptr = &cmdq->valid_map[swidx];
 		if ((swidx == ewidx) && (sbidx < ebidx))
 			limit = ebidx;
 		mask = GENMASK(limit - 1, sbidx);
 		/*
 		 * The valid bit is the inverse of the wrap bit. This means
 		 * that a zero-initialised queue is invalid and, after marking
 		 * all entries as valid, they become invalid again when we
 		 * wrap.
 		 */
 		if (set) {
 			atomic_long_xor(mask, ptr);
 		} else { /* Poll */
 			unsigned long valid;
 			valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask;
 			atomic_long_cond_read_relaxed(ptr, (VAL & mask) == valid);
 		}
 		llq.prod = queue_inc_prod_n(&llq, limit - sbidx);
 	}
 }
-static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
+/* Mark all entries in the range [sprod, eprod) as valid */
-				    struct arm_smmu_cmdq_ent *ent)
+static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq,
 					u32 sprod, u32 eprod)
 {
 	__arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true);
 }
 /* Wait for all entries in the range [sprod, eprod) to become valid */
 static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq,
 					 u32 sprod, u32 eprod)
 {
 	__arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false);
 }
 /* Wait for the command queue to become non-full */
 static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
 					     struct arm_smmu_ll_queue *llq)
 {
 	unsigned long flags;
 	struct arm_smmu_queue_poll qp;
 	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 	int ret = 0;
 	/*
 	 * Try to update our copy of cons by grabbing exclusive cmdq access. If
 	 * that fails, spin until somebody else updates it for us.
 	 */
 	if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) {
 		WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg));
 		arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags);
 		llq->val = READ_ONCE(cmdq->q.llq.val);
 		return 0;
 	}
 	queue_poll_init(smmu, &qp);
 	do {
 		llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
 		if (!queue_full(llq))
 			break;
 		ret = queue_poll(&qp);
 	} while (!ret);
 	return ret;
 }
 /*
 * Wait until the SMMU signals a CMD_SYNC completion MSI.
 * Must be called with the cmdq lock held in some capacity.
 */
 static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
 					  struct arm_smmu_ll_queue *llq)
 {
 	int ret = 0;
 	struct arm_smmu_queue_poll qp;
 	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 	u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
 	queue_poll_init(smmu, &qp);
 	/*
 	 * The MSI won't generate an event, since it's being written back
 	 * into the command queue.
 	 */
 	qp.wfe = false;
 	smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp)));
 	llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1);
 	return ret;
 }
 /*
 * Wait until the SMMU cons index passes llq->prod.
 * Must be called with the cmdq lock held in some capacity.
 */
 static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
 					       struct arm_smmu_ll_queue *llq)
 {
 	struct arm_smmu_queue_poll qp;
 	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 	u32 prod = llq->prod;
 	int ret = 0;
 	queue_poll_init(smmu, &qp);
 	llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
 	do {
 		if (queue_consumed(llq, prod))
 			break;
 		ret = queue_poll(&qp);
 		/*
 		 * This needs to be a readl() so that our subsequent call
 		 * to arm_smmu_cmdq_shared_tryunlock() can fail accurately.
 		 *
 		 * Specifically, we need to ensure that we observe all
 		 * shared_lock()s by other CMD_SYNCs that share our owner,
 		 * so that a failing call to tryunlock() means that we're
 		 * the last one out and therefore we can safely advance
 		 * cmdq->q.llq.cons. Roughly speaking:
 		 *
 		 * CPU 0		CPU1			CPU2 (us)
 		 *
 		 * if (sync)
 		 * 	shared_lock();
 		 *
 		 * dma_wmb();
 		 * set_valid_map();
 		 *
 		 * 			if (owner) {
 		 *				poll_valid_map();
 		 *				<control dependency>
 		 *				writel(prod_reg);
 		 *
 		 *						readl(cons_reg);
 		 *						tryunlock();
 		 *
 		 * Requires us to see CPU 0's shared_lock() acquisition.
 		 */
 		llq->cons = readl(cmdq->q.cons_reg);
 	} while (!ret);
 	return ret;
 }
 static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
 					 struct arm_smmu_ll_queue *llq)
 {
 	if (smmu->features & ARM_SMMU_FEAT_MSI &&
 	    smmu->features & ARM_SMMU_FEAT_COHERENCY)
 		return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
 	return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
 }
 static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
 					u32 prod, int n)
 {
 	int i;
 	struct arm_smmu_ll_queue llq = {
 		.max_n_shift	= cmdq->q.llq.max_n_shift,
 		.prod		= prod,
 	};
 	for (i = 0; i < n; ++i) {
 		u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS];
 		prod = queue_inc_prod_n(&llq, i);
 		queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS);
 	}
 }
 static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
 				       u64 *cmds, int n, bool sync)
 {
 	u64 cmd_sync[CMDQ_ENT_DWORDS];
 	u32 prod;
 	unsigned long flags;
 	bool owner;
 	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 	struct arm_smmu_ll_queue llq = {
 		.max_n_shift = cmdq->q.llq.max_n_shift,
 	}, head = llq;
 	int ret = 0;
 	/* 1. Allocate some space in the queue */
 	local_irq_save(flags);
 	llq.val = READ_ONCE(cmdq->q.llq.val);
 	do {
 		u64 old;
 		while (!queue_has_space(&llq, n + sync)) {
 			local_irq_restore(flags);
 			if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
 				dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
 			local_irq_save(flags);
 		}
 		head.cons = llq.cons;
 		head.prod = queue_inc_prod_n(&llq, n + sync) |
 					     CMDQ_PROD_OWNED_FLAG;
 		old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
 		if (old == llq.val)
 			break;
 		llq.val = old;
 	} while (1);
 	owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
 	head.prod &= ~CMDQ_PROD_OWNED_FLAG;
 	llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
 	/*
 	 * 2. Write our commands into the queue
 	 * Dependency ordering from the cmpxchg() loop above.
 	 */
 	arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
 	if (sync) {
 		prod = queue_inc_prod_n(&llq, n);
 		arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod);
 		queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
 		/*
 		 * In order to determine completion of our CMD_SYNC, we must
 		 * ensure that the queue can't wrap twice without us noticing.
 		 * We achieve that by taking the cmdq lock as shared before
 		 * marking our slot as valid.
 		 */
 		arm_smmu_cmdq_shared_lock(cmdq);
 	}
 	/* 3. Mark our slots as valid, ensuring commands are visible first */
 	dma_wmb();
 	arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod);
 	/* 4. If we are the owner, take control of the SMMU hardware */
 	if (owner) {
 		/* a. Wait for previous owner to finish */
 		atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod);
 		/* b. Stop gathering work by clearing the owned flag */
 		prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG,
 						   &cmdq->q.llq.atomic.prod);
 		prod &= ~CMDQ_PROD_OWNED_FLAG;
 		/*
 		 * c. Wait for any gathered work to be written to the queue.
 		 * Note that we read our own entries so that we have the control
 		 * dependency required by (d).
 		 */
 		arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod);
 		/*
 		 * d. Advance the hardware prod pointer
 		 * Control dependency ordering from the entries becoming valid.
 		 */
 		writel_relaxed(prod, cmdq->q.prod_reg);
 		/*
 		 * e. Tell the next owner we're done
 		 * Make sure we've updated the hardware first, so that we don't
 		 * race to update prod and potentially move it backwards.
 		 */
 		atomic_set_release(&cmdq->owner_prod, prod);
 	}
 	/* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
 	if (sync) {
 		llq.prod = queue_inc_prod_n(&llq, n);
 		ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
 		if (ret) {
 			dev_err_ratelimited(smmu->dev,
 					    "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
 					    llq.prod,
 					    readl_relaxed(cmdq->q.prod_reg),
 					    readl_relaxed(cmdq->q.cons_reg));
 		}
 		/*
 		 * Try to unlock the cmq lock. This will fail if we're the last
 		 * reader, in which case we can safely update cmdq->q.llq.cons
 		 */
 		if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) {
 			WRITE_ONCE(cmdq->q.llq.cons, llq.cons);
 			arm_smmu_cmdq_shared_unlock(cmdq);
 		}
 	}
 	local_irq_restore(flags);
 	return ret;
 }
 static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 				   struct arm_smmu_cmdq_ent *ent)
 {
 	u64 cmd[CMDQ_ENT_DWORDS];
 	unsigned long flags;
 	if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
 		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
 			 ent->opcode);
-		return;
+		return -EINVAL;
 	}
-	spin_lock_irqsave(&smmu->cmdq.lock, flags);
+	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
 	arm_smmu_cmdq_insert_cmd(smmu, cmd);
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 }
 /*
 * The difference between val and sync_idx is bounded by the maximum size of
 * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
 */
 static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
 {
 	ktime_t timeout;
 	u32 val;
 	timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
 	val = smp_cond_load_acquire(&smmu->sync_count,
 				    (int)(VAL - sync_idx) >= 0 ||
 				    !ktime_before(ktime_get(), timeout));
 	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
 }
 static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
 {
 	u64 cmd[CMDQ_ENT_DWORDS];
 	unsigned long flags;
 	struct arm_smmu_cmdq_ent ent = {
 		.opcode = CMDQ_OP_CMD_SYNC,
 		.sync	= {
 			.msiaddr = virt_to_phys(&smmu->sync_count),
 		},
 	};
 	spin_lock_irqsave(&smmu->cmdq.lock, flags);
 	/* Piggy-back on the previous command if it's a SYNC */
 	if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) {
 		ent.sync.msidata = smmu->sync_nr;
 	} else {
 		ent.sync.msidata = ++smmu->sync_nr;
 		arm_smmu_cmdq_build_cmd(cmd, &ent);
 		arm_smmu_cmdq_insert_cmd(smmu, cmd);
 	}
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 	return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
 }
 static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 {
 	u64 cmd[CMDQ_ENT_DWORDS];
 	unsigned long flags;
 	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
 	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
 	int ret;
 	arm_smmu_cmdq_build_cmd(cmd, &ent);
 	spin_lock_irqsave(&smmu->cmdq.lock, flags);
 	arm_smmu_cmdq_insert_cmd(smmu, cmd);
 	ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 	return ret;
 }
 static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 {
-	int ret;
+	return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
 	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
 		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
 	ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
 		  : __arm_smmu_cmdq_issue_sync(smmu);
 	if (ret)
 		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
 	return ret;
 }
 /* Context descriptor manipulation functions */
@ -1580,9 +1932,9 @@ static void arm_smmu_tlb_inv_context(void *cookie)
 	/*
 	 * NOTE: when io-pgtable is in non-strict mode, we may get here with
 	 * PTEs previously cleared by unmaps on the current CPU not yet visible
-	 * to the SMMU. We are relying on the DSB implicit in
+	 * to the SMMU. We are relying on the dma_wmb() implicit during cmd
-	 * queue_sync_prod_out() to guarantee those are observed before the
+	 * insertion to guarantee those are observed before the TLBI. Do be
-	 * TLBI. Do be careful, 007.
+	 * careful, 007.
 	 */
 	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
 	arm_smmu_cmdq_issue_sync(smmu);
@ -2359,18 +2711,49 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
 	return 0;
 }
 static void arm_smmu_cmdq_free_bitmap(void *data)
 {
 	unsigned long *bitmap = data;
 	bitmap_free(bitmap);
 }
 static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
 {
 	int ret = 0;
 	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 	unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
 	atomic_long_t *bitmap;
 	atomic_set(&cmdq->owner_prod, 0);
 	atomic_set(&cmdq->lock, 0);
 	bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
 	if (!bitmap) {
 		dev_err(smmu->dev, "failed to allocate cmdq bitmap\n");
 		ret = -ENOMEM;
 	} else {
 		cmdq->valid_map = bitmap;
 		devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap);
 	}
 	return ret;
 }
 static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
 {
 	int ret;
 	/* cmdq */
 	spin_lock_init(&smmu->cmdq.lock);
 	ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD,
 				      ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS,
 				      "cmdq");
 	if (ret)
 		return ret;
 	ret = arm_smmu_cmdq_init(smmu);
 	if (ret)
 		return ret;
 	/* evtq */
 	ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
 				      ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS,
@ -2951,9 +3334,15 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
 	/* Queue sizes, capped to ensure natural alignment */
 	smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
 					     FIELD_GET(IDR1_CMDQS, reg));
-	if (!smmu->cmdq.q.llq.max_n_shift) {
+	if (smmu->cmdq.q.llq.max_n_shift < ilog2(BITS_PER_LONG)) {
-		/* Odd alignment restrictions on the base, so ignore for now */
+		/*
-		dev_err(smmu->dev, "unit-length command queue not supported\n");
+		 * The cmdq valid_map relies on the total number of entries
 		 * being a multiple of BITS_PER_LONG. There's also no way
 		 * we can handle the weird alignment restrictions on the
 		 * base pointer for a unit-length queue.
 		 */
 		dev_err(smmu->dev, "command queue size < %d entries not supported\n",
 			BITS_PER_LONG);
 		return -ENXIO;
 	}