2 files changed, 319 insertions, 1 deletions
diff --git a/arch/riscv/lib/crc-clmul-template.h b/arch/riscv/lib/crc-clmul-template.h
new file mode 100644
index 000000000000..77187e7f1762
--- /dev/null
+++ b/arch/riscv/lib/crc-clmul-template.h
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright 2025 Google LLC */
+
+/*
+ * This file is a "template" that generates a CRC function optimized using the
+ * RISC-V Zbc (scalar carryless multiplication) extension.  The includer of this
+ * file must define the following parameters to specify the type of CRC:
+ *
+ *	crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC
+ *	LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural
+ *		 mapping between bits and polynomial coefficients
+ *	         1 for a lsb (least-significant-bit) first CRC, i.e. reflected
+ *	         mapping between bits and polynomial coefficients
+ */
+
+#include <asm/byteorder.h>
+#include <linux/minmax.h>
+
+#define CRC_BITS	(8 * sizeof(crc_t))	/* a.k.a. 'n' */
+
+static inline unsigned long clmul(unsigned long a, unsigned long b)
+{
+	unsigned long res;
+
+	asm(".option push\n"
+	    ".option arch,+zbc\n"
+	    "clmul %0, %1, %2\n"
+	    ".option pop\n"
+	    : "=r" (res) : "r" (a), "r" (b));
+	return res;
+}
+
+static inline unsigned long clmulh(unsigned long a, unsigned long b)
+{
+	unsigned long res;
+
+	asm(".option push\n"
+	    ".option arch,+zbc\n"
+	    "clmulh %0, %1, %2\n"
+	    ".option pop\n"
+	    : "=r" (res) : "r" (a), "r" (b));
+	return res;
+}
+
+static inline unsigned long clmulr(unsigned long a, unsigned long b)
+{
+	unsigned long res;
+
+	asm(".option push\n"
+	    ".option arch,+zbc\n"
+	    "clmulr %0, %1, %2\n"
+	    ".option pop\n"
+	    : "=r" (res) : "r" (a), "r" (b));
+	return res;
+}
+
+/*
+ * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a
+ * polynomial whose bit order matches the CRC's bit order.
+ */
+#ifdef CONFIG_64BIT
+#  if LSB_CRC
+#    define crc_load_long(x)	le64_to_cpup(x)
+#  else
+#    define crc_load_long(x)	be64_to_cpup(x)
+#  endif
+#else
+#  if LSB_CRC
+#    define crc_load_long(x)	le32_to_cpup(x)
+#  else
+#    define crc_load_long(x)	be32_to_cpup(x)
+#  endif
+#endif
+
+/* XOR @crc into the end of @msgpoly that represents the high-order terms. */
+static inline unsigned long
+crc_clmul_prep(crc_t crc, unsigned long msgpoly)
+{
+#if LSB_CRC
+	return msgpoly ^ crc;
+#else
+	return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS));
+#endif
+}
+
+/*
+ * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it
+ * modulo the generator polynomial G.  This gives the CRC of @msgpoly.
+ */
+static inline crc_t
+crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts)
+{
+	unsigned long tmp;
+
+	/*
+	 * First step of Barrett reduction with integrated multiplication by
+	 * x^n: calculate floor((msgpoly * x^n) / G).  This is the value by
+	 * which G needs to be multiplied to cancel out the x^n and higher terms
+	 * of msgpoly * x^n.  Do it using the following formula:
+	 *
+	 * msb-first:
+	 *    floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1))
+	 * lsb-first:
+	 *    floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG)
+	 *
+	 * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G),
+	 * which fits a long exactly.  Using any lower power of x there would
+	 * not carry enough precision through the calculation, while using any
+	 * higher power of x would require extra instructions to handle a wider
+	 * multiplication.  In the msb-first case, using this power of x results
+	 * in needing a floored division by x^(BITS_PER_LONG-1), which matches
+	 * what clmulr produces.  In the lsb-first case, a factor of x gets
+	 * implicitly introduced by each carryless multiplication (shown as
+	 * '* x' above), and the floored division instead needs to be by
+	 * x^BITS_PER_LONG which matches what clmul produces.
+	 */
+#if LSB_CRC
+	tmp = clmul(msgpoly, consts->barrett_reduction_const_1);
+#else
+	tmp = clmulr(msgpoly, consts->barrett_reduction_const_1);
+#endif
+
+	/*
+	 * Second step of Barrett reduction:
+	 *
+	 *    crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G))
+	 *
+	 * This reduces (msgpoly * x^n) modulo G by adding the appropriate
+	 * multiple of G to it.  The result uses only the x^0..x^(n-1) terms.
+	 * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those
+	 * terms in the first place, it is more efficient to do the equivalent:
+	 *
+	 *    crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n
+	 *
+	 * In the lsb-first case further modify it to the following which avoids
+	 * a shift, as the crc ends up in the physically low n bits from clmulr:
+	 *
+	 *    product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x
+	 *    crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n
+	 *
+	 * barrett_reduction_const_2 contains the constant multiplier (G - x^n)
+	 * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above.  The
+	 * cast of the result to crc_t is essential, as it applies the mod x^n!
+	 */
+#if LSB_CRC
+	return clmulr(tmp, consts->barrett_reduction_const_2);
+#else
+	return clmul(tmp, consts->barrett_reduction_const_2);
+#endif
+}
+
+/* Update @crc with the data from @msgpoly. */
+static inline crc_t
+crc_clmul_update_long(crc_t crc, unsigned long msgpoly,
+		      const struct crc_clmul_consts *consts)
+{
+	return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts);
+}
+
+/* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */
+static inline crc_t
+crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len,
+			 const struct crc_clmul_consts *consts)
+{
+	unsigned long msgpoly;
+	size_t i;
+
+#if LSB_CRC
+	msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8);
+	for (i = 1; i < len; i++)
+		msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8));
+#else
+	msgpoly = p[0];
+	for (i = 1; i < len; i++)
+		msgpoly = (msgpoly << 8) ^ p[i];
+#endif
+
+	if (len >= sizeof(crc_t)) {
+	#if LSB_CRC
+		msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
+	#else
+		msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS);
+	#endif
+		return crc_clmul_long(msgpoly, consts);
+	}
+#if LSB_CRC
+	msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
+	return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len));
+#else
+	msgpoly ^= crc >> (CRC_BITS - 8*len);
+	return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len));
+#endif
+}
+
+static inline crc_t
+crc_clmul(crc_t crc, const void *p, size_t len,
+	  const struct crc_clmul_consts *consts)
+{
+	size_t align;
+
+	/* This implementation assumes that the CRC fits in an unsigned long. */
+	BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long));
+
+	/* If the buffer is not long-aligned, align it. */
+	align = (unsigned long)p % sizeof(unsigned long);
+	if (align && len) {
+		align = min(sizeof(unsigned long) - align, len);
+		crc = crc_clmul_update_partial(crc, p, align, consts);
+		p += align;
+		len -= align;
+	}
+
+	if (len >= 4 * sizeof(unsigned long)) {
+		unsigned long m0, m1;
+
+		m0 = crc_clmul_prep(crc, crc_load_long(p));
+		m1 = crc_load_long(p + sizeof(unsigned long));
+		p += 2 * sizeof(unsigned long);
+		len -= 2 * sizeof(unsigned long);
+		/*
+		 * Main loop.  Each iteration starts with a message polynomial
+		 * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two
+		 * more longs of data to form x^(3*BITS_PER_LONG)*m0 +
+		 * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then
+		 * "folds" that back into a congruent (modulo G) value that uses
+		 * just m0 and m1 again.  This is done by multiplying m0 by the
+		 * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by
+		 * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then
+		 * adding the results to m2 and m3 as appropriate.  Each such
+		 * multiplication produces a result twice the length of a long,
+		 * which in RISC-V is two instructions clmul and clmulh.
+		 *
+		 * This could be changed to fold across more than 2 longs at a
+		 * time if there is a CPU that can take advantage of it.
+		 */
+		do {
+			unsigned long p0, p1, p2, p3;
+
+			p0 = clmulh(m0, consts->fold_across_2_longs_const_hi);
+			p1 = clmul(m0, consts->fold_across_2_longs_const_hi);
+			p2 = clmulh(m1, consts->fold_across_2_longs_const_lo);
+			p3 = clmul(m1, consts->fold_across_2_longs_const_lo);
+			m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p);
+			m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^
+			     crc_load_long(p + sizeof(unsigned long));
+
+			p += 2 * sizeof(unsigned long);
+			len -= 2 * sizeof(unsigned long);
+		} while (len >= 2 * sizeof(unsigned long));
+
+		crc = crc_clmul_long(m0, consts);
+		crc = crc_clmul_update_long(crc, m1, consts);
+	}
+
+	while (len >= sizeof(unsigned long)) {
+		crc = crc_clmul_update_long(crc, crc_load_long(p), consts);
+		p += sizeof(unsigned long);
+		len -= sizeof(unsigned long);
+	}
+
+	if (len)
+		crc = crc_clmul_update_partial(crc, p, len, consts);
+
+	return crc;
+}
diff --git a/scripts/gen-crc-consts.py b/scripts/gen-crc-consts.py
index aa678a50897d..f9b44fc3a03f 100755
--- a/scripts/gen-crc-consts.py
+++ b/scripts/gen-crc-consts.py
@@ -105,6 +105,57 @@ def gen_slicebyN_tables(variants, n):
             print(f'\t{s}')
         print('};')
 
+def print_riscv_const(v, bits_per_long, name, val, desc):
+    print(f'\t.{name} = {fmt_poly(v, val, bits_per_long)}, /* {desc} */')
+
+def do_gen_riscv_clmul_consts(v, bits_per_long):
+    (G, n, lsb) = (v.G, v.bits, v.lsb)
+
+    pow_of_x = 3 * bits_per_long - (1 if lsb else 0)
+    print_riscv_const(v, bits_per_long, 'fold_across_2_longs_const_hi',
+                      reduce(1 << pow_of_x, G), f'x^{pow_of_x} mod G')
+    pow_of_x = 2 * bits_per_long - (1 if lsb else 0)
+    print_riscv_const(v, bits_per_long, 'fold_across_2_longs_const_lo',
+                      reduce(1 << pow_of_x, G), f'x^{pow_of_x} mod G')
+
+    pow_of_x = bits_per_long - 1 + n
+    print_riscv_const(v, bits_per_long, 'barrett_reduction_const_1',
+                      div(1 << pow_of_x, G), f'floor(x^{pow_of_x} / G)')
+
+    val = G - (1 << n)
+    desc = f'G - x^{n}'
+    if lsb:
+        val <<= bits_per_long - n
+        desc = f'({desc}) * x^{bits_per_long - n}'
+    print_riscv_const(v, bits_per_long, 'barrett_reduction_const_2', val, desc)
+
+def gen_riscv_clmul_consts(variants):
+    print('')
+    print('struct crc_clmul_consts {');
+    print('\tunsigned long fold_across_2_longs_const_hi;');
+    print('\tunsigned long fold_across_2_longs_const_lo;');
+    print('\tunsigned long barrett_reduction_const_1;');
+    print('\tunsigned long barrett_reduction_const_2;');
+    print('};');
+    for v in variants:
+        print('');
+        if v.bits > 32:
+            print_header(v, 'Constants')
+            print('#ifdef CONFIG_64BIT')
+            print(f'static const struct crc_clmul_consts {v.name}_consts __maybe_unused = {{')
+            do_gen_riscv_clmul_consts(v, 64)
+            print('};')
+            print('#endif')
+        else:
+            print_header(v, 'Constants')
+            print(f'static const struct crc_clmul_consts {v.name}_consts __maybe_unused = {{')
+            print('#ifdef CONFIG_64BIT')
+            do_gen_riscv_clmul_consts(v, 64)
+            print('#else')
+            do_gen_riscv_clmul_consts(v, 32)
+            print('#endif')
+            print('};')
+
 # Generate constants for carryless multiplication based CRC computation.
 def gen_x86_pclmul_consts(variants):
     # These are the distances, in bits, to generate folding constants for.
@@ -213,7 +264,7 @@ def parse_crc_variants(vars_string):
 
 if len(sys.argv) != 3:
     sys.stderr.write(f'Usage: {sys.argv[0]} CONSTS_TYPE[,CONSTS_TYPE]... CRC_VARIANT[,CRC_VARIANT]...\n')
-    sys.stderr.write('  CONSTS_TYPE can be sliceby[1-8] or x86_pclmul\n')
+    sys.stderr.write('  CONSTS_TYPE can be sliceby[1-8], riscv_clmul, or x86_pclmul\n')
     sys.stderr.write('  CRC_VARIANT is crc${num_bits}_${bit_order}_${generator_poly_as_hex}\n')
     sys.stderr.write('     E.g. crc16_msb_0x8bb7 or crc32_lsb_0xedb88320\n')
     sys.stderr.write('     Polynomial must use the given bit_order and exclude x^{num_bits}\n')
@@ -232,6 +283,8 @@ variants = parse_crc_variants(sys.argv[2])
 for consts_type in consts_types:
     if consts_type.startswith('sliceby'):
         gen_slicebyN_tables(variants, int(consts_type.removeprefix('sliceby')))
+    elif consts_type == 'riscv_clmul':
+        gen_riscv_clmul_consts(variants)
     elif consts_type == 'x86_pclmul':
         gen_x86_pclmul_consts(variants)
     else: