lib/crypto: sha256: Add support for 2-way interleaved hashing

Many arm64 and x86_64 CPUs can compute two SHA-256 hashes in nearly the same speed as one, if the instructions are interleaved. This is because SHA-256 is serialized block-by-block, and two interleaved hashes take much better advantage of the CPU's instruction-level parallelism. Meanwhile, a very common use case for SHA-256 hashing in the Linux kernel is dm-verity and fs-verity. Both use a Merkle tree that has a fixed block size, usually 4096 bytes with an empty or 32-byte salt prepended. Usually, many blocks need to be hashed at a time. This is an ideal scenario for 2-way interleaved hashing. To enable this optimization, add a new function sha256_finup_2x() to the SHA-256 library API. It computes the hash of two equal-length messages, starting from a common initial context. For now it always falls back to sequential processing. Later patches will wire up arm64 and x86_64 optimized implementations. Note that the interleaving factor could in principle be higher than 2x. However, that runs into many practical difficulties and CPU throughput limitations. Thus, both the implementations I'm adding are 2x. In the interest of using the simplest solution, the API matches that. Reviewed-by: Ard Biesheuvel <ardb@kernel.org> Link: https://lore.kernel.org/r/20250915160819.140019-2-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>
author: Eric Biggers <ebiggers@kernel.org> 2025-09-15 11:08:14 -0500
committer: Eric Biggers <ebiggers@kernel.org> 2025-09-17 13:09:39 -0500
commit: 4ca24d6abbca5df76c4b189dd94fb055613de297 (patch)
tree: cfcd3f1d5efc964c0548e3b4ab4d9d84d540c232
parent: f0883b9c395ecdf7e66a58b6027fd35056cf152c (diff)
2 files changed, 94 insertions, 5 deletions
diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index 15e461e568cc..e5dafb935cc8 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -376,6 +376,34 @@ void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
 void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
 
 /**
+ * sha256_finup_2x() - Compute two SHA-256 digests from a common initial
+ *		       context.  On some CPUs, this is faster than sequentially
+ *		       computing each digest.
+ * @ctx: an optional initial context, which may have already processed data.  If
+ *	 NULL, a default initial context is used (equivalent to sha256_init()).
+ * @data1: data for the first message
+ * @data2: data for the second message
+ * @len: the length of each of @data1 and @data2, in bytes
+ * @out1: (output) the first SHA-256 message digest
+ * @out2: (output) the second SHA-256 message digest
+ *
+ * Context: Any context.
+ */
+void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
+		     const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
+		     u8 out2[SHA256_DIGEST_SIZE]);
+
+/**
+ * sha256_finup_2x_is_optimized() - Check if sha256_finup_2x() is using a real
+ *				    interleaved implementation, as opposed to a
+ *				    sequential fallback
+ * @return: true if optimized
+ *
+ * Context: Any context.
+ */
+bool sha256_finup_2x_is_optimized(void);
+
+/**
  * struct hmac_sha256_key - Prepared key for HMAC-SHA256
  * @key: private
  */
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 8fa15165d23e..881b935418ce 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -25,13 +25,20 @@ static const struct sha256_block_state sha224_iv = {
 	},
 };
 
-static const struct sha256_block_state sha256_iv = {
-	.h = {
-		SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
-		SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+static const struct sha256_ctx initial_sha256_ctx = {
+	.ctx = {
+		.state = {
+			.h = {
+				SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
+				SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+			},
+		},
+		.bytecount = 0,
 	},
 };
 
+#define sha256_iv (initial_sha256_ctx.ctx.state)
+
 static const u32 sha256_K[64] = {
 	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
 	0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -261,8 +268,62 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
 }
 EXPORT_SYMBOL(sha256);
 
-/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */
+/*
+ * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined)
+ * doesn't need either HMAC support or interleaved hashing support
+ */
 #ifndef __DISABLE_EXPORTS
+
+#ifndef sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+				 const u8 *data1, const u8 *data2, size_t len,
+				 u8 out1[SHA256_DIGEST_SIZE],
+				 u8 out2[SHA256_DIGEST_SIZE])
+{
+	return false;
+}
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+	return false;
+}
+#endif
+
+/* Sequential fallback implementation of sha256_finup_2x() */
+static noinline_for_stack void sha256_finup_2x_sequential(
+	const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2,
+	size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE])
+{
+	struct __sha256_ctx mut_ctx;
+
+	mut_ctx = *ctx;
+	__sha256_update(&mut_ctx, data1, len);
+	__sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE);
+
+	mut_ctx = *ctx;
+	__sha256_update(&mut_ctx, data2, len);
+	__sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE);
+}
+
+void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
+		     const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
+		     u8 out2[SHA256_DIGEST_SIZE])
+{
+	if (ctx == NULL)
+		ctx = &initial_sha256_ctx;
+
+	if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1,
+					out2)))
+		return;
+	sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2);
+}
+EXPORT_SYMBOL_GPL(sha256_finup_2x);
+
+bool sha256_finup_2x_is_optimized(void)
+{
+	return sha256_finup_2x_is_optimized_arch();
+}
+EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized);
+
 static void __hmac_sha256_preparekey(struct sha256_block_state *istate,
 				     struct sha256_block_state *ostate,
 				     const u8 *raw_key, size_t raw_key_len,
author	Eric Biggers <ebiggers@kernel.org>	2025-09-15 11:08:14 -0500
committer	Eric Biggers <ebiggers@kernel.org>	2025-09-17 13:09:39 -0500
commit	4ca24d6abbca5df76c4b189dd94fb055613de297 (patch)
tree	cfcd3f1d5efc964c0548e3b4ab4d9d84d540c232
parent	f0883b9c395ecdf7e66a58b6027fd35056cf152c (diff)