From 58414bbb58a8b49af20e3accae56f6f8344b2424 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 14 Sep 2022 19:07:27 -0700 Subject: lib/find_bit: introduce FIND_FIRST_BIT() macro Now that we have many flavors of find_first_bit(), and expect even more, it's better to have one macro that generates optimal code for all and makes maintaining of slightly different functions simpler. The logic common to all versions is moved to the new macro, and all the flavors are generated by providing an FETCH macro-parameter, like in this example: #define FIND_FIRST_BIT(FETCH, MUNGE, size) ... find_first_ornot_and_bit(addr1, addr2, addr3, size) { return FIND_FIRST_BIT(addr1[idx] | ~addr2[idx] & addr3[idx], /* nop */, size); } The FETCH may be of any complexity, as soon as it only refers the bitmap(s) and an iterator idx. MUNGE is here to support _le code generation for BE builds. May be empty. Suggested-by: Linus Torvalds Reviewed-by: Valentin Schneider Signed-off-by: Yury Norov --- lib/find_bit.c | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) (limited to 'lib/find_bit.c') diff --git a/lib/find_bit.c b/lib/find_bit.c index 1b8e4b2a9cba..894b656f6836 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -19,6 +19,27 @@ #include #include +/* + * Common helper for find_bit() function family + * @FETCH: The expression that fetches and pre-processes each word of bitmap(s) + * @MUNGE: The expression that post-processes a word containing found bit (may be empty) + * @size: The bitmap size in bits + */ +#define FIND_FIRST_BIT(FETCH, MUNGE, size) \ +({ \ + unsigned long idx, val, sz = (size); \ + \ + for (idx = 0; idx * BITS_PER_LONG < sz; idx++) { \ + val = (FETCH); \ + if (val) { \ + sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(val)), sz); \ + break; \ + } \ + } \ + \ + sz; \ +}) + #if !defined(find_next_bit) || !defined(find_next_zero_bit) || \ !defined(find_next_bit_le) || !defined(find_next_zero_bit_le) || \ !defined(find_next_and_bit) @@ -77,14 +98,7 @@ EXPORT_SYMBOL(_find_next_bit); */ unsigned long _find_first_bit(const unsigned long *addr, unsigned long size) { - unsigned long idx; - - for (idx = 0; idx * BITS_PER_LONG < size; idx++) { - if (addr[idx]) - return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size); - } - - return size; + return FIND_FIRST_BIT(addr[idx], /* nop */, size); } EXPORT_SYMBOL(_find_first_bit); #endif @@ -97,15 +111,7 @@ unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { - unsigned long idx, val; - - for (idx = 0; idx * BITS_PER_LONG < size; idx++) { - val = addr1[idx] & addr2[idx]; - if (val) - return min(idx * BITS_PER_LONG + __ffs(val), size); - } - - return size; + return FIND_FIRST_BIT(addr1[idx] & addr2[idx], /* nop */, size); } EXPORT_SYMBOL(_find_first_and_bit); #endif @@ -116,14 +122,7 @@ EXPORT_SYMBOL(_find_first_and_bit); */ unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size) { - unsigned long idx; - - for (idx = 0; idx * BITS_PER_LONG < size; idx++) { - if (addr[idx] != ~0UL) - return min(idx * BITS_PER_LONG + ffz(addr[idx]), size); - } - - return size; + return FIND_FIRST_BIT(~addr[idx], /* nop */, size); } EXPORT_SYMBOL(_find_first_zero_bit); #endif -- cgit From 14a99e130f2758bc826a7e7a8bdf6f7400b54f0f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 14 Sep 2022 19:07:28 -0700 Subject: lib/find_bit: create find_first_zero_bit_le() find_first_zero_bit_le() is an alias to find_next_zero_bit_le(), despite that 'next' is known to be slower than 'first' version. Now that we have common FIND_FIRST_BIT() macro helper, it's trivial to implement find_first_zero_bit_le() as a real function. Reviewed-by: Valentin Schneider Signed-off-by: Yury Norov --- lib/find_bit.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'lib/find_bit.c') diff --git a/lib/find_bit.c b/lib/find_bit.c index 894b656f6836..61ec2992938b 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -160,3 +160,19 @@ unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr, return offset; } EXPORT_SYMBOL(find_next_clump8); + +#ifdef __BIG_ENDIAN + +#ifndef find_first_zero_bit_le +/* + * Find the first cleared bit in an LE memory region. + */ +unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size) +{ + return FIND_FIRST_BIT(~addr[idx], swab, size); +} +EXPORT_SYMBOL(_find_first_zero_bit_le); + +#endif + +#endif /* __BIG_ENDIAN */ -- cgit From e79864f3164c573afce09ec4b72b75ebe061c14d Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 14 Sep 2022 19:07:29 -0700 Subject: lib/find_bit: optimize find_next_bit() functions Over the past couple years, the function _find_next_bit() was extended with parameters that modify its behavior to implement and- zero- and le- flavors. The parameters are passed at compile time, but current design prevents a compiler from optimizing out the conditionals. As find_next_bit() API grows, I expect that more parameters will be added. Current design would require more conditional code in _find_next_bit(), which would bloat the helper even more and make it barely readable. This patch replaces _find_next_bit() with a macro FIND_NEXT_BIT, and adds a set of wrappers, so that the compile-time optimizations become possible. The common logic is moved to the new macro, and all flavors may be generated by providing a FETCH macro parameter, like in this example: #define FIND_NEXT_BIT(FETCH, MUNGE, size, start) ... find_next_xornot_and_bit(addr1, addr2, addr3, size, start) { return FIND_NEXT_BIT(addr1[idx] ^ ~addr2[idx] & addr3[idx], /* nop */, size, start); } The FETCH may be of any complexity, as soon as it only refers the bitmap(s) and an iterator idx. MUNGE is here to support _le code generation for BE builds. May be empty. I ran find_bit_benchmark 16 times on top of 6.0-rc2 and 16 times on top of 6.0-rc2 + this series. The results for kvm/x86_64 are: v6.0-rc2 Optimized Difference Z-score Random dense bitmap ns ns ns % find_next_bit: 787735 670546 117189 14.9 3.97 find_next_zero_bit: 777492 664208 113284 14.6 10.51 find_last_bit: 830925 687573 143352 17.3 2.35 find_first_bit: 3874366 3306635 567731 14.7 1.84 find_first_and_bit: 40677125 37739887 2937238 7.2 1.36 find_next_and_bit: 347865 304456 43409 12.5 1.35 Random sparse bitmap find_next_bit: 19816 14021 5795 29.2 6.10 find_next_zero_bit: 1318901 1223794 95107 7.2 1.41 find_last_bit: 14573 13514 1059 7.3 6.92 find_first_bit: 1313321 1249024 64297 4.9 1.53 find_first_and_bit: 8921 8098 823 9.2 4.56 find_next_and_bit: 9796 7176 2620 26.7 5.39 Where the statistics is significant (z-score > 3), the improvement is ~15%. According to the bloat-o-meter, the Image size is 10-11K less: x86_64/defconfig: add/remove: 32/14 grow/shrink: 61/782 up/down: 6344/-16521 (-10177) arm64/defconfig: add/remove: 3/2 grow/shrink: 50/714 up/down: 608/-11556 (-10948) Suggested-by: Linus Torvalds Signed-off-by: Yury Norov --- lib/find_bit.c | 119 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 70 insertions(+), 49 deletions(-) (limited to 'lib/find_bit.c') diff --git a/lib/find_bit.c b/lib/find_bit.c index 61ec2992938b..d00ee23ab657 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -40,57 +40,33 @@ sz; \ }) -#if !defined(find_next_bit) || !defined(find_next_zero_bit) || \ - !defined(find_next_bit_le) || !defined(find_next_zero_bit_le) || \ - !defined(find_next_and_bit) /* - * This is a common helper function for find_next_bit, find_next_zero_bit, and - * find_next_and_bit. The differences are: - * - The "invert" argument, which is XORed with each fetched word before - * searching it for one bits. - * - The optional "addr2", which is anded with "addr1" if present. + * Common helper for find_next_bit() function family + * @FETCH: The expression that fetches and pre-processes each word of bitmap(s) + * @MUNGE: The expression that post-processes a word containing found bit (may be empty) + * @size: The bitmap size in bits + * @start: The bitnumber to start searching at */ -unsigned long _find_next_bit(const unsigned long *addr1, - const unsigned long *addr2, unsigned long nbits, - unsigned long start, unsigned long invert, unsigned long le) -{ - unsigned long tmp, mask; - - if (unlikely(start >= nbits)) - return nbits; - - tmp = addr1[start / BITS_PER_LONG]; - if (addr2) - tmp &= addr2[start / BITS_PER_LONG]; - tmp ^= invert; - - /* Handle 1st word. */ - mask = BITMAP_FIRST_WORD_MASK(start); - if (le) - mask = swab(mask); - - tmp &= mask; - - start = round_down(start, BITS_PER_LONG); - - while (!tmp) { - start += BITS_PER_LONG; - if (start >= nbits) - return nbits; - - tmp = addr1[start / BITS_PER_LONG]; - if (addr2) - tmp &= addr2[start / BITS_PER_LONG]; - tmp ^= invert; - } - - if (le) - tmp = swab(tmp); - - return min(start + __ffs(tmp), nbits); -} -EXPORT_SYMBOL(_find_next_bit); -#endif +#define FIND_NEXT_BIT(FETCH, MUNGE, size, start) \ +({ \ + unsigned long mask, idx, tmp, sz = (size), __start = (start); \ + \ + if (unlikely(__start >= sz)) \ + goto out; \ + \ + mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start)); \ + idx = __start / BITS_PER_LONG; \ + \ + for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) { \ + if ((idx + 1) * BITS_PER_LONG >= sz) \ + goto out; \ + idx++; \ + } \ + \ + sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz); \ +out: \ + sz; \ +}) #ifndef find_first_bit /* @@ -127,6 +103,32 @@ unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size EXPORT_SYMBOL(_find_first_zero_bit); #endif +#ifndef find_next_bit +unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, unsigned long start) +{ + return FIND_NEXT_BIT(addr[idx], /* nop */, nbits, start); +} +EXPORT_SYMBOL(_find_next_bit); +#endif + +#ifndef find_next_and_bit +unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long nbits, unsigned long start) +{ + return FIND_NEXT_BIT(addr1[idx] & addr2[idx], /* nop */, nbits, start); +} +EXPORT_SYMBOL(_find_next_and_bit); +#endif + +#ifndef find_next_zero_bit +unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, + unsigned long start) +{ + return FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start); +} +EXPORT_SYMBOL(_find_next_zero_bit); +#endif + #ifndef find_last_bit unsigned long _find_last_bit(const unsigned long *addr, unsigned long size) { @@ -175,4 +177,23 @@ EXPORT_SYMBOL(_find_first_zero_bit_le); #endif +#ifndef find_next_zero_bit_le +unsigned long _find_next_zero_bit_le(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return FIND_NEXT_BIT(~addr[idx], swab, size, offset); +} +EXPORT_SYMBOL(_find_next_zero_bit_le); +#endif + +#ifndef find_next_bit_le +unsigned long _find_next_bit_le(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return FIND_NEXT_BIT(addr[idx], swab, size, offset); +} +EXPORT_SYMBOL(_find_next_bit_le); + +#endif + #endif /* __BIG_ENDIAN */ -- cgit From 3cea8d475327756066e2a54f0b651bb7284dd448 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 17 Sep 2022 20:07:13 -0700 Subject: lib: add find_nth{,_and,_andnot}_bit() Kernel lacks for a function that searches for Nth bit in a bitmap. Usually people do it like this: for_each_set_bit(bit, mask, size) if (n-- == 0) return bit; We can do it more efficiently, if we: 1. find a word containing Nth bit, using hweight(); and 2. find the bit, using a helper fns(), that works similarly to __ffs() and ffz(). fns() is implemented as a simple loop. For x86_64, there's PDEP instruction to do that: ret = clz(pdep(1 << idx, num)). However, for large bitmaps the most of improvement comes from using hweight(), so I kept fns() simple. New find_nth_bit() is ~70 times faster on x86_64/kvm in find_bit benchmark: find_nth_bit: 7154190 ns, 16411 iterations for_each_bit: 505493126 ns, 16315 iterations With all that, a family of 3 new functions is added, and used where appropriate in the following patches. Signed-off-by: Yury Norov --- lib/find_bit.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'lib/find_bit.c') diff --git a/lib/find_bit.c b/lib/find_bit.c index d00ee23ab657..25609974cbe4 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -68,6 +68,30 @@ out: \ sz; \ }) +#define FIND_NTH_BIT(FETCH, size, num) \ +({ \ + unsigned long sz = (size), nr = (num), idx, w, tmp; \ + \ + for (idx = 0; (idx + 1) * BITS_PER_LONG <= sz; idx++) { \ + if (idx * BITS_PER_LONG + nr >= sz) \ + goto out; \ + \ + tmp = (FETCH); \ + w = hweight_long(tmp); \ + if (w > nr) \ + goto found; \ + \ + nr -= w; \ + } \ + \ + if (sz % BITS_PER_LONG) \ + tmp = (FETCH) & BITMAP_LAST_WORD_MASK(sz); \ +found: \ + sz = min(idx * BITS_PER_LONG + fns(tmp, nr), sz); \ +out: \ + sz; \ +}) + #ifndef find_first_bit /* * Find the first set bit in a memory region. @@ -111,6 +135,26 @@ unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, uns EXPORT_SYMBOL(_find_next_bit); #endif +unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) +{ + return FIND_NTH_BIT(addr[idx], size, n); +} +EXPORT_SYMBOL(__find_nth_bit); + +unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long size, unsigned long n) +{ + return FIND_NTH_BIT(addr1[idx] & addr2[idx], size, n); +} +EXPORT_SYMBOL(__find_nth_and_bit); + +unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long size, unsigned long n) +{ + return FIND_NTH_BIT(addr1[idx] & ~addr2[idx], size, n); +} +EXPORT_SYMBOL(__find_nth_andnot_bit); + #ifndef find_next_and_bit unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start) -- cgit From 90d482908eedd56f01a707325aa541cf9c40f936 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 3 Oct 2022 16:34:17 +0100 Subject: lib/find_bit: Introduce find_next_andnot_bit() In preparation of introducing for_each_cpu_andnot(), add a variant of find_next_bit() that negate the bits in @addr2 when ANDing them with the bits in @addr1. Signed-off-by: Valentin Schneider --- lib/find_bit.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'lib/find_bit.c') diff --git a/lib/find_bit.c b/lib/find_bit.c index 25609974cbe4..18bc0a7ac8ee 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -164,6 +164,15 @@ unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long EXPORT_SYMBOL(_find_next_and_bit); #endif +#ifndef find_next_andnot_bit +unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long nbits, unsigned long start) +{ + return FIND_NEXT_BIT(addr1[idx] & ~addr2[idx], /* nop */, nbits, start); +} +EXPORT_SYMBOL(_find_next_andnot_bit); +#endif + #ifndef find_next_zero_bit unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start) -- cgit