riscv: more efficient clz and ctz

For RISC-V platforms that do not provide machine instructions to count leading and trailing zeros, this commit includes more efficient library functions. For verification, we expose the bodies of the functions to the proofs. Kernel config options `CLZ_BUILTIN` and `CTZ_BUILTIN` allow selection of whether compiler builtin functions should be used. These are only supported on platforms where the builtin compiles to inline assembly. By default, the options are on for all platforms except RISC-V. Signed-off-by: Matthew Brecknell <Matthew.Brecknell@data61.csiro.au>
2020-08-03 23:22:02 +10:00
parent 2a0e5a2a1f
commit 9ec5df5fa8
6 changed files with 462 additions and 59 deletions
--- a/config.cmake
+++ b/config.cmake
@@ -462,4 +462,46 @@ config_string(
    DEPENDS "KernelIsMCS" UNDEF_DISABLED
 )
 config_option(
    KernelClzlImpl CLZL_IMPL
    "Define a __clzdi2 function to count leading zeros for unsigned long arguments. \
     Only needed on platforms which lack a builtin instruction."
    DEFAULT OFF
 )
 config_option(
    KernelClzllImpl CLZLL_IMPL
    "Define a __clzti2 function to count leading zeros for unsigned long long arguments. \
     Only needed on platforms which lack a builtin instruction."
    DEFAULT OFF
 )
 config_option(
    KernelCtzlImpl CTZL_IMPL
    "Define a __ctzdi2 function to count trailing zeros for unsigned long arguments. \
     Only needed on platforms which lack a builtin instruction."
    DEFAULT OFF
 )
 config_option(
    KernelCtzllImpl CTZLL_IMPL
    "Define a __ctzti2 function to count trailing zeros for unsigned long long arguments. \
     Only needed on platforms which lack a builtin instruction."
    DEFAULT OFF
 )
 config_option(
    KernelClzNoBuiltin CLZ_NO_BUILTIN
    "Expose implementations of clzl and clzll to verification by avoiding the use \
     of __builtin_clzl and __builtin_clzll."
    DEFAULT OFF
 )
 config_option(
    KernelCtzNoBuiltin CTZ_NO_BUILTIN
    "Expose implementations of ctzl and ctzll to verification by avoiding the use \
     of __builtin_ctzl and __builtin_ctzll."
    DEFAULT OFF
 )
 add_config_library(kernel "${configure_string}")
--- a/configs/RISCV64_MCS_verified.cmake
+++ b/configs/RISCV64_MCS_verified.cmake
@@ -26,4 +26,6 @@ set(KernelRootCNodeSizeBits 19 CACHE STRING "")
 set(KernelMaxNumBootinfoUntypedCaps 50 CACHE STRING "")
 set(KernelIsMCS ON CACHE BOOL "")
 set(KernelStaticMaxBudgetUs "(60 * 60 * MS_IN_S * US_IN_MS)" CACHE STRING "")
 set(KernelClzNoBuiltin ON CACHE BOOL "")
 set(KernelCtzNoBuiltin ON CACHE BOOL "")
 include(${CMAKE_CURRENT_LIST_DIR}/seL4Config.cmake)
--- a/configs/RISCV64_verified.cmake
+++ b/configs/RISCV64_verified.cmake
@@ -24,4 +24,6 @@ set(KernelNumDomains 16 CACHE STRING "")
 set(KernelMaxNumBootinfoUntypedCap 166 CACHE STRING "")
 set(KernelRootCNodeSizeBits 19 CACHE STRING "")
 set(KernelMaxNumBootinfoUntypedCaps 50 CACHE STRING "")
 set(KernelClzNoBuiltin ON CACHE BOOL "")
 set(KernelCtzNoBuiltin ON CACHE BOOL "")
 include(${CMAKE_CURRENT_LIST_DIR}/seL4Config.cmake)
--- a/include/util.h
+++ b/include/util.h
@@ -98,60 +98,137 @@ int PURE strncmp(const char *s1, const char *s2, int n);
 long CONST char_to_long(char c);
 long PURE str_to_long(const char *str);
 // Library functions for counting leading/trailing zeros.
 // GCC's builtins will emit calls to these functions when the platform
 // does not provide suitable inline assembly.
 // We only emit function definitions if CONFIG_CLZL_IMPL etc are set.
 NO_INLINE CONST int __clzdi2(unsigned long x);
 NO_INLINE CONST int __clzti2(unsigned long long x);
 NO_INLINE CONST int __ctzdi2(unsigned long x);
 NO_INLINE CONST int __ctzti2(unsigned long long x);
-int __builtin_clzl(unsigned long x);
+// Used for compile-time constants, so should always use the builtin.
-int __builtin_ctzl(unsigned long x);
+#define CTZL(x) __builtin_ctzl(x)
-#ifdef CONFIG_ARCH_RISCV
+// Count leading zeros.
-uint32_t __clzsi2(uint32_t x);
+// The CONFIG_CLZ_NO_BUILTIN macro may be used to expose the library function
-uint32_t __ctzsi2(uint32_t x);
+// to the C parser for verification.
-uint32_t __clzdi2(uint64_t x);
+#ifndef CONFIG_CLZ_NO_BUILTIN
-uint32_t __ctzdi2(uint64_t x);
+// If we use a compiler builtin, we cannot verify it, so we use the following
-#endif
+// annotations to hide the function body from the proofs, and axiomatise its
 // behaviour.
 // On the other hand, if we use our own implementation instead of the builtin,
 // then we want to expose that implementation to the proofs, and therefore hide
 // these annotations.
 /** MODIFIES: */
 /** DONT_TRANSLATE */
 /** FNSPEC clzl_spec:
  "\<forall>s. \<Gamma> \<turnstile>
-    {\<sigma>. s = \<sigma> \<and> x_' s \<noteq> 0 }
+    {\<sigma>. s = \<sigma> \<and> x___unsigned_long_' s \<noteq> 0 }
      \<acute>ret__long :== PROC clzl(\<acute>x)
-    \<lbrace> \<acute>ret__long = of_nat (word_clz (x_' s)) \<rbrace>"
+    \<lbrace> \<acute>ret__long = of_nat (word_clz (x___unsigned_long_' s)) \<rbrace>"
 */
 #endif
 static inline long
 CONST clzl(unsigned long x)
 {
 #ifdef CONFIG_CLZ_NO_BUILTIN
    return __clzdi2(x);
 #else
    return __builtin_clzl(x);
 #endif
 }
 #ifndef CONFIG_CLZ_NO_BUILTIN
 // See comments on clzl.
 /** MODIFIES: */
 /** DONT_TRANSLATE */
 /** FNSPEC clzll_spec:
  "\<forall>s. \<Gamma> \<turnstile>
    {\<sigma>. s = \<sigma> \<and> x___unsigned_longlong_' s \<noteq> 0 }
      \<acute>ret__longlong :== PROC clzll(\<acute>x)
    \<lbrace> \<acute>ret__longlong = of_nat (word_clz (x___unsigned_longlong_' s)) \<rbrace>"
 */
 #endif
 static inline long long
 CONST clzll(unsigned long long x)
 {
 #ifdef CONFIG_CLZ_NO_BUILTIN
    return __clzti2(x);
 #else
    return __builtin_clzll(x);
 #endif
 }
 // Count trailing zeros.
 #ifndef CONFIG_CTZ_NO_BUILTIN
 // See comments on clzl.
 /** MODIFIES: */
 /** DONT_TRANSLATE */
 /** FNSPEC ctzl_spec:
  "\<forall>s. \<Gamma> \<turnstile>
-    {\<sigma>. s = \<sigma> \<and> x_' s \<noteq> 0 }
+    {\<sigma>. s = \<sigma> \<and> x___unsigned_long_' s \<noteq> 0 }
      \<acute>ret__long :== PROC ctzl(\<acute>x)
-    \<lbrace> \<acute>ret__long = of_nat (word_ctz (x_' s)) \<rbrace>"
+    \<lbrace> \<acute>ret__long = of_nat (word_ctz (x___unsigned_long_' s)) \<rbrace>"
 */
 #endif
 static inline long
 CONST ctzl(unsigned long x)
 {
 #ifdef CONFIG_CTZ_NO_BUILTIN
 // If there is a builtin CLZ, but no builtin CTZ, then CTZ will be implemented
 // using the builtin CLZ, rather than the long-form implementation.
 // This is typically the fastest way to calculate ctzl on such platforms.
 #ifdef CONFIG_CLZ_NO_BUILTIN
    // Here, there are no builtins we can use, so call the library function.
    return __ctzdi2(x);
 #else
    // Here, we have __builtin_clzl, but no __builtin_ctzl.
    if (unlikely(x == 0)) {
        return 8 * sizeof(unsigned long);
    }
    // -x = ~x + 1, so (x & -x) isolates the least significant 1-bit of x,
    // allowing ctzl to be calculated from clzl and the word size.
    return 8 * sizeof(unsigned long) - 1 - __builtin_clzl(x & -x);
 #endif
 #else
    // Here, we have __builtin_ctzl.
    return __builtin_ctzl(x);
 #endif
 }
-#define CTZL(x) __builtin_ctzl(x)
+#ifndef CONFIG_CTZ_NO_BUILTIN
 // See comments on clzl.
 /** MODIFIES: */
 /** DONT_TRANSLATE */
 /** FNSPEC ctzll_spec:
  "\<forall>s. \<Gamma> \<turnstile>
    {\<sigma>. s = \<sigma> \<and> x___unsigned_longlong_' s \<noteq> 0 }
      \<acute>ret__longlong :== PROC ctzll(\<acute>x)
    \<lbrace> \<acute>ret__longlong = of_nat (word_ctz (x___unsigned_longlong_' s)) \<rbrace>"
 */
 #endif
 static inline long long
 CONST ctzll(unsigned long long x)
 {
 #ifdef CONFIG_CTZ_NO_BUILTIN
 // See comments on ctzl.
 #ifdef CONFIG_CLZ_NO_BUILTIN
    return __ctzti2(x);
 #else
    if (unlikely(x == 0)) {
        return 8 * sizeof(unsigned long long);
    }
    // See comments on ctzl.
    return 8 * sizeof(unsigned long long) - 1 - __builtin_clzll(x & -x);
 #endif
 #else
    return __builtin_ctzll(x);
 #endif
 }
 int __builtin_popcountl(unsigned long x);
 /** DONT_TRANSLATE */
 /** FNSPEC clzll_spec:
  "\<forall>s. \<Gamma> \<turnstile>
    {\<sigma>. s = \<sigma> \<and> x_' s \<noteq> 0 }
      \<acute>ret__longlong :== PROC clzll(\<acute>x)
    \<lbrace> \<acute>ret__longlong = of_nat (word_clz (x_' s)) \<rbrace>"
 */
 static inline long long CONST clzll(unsigned long long x)
 {
    return __builtin_clzll(x);
 }
 /** DONT_TRANSLATE */
 static inline long
 CONST popcountl(unsigned long mask)
--- a/src/arch/riscv/config.cmake
+++ b/src/arch/riscv/config.cmake
@@ -27,6 +27,16 @@ config_option(
    DEPENDS "KernelArchRiscV"
 )
 # Until RISC-V has instructions to count leading/trailing zeros, we provide
 # library implementations.
 # In the verified configurations, we additionally define KernelClzNoBuiltin and
 # KernelCtzNoBuiltin to expose the library implementations to verification.
 # However, since the NoBuiltin options force the use of the library functions
 # even when the platform has sutiable inline assembly, we do not make these the
 # default.
 set(KernelClzlImpl ON CACHE BOOL "")
 set(KernelCtzlImpl ON CACHE BOOL "")
 if(KernelSel4ArchRiscV32)
    set(KernelPTLevels 2 CACHE STRING "" FORCE)
 endif()
--- a/src/util.c
+++ b/src/util.c
@@ -132,44 +132,314 @@ long PURE str_to_long(const char *str)
    return val;
 }
-#ifdef CONFIG_ARCH_RISCV
+// The following implementations of CLZ (count leading zeros) and CTZ (count
-uint32_t __clzsi2(uint32_t x)
+// trailing zeros) perform a binary search for the first 1 bit from the
-{
+// beginning (resp. end) of the input. Initially, the focus is the whole input.
-    uint32_t count = 0;
+// Then, each iteration determines whether there are any 1 bits set in the
-    while (!(x & 0x80000000U) && count < 34) {
+// upper (resp. lower) half of the current focus. If there are (resp. are not),
-        x <<= 1;
+// then the upper half is shifted into the lower half. Either way, the lower
-        count++;
+// half of the current focus becomes the new focus for the next iteration.
-    }
+// After enough iterations (6 for 64-bit inputs, 5 for 32-bit inputs), the
-    return count;
+// focus is reduced to a single bit, and the total number of bits shifted can
-}
+// be used to determine the number of zeros before (resp. after) the first 1
 // bit.
 //
 // Although the details vary, the general approach is used in several library
 // implementations, including in LLVM and GCC. Wikipedia has some references:
 // https://en.wikipedia.org/wiki/Find_first_set
 //
 // The current implementation avoids branching. The test that determines
 // whether the upper (resp. lower) half contains any ones directly produces a
 // number which can be used for an unconditional shift. If the upper (resp.
 // lower) half is all zeros, the test produces a zero, and the shift is a
 // no-op. A branchless implementation has the disadvantage that it requires
 // more instructions to execute than one which branches, but the advantage is
 // that none will be mispredicted branches. Whether this is a good tradeoff
 // depends on the branch predictability and the architecture's pipeline depth.
 // The most critical use of clzl in the kernel is in the scheduler priority
 // queue. In the absence of a concrete application and hardware implementation
 // to evaluate the tradeoff, we somewhat arbitrarily choose a branchless
 // implementation. In any case, the compiler might convert this to a branching
 // binary.
-uint32_t __ctzsi2(uint32_t x)
+// Check some assumptions made by the CLZ and CTZ library functions:
-{
+compile_assert(clz_ulong_32_or_64, sizeof(unsigned long) == 4 || sizeof(unsigned long) == 8);
-    uint32_t count = 0;
+compile_assert(clz_ullong_64, sizeof(unsigned long long) == 8);
    while (!(x & 0x000000001) && count <= 32) {
        x >>= 1;
        count++;
    }
    return count;
 }
-uint32_t __clzdi2(uint64_t x)
+#ifdef CONFIG_CLZL_IMPL
 // Count leading zeros.
 // This implementation contains no branches. If the architecture provides an
 // instruction to set a register to a boolean value on a comparison, then the
 // binary might also avoid branching. A branchless implementation might be
 // preferable on architectures with deep pipelines, or when the maximum
 // priority of runnable threads frequently varies. However, note that the
 // compiler may choose to convert this to a branching implementation.
 static CONST inline unsigned clz32(uint32_t x)
 {
-    uint32_t count = 0;
+    // Compiler builtins typically return int, but we use unsigned internally
-    while (!(x & 0x8000000000000000U) && count < 65) {
+    // to reduce the number of guards we see in the proofs.
-        x <<= 1;
+    unsigned count = 32;
-        count++;
+    uint32_t mask = UINT32_MAX;
    }
    return count;
 }
-uint32_t __ctzdi2(uint64_t x)
+    // Each iteration i (counting backwards) considers the least significant
-{
+    // 2^(i+1) bits of x as the current focus. At the first iteration, the
-    uint32_t count = 0;
+    // focus is the whole input. Each iteration assumes x contains no 1 bits
-    while (!(x & 0x00000000000000001) && count <= 64) {
+    // outside its focus. The iteration contains a test which determines
-        x >>= 1;
+    // whether there are any 1 bits in the upper half (2^i bits) of the focus,
-        count++;
+    // setting `bits` to 2^i if there are, or zero if not. Shifting by `bits`
    // then narrows the focus to the lower 2^i bits and satisfies the
    // assumption for the next iteration. After the final iteration, the focus
    // is just the least significant bit, and the most significsnt 1 bit of the
    // original input (if any) has been shifted into this position. The leading
    // zero count can be determined from the total shift.
    //
    // The iterations are given a very regular structure to facilitate proofs,
    // while also generating reasonably efficient binary code.
    // The `if (1)` blocks make it easier to reason by chunks in the proofs.
    if (1) {
        // iteration 4
        mask >>= (1 << 4); // 0x0000ffff
        unsigned bits = ((unsigned)(mask < x)) << 4; // [0, 16]
        x >>= bits; // <= 0x0000ffff
        count -= bits; // [16, 32]
    }
    if (1) {
        // iteration 3
        mask >>= (1 << 3); // 0x000000ff
        unsigned bits = ((unsigned)(mask < x)) << 3; // [0, 8]
        x >>= bits; // <= 0x000000ff
        count -= bits; // [8, 16, 24, 32]
    }
    if (1) {
        // iteration 2
        mask >>= (1 << 2); // 0x0000000f
        unsigned bits = ((unsigned)(mask < x)) << 2; // [0, 4]
        x >>= bits; // <= 0x0000000f
        count -= bits; // [4, 8, 12, ..., 32]
    }
    if (1) {
        // iteration 1
        mask >>= (1 << 1); // 0x00000003
        unsigned bits = ((unsigned)(mask < x)) << 1; // [0, 2]
        x >>= bits; // <= 0x00000003
        count -= bits; // [2, 4, 6, ..., 32]
    }
    if (1) {
        // iteration 0
        mask >>= (1 << 0); // 0x00000001
        unsigned bits = ((unsigned)(mask < x)) << 0; // [0, 1]
        x >>= bits; // <= 0x00000001
        count -= bits; // [1, 2, 3, ..., 32]
    }
    // If the original input was zero, there will have been no shifts, so this
    // gives a result of 32. Otherwise, x is now exactly 1, so subtracting from
    // count gives a result from [0, 1, 2, ..., 31].
    return count - x;
 }
 #endif
 #if defined(CONFIG_CLZL_IMPL) || defined (CONFIG_CLZLL_IMPL)
 static CONST inline unsigned clz64(uint64_t x)
 {
    unsigned count = 64;
    uint64_t mask = UINT64_MAX;
    // Although we could implement this using clz32, we spell out the
    // iterations in full for slightly better code generation at low
    // optimisation levels, and to allow us to reuse the proof machinery we
    // developed for clz32.
    if (1) {
        // iteration 5
        mask >>= (1 << 5); // 0x00000000ffffffff
        unsigned bits = ((unsigned)(mask < x)) << 5; // [0, 32]
        x >>= bits; // <= 0x00000000ffffffff
        count -= bits; // [32, 64]
    }
    if (1) {
        // iteration 4
        mask >>= (1 << 4); // 0x000000000000ffff
        unsigned bits = ((unsigned)(mask < x)) << 4; // [0, 16]
        x >>= bits; // <= 0x000000000000ffff
        count -= bits; // [16, 32, 48, 64]
    }
    if (1) {
        // iteration 3
        mask >>= (1 << 3); // 0x00000000000000ff
        unsigned bits = ((unsigned)(mask < x)) << 3; // [0, 8]
        x >>= bits; // <= 0x00000000000000ff
        count -= bits; // [8, 16, 24, ..., 64]
    }
    if (1) {
        // iteration 2
        mask >>= (1 << 2); // 0x000000000000000f
        unsigned bits = ((unsigned)(mask < x)) << 2; // [0, 4]
        x >>= bits; // <= 0x000000000000000f
        count -= bits; // [4, 8, 12, ..., 64]
    }
    if (1) {
        // iteration 1
        mask >>= (1 << 1); // 0x0000000000000003
        unsigned bits = ((unsigned)(mask < x)) << 1; // [0, 2]
        x >>= bits; // <= 0x0000000000000003
        count -= bits; // [2, 4, 6, ..., 64]
    }
    if (1) {
        // iteration 0
        mask >>= (1 << 0); // 0x0000000000000001
        unsigned bits = ((unsigned)(mask < x)) << 0; // [0, 1]
        x >>= bits; // <= 0x0000000000000001
        count -= bits; // [1, 2, 3, ..., 64]
    }
    return count - x;
 }
 #endif
 #ifdef CONFIG_CLZL_IMPL
 int __clzdi2(unsigned long x)
 {
    return sizeof(unsigned long) == 4 ? clz32(x) : clz64(x);
 }
 #endif
 #ifdef CONFIG_CLZLL_IMPL
 int __clzti2(unsigned long long x)
 {
    return clz64(x);
 }
 #endif
 #ifdef CONFIG_CTZL_IMPL
 // Count trailing zeros.
 // See comments on clz32.
 static CONST inline unsigned ctz32(uint32_t x)
 {
    unsigned count = (x == 0);
    uint32_t mask = UINT32_MAX;
    // Each iteration i (counting backwards) considers the least significant
    // 2^(i+1) bits of x as the current focus. At the first iteration, the
    // focus is the whole input. The iteration contains a test which determines
    // whether there are any 1 bits in the lower half (2^i bits) of the focus,
    // setting `bits` to zero if there are, or 2^i if not. Shifting by `bits`
    // then narrows the focus to the lower 2^i bits for the next iteration.
    // After the final iteration, the focus is just the least significant bit,
    // and the least significsnt 1 bit of the original input (if any) has been
    // shifted into this position. The trailing zero count can be determined
    // from the total shift.
    //
    // If the initial input is zero, every iteration causes a shift, for a
    // total shift count of 31, so in that case, we add one for a total count
    // of 32. In the comments, xi is the initial value of x.
    //
    // The iterations are given a very regular structure to facilitate proofs,
    // while also generating reasonably efficient binary code.
    if (1) {
        // iteration 4
        mask >>= (1 << 4); // 0x0000ffff
        unsigned bits = ((unsigned)((x & mask) == 0)) << 4; // [0, 16]
        x >>= bits; // xi != 0 --> x & 0x0000ffff != 0
        count += bits; // if xi != 0 then [0, 16] else 17
    }
    if (1) {
        // iteration 3
        mask >>= (1 << 3); // 0x000000ff
        unsigned bits = ((unsigned)((x & mask) == 0)) << 3; // [0, 8]
        x >>= bits; // xi != 0 --> x & 0x000000ff != 0
        count += bits; // if xi != 0 then [0, 8, 16, 24] else 25
    }
    if (1) {
        // iteration 2
        mask >>= (1 << 2); // 0x0000000f
        unsigned bits = ((unsigned)((x & mask) == 0)) << 2; // [0, 4]
        x >>= bits; // xi != 0 --> x & 0x0000000f != 0
        count += bits; // if xi != 0 then [0, 4, 8, ..., 28] else 29
    }
    if (1) {
        // iteration 1
        mask >>= (1 << 1); // 0x00000003
        unsigned bits = ((unsigned)((x & mask) == 0)) << 1; // [0, 2]
        x >>= bits; // xi != 0 --> x & 0x00000003 != 0
        count += bits; // if xi != 0 then [0, 2, 4, ..., 30] else 31
    }
    if (1) {
        // iteration 0
        mask >>= (1 << 0); // 0x00000001
        unsigned bits = ((unsigned)((x & mask) == 0)) << 0; // [0, 1]
        x >>= bits; // xi != 0 --> x & 0x00000001 != 0
        count += bits; // if xi != 0 then [0, 1, 2, ..., 31] else 32
    }
    return count;
 }
-#endif /* CONFIG_ARCH_RISCV */
+#endif
 #if defined(CONFIG_CTZL_IMPL) || defined (CONFIG_CTZLL_IMPL)
 static CONST inline unsigned ctz64(uint64_t x)
 {
    unsigned count = (x == 0);
    uint64_t mask = UINT64_MAX;
    if (1) {
        // iteration 5
        mask >>= (1 << 5); // 0x00000000ffffffff
        unsigned bits = ((unsigned)((x & mask) == 0)) << 5; // [0, 32]
        x >>= bits; // xi != 0 --> x & 0x00000000ffffffff != 0
        count += bits; // if xi != 0 then [0, 32] else 33
    }
    if (1) {
        // iteration 4
        mask >>= (1 << 4); // 0x000000000000ffff
        unsigned bits = ((unsigned)((x & mask) == 0)) << 4; // [0, 16]
        x >>= bits; // xi != 0 --> x & 0x000000000000ffff != 0
        count += bits; // if xi != 0 then [0, 16, 32, 48] else 49
    }
    if (1) {
        // iteration 3
        mask >>= (1 << 3); // 0x00000000000000ff
        unsigned bits = ((unsigned)((x & mask) == 0)) << 3; // [0, 8]
        x >>= bits; // xi != 0 --> x & 0x00000000000000ff != 0
        count += bits; // if xi != 0 then [0, 8, 16, ..., 56] else 57
    }
    if (1) {
        // iteration 2
        mask >>= (1 << 2); // 0x000000000000000f
        unsigned bits = ((unsigned)((x & mask) == 0)) << 2; // [0, 4]
        x >>= bits; // xi != 0 --> x & 0x000000000000000f != 0
        count += bits; // if xi != 0 then [0, 4, 8, ..., 60] else 61
    }
    if (1) {
        // iteration 1
        mask >>= (1 << 1); // 0x0000000000000003
        unsigned bits = ((unsigned)((x & mask) == 0)) << 1; // [0, 2]
        x >>= bits; // xi != 0 --> x & 0x0000000000000003 != 0
        count += bits; // if xi != 0 then [0, 2, 4, ..., 62] else 63
    }
    if (1) {
        // iteration 0
        mask >>= (1 << 0); // 0x0000000000000001
        unsigned bits = ((unsigned)((x & mask) == 0)) << 0; // [0, 1]
        x >>= bits; // xi != 0 --> x & 0x0000000000000001 != 0
        count += bits; // if xi != 0 then [0, 1, 2, ..., 63] else 64
    }
    return count;
 }
 #endif
 #ifdef CONFIG_CTZL_IMPL
 int __ctzdi2(unsigned long x)
 {
    return sizeof(unsigned long) == 4 ? ctz32(x) : ctz64(x);
 }
 #endif
 #ifdef CONFIG_CTZLL_IMPL
 int __ctzti2(unsigned long long x)
 {
    return ctz64(x);
 }
 #endif