diff --git a/config.cmake b/config.cmake index 25cd5d25e..88542a0b5 100644 --- a/config.cmake +++ b/config.cmake @@ -462,4 +462,46 @@ config_string( DEPENDS "KernelIsMCS" UNDEF_DISABLED ) +config_option( + KernelClzlImpl CLZL_IMPL + "Define a __clzdi2 function to count leading zeros for unsigned long arguments. \ + Only needed on platforms which lack a builtin instruction." + DEFAULT OFF +) + +config_option( + KernelClzllImpl CLZLL_IMPL + "Define a __clzti2 function to count leading zeros for unsigned long long arguments. \ + Only needed on platforms which lack a builtin instruction." + DEFAULT OFF +) + +config_option( + KernelCtzlImpl CTZL_IMPL + "Define a __ctzdi2 function to count trailing zeros for unsigned long arguments. \ + Only needed on platforms which lack a builtin instruction." + DEFAULT OFF +) + +config_option( + KernelCtzllImpl CTZLL_IMPL + "Define a __ctzti2 function to count trailing zeros for unsigned long long arguments. \ + Only needed on platforms which lack a builtin instruction." + DEFAULT OFF +) + +config_option( + KernelClzNoBuiltin CLZ_NO_BUILTIN + "Expose implementations of clzl and clzll to verification by avoiding the use \ + of __builtin_clzl and __builtin_clzll." + DEFAULT OFF +) + +config_option( + KernelCtzNoBuiltin CTZ_NO_BUILTIN + "Expose implementations of ctzl and ctzll to verification by avoiding the use \ + of __builtin_ctzl and __builtin_ctzll." + DEFAULT OFF +) + add_config_library(kernel "${configure_string}") diff --git a/configs/RISCV64_MCS_verified.cmake b/configs/RISCV64_MCS_verified.cmake old mode 100755 new mode 100644 index a945fcc3e..856fc8fcd --- a/configs/RISCV64_MCS_verified.cmake +++ b/configs/RISCV64_MCS_verified.cmake @@ -26,4 +26,6 @@ set(KernelRootCNodeSizeBits 19 CACHE STRING "") set(KernelMaxNumBootinfoUntypedCaps 50 CACHE STRING "") set(KernelIsMCS ON CACHE BOOL "") set(KernelStaticMaxBudgetUs "(60 * 60 * MS_IN_S * US_IN_MS)" CACHE STRING "") +set(KernelClzNoBuiltin ON CACHE BOOL "") +set(KernelCtzNoBuiltin ON CACHE BOOL "") include(${CMAKE_CURRENT_LIST_DIR}/seL4Config.cmake) diff --git a/configs/RISCV64_verified.cmake b/configs/RISCV64_verified.cmake old mode 100755 new mode 100644 index 555aa1bcb..a1778b79e --- a/configs/RISCV64_verified.cmake +++ b/configs/RISCV64_verified.cmake @@ -24,4 +24,6 @@ set(KernelNumDomains 16 CACHE STRING "") set(KernelMaxNumBootinfoUntypedCap 166 CACHE STRING "") set(KernelRootCNodeSizeBits 19 CACHE STRING "") set(KernelMaxNumBootinfoUntypedCaps 50 CACHE STRING "") +set(KernelClzNoBuiltin ON CACHE BOOL "") +set(KernelCtzNoBuiltin ON CACHE BOOL "") include(${CMAKE_CURRENT_LIST_DIR}/seL4Config.cmake) diff --git a/include/util.h b/include/util.h index faa6bff47..e708d3040 100644 --- a/include/util.h +++ b/include/util.h @@ -98,60 +98,137 @@ int PURE strncmp(const char *s1, const char *s2, int n); long CONST char_to_long(char c); long PURE str_to_long(const char *str); +// Library functions for counting leading/trailing zeros. +// GCC's builtins will emit calls to these functions when the platform +// does not provide suitable inline assembly. +// We only emit function definitions if CONFIG_CLZL_IMPL etc are set. +NO_INLINE CONST int __clzdi2(unsigned long x); +NO_INLINE CONST int __clzti2(unsigned long long x); +NO_INLINE CONST int __ctzdi2(unsigned long x); +NO_INLINE CONST int __ctzti2(unsigned long long x); -int __builtin_clzl(unsigned long x); -int __builtin_ctzl(unsigned long x); +// Used for compile-time constants, so should always use the builtin. +#define CTZL(x) __builtin_ctzl(x) -#ifdef CONFIG_ARCH_RISCV -uint32_t __clzsi2(uint32_t x); -uint32_t __ctzsi2(uint32_t x); -uint32_t __clzdi2(uint64_t x); -uint32_t __ctzdi2(uint64_t x); -#endif +// Count leading zeros. +// The CONFIG_CLZ_NO_BUILTIN macro may be used to expose the library function +// to the C parser for verification. +#ifndef CONFIG_CLZ_NO_BUILTIN +// If we use a compiler builtin, we cannot verify it, so we use the following +// annotations to hide the function body from the proofs, and axiomatise its +// behaviour. +// On the other hand, if we use our own implementation instead of the builtin, +// then we want to expose that implementation to the proofs, and therefore hide +// these annotations. /** MODIFIES: */ /** DONT_TRANSLATE */ /** FNSPEC clzl_spec: "\s. \ \ - {\. s = \ \ x_' s \ 0 } + {\. s = \ \ x___unsigned_long_' s \ 0 } \ret__long :== PROC clzl(\x) - \ \ret__long = of_nat (word_clz (x_' s)) \" + \ \ret__long = of_nat (word_clz (x___unsigned_long_' s)) \" */ +#endif static inline long CONST clzl(unsigned long x) { +#ifdef CONFIG_CLZ_NO_BUILTIN + return __clzdi2(x); +#else return __builtin_clzl(x); +#endif } +#ifndef CONFIG_CLZ_NO_BUILTIN +// See comments on clzl. +/** MODIFIES: */ +/** DONT_TRANSLATE */ +/** FNSPEC clzll_spec: + "\s. \ \ + {\. s = \ \ x___unsigned_longlong_' s \ 0 } + \ret__longlong :== PROC clzll(\x) + \ \ret__longlong = of_nat (word_clz (x___unsigned_longlong_' s)) \" +*/ +#endif +static inline long long +CONST clzll(unsigned long long x) +{ +#ifdef CONFIG_CLZ_NO_BUILTIN + return __clzti2(x); +#else + return __builtin_clzll(x); +#endif +} + +// Count trailing zeros. +#ifndef CONFIG_CTZ_NO_BUILTIN +// See comments on clzl. /** MODIFIES: */ /** DONT_TRANSLATE */ /** FNSPEC ctzl_spec: "\s. \ \ - {\. s = \ \ x_' s \ 0 } + {\. s = \ \ x___unsigned_long_' s \ 0 } \ret__long :== PROC ctzl(\x) - \ \ret__long = of_nat (word_ctz (x_' s)) \" + \ \ret__long = of_nat (word_ctz (x___unsigned_long_' s)) \" */ +#endif static inline long CONST ctzl(unsigned long x) { +#ifdef CONFIG_CTZ_NO_BUILTIN +// If there is a builtin CLZ, but no builtin CTZ, then CTZ will be implemented +// using the builtin CLZ, rather than the long-form implementation. +// This is typically the fastest way to calculate ctzl on such platforms. +#ifdef CONFIG_CLZ_NO_BUILTIN + // Here, there are no builtins we can use, so call the library function. + return __ctzdi2(x); +#else + // Here, we have __builtin_clzl, but no __builtin_ctzl. + if (unlikely(x == 0)) { + return 8 * sizeof(unsigned long); + } + // -x = ~x + 1, so (x & -x) isolates the least significant 1-bit of x, + // allowing ctzl to be calculated from clzl and the word size. + return 8 * sizeof(unsigned long) - 1 - __builtin_clzl(x & -x); +#endif +#else + // Here, we have __builtin_ctzl. return __builtin_ctzl(x); +#endif } -#define CTZL(x) __builtin_ctzl(x) +#ifndef CONFIG_CTZ_NO_BUILTIN +// See comments on clzl. +/** MODIFIES: */ +/** DONT_TRANSLATE */ +/** FNSPEC ctzll_spec: + "\s. \ \ + {\. s = \ \ x___unsigned_longlong_' s \ 0 } + \ret__longlong :== PROC ctzll(\x) + \ \ret__longlong = of_nat (word_ctz (x___unsigned_longlong_' s)) \" +*/ +#endif +static inline long long +CONST ctzll(unsigned long long x) +{ +#ifdef CONFIG_CTZ_NO_BUILTIN +// See comments on ctzl. +#ifdef CONFIG_CLZ_NO_BUILTIN + return __ctzti2(x); +#else + if (unlikely(x == 0)) { + return 8 * sizeof(unsigned long long); + } + // See comments on ctzl. + return 8 * sizeof(unsigned long long) - 1 - __builtin_clzll(x & -x); +#endif +#else + return __builtin_ctzll(x); +#endif +} int __builtin_popcountl(unsigned long x); -/** DONT_TRANSLATE */ -/** FNSPEC clzll_spec: - "\s. \ \ - {\. s = \ \ x_' s \ 0 } - \ret__longlong :== PROC clzll(\x) - \ \ret__longlong = of_nat (word_clz (x_' s)) \" -*/ -static inline long long CONST clzll(unsigned long long x) -{ - return __builtin_clzll(x); -} - /** DONT_TRANSLATE */ static inline long CONST popcountl(unsigned long mask) diff --git a/src/arch/riscv/config.cmake b/src/arch/riscv/config.cmake index 883e250e9..122de4484 100644 --- a/src/arch/riscv/config.cmake +++ b/src/arch/riscv/config.cmake @@ -27,6 +27,16 @@ config_option( DEPENDS "KernelArchRiscV" ) +# Until RISC-V has instructions to count leading/trailing zeros, we provide +# library implementations. +# In the verified configurations, we additionally define KernelClzNoBuiltin and +# KernelCtzNoBuiltin to expose the library implementations to verification. +# However, since the NoBuiltin options force the use of the library functions +# even when the platform has sutiable inline assembly, we do not make these the +# default. +set(KernelClzlImpl ON CACHE BOOL "") +set(KernelCtzlImpl ON CACHE BOOL "") + if(KernelSel4ArchRiscV32) set(KernelPTLevels 2 CACHE STRING "" FORCE) endif() diff --git a/src/util.c b/src/util.c index af42d8ab2..73e9eba6c 100644 --- a/src/util.c +++ b/src/util.c @@ -132,44 +132,314 @@ long PURE str_to_long(const char *str) return val; } -#ifdef CONFIG_ARCH_RISCV -uint32_t __clzsi2(uint32_t x) -{ - uint32_t count = 0; - while (!(x & 0x80000000U) && count < 34) { - x <<= 1; - count++; - } - return count; -} +// The following implementations of CLZ (count leading zeros) and CTZ (count +// trailing zeros) perform a binary search for the first 1 bit from the +// beginning (resp. end) of the input. Initially, the focus is the whole input. +// Then, each iteration determines whether there are any 1 bits set in the +// upper (resp. lower) half of the current focus. If there are (resp. are not), +// then the upper half is shifted into the lower half. Either way, the lower +// half of the current focus becomes the new focus for the next iteration. +// After enough iterations (6 for 64-bit inputs, 5 for 32-bit inputs), the +// focus is reduced to a single bit, and the total number of bits shifted can +// be used to determine the number of zeros before (resp. after) the first 1 +// bit. +// +// Although the details vary, the general approach is used in several library +// implementations, including in LLVM and GCC. Wikipedia has some references: +// https://en.wikipedia.org/wiki/Find_first_set +// +// The current implementation avoids branching. The test that determines +// whether the upper (resp. lower) half contains any ones directly produces a +// number which can be used for an unconditional shift. If the upper (resp. +// lower) half is all zeros, the test produces a zero, and the shift is a +// no-op. A branchless implementation has the disadvantage that it requires +// more instructions to execute than one which branches, but the advantage is +// that none will be mispredicted branches. Whether this is a good tradeoff +// depends on the branch predictability and the architecture's pipeline depth. +// The most critical use of clzl in the kernel is in the scheduler priority +// queue. In the absence of a concrete application and hardware implementation +// to evaluate the tradeoff, we somewhat arbitrarily choose a branchless +// implementation. In any case, the compiler might convert this to a branching +// binary. -uint32_t __ctzsi2(uint32_t x) -{ - uint32_t count = 0; - while (!(x & 0x000000001) && count <= 32) { - x >>= 1; - count++; - } - return count; -} +// Check some assumptions made by the CLZ and CTZ library functions: +compile_assert(clz_ulong_32_or_64, sizeof(unsigned long) == 4 || sizeof(unsigned long) == 8); +compile_assert(clz_ullong_64, sizeof(unsigned long long) == 8); -uint32_t __clzdi2(uint64_t x) +#ifdef CONFIG_CLZL_IMPL +// Count leading zeros. +// This implementation contains no branches. If the architecture provides an +// instruction to set a register to a boolean value on a comparison, then the +// binary might also avoid branching. A branchless implementation might be +// preferable on architectures with deep pipelines, or when the maximum +// priority of runnable threads frequently varies. However, note that the +// compiler may choose to convert this to a branching implementation. +static CONST inline unsigned clz32(uint32_t x) { - uint32_t count = 0; - while (!(x & 0x8000000000000000U) && count < 65) { - x <<= 1; - count++; - } - return count; -} + // Compiler builtins typically return int, but we use unsigned internally + // to reduce the number of guards we see in the proofs. + unsigned count = 32; + uint32_t mask = UINT32_MAX; -uint32_t __ctzdi2(uint64_t x) -{ - uint32_t count = 0; - while (!(x & 0x00000000000000001) && count <= 64) { - x >>= 1; - count++; + // Each iteration i (counting backwards) considers the least significant + // 2^(i+1) bits of x as the current focus. At the first iteration, the + // focus is the whole input. Each iteration assumes x contains no 1 bits + // outside its focus. The iteration contains a test which determines + // whether there are any 1 bits in the upper half (2^i bits) of the focus, + // setting `bits` to 2^i if there are, or zero if not. Shifting by `bits` + // then narrows the focus to the lower 2^i bits and satisfies the + // assumption for the next iteration. After the final iteration, the focus + // is just the least significant bit, and the most significsnt 1 bit of the + // original input (if any) has been shifted into this position. The leading + // zero count can be determined from the total shift. + // + // The iterations are given a very regular structure to facilitate proofs, + // while also generating reasonably efficient binary code. + + // The `if (1)` blocks make it easier to reason by chunks in the proofs. + if (1) { + // iteration 4 + mask >>= (1 << 4); // 0x0000ffff + unsigned bits = ((unsigned)(mask < x)) << 4; // [0, 16] + x >>= bits; // <= 0x0000ffff + count -= bits; // [16, 32] } + if (1) { + // iteration 3 + mask >>= (1 << 3); // 0x000000ff + unsigned bits = ((unsigned)(mask < x)) << 3; // [0, 8] + x >>= bits; // <= 0x000000ff + count -= bits; // [8, 16, 24, 32] + } + if (1) { + // iteration 2 + mask >>= (1 << 2); // 0x0000000f + unsigned bits = ((unsigned)(mask < x)) << 2; // [0, 4] + x >>= bits; // <= 0x0000000f + count -= bits; // [4, 8, 12, ..., 32] + } + if (1) { + // iteration 1 + mask >>= (1 << 1); // 0x00000003 + unsigned bits = ((unsigned)(mask < x)) << 1; // [0, 2] + x >>= bits; // <= 0x00000003 + count -= bits; // [2, 4, 6, ..., 32] + } + if (1) { + // iteration 0 + mask >>= (1 << 0); // 0x00000001 + unsigned bits = ((unsigned)(mask < x)) << 0; // [0, 1] + x >>= bits; // <= 0x00000001 + count -= bits; // [1, 2, 3, ..., 32] + } + + // If the original input was zero, there will have been no shifts, so this + // gives a result of 32. Otherwise, x is now exactly 1, so subtracting from + // count gives a result from [0, 1, 2, ..., 31]. + return count - x; +} +#endif + +#if defined(CONFIG_CLZL_IMPL) || defined (CONFIG_CLZLL_IMPL) +static CONST inline unsigned clz64(uint64_t x) +{ + unsigned count = 64; + uint64_t mask = UINT64_MAX; + + // Although we could implement this using clz32, we spell out the + // iterations in full for slightly better code generation at low + // optimisation levels, and to allow us to reuse the proof machinery we + // developed for clz32. + if (1) { + // iteration 5 + mask >>= (1 << 5); // 0x00000000ffffffff + unsigned bits = ((unsigned)(mask < x)) << 5; // [0, 32] + x >>= bits; // <= 0x00000000ffffffff + count -= bits; // [32, 64] + } + if (1) { + // iteration 4 + mask >>= (1 << 4); // 0x000000000000ffff + unsigned bits = ((unsigned)(mask < x)) << 4; // [0, 16] + x >>= bits; // <= 0x000000000000ffff + count -= bits; // [16, 32, 48, 64] + } + if (1) { + // iteration 3 + mask >>= (1 << 3); // 0x00000000000000ff + unsigned bits = ((unsigned)(mask < x)) << 3; // [0, 8] + x >>= bits; // <= 0x00000000000000ff + count -= bits; // [8, 16, 24, ..., 64] + } + if (1) { + // iteration 2 + mask >>= (1 << 2); // 0x000000000000000f + unsigned bits = ((unsigned)(mask < x)) << 2; // [0, 4] + x >>= bits; // <= 0x000000000000000f + count -= bits; // [4, 8, 12, ..., 64] + } + if (1) { + // iteration 1 + mask >>= (1 << 1); // 0x0000000000000003 + unsigned bits = ((unsigned)(mask < x)) << 1; // [0, 2] + x >>= bits; // <= 0x0000000000000003 + count -= bits; // [2, 4, 6, ..., 64] + } + if (1) { + // iteration 0 + mask >>= (1 << 0); // 0x0000000000000001 + unsigned bits = ((unsigned)(mask < x)) << 0; // [0, 1] + x >>= bits; // <= 0x0000000000000001 + count -= bits; // [1, 2, 3, ..., 64] + } + + return count - x; +} +#endif + +#ifdef CONFIG_CLZL_IMPL +int __clzdi2(unsigned long x) +{ + return sizeof(unsigned long) == 4 ? clz32(x) : clz64(x); +} +#endif + +#ifdef CONFIG_CLZLL_IMPL +int __clzti2(unsigned long long x) +{ + return clz64(x); +} +#endif + +#ifdef CONFIG_CTZL_IMPL +// Count trailing zeros. +// See comments on clz32. +static CONST inline unsigned ctz32(uint32_t x) +{ + unsigned count = (x == 0); + uint32_t mask = UINT32_MAX; + + // Each iteration i (counting backwards) considers the least significant + // 2^(i+1) bits of x as the current focus. At the first iteration, the + // focus is the whole input. The iteration contains a test which determines + // whether there are any 1 bits in the lower half (2^i bits) of the focus, + // setting `bits` to zero if there are, or 2^i if not. Shifting by `bits` + // then narrows the focus to the lower 2^i bits for the next iteration. + // After the final iteration, the focus is just the least significant bit, + // and the least significsnt 1 bit of the original input (if any) has been + // shifted into this position. The trailing zero count can be determined + // from the total shift. + // + // If the initial input is zero, every iteration causes a shift, for a + // total shift count of 31, so in that case, we add one for a total count + // of 32. In the comments, xi is the initial value of x. + // + // The iterations are given a very regular structure to facilitate proofs, + // while also generating reasonably efficient binary code. + + if (1) { + // iteration 4 + mask >>= (1 << 4); // 0x0000ffff + unsigned bits = ((unsigned)((x & mask) == 0)) << 4; // [0, 16] + x >>= bits; // xi != 0 --> x & 0x0000ffff != 0 + count += bits; // if xi != 0 then [0, 16] else 17 + } + if (1) { + // iteration 3 + mask >>= (1 << 3); // 0x000000ff + unsigned bits = ((unsigned)((x & mask) == 0)) << 3; // [0, 8] + x >>= bits; // xi != 0 --> x & 0x000000ff != 0 + count += bits; // if xi != 0 then [0, 8, 16, 24] else 25 + } + if (1) { + // iteration 2 + mask >>= (1 << 2); // 0x0000000f + unsigned bits = ((unsigned)((x & mask) == 0)) << 2; // [0, 4] + x >>= bits; // xi != 0 --> x & 0x0000000f != 0 + count += bits; // if xi != 0 then [0, 4, 8, ..., 28] else 29 + } + if (1) { + // iteration 1 + mask >>= (1 << 1); // 0x00000003 + unsigned bits = ((unsigned)((x & mask) == 0)) << 1; // [0, 2] + x >>= bits; // xi != 0 --> x & 0x00000003 != 0 + count += bits; // if xi != 0 then [0, 2, 4, ..., 30] else 31 + } + if (1) { + // iteration 0 + mask >>= (1 << 0); // 0x00000001 + unsigned bits = ((unsigned)((x & mask) == 0)) << 0; // [0, 1] + x >>= bits; // xi != 0 --> x & 0x00000001 != 0 + count += bits; // if xi != 0 then [0, 1, 2, ..., 31] else 32 + } + return count; } -#endif /* CONFIG_ARCH_RISCV */ +#endif + +#if defined(CONFIG_CTZL_IMPL) || defined (CONFIG_CTZLL_IMPL) +static CONST inline unsigned ctz64(uint64_t x) +{ + unsigned count = (x == 0); + uint64_t mask = UINT64_MAX; + + if (1) { + // iteration 5 + mask >>= (1 << 5); // 0x00000000ffffffff + unsigned bits = ((unsigned)((x & mask) == 0)) << 5; // [0, 32] + x >>= bits; // xi != 0 --> x & 0x00000000ffffffff != 0 + count += bits; // if xi != 0 then [0, 32] else 33 + } + if (1) { + // iteration 4 + mask >>= (1 << 4); // 0x000000000000ffff + unsigned bits = ((unsigned)((x & mask) == 0)) << 4; // [0, 16] + x >>= bits; // xi != 0 --> x & 0x000000000000ffff != 0 + count += bits; // if xi != 0 then [0, 16, 32, 48] else 49 + } + if (1) { + // iteration 3 + mask >>= (1 << 3); // 0x00000000000000ff + unsigned bits = ((unsigned)((x & mask) == 0)) << 3; // [0, 8] + x >>= bits; // xi != 0 --> x & 0x00000000000000ff != 0 + count += bits; // if xi != 0 then [0, 8, 16, ..., 56] else 57 + } + if (1) { + // iteration 2 + mask >>= (1 << 2); // 0x000000000000000f + unsigned bits = ((unsigned)((x & mask) == 0)) << 2; // [0, 4] + x >>= bits; // xi != 0 --> x & 0x000000000000000f != 0 + count += bits; // if xi != 0 then [0, 4, 8, ..., 60] else 61 + } + if (1) { + // iteration 1 + mask >>= (1 << 1); // 0x0000000000000003 + unsigned bits = ((unsigned)((x & mask) == 0)) << 1; // [0, 2] + x >>= bits; // xi != 0 --> x & 0x0000000000000003 != 0 + count += bits; // if xi != 0 then [0, 2, 4, ..., 62] else 63 + } + if (1) { + // iteration 0 + mask >>= (1 << 0); // 0x0000000000000001 + unsigned bits = ((unsigned)((x & mask) == 0)) << 0; // [0, 1] + x >>= bits; // xi != 0 --> x & 0x0000000000000001 != 0 + count += bits; // if xi != 0 then [0, 1, 2, ..., 63] else 64 + } + + return count; +} +#endif + +#ifdef CONFIG_CTZL_IMPL +int __ctzdi2(unsigned long x) +{ + return sizeof(unsigned long) == 4 ? ctz32(x) : ctz64(x); +} +#endif + +#ifdef CONFIG_CTZLL_IMPL +int __ctzti2(unsigned long long x) +{ + return ctz64(x); +} +#endif