Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
elseif((CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64"))
# RISC-V specific optimizations
option(WITH_RISCV_ZBC "Enable RISC-V Zbc carry-less multiplication for CRC32C acceleration" OFF)
if(WITH_RISCV_ZBC)
option(WITH_RISCV_ZVBC "Enable RISC-V Zvbc vector carry-less multiplication for CRC32C acceleration" OFF)
if(WITH_RISCV_ZVBC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc_zbc_zvbc")
elseif(WITH_RISCV_ZBC)
Comment on lines +177 to +179

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Changed -march=rv64gc_zbc_zvbc to -march=rv64gcv_zbc_zvbc to include the base RVV extension.

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc_zbc")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc")
Expand Down
212 changes: 208 additions & 4 deletions src/butil/crc32c.cc
Original file line number Diff line number Diff line change
Expand Up @@ -421,8 +421,11 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
return static_cast<uint32_t>(l ^ 0xffffffffu);
}

#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc)
#if defined(__riscv) && (__riscv_xlen == 64) && (defined(__riscv_zbc) || defined(__riscv_zvbc))
#include <stdio.h>
#if defined(__riscv_zvbc)
#include <riscv_vector.h>
#endif

// RISC-V Zbc carry-less multiplication inline helpers
static inline uint64_t rv_clmul(uint64_t a, uint64_t b) {
Expand Down Expand Up @@ -488,6 +491,7 @@ static const uint64_t crc32c_fold_const[4] __attribute__((aligned(16))) = {

// Hardware-accelerated CRC32C using RISC-V Zbc carry-less multiplication.
// Processes data in 64-byte chunks with 128-bit folding, then Barrett reduces.
#if defined(__riscv_zbc)
static uint32_t rv_crc32c_clmul(uint32_t crc, const char* buf, size_t len) {
// Convert external CRC to internal register state
crc ^= 0xFFFFFFFF;
Expand Down Expand Up @@ -580,6 +584,7 @@ static uint32_t rv_crc32c_clmul(uint32_t crc, const char* buf, size_t len) {
// Convert internal register state to external CRC
return c ^ 0xFFFFFFFF;
}
#endif // __riscv_zbc

// Runtime detection: check if RISC-V CPU supports Zbc extension
static bool isZbc() {
Expand All @@ -604,8 +609,195 @@ static bool isZbc() {
}();
return zbc_supported;
}

#if defined(__riscv_zvbc)
// Hardware-accelerated CRC32C using RISC-V Zvbc vector carry-less multiplication.
// Uses RVV vclmul/vclmulh to process 2 lanes per vector operation (VLEN=128).
// With VLEN=128, each vector register holds 2 x 64-bit elements.
// 4 lanes are processed using 2 vector register pairs per clmul step.
static uint32_t rv_crc32c_vclmul(uint32_t crc, const char* buf, size_t len) {
crc ^= 0xFFFFFFFF;

const uint8_t* p = reinterpret_cast<const uint8_t*>(buf);
size_t n = len;

if (n < 64) {
return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF;
}

// Align to 16-byte boundary
uintptr_t mis = (uintptr_t)p & 0xF;
if (mis) {
size_t pre = 16 - mis;
if (pre > n) pre = n;
crc = rv_crc32c_bitwise(crc, p, pre);
p += pre;
n -= pre;
if (n < 64) {
return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF;
}
}

// Set up RVV for 64-bit elements: vl = min(VLEN/64, 2) = 2 for VLEN=128
size_t vl = __riscv_vsetvl_e64m1(2);
Comment on lines +641 to +643

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Added a vl < 2 check after __riscv_vsetvl_e64m1(2) that falls back to the bitwise path if VLEN < 128.


// Construct fold constant vectors: {k1, k2} and {k3, k4}
// Each element gets the appropriate constant for its position:
// element 0 (lo half) uses k1/k3, element 1 (hi half) uses k2/k4
uint64_t k12_arr[2] = { crc32c_fold_const[0], crc32c_fold_const[1] };
uint64_t k34_arr[2] = { crc32c_fold_const[2], crc32c_fold_const[3] };
vuint64m1_t k12_vec = __riscv_vle64_v_u64m1(k12_arr, vl); // {k1, k2}
vuint64m1_t k34_vec = __riscv_vle64_v_u64m1(k34_arr, vl); // {k3, k4}

// Load first 64 bytes into 4 vector registers.
// Each vector = one 128-bit lane: {lo_64, hi_64}
vuint64m1_t lane1 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 0), vl);
vuint64m1_t lane2 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 16), vl);
vuint64m1_t lane3 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 32), vl);
vuint64m1_t lane4 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 48), vl);

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. All vector loads now go through memcpy into a uint64_t[2] staging buffer before vle64, avoiding the uint8_t*-to-uint64_t* cast.


// XOR CRC into element 0 of first lane
uint64_t tmp[2];
__riscv_vse64_v_u64m1(tmp, lane1, vl);
tmp[0] ^= (uint64_t)crc;
lane1 = __riscv_vle64_v_u64m1(tmp, vl);

p += 64;
n -= 64;

// Main loop: fold 64 bytes per iteration using vector carry-less multiply.
//
// For each 128-bit lane {lo, hi}, the fold computes:
// new_lo = clmul(lo, k1) ^ clmul(hi, k2) ^ data_lo
// new_hi = clmulh(lo, k1) ^ clmulh(hi, k2) ^ data_hi
//
// With k12_vec = {k1, k2} and element-wise vclmul:
// vclmul(lane, k12_vec) = {clmul(lo, k1), clmul(hi, k2)} (lo halves of products)
// vclmulh(lane, k12_vec) = {clmulh(lo, k1), clmulh(hi, k2)} (hi halves of products)
//
// The 128-bit XOR of (lo*k1) and (hi*k2) decomposes element-wise:
// new_lo = clmul(lo,k1) ^ clmul(hi,k2) = vclmul[0] ^ vclmul[1]
// new_hi = clmulh(lo,k1) ^ clmulh(hi,k2) = vclmulh[0] ^ vclmulh[1]
//
// So we need to XOR across elements. With VLEN=128 (2 elements), we use
// scalar extraction for the cross-element XOR since there's no vector
// permute instruction for just 2 elements that's more efficient.
while (n >= 64) {
vuint64m1_t d1 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 0), vl);
vuint64m1_t d2 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 16), vl);
vuint64m1_t d3 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 32), vl);
vuint64m1_t d4 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 48), vl);

// Fold each lane using vector clmul with {k1, k2}
uint64_t lo_r[2], hi_r[2], d_r[2];

// Lane 1
__riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane1, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane1, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(d_r, d1, vl);
d_r[0] ^= lo_r[0] ^ lo_r[1];
d_r[1] ^= hi_r[0] ^ hi_r[1];
lane1 = __riscv_vle64_v_u64m1(d_r, vl);

// Lane 2
__riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane2, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane2, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(d_r, d2, vl);
d_r[0] ^= lo_r[0] ^ lo_r[1];
d_r[1] ^= hi_r[0] ^ hi_r[1];
lane2 = __riscv_vle64_v_u64m1(d_r, vl);

// Lane 3
__riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane3, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane3, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(d_r, d3, vl);
d_r[0] ^= lo_r[0] ^ lo_r[1];
d_r[1] ^= hi_r[0] ^ hi_r[1];
lane3 = __riscv_vle64_v_u64m1(d_r, vl);

// Lane 4
__riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane4, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane4, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(d_r, d4, vl);
d_r[0] ^= lo_r[0] ^ lo_r[1];
d_r[1] ^= hi_r[0] ^ hi_r[1];
lane4 = __riscv_vle64_v_u64m1(d_r, vl);

p += 64;
n -= 64;
}

// Reduce 4 lanes to 1 using {k3, k4}
// Same fold pattern: fold lane_a into lane_b
#define FOLD_INTO(dst, src) do { \
uint64_t _lo[2], _hi[2], _d[2]; \
__riscv_vse64_v_u64m1(_lo, __riscv_vclmul_vv_u64m1(src, k34_vec, vl), vl); \
__riscv_vse64_v_u64m1(_hi, __riscv_vclmulh_vv_u64m1(src, k34_vec, vl), vl); \
__riscv_vse64_v_u64m1(_d, dst, vl); \
_d[0] ^= _lo[0] ^ _lo[1]; \
_d[1] ^= _hi[0] ^ _hi[1]; \
dst = __riscv_vle64_v_u64m1(_d, vl); \
} while(0)

FOLD_INTO(lane2, lane1); // lane2 = fold(lane1) ^ lane2
FOLD_INTO(lane3, lane2); // lane3 = fold(lane2) ^ lane3
FOLD_INTO(lane4, lane3); // lane4 = fold(lane3) ^ lane4
#undef FOLD_INTO

// Extract final 128-bit state from vector register
uint64_t final_state[2];
__riscv_vse64_v_u64m1(final_state, lane4, vl);
uint64_t x0 = final_state[0];
uint64_t x1 = final_state[1];

// Barrett reduction: 128-bit -> 32-bit CRC (scalar)
uint64_t t4 = rv_clmul(x0, RV_CRC32C_CONST_1);
uint64_t t3 = rv_clmulh(x0, RV_CRC32C_CONST_1);
uint64_t t1 = x1 ^ t4;
t4 = t1 & RV_CRC32_MASK32;
t1 >>= 32;
uint64_t t0 = rv_clmul(t4, RV_CRC32C_CONST_0);
t3 = (t3 << 32) ^ t1 ^ t0;

t4 = t3 & RV_CRC32_MASK32;
t4 = rv_clmul(t4, RV_CRC32C_CONST_QUO);
t4 &= RV_CRC32_MASK32;
t4 = rv_clmul(t4, RV_CRC32C_CONST_POLY);
t4 ^= t3;

uint32_t c = (uint32_t)((t4 >> 32) & RV_CRC32_MASK32);
if (n) {
c = rv_crc32c_bitwise(c, p, n);
}
return c ^ 0xFFFFFFFF;
}

// Runtime detection: check if RISC-V CPU supports Zvbc extension
static bool isZvbc() {
static const bool zvbc_supported = []() {
FILE* f = fopen("/proc/cpuinfo", "r");
if (!f) return false;
bool supported = false;
char line[1024];
while (fgets(line, sizeof(line), f)) {
if (strstr(line, "isa") || strstr(line, "hart isa")) {
char* colon = strchr(line, ':');
if (colon) {
if (strstr(colon, "_zvbc") || strstr(colon, "zvbc")) {
supported = true;
Comment on lines +798 to +802

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Both isZbc() and isZvbc() now only match _zbc and _zvbc (with underscore prefix), removing the bare zbc/zvbc substring checks that could cause false positives.

break;
}
}
}
}
fclose(f);
return supported;
}();
return zvbc_supported;
}
#endif // __riscv && __riscv_xlen == 64
#endif // __riscv_zvbc

#endif // __riscv && __riscv_xlen == 64 && (__riscv_zbc || __riscv_zvbc)

// Detect if SSE4.2 or not.
#ifdef __SSE4_2__
Expand All @@ -629,10 +821,17 @@ static inline Function Choose_Extend() {
return (Function)ExtendImpl<FastCRC32Functor>;
}
#endif
#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc)
#if defined(__riscv) && (__riscv_xlen == 64) && (defined(__riscv_zbc) || defined(__riscv_zvbc))
#if defined(__riscv_zvbc)
if (isZvbc()) {
return (Function)rv_crc32c_vclmul;
}
#endif
#if defined(__riscv_zbc)
if (isZbc()) {
return (Function)rv_crc32c_clmul;
}
#endif
#endif
return (Function)ExtendImpl<SlowCRC32Functor>;
}
Expand All @@ -641,8 +840,13 @@ bool IsFastCrc32Supported() {
#ifdef __SSE4_2__
if (isSSE42()) return true;
#endif
#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc)
#if defined(__riscv) && (__riscv_xlen == 64) && (defined(__riscv_zbc) || defined(__riscv_zvbc))
#if defined(__riscv_zvbc)
if (isZvbc()) return true;
#endif
#if defined(__riscv_zbc)
if (isZbc()) return true;
#endif
#endif
return false;
}
Expand Down
Loading