快速十六进制转八进制
-
QQ群上有群友提到了十六进制转八进制,借此练练手写一个快速十六进制转八进制代码并在此分享:Godbolt链接
consteval std::array<uint8_t, 256> gen_lookup() noexcept { std::array<uint8_t, 256> lookup{}; lookup.fill(0); lookup['0'] = 0; lookup['1'] = 1; lookup['2'] = 2; lookup['3'] = 3; lookup['4'] = 4; lookup['5'] = 5; lookup['6'] = 6; lookup['7'] = 7; lookup['8'] = 8; lookup['9'] = 9; lookup['a'] = 10; lookup['b'] = 11; lookup['c'] = 12; lookup['d'] = 13; lookup['e'] = 14; lookup['f'] = 15; lookup['A'] = 10; lookup['B'] = 11; lookup['C'] = 12; lookup['D'] = 13; lookup['E'] = 14; lookup['F'] = 15; return lookup; } consteval std::array<std::array<char, 2>, 64> hex_to_oct_lookup() noexcept { std::array<std::array<char, 2>, 64> lookup{}; for (size_t i = 0; i < 64; ++i) { lookup[i][0] = '0' + (i % 8); lookup[i][1] = '0' + (i / 8); } return lookup; } constexpr auto hex_lookup = gen_lookup(); constexpr auto hex_to_oct_lookup_table = hex_to_oct_lookup(); std::string hex_to_oct(const std::string& hex) noexcept { const auto hex_length = hex.length(); if (hex_length == 0) return "0"; const auto groups = (hex_length + 2) / 3; const auto oct_digits = groups * 4; std::string res(oct_digits, '0'); for (size_t i = hex_length % 3; i < hex_length; i += 3) { const size_t curr_group = (i + 2) / 3; const auto h0 = hex_lookup[static_cast<uint8_t>(hex[i + 2])]; // [3:0] const auto h1 = hex_lookup[static_cast<uint8_t>(hex[i + 1])]; // [7:4] const auto h2 = hex_lookup[static_cast<uint8_t>(hex[i + 0])]; // [11:8] const uint8_t o12 = h0 | ((h1 & 0b11) << 4); // [5:0] const uint8_t o34 = (h1 >> 2) | (h2 << 2); // [11:6] const auto [o1, o2] = hex_to_oct_lookup_table[o12]; const auto [o3, o4] = hex_to_oct_lookup_table[o34]; res[curr_group * 4 + 3] = o1; res[curr_group * 4 + 2] = o2; res[curr_group * 4 + 1] = o3; res[curr_group * 4 + 0] = o4; } if (hex_length % 3 != 0) { std::array<uint8_t, 3> h{0, 0, 0}; if (hex_length % 3 == 1) { h[0] = hex_lookup[static_cast<uint8_t>(hex[0])]; } else // hex_length % 3 == 2 { h[1] = hex_lookup[static_cast<uint8_t>(hex[0])]; h[0] = hex_lookup[static_cast<uint8_t>(hex[1])]; } const uint8_t o12 = h[0] | ((h[1] & 0b11) << 4); // [5:0] const uint8_t o34 = (h[1] >> 2) | (h[2] << 2); // [11:6] const auto [o1, o2] = hex_to_oct_lookup_table[o12]; const auto [o3, o4] = hex_to_oct_lookup_table[o34]; res[3] = o1; res[2] = o2; res[1] = o3; res[0] = o4; } const auto first_non_zero = res.find_first_not_of('0'); if (first_non_zero == std::string::npos) return "0"; return res.substr(first_non_zero); }优化点
- 用了小型查找表快速将16进制字符转成对应的十六进制数字
- 每12bit为一组转换,同样使用小型查找表将6bit直接转成2个八进制字符
- 全程不使用分支,提高执行效率
- 使用
consteval直接在编译期生成查找表 - 所有查找表均足够小并且按64字节对齐,对齐L1缓存行,提升访存效率
性能测试
测试环境:
- CPU: Intel i7-13620H
- RAM: DDR5-5200 32GiB
- OS: Debian 13 (Linux 6.12.73+deb13-amd64)
转换100M个十六进制字符,对比上述代码和Gemini写的版本(Gemini版本由群友提供):
Optimized version: 84.01 ms (stddev: 1.12 ms), 1190284058.7910337 hex chars/sec Gemini version: 1464.58 ms (stddev: 31.58 ms), 68279030.44209935 hex chars/sec上述代码可以达到1.2G/s的速度,按照4.9GHz的主频,大约每字符4周期
正确性测试
使用Python脚本生成从1~40位16进制数测例,并使用Xmake并行测试,全部通过
设计妥协
- 为了快速转换,没有错误检测,所有非法字符均映射为0
Gemini版本代码
static std::string hex_to_oct_gemini_version(std::string hex) { // 补齐长度为 3 的倍数,方便分段 int padding = (3 - hex.length() % 3) % 3; std::string prefix(padding, '0'); hex = prefix + hex; std::string res = ""; // 每次处理 3 个 16 进制字符 for (int i = 0; i < hex.length(); i += 3) { // 1. 将 3 位 16 进制转为 12 位整数 int combined = 0; for (int j = 0; j < 3; ++j) { char c = hex[i + j]; int v = (std::isdigit(c) ? c - '0' : std::toupper(c) - 'A' + 10); combined = (combined << 4) | v; } // 2. 将 12 位整数转为 4 位 8 进制 std::string chunk = ""; for (int j = 0; j < 4; ++j) { chunk = std::to_string(combined & 7) + chunk; combined >>= 3; } res += chunk; } // 最后处理前导零 size_t start = res.find_first_not_of('0'); return (start == std::string::npos) ? "0" : res.substr(start); }明显可见不少槽点:
- 非常多的三目运算(
std::isdigit、std::toupper等内置函数里面可能仍然有分支),降低执行效率 - 用了过多的临时
std::string,尽管有SSO优化,效率仍然低下 - 没有提前给
std::string预留空间 - 直接用
std::to_string转换过于重量级,且两个std::string相连接也会有额外的开销
-
改进
经过@yyq252517提醒并测试,直接使用12bit的查找表可以更快:
consteval std::array<std::array<char, 4>, 4096> hex_to_oct_lookup() noexcept { std::array<std::array<char, 4>, 4096> lookup{}; for (size_t i = 0; i < 4096; ++i) { lookup[i][0] = '0' + ((i >> 0) & 0b111); lookup[i][1] = '0' + ((i >> 3) & 0b111); lookup[i][2] = '0' + ((i >> 6) & 0b111); lookup[i][3] = '0' + ((i >> 9) & 0b111); } return lookup; } // ... std::string hex_to_oct(const std::string& hex) noexcept { const auto hex_length = hex.length(); if (hex_length == 0) return "0"; const auto groups = (hex_length + 2) / 3; const auto oct_digits = groups * 4; std::string res(oct_digits, '0'); for (size_t i = hex_length % 3; i < hex_length; i += 3) { const size_t curr_group = (i + 2) / 3; const uint16_t h0 = hex_lookup[std::bit_cast<uint8_t>(hex[i + 2])]; // [3:0] const uint16_t h1 = hex_lookup[std::bit_cast<uint8_t>(hex[i + 1])]; // [7:4] const uint16_t h2 = hex_lookup[std::bit_cast<uint8_t>(hex[i + 0])]; // [11:8] const uint16_t idx = (h2 << 8) | (h1 << 4) | h0; // [11:0] const auto [o1, o2, o3, o4] = hex_to_oct_lookup_table[idx]; res[curr_group * 4 + 3] = o1; res[curr_group * 4 + 2] = o2; res[curr_group * 4 + 1] = o3; res[curr_group * 4 + 0] = o4; } if (hex_length % 3 != 0) { std::array<uint16_t, 3> h{0, 0, 0}; if (hex_length % 3 == 1) { h[0] = hex_lookup[std::bit_cast<uint8_t>(hex[0])]; } else // hex_length % 3 == 2 { h[1] = hex_lookup[std::bit_cast<uint8_t>(hex[0])]; h[0] = hex_lookup[std::bit_cast<uint8_t>(hex[1])]; } const uint16_t idx = (h[2] << 8) | (h[1] << 4) | h[0]; const auto [o1, o2, o3, o4] = hex_to_oct_lookup_table[idx]; res[3] = o1; res[2] = o2; res[1] = o3; res[0] = o4; } const auto first_non_zero = res.find_first_not_of('0'); if (first_non_zero == std::string::npos) return "0"; return res.substr(first_non_zero); }使用和此前一致的测试环境,可以得到:
Optimized version: 72.16 ms (stddev: 0.45 ms), 1385897448.4948883 hex chars/sec比此前快了15%!此前低估了现代CPU的L1缓存系统的能力,看来它处理
4096*4byte = 16KiB的查找表也不在话下