改进
经过@yyq252517提醒并测试,直接使用12bit的查找表可以更快:
consteval std::array<std::array<char, 4>, 4096> hex_to_oct_lookup() noexcept
{
std::array<std::array<char, 4>, 4096> lookup{};
for (size_t i = 0; i < 4096; ++i)
{
lookup[i][0] = '0' + ((i >> 0) & 0b111);
lookup[i][1] = '0' + ((i >> 3) & 0b111);
lookup[i][2] = '0' + ((i >> 6) & 0b111);
lookup[i][3] = '0' + ((i >> 9) & 0b111);
}
return lookup;
}
// ...
std::string hex_to_oct(const std::string& hex) noexcept
{
const auto hex_length = hex.length();
if (hex_length == 0) return "0";
const auto groups = (hex_length + 2) / 3;
const auto oct_digits = groups * 4;
std::string res(oct_digits, '0');
for (size_t i = hex_length % 3; i < hex_length; i += 3)
{
const size_t curr_group = (i + 2) / 3;
const uint16_t h0 = hex_lookup[std::bit_cast<uint8_t>(hex[i + 2])]; // [3:0]
const uint16_t h1 = hex_lookup[std::bit_cast<uint8_t>(hex[i + 1])]; // [7:4]
const uint16_t h2 = hex_lookup[std::bit_cast<uint8_t>(hex[i + 0])]; // [11:8]
const uint16_t idx = (h2 << 8) | (h1 << 4) | h0; // [11:0]
const auto [o1, o2, o3, o4] = hex_to_oct_lookup_table[idx];
res[curr_group * 4 + 3] = o1;
res[curr_group * 4 + 2] = o2;
res[curr_group * 4 + 1] = o3;
res[curr_group * 4 + 0] = o4;
}
if (hex_length % 3 != 0)
{
std::array<uint16_t, 3> h{0, 0, 0};
if (hex_length % 3 == 1)
{
h[0] = hex_lookup[std::bit_cast<uint8_t>(hex[0])];
}
else // hex_length % 3 == 2
{
h[1] = hex_lookup[std::bit_cast<uint8_t>(hex[0])];
h[0] = hex_lookup[std::bit_cast<uint8_t>(hex[1])];
}
const uint16_t idx = (h[2] << 8) | (h[1] << 4) | h[0];
const auto [o1, o2, o3, o4] = hex_to_oct_lookup_table[idx];
res[3] = o1;
res[2] = o2;
res[1] = o3;
res[0] = o4;
}
const auto first_non_zero = res.find_first_not_of('0');
if (first_non_zero == std::string::npos) return "0";
return res.substr(first_non_zero);
}
使用和此前一致的测试环境,可以得到:
Optimized version: 72.16 ms (stddev: 0.45 ms), 1385897448.4948883 hex chars/sec
比此前快了15%!此前低估了现代CPU的L1缓存系统的能力,看来它处理4096*4byte = 16KiB的查找表也不在话下
感谢@yyq252517