快速十六进制转八进制

Stehsaer

QQ群上有群友提到了十六进制转八进制，借此练练手写一个快速十六进制转八进制代码并在此分享：Godbolt链接

consteval std::array<uint8_t, 256> gen_lookup() noexcept
{
	std::array<uint8_t, 256> lookup{};
	lookup.fill(0);

	lookup['0'] = 0;
	lookup['1'] = 1;
	lookup['2'] = 2;
	lookup['3'] = 3;
	lookup['4'] = 4;
	lookup['5'] = 5;
	lookup['6'] = 6;
	lookup['7'] = 7;
	lookup['8'] = 8;
	lookup['9'] = 9;
	lookup['a'] = 10;
	lookup['b'] = 11;
	lookup['c'] = 12;
	lookup['d'] = 13;
	lookup['e'] = 14;
	lookup['f'] = 15;
	lookup['A'] = 10;
	lookup['B'] = 11;
	lookup['C'] = 12;
	lookup['D'] = 13;
	lookup['E'] = 14;
	lookup['F'] = 15;

	return lookup;
}

consteval std::array<std::array<char, 2>, 64> hex_to_oct_lookup() noexcept
{
	std::array<std::array<char, 2>, 64> lookup{};

	for (size_t i = 0; i < 64; ++i)
	{
		lookup[i][0] = '0' + (i % 8);
		lookup[i][1] = '0' + (i / 8);
	}

	return lookup;
}

constexpr auto hex_lookup = gen_lookup();
constexpr auto hex_to_oct_lookup_table = hex_to_oct_lookup();

std::string hex_to_oct(const std::string& hex) noexcept
{
	const auto hex_length = hex.length();
	if (hex_length == 0) return "0";

	const auto groups = (hex_length + 2) / 3;
	const auto oct_digits = groups * 4;

	std::string res(oct_digits, '0');

	for (size_t i = hex_length % 3; i < hex_length; i += 3)
	{
		const size_t curr_group = (i + 2) / 3;

		const auto h0 = hex_lookup[static_cast<uint8_t>(hex[i + 2])];  // [3:0]
		const auto h1 = hex_lookup[static_cast<uint8_t>(hex[i + 1])];  // [7:4]
		const auto h2 = hex_lookup[static_cast<uint8_t>(hex[i + 0])];  // [11:8]

		const uint8_t o12 = h0 | ((h1 & 0b11) << 4);  // [5:0]
		const uint8_t o34 = (h1 >> 2) | (h2 << 2);    // [11:6]

		const auto [o1, o2] = hex_to_oct_lookup_table[o12];
		const auto [o3, o4] = hex_to_oct_lookup_table[o34];

		res[curr_group * 4 + 3] = o1;
		res[curr_group * 4 + 2] = o2;
		res[curr_group * 4 + 1] = o3;
		res[curr_group * 4 + 0] = o4;
	}

	if (hex_length % 3 != 0)
	{
		std::array<uint8_t, 3> h{0, 0, 0};

		if (hex_length % 3 == 1)
		{
			h[0] = hex_lookup[static_cast<uint8_t>(hex[0])];
		}
		else  // hex_length % 3 == 2
		{
			h[1] = hex_lookup[static_cast<uint8_t>(hex[0])];
			h[0] = hex_lookup[static_cast<uint8_t>(hex[1])];
		}

		const uint8_t o12 = h[0] | ((h[1] & 0b11) << 4);  // [5:0]
		const uint8_t o34 = (h[1] >> 2) | (h[2] << 2);    // [11:6]

		const auto [o1, o2] = hex_to_oct_lookup_table[o12];
		const auto [o3, o4] = hex_to_oct_lookup_table[o34];

		res[3] = o1;
		res[2] = o2;
		res[1] = o3;
		res[0] = o4;
	}

	const auto first_non_zero = res.find_first_not_of('0');
	if (first_non_zero == std::string::npos) return "0";
	return res.substr(first_non_zero);
}

优化点

用了小型查找表快速将16进制字符转成对应的十六进制数字
每12bit为一组转换，同样使用小型查找表将6bit直接转成2个八进制字符
全程不使用分支，提高执行效率
使用consteval直接在编译期生成查找表
所有查找表均足够小并且按64字节对齐，对齐L1缓存行，提升访存效率

性能测试

测试环境：

CPU: Intel i7-13620H

RAM: DDR5-5200 32GiB

OS: Debian 13 (Linux 6.12.73+deb13-amd64)

转换100M个十六进制字符，对比上述代码和Gemini写的版本（Gemini版本由群友提供）：

Optimized version: 84.01 ms (stddev: 1.12 ms), 1190284058.7910337 hex chars/sec
Gemini version: 1464.58 ms (stddev: 31.58 ms), 68279030.44209935 hex chars/sec

上述代码可以达到1.2G/s的速度，按照4.9GHz的主频，大约每字符4周期

正确性测试

使用Python脚本生成从1~40位16进制数测例，并使用Xmake并行测试，全部通过

设计妥协

为了快速转换，没有错误检测，所有非法字符均映射为0

Gemini版本代码

static std::string hex_to_oct_gemini_version(std::string hex)
{
	// 补齐长度为 3 的倍数，方便分段
	int padding = (3 - hex.length() % 3) % 3;
	std::string prefix(padding, '0');
	hex = prefix + hex;

	std::string res = "";
	// 每次处理 3 个 16 进制字符
	for (int i = 0; i < hex.length(); i += 3)
	{
		// 1. 将 3 位 16 进制转为 12 位整数
		int combined = 0;
		for (int j = 0; j < 3; ++j)
		{
			char c = hex[i + j];
			int v = (std::isdigit(c) ? c - '0' : std::toupper(c) - 'A' + 10);
			combined = (combined << 4) | v;
		}

		// 2. 将 12 位整数转为 4 位 8 进制
		std::string chunk = "";
		for (int j = 0; j < 4; ++j)
		{
			chunk = std::to_string(combined & 7) + chunk;
			combined >>= 3;
		}
		res += chunk;
	}

	// 最后处理前导零
	size_t start = res.find_first_not_of('0');
	return (start == std::string::npos) ? "0" : res.substr(start);
}

明显可见不少槽点：

非常多的三目运算（std::isdigit、std::toupper等内置函数里面可能仍然有分支），降低执行效率
用了过多的临时std::string，尽管有SSO优化，效率仍然低下
没有提前给std::string预留空间
直接用std::to_string转换过于重量级，且两个std::string相连接也会有额外的开销

Stehsaer

改进

经过@yyq252517提醒并测试，直接使用12bit的查找表可以更快：

consteval std::array<std::array<char, 4>, 4096> hex_to_oct_lookup() noexcept
{
	std::array<std::array<char, 4>, 4096> lookup{};

	for (size_t i = 0; i < 4096; ++i)
	{
		lookup[i][0] = '0' + ((i >> 0) & 0b111);
		lookup[i][1] = '0' + ((i >> 3) & 0b111);
		lookup[i][2] = '0' + ((i >> 6) & 0b111);
		lookup[i][3] = '0' + ((i >> 9) & 0b111);
	}

	return lookup;
}
// ...
std::string hex_to_oct(const std::string& hex) noexcept
{
	const auto hex_length = hex.length();
	if (hex_length == 0) return "0";

	const auto groups = (hex_length + 2) / 3;
	const auto oct_digits = groups * 4;

	std::string res(oct_digits, '0');

	for (size_t i = hex_length % 3; i < hex_length; i += 3)
	{
		const size_t curr_group = (i + 2) / 3;

		const uint16_t h0 = hex_lookup[std::bit_cast<uint8_t>(hex[i + 2])];  // [3:0]
		const uint16_t h1 = hex_lookup[std::bit_cast<uint8_t>(hex[i + 1])];  // [7:4]
		const uint16_t h2 = hex_lookup[std::bit_cast<uint8_t>(hex[i + 0])];  // [11:8]

		const uint16_t idx = (h2 << 8) | (h1 << 4) | h0;  // [11:0]

		const auto [o1, o2, o3, o4] = hex_to_oct_lookup_table[idx];

		res[curr_group * 4 + 3] = o1;
		res[curr_group * 4 + 2] = o2;
		res[curr_group * 4 + 1] = o3;
		res[curr_group * 4 + 0] = o4;
	}

	if (hex_length % 3 != 0)
	{
		std::array<uint16_t, 3> h{0, 0, 0};

		if (hex_length % 3 == 1)
		{
			h[0] = hex_lookup[std::bit_cast<uint8_t>(hex[0])];
		}
		else  // hex_length % 3 == 2
		{
			h[1] = hex_lookup[std::bit_cast<uint8_t>(hex[0])];
			h[0] = hex_lookup[std::bit_cast<uint8_t>(hex[1])];
		}

		const uint16_t idx = (h[2] << 8) | (h[1] << 4) | h[0];

		const auto [o1, o2, o3, o4] = hex_to_oct_lookup_table[idx];

		res[3] = o1;
		res[2] = o2;
		res[1] = o3;
		res[0] = o4;
	}

	const auto first_non_zero = res.find_first_not_of('0');
	if (first_non_zero == std::string::npos) return "0";
	return res.substr(first_non_zero);
}

使用和此前一致的测试环境，可以得到：

Optimized version: 72.16 ms (stddev: 0.45 ms), 1385897448.4948883 hex chars/sec

比此前快了15%！此前低估了现代CPU的L1缓存系统的能力，看来它处理4096*4byte = 16KiB的查找表也不在话下

感谢@yyq252517

D2Learn Forums

快速十六进制转八进制

优化点

性能测试

正确性测试

设计妥协

Gemini版本代码

改进