跳转至内容
  • 版块
  • 最新
  • 标签
  • 热门
  • Online Tools
  • 用户
  • 群组
折叠
品牌标识

D2Learn Forums

  1. 主页
  2. General Discussion | 综合讨论
  3. 快速十六进制转八进制

快速十六进制转八进制

已定时 已固定 已锁定 已移动 General Discussion | 综合讨论
算法
2 帖子 1 发布者 41 浏览
  • 从旧到新
  • 从新到旧
  • 最多赞同
登录后回复
此主题已被删除。只有拥有主题管理权限的用户可以查看。
  • StehsaerS 离线
    StehsaerS 离线
    Stehsaer
    编写于 最后由 编辑
    #1

    QQ群上有群友提到了十六进制转八进制,借此练练手写一个快速十六进制转八进制代码并在此分享:Godbolt链接

    consteval std::array<uint8_t, 256> gen_lookup() noexcept
    {
    	std::array<uint8_t, 256> lookup{};
    	lookup.fill(0);
    
    	lookup['0'] = 0;
    	lookup['1'] = 1;
    	lookup['2'] = 2;
    	lookup['3'] = 3;
    	lookup['4'] = 4;
    	lookup['5'] = 5;
    	lookup['6'] = 6;
    	lookup['7'] = 7;
    	lookup['8'] = 8;
    	lookup['9'] = 9;
    	lookup['a'] = 10;
    	lookup['b'] = 11;
    	lookup['c'] = 12;
    	lookup['d'] = 13;
    	lookup['e'] = 14;
    	lookup['f'] = 15;
    	lookup['A'] = 10;
    	lookup['B'] = 11;
    	lookup['C'] = 12;
    	lookup['D'] = 13;
    	lookup['E'] = 14;
    	lookup['F'] = 15;
    
    	return lookup;
    }
    
    consteval std::array<std::array<char, 2>, 64> hex_to_oct_lookup() noexcept
    {
    	std::array<std::array<char, 2>, 64> lookup{};
    
    	for (size_t i = 0; i < 64; ++i)
    	{
    		lookup[i][0] = '0' + (i % 8);
    		lookup[i][1] = '0' + (i / 8);
    	}
    
    	return lookup;
    }
    
    constexpr auto hex_lookup = gen_lookup();
    constexpr auto hex_to_oct_lookup_table = hex_to_oct_lookup();
    
    std::string hex_to_oct(const std::string& hex) noexcept
    {
    	const auto hex_length = hex.length();
    	if (hex_length == 0) return "0";
    
    	const auto groups = (hex_length + 2) / 3;
    	const auto oct_digits = groups * 4;
    
    	std::string res(oct_digits, '0');
    
    	for (size_t i = hex_length % 3; i < hex_length; i += 3)
    	{
    		const size_t curr_group = (i + 2) / 3;
    
    		const auto h0 = hex_lookup[static_cast<uint8_t>(hex[i + 2])];  // [3:0]
    		const auto h1 = hex_lookup[static_cast<uint8_t>(hex[i + 1])];  // [7:4]
    		const auto h2 = hex_lookup[static_cast<uint8_t>(hex[i + 0])];  // [11:8]
    
    		const uint8_t o12 = h0 | ((h1 & 0b11) << 4);  // [5:0]
    		const uint8_t o34 = (h1 >> 2) | (h2 << 2);    // [11:6]
    
    		const auto [o1, o2] = hex_to_oct_lookup_table[o12];
    		const auto [o3, o4] = hex_to_oct_lookup_table[o34];
    
    		res[curr_group * 4 + 3] = o1;
    		res[curr_group * 4 + 2] = o2;
    		res[curr_group * 4 + 1] = o3;
    		res[curr_group * 4 + 0] = o4;
    	}
    
    	if (hex_length % 3 != 0)
    	{
    		std::array<uint8_t, 3> h{0, 0, 0};
    
    		if (hex_length % 3 == 1)
    		{
    			h[0] = hex_lookup[static_cast<uint8_t>(hex[0])];
    		}
    		else  // hex_length % 3 == 2
    		{
    			h[1] = hex_lookup[static_cast<uint8_t>(hex[0])];
    			h[0] = hex_lookup[static_cast<uint8_t>(hex[1])];
    		}
    
    		const uint8_t o12 = h[0] | ((h[1] & 0b11) << 4);  // [5:0]
    		const uint8_t o34 = (h[1] >> 2) | (h[2] << 2);    // [11:6]
    
    		const auto [o1, o2] = hex_to_oct_lookup_table[o12];
    		const auto [o3, o4] = hex_to_oct_lookup_table[o34];
    
    		res[3] = o1;
    		res[2] = o2;
    		res[1] = o3;
    		res[0] = o4;
    	}
    
    	const auto first_non_zero = res.find_first_not_of('0');
    	if (first_non_zero == std::string::npos) return "0";
    	return res.substr(first_non_zero);
    }
    

    优化点

    1. 用了小型查找表快速将16进制字符转成对应的十六进制数字
    2. 每12bit为一组转换,同样使用小型查找表将6bit直接转成2个八进制字符
    3. 全程不使用分支,提高执行效率
    4. 使用consteval直接在编译期生成查找表
    5. 所有查找表均足够小并且按64字节对齐,对齐L1缓存行,提升访存效率

    性能测试

    测试环境:

    • CPU: Intel i7-13620H
    • RAM: DDR5-5200 32GiB
    • OS: Debian 13 (Linux 6.12.73+deb13-amd64)

    转换100M个十六进制字符,对比上述代码和Gemini写的版本(Gemini版本由群友提供):

    Optimized version: 84.01 ms (stddev: 1.12 ms), 1190284058.7910337 hex chars/sec
    Gemini version: 1464.58 ms (stddev: 31.58 ms), 68279030.44209935 hex chars/sec
    

    上述代码可以达到1.2G/s的速度,按照4.9GHz的主频,大约每字符4周期

    正确性测试

    使用Python脚本生成从1~40位16进制数测例,并使用Xmake并行测试,全部通过

    设计妥协

    • 为了快速转换,没有错误检测,所有非法字符均映射为0

    Gemini版本代码

    static std::string hex_to_oct_gemini_version(std::string hex)
    {
    	// 补齐长度为 3 的倍数,方便分段
    	int padding = (3 - hex.length() % 3) % 3;
    	std::string prefix(padding, '0');
    	hex = prefix + hex;
    
    	std::string res = "";
    	// 每次处理 3 个 16 进制字符
    	for (int i = 0; i < hex.length(); i += 3)
    	{
    		// 1. 将 3 位 16 进制转为 12 位整数
    		int combined = 0;
    		for (int j = 0; j < 3; ++j)
    		{
    			char c = hex[i + j];
    			int v = (std::isdigit(c) ? c - '0' : std::toupper(c) - 'A' + 10);
    			combined = (combined << 4) | v;
    		}
    
    		// 2. 将 12 位整数转为 4 位 8 进制
    		std::string chunk = "";
    		for (int j = 0; j < 4; ++j)
    		{
    			chunk = std::to_string(combined & 7) + chunk;
    			combined >>= 3;
    		}
    		res += chunk;
    	}
    
    	// 最后处理前导零
    	size_t start = res.find_first_not_of('0');
    	return (start == std::string::npos) ? "0" : res.substr(start);
    }
    

    明显可见不少槽点:

    1. 非常多的三目运算(std::isdigit、std::toupper等内置函数里面可能仍然有分支),降低执行效率
    2. 用了过多的临时std::string,尽管有SSO优化,效率仍然低下
    3. 没有提前给std::string预留空间
    4. 直接用std::to_string转换过于重量级,且两个std::string相连接也会有额外的开销
    1 条回复 最后回复
    1
    • StehsaerS 离线
      StehsaerS 离线
      Stehsaer
      编写于 最后由 编辑
      #2

      改进

      经过@yyq252517提醒并测试,直接使用12bit的查找表可以更快:

      consteval std::array<std::array<char, 4>, 4096> hex_to_oct_lookup() noexcept
      {
      	std::array<std::array<char, 4>, 4096> lookup{};
      
      	for (size_t i = 0; i < 4096; ++i)
      	{
      		lookup[i][0] = '0' + ((i >> 0) & 0b111);
      		lookup[i][1] = '0' + ((i >> 3) & 0b111);
      		lookup[i][2] = '0' + ((i >> 6) & 0b111);
      		lookup[i][3] = '0' + ((i >> 9) & 0b111);
      	}
      
      	return lookup;
      }
      // ...
      std::string hex_to_oct(const std::string& hex) noexcept
      {
      	const auto hex_length = hex.length();
      	if (hex_length == 0) return "0";
      
      	const auto groups = (hex_length + 2) / 3;
      	const auto oct_digits = groups * 4;
      
      	std::string res(oct_digits, '0');
      
      	for (size_t i = hex_length % 3; i < hex_length; i += 3)
      	{
      		const size_t curr_group = (i + 2) / 3;
      
      		const uint16_t h0 = hex_lookup[std::bit_cast<uint8_t>(hex[i + 2])];  // [3:0]
      		const uint16_t h1 = hex_lookup[std::bit_cast<uint8_t>(hex[i + 1])];  // [7:4]
      		const uint16_t h2 = hex_lookup[std::bit_cast<uint8_t>(hex[i + 0])];  // [11:8]
      
      		const uint16_t idx = (h2 << 8) | (h1 << 4) | h0;  // [11:0]
      
      		const auto [o1, o2, o3, o4] = hex_to_oct_lookup_table[idx];
      
      		res[curr_group * 4 + 3] = o1;
      		res[curr_group * 4 + 2] = o2;
      		res[curr_group * 4 + 1] = o3;
      		res[curr_group * 4 + 0] = o4;
      	}
      
      	if (hex_length % 3 != 0)
      	{
      		std::array<uint16_t, 3> h{0, 0, 0};
      
      		if (hex_length % 3 == 1)
      		{
      			h[0] = hex_lookup[std::bit_cast<uint8_t>(hex[0])];
      		}
      		else  // hex_length % 3 == 2
      		{
      			h[1] = hex_lookup[std::bit_cast<uint8_t>(hex[0])];
      			h[0] = hex_lookup[std::bit_cast<uint8_t>(hex[1])];
      		}
      
      		const uint16_t idx = (h[2] << 8) | (h[1] << 4) | h[0];
      
      		const auto [o1, o2, o3, o4] = hex_to_oct_lookup_table[idx];
      
      		res[3] = o1;
      		res[2] = o2;
      		res[1] = o3;
      		res[0] = o4;
      	}
      
      	const auto first_non_zero = res.find_first_not_of('0');
      	if (first_non_zero == std::string::npos) return "0";
      	return res.substr(first_non_zero);
      }
      

      使用和此前一致的测试环境,可以得到:

      Optimized version: 72.16 ms (stddev: 0.45 ms), 1385897448.4948883 hex chars/sec
      

      比此前快了15%!此前低估了现代CPU的L1缓存系统的能力,看来它处理4096*4byte = 16KiB的查找表也不在话下

      感谢@yyq252517

      1 条回复 最后回复
      0

      • 登录

      • 没有帐号? 注册

      • 登录或注册以进行搜索。
      d2learn forums Powered by NodeBB
      • 第一个帖子
        最后一个帖子
      0
      • 版块
      • 最新
      • 标签
      • 热门
      • Online Tools
      • 用户
      • 群组