我有一个简短的浮动投射在C中是瓶颈我的代码.
该代码从硬件设备缓冲区转换,该缓冲区本身是短路的,这代表来自花式光子计数器的输入.
float factor= 1.0f/value; for (int i = 0; i < W*H; i++)//25% of time is spent doing this { int value = source[i];//ushort -> int destination[i] = value*factor;//int*float->float }
一些细节
>值应从0到2 ^ 16-1,它表示高灵敏度相机的像素值
>我在配备i7处理器(i7 960,SSE 4.2和4.1)的多核x86机器上.
>源与8位边界对齐(硬件设备的要求)
> W * H总是可被8整除,大部分时间W和H可被8整除
这让我感到难过,有什么我可以做的吗?
我正在使用Visual Studios 2012 …
解决方法
这是一个基本的SSE4.1实现:
__m128 factor = _mm_set1_ps(1.0f / value); for (int i = 0; i < W*H; i += 8) { // Load 8 16-bit ushorts. // vi = {a,b,c,d,e,f,g,h} __m128i vi = _mm_load_si128((const __m128i*)(source + i)); // Convert to 32-bit integers // vi0 = {a,0} // vi1 = {e,h,0} __m128i vi0 = _mm_cvtepu16_epi32(vi); __m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi)); // Convert to float __m128 vf0 = _mm_cvtepi32_ps(vi0); __m128 vf1 = _mm_cvtepi32_ps(vi1); // Multiply vf0 = _mm_mul_ps(vf0,factor); vf1 = _mm_mul_ps(vf1,factor); // Store _mm_store_ps(destination + i + 0,vf0); _mm_store_ps(destination + i + 4,vf1); }
这假定:
>源和目标都与16个字节对齐.
> W * H是8的倍数.
通过进一步展开此循环可以做得更好. (见下文)
这里的想法如下:
>将8个短路装入单个SSE寄存器.
>将寄存器拆分为两个:一个是底部4个短裤,另一个是前4个短裤.
>将两个寄存器零扩展为32位整数.
>将它们转换为浮点数.
>乘以因子.
>将它们存放到目的地.
编辑:
我做了这种类型的优化已经有一段时间了,所以我继续展开循环.
酷睿i7 920 @ 3.5 GHz
Visual Studio 2012 – 发布x64:
Original Loop : 4.374 seconds Vectorize no unroll: 1.665 Vectorize unroll 2 : 1.416
进一步展开导致收益递减.
这是测试代码:
#include <smmintrin.h> #include <time.h> #include <iostream> #include <malloc.h> using namespace std; void default_loop(float *destination,const short* source,float value,int size){ float factor = 1.0f / value; for (int i = 0; i < size; i++) { int value = source[i]; destination[i] = value*factor; } } void vectorize8_unroll1(float *destination,int size){ __m128 factor = _mm_set1_ps(1.0f / value); for (int i = 0; i < size; i += 8) { // Load 8 16-bit ushorts. __m128i vi = _mm_load_si128((const __m128i*)(source + i)); // Convert to 32-bit integers __m128i vi0 = _mm_cvtepu16_epi32(vi); __m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi)); // Convert to float __m128 vf0 = _mm_cvtepi32_ps(vi0); __m128 vf1 = _mm_cvtepi32_ps(vi1); // Multiply vf0 = _mm_mul_ps(vf0,factor); vf1 = _mm_mul_ps(vf1,factor); // Store _mm_store_ps(destination + i + 0,vf0); _mm_store_ps(destination + i + 4,vf1); } } void vectorize8_unroll2(float *destination,int size){ __m128 factor = _mm_set1_ps(1.0f / value); for (int i = 0; i < size; i += 16) { __m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0)); __m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8)); // Split into two registers __m128i b0 = _mm_unpackhi_epi64(a0,a0); __m128i b1 = _mm_unpackhi_epi64(a1,a1); // Convert to 32-bit integers a0 = _mm_cvtepu16_epi32(a0); b0 = _mm_cvtepu16_epi32(b0); a1 = _mm_cvtepu16_epi32(a1); b1 = _mm_cvtepu16_epi32(b1); // Convert to float __m128 c0 = _mm_cvtepi32_ps(a0); __m128 d0 = _mm_cvtepi32_ps(b0); __m128 c1 = _mm_cvtepi32_ps(a1); __m128 d1 = _mm_cvtepi32_ps(b1); // Multiply c0 = _mm_mul_ps(c0,factor); d0 = _mm_mul_ps(d0,factor); c1 = _mm_mul_ps(c1,factor); d1 = _mm_mul_ps(d1,factor); // Store _mm_store_ps(destination + i + 0,c0); _mm_store_ps(destination + i + 4,d0); _mm_store_ps(destination + i + 8,c1); _mm_store_ps(destination + i + 12,d1); } } void print_sum(const float *destination,int size){ float sum = 0; for (int i = 0; i < size; i++){ sum += destination[i]; } cout << sum << endl; } int main(){ int size = 8000; short *source = (short*)_mm_malloc(size * sizeof(short),16); float *destination = (float*)_mm_malloc(size * sizeof(float),16); for (int i = 0; i < size; i++){ source[i] = i; } float value = 1.1; int iterations = 1000000; clock_t start; // Default Loop start = clock(); for (int it = 0; it < iterations; it++){ default_loop(destination,source,value,size); } cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl; print_sum(destination,size); // Vectorize 8,no unroll start = clock(); for (int it = 0; it < iterations; it++){ vectorize8_unroll1(destination,unroll 2 start = clock(); for (int it = 0; it < iterations; it++){ vectorize8_unroll2(destination,size); _mm_free(source); _mm_free(destination); system("pause"); }