x86 - Horizontal trailing maximum on AVX or SSE -


i have __m256i register consisting of 16bit values , want maximum values on each trailing element zeroes.

to give example:

input:  1 0 0 3 0 0 4 5 0 0 0 0 4 3 0 2 output: 1 1 1 3 3 3 4 5 5 5 5 5 4 3 3 2 

are there efficient way of doing on avx or avx architecture? maybe log(16) = 4 iterations?

addition: solution on 128 bit numbers 8 uint_16's in appreciated also.

you can in log_2(simd_width) steps indeed. idea shift input vector x_vec 2 bytes. blend x_vec shifted vector such x_vec replaced shifted vector, @ 0 positions of x_vec. process repeated shifts of 4, 8, , 16 bytes. can uncomment printf-s in code see happens between x_vec , x_trail.

#include <stdio.h> #include <x86intrin.h> /*  gcc -o3 -wall -m64 -march=broadwell -falign-loops=16 horz_trail_max.c   */ int print_vec_short(__m256i x);  __m256i hor_tr_max(__m256i x_vec){     __m256i  0         = _mm256_setzero_si256();     __m256i  pshufb_cnst  = _mm256_set_epi64x(0x8080808080808080,0x8080808080808080,0x0f0e0f0e0f0e0f0e,0x0f0e0f0e0f0e0f0e);         __m256i  mask1        = _mm256_cmpeq_epi16(x_vec,zero);     __m256i  t1           = _mm256_slli_si256(x_vec,2);                 /* _mm256_slli_si256() doesn't cross 128b lanes          */     __m256i  t2           = _mm256_blendv_epi8(x_vec,t1,mask1);      __m256i  mask3        = _mm256_cmpeq_epi16(t2,zero);     __m256i  t3           = _mm256_slli_si256(t2,4);     __m256i  t4           = _mm256_blendv_epi8(t2,t3,mask3);      __m256i  mask5        = _mm256_cmpeq_epi16(t4,zero);     __m256i  t5           = _mm256_slli_si256(t4,8);     __m256i  t6           = _mm256_blendv_epi8(t4,t5,mask5);      __m256i  mask7        = _mm256_cmpeq_epi16(t6,zero);     __m256i  t7_0         = _mm256_shuffle_epi8(t6,pshufb_cnst);        /* _mm256_slli_si256() doesn't cross 128b boundaries. therefore need shuffle , permute here. */     __m256i  t7_1         = _mm256_permute2x128_si256(t7_0,t7_0,0x01);  /* t7_1={t6[7], t6[7],...,t6[7],  0,0,0,0, 0,0,0,0}                                                      */     __m256i  x_trail      = _mm256_blendv_epi8(t6,t7_1,mask7);  /* uncomment next few lines print values of intermediate variables */  /*     printf("\n15...0     =   15   14   13   12     11   10    9    8      7    6    5    4       3    2    1    0\n");     printf("x_vec      = ");print_vec_short(x_vec      );printf("mask1      = ");print_vec_short(mask1      );     printf("t1         = ");print_vec_short(t1         );printf("t2         = ");print_vec_short(t2         );     printf("mask3      = ");print_vec_short(mask3      );printf("t3         = ");print_vec_short(t3         );     printf("t4         = ");print_vec_short(t4         );printf("mask5      = ");print_vec_short(mask5      );     printf("t5         = ");print_vec_short(t5         );printf("t6         = ");print_vec_short(t6         );     printf("mask7      = ");print_vec_short(mask7      );printf("t7_0       = ");print_vec_short(t7_0       );     printf("t7_1       = ");print_vec_short(t7_1       );printf("x_trail    = ");print_vec_short(x_trail    );printf("\n"); */     return x_trail; }   int hor_tr_max_n(short int * x_in, short int * x_out, int n){     __m256i  minus_1       = _mm256_set1_epi8(-1);     __m256i  0          = _mm256_setzero_si256();     __m256i  pshufb_cnst   = _mm256_set_epi64x(0x8080808080808080,0x8080808080808080,0x0f0e0f0e0f0e0f0e,0x0f0e0f0e0f0e0f0e);        int      indx_last_nz  = 0;     (int i=0;i<n;i=i+16){         __m256i  x_vec        = _mm256_load_si256((__m256i*)&x_in[i]);          __m256i  mask1        = _mm256_cmpeq_epi16(x_vec,zero);         __m256i  t1           = _mm256_slli_si256(x_vec,2);         __m256i  t2           = _mm256_blendv_epi8(x_vec,t1,mask1);         __m256i  mask3        = _mm256_cmpeq_epi16(t2,zero);         __m256i  t3           = _mm256_slli_si256(t2,4);         __m256i  t4           = _mm256_blendv_epi8(t2,t3,mask3);         __m256i  mask5        = _mm256_cmpeq_epi16(t4,zero);         __m256i  t5           = _mm256_slli_si256(t4,8);         __m256i  t6           = _mm256_blendv_epi8(t4,t5,mask5);         __m256i  mask7        = _mm256_cmpeq_epi16(t6,zero);         __m256i  t7_0         = _mm256_shuffle_epi8(t6,pshufb_cnst);         __m256i  t7_1         = _mm256_permute2x128_si256(t7_0,t7_0,0x01);         __m256i  x_trail      = _mm256_blendv_epi8(t6,t7_1,mask7);          __m256i  isnonzero    = _mm256_xor_si256(mask1,minus_1);                                                                                            int      mvmsk_nonz   = _mm256_movemask_epi8(isnonzero);                                                                                            int      lz_x_vec     = _lzcnt_u32( mvmsk_nonz ) >>1;                                                                                               __m256i  x_last_nz    = _mm256_broadcastw_epi16(_mm_load_si128((__m128i*)&x_in[indx_last_nz]));                                                                                 indx_last_nz = mvmsk_nonz ? (i+15-lz_x_vec) : indx_last_nz;                                                                                 __m256i  x_tr_is_zero = _mm256_cmpeq_epi16(x_trail,zero);         __m256i  x_trail_upd  = _mm256_blendv_epi8(x_trail,x_last_nz,x_tr_is_zero);                                     _mm256_store_si256((__m256i*)&x_out[i],x_trail_upd);     }     return 0; }   int main() { #define test 0  #if test == 0     printf("test 0: test functionality\n");     short   x[16] = {1, 0, 0, 3, 0, 0, 4, 5, 0, 0, 0, 0, 4, 3, 0, 2}; //    short   x[16] = {0, 0, 0, 3, 0, 0, 4, 5, 0, 0, 0, 0, 4, 3, 0, 2}; //    short   x[16] = {1, 0, 0, 3, 0, 0, 4000, 0, 0, 0, 10, 0, 0, 3, 0, 2}; //    short   x[16] = {1100, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 5000, 0, 0, 0}; //    short   x[16] = {1100, 0, 0, 0, 0, 0, 0, 8888,    0, 0, 0, 0, 5000, 0, 0, 0};      printf("\n15...0     =   15   14   13   12     11   10    9    8      7    6    5    4       3    2    1    0\n");     __m256i  x_vec        = _mm256_loadu_si256((__m256i*)x);     printf("x_vec      = ");print_vec_short(x_vec        );     __m256i  x_trail      = hor_tr_max(x_vec);     printf("x_trail    = ");print_vec_short(x_trail      );  #elif test == 1  || test == 2      int i, i_o, k;     int n = 8000;     int d = 50;     short int *x_in;     short int *x_out;     x_in  = _mm_malloc(n*sizeof(short int),32);     x_out = _mm_malloc(n*sizeof(short int),32);     int j = 73659343;                            /* generate pseudo random array a.                                               */     (i = 0;i < n;i++){        j = j*653+1;                                     k = (j & 0x3ff00)>>8;                     /* k pseudo random number between 0 , 1023                                       */        if (k < d){                               /* small d, x_in has many zeros, try e.g. d=6, d=60 , d=600                     */                    x_in[i] = (j&0xffe)+1-2048;            /* set x_in[i] nonzero.                                                              */        }else{           x_in[i] = 0;        }     } #endif     #if test == 1     printf("test 1: test performance short int arrays of size n. use:  perf stat -d ./a.out \n");     (i_o=0;i_o<400000;i_o++){                              /* compiler should not interchange inner , outer loop after function inlining, check compiler output (-s). */         hor_tr_max_n(x_in,x_out,n);     }          #elif test == 2     printf("test 2: test performance of unrolled scalar loop short int arrays of size n. use:  perf stat -d ./a.out\n");     short int prev_x  = 0;     (i_o=0;i_o<400000;i_o++){                              /* compiler should not interchange inner , outer loop, check compiler output (-s). */         (i=0;i<n;i=i+4){             short int x_in_i0 = x_in[i];                                                                  short int x_in_i1 = x_in[i+1];                                                                short int x_in_i2 = x_in[i+2];                                                                short int x_in_i3 = x_in[i+3];                                                                          prev_x  = (x_in_i0)?(x_in_i0):(prev_x);  x_out[i]   = prev_x;                                  prev_x  = (x_in_i1)?(x_in_i1):(prev_x);  x_out[i+1] = prev_x;                                  prev_x  = (x_in_i2)?(x_in_i2):(prev_x);  x_out[i+2] = prev_x;                                  prev_x  = (x_in_i3)?(x_in_i3):(prev_x);  x_out[i+3] = prev_x;                    }     }          #elif test == 3     printf("test 3: estimate approximately latency , throughput of hor_tr_max with: perf stat -d ./a.out \n");     int i;     short   x0[16] = {1, 0, 0, 3, 0, 0, 4, 5, 0, 0, 0, 0, 4, 3, 0, 2};     short   x1[16] = {0, 0, 0, 3, 0, 12, 4, 5, 0, 0, 0, 0, 4, 3, 0, 2};     short   x2[16] = {1, 0, 0, 3, 0, 0, 4, 5, 0, 0, 10, 0, 4, 3, 0, 2};     short   x3[16] = {110, 0, 0, 1113, 0, 0, 4, 5, 0, 0, 0, 0, 4000, 3, 0, 2};     short   x4[16] = {110, 4, 0, 1113, 0, 0, 4, 5, 0, 7, 0, 0, 4000, 3, 0, 2};      __m256i  x_vec0   = _mm256_loadu_si256((__m256i*)x0); printf("x_vec0 = ");print_vec_short(x_vec0); __m256i x_trail0 = hor_tr_max(x_vec0);                                       __m256i  x_vec1   = _mm256_loadu_si256((__m256i*)x1); printf("x_vec1 = ");print_vec_short(x_vec1); __m256i x_trail1 = hor_tr_max(x_vec1);                                     __m256i  x_vec2   = _mm256_loadu_si256((__m256i*)x2); printf("x_vec2 = ");print_vec_short(x_vec2); __m256i x_trail2 = hor_tr_max(x_vec2);                                     __m256i  x_vec3   = _mm256_loadu_si256((__m256i*)x3); printf("x_vec3 = ");print_vec_short(x_vec3); __m256i x_trail3 = hor_tr_max(x_vec3);                                       __m256i  x_vec4   = _mm256_loadu_si256((__m256i*)x4); printf("x_vec4 = ");print_vec_short(x_vec4); __m256i x_trail4 = hor_tr_max(x_vec4);                                        for(i=0;i<100000000;i++){              x_trail0      = hor_tr_max(x_trail0);    /* use line latency testing, uncomment next 4 lines throughput testing */ //             x_trail1      = hor_tr_max(x_trail1); //             x_trail2      = hor_tr_max(x_trail2); //             x_trail3      = hor_tr_max(x_trail3); //             x_trail4      = hor_tr_max(x_trail4);              }     printf("x_trail0 = ");print_vec_short(x_trail0 );     printf("x_trail1 = ");print_vec_short(x_trail1 );     printf("x_trail2 = ");print_vec_short(x_trail2 );     printf("x_trail3 = ");print_vec_short(x_trail3 );     printf("x_trail4 = ");print_vec_short(x_trail4 ); #endif     #if test == 1  || test == 2       (i=0;i<400;i++){         printf("%6i %6hi  %6hi\n",i,x_in[i],x_out[i]);     } #endif         return 0; }  int print_vec_short(__m256i x){     short int v[16];     _mm256_storeu_si256((__m256i *)v,x);     printf("%4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi  | %4hi %4hi %4hi %4hi\n",            v[15],v[14],v[13],v[12],v[11],v[10],v[9],v[8],v[7],v[6],v[5],v[4],v[3],v[2],v[1],v[0]);     return 0; } 

the output is:

15...0     =   15   14   13   12     11   10    9    8      7    6    5    4       3    2    1    0 x_vec      =    2    0    3    4 |    0    0    0    0 |    5    4    0    0  |    3    0    0    1 x_trail    =    2    3    3    4 |    5    5    5    5 |    5    4    3    3  |    3    1    1    1 


this function hor_tr_max has latency , throughput of approximately 14.2 , 6.4 cycles (intel skylake core i5-6500). note standard unrolled scalar loop such as:

short int prev_x  = 0; (i=0;i<n;i=i+4){     short int x_in_i0 = x_in[i];                                                          short int x_in_i1 = x_in[i+1];                                                        short int x_in_i2 = x_in[i+2];                                                        short int x_in_i3 = x_in[i+3];                                                                  prev_x  = (x_in_i0)?(x_in_i0):(prev_x);  x_out[i]   = prev_x;                          prev_x  = (x_in_i1)?(x_in_i1):(prev_x);  x_out[i+1] = prev_x;                          prev_x  = (x_in_i2)?(x_in_i2):(prev_x);  x_out[i+2] = prev_x;                          prev_x  = (x_in_i3)?(x_in_i3):(prev_x);  x_out[i+3] = prev_x;            } 

takes 1.26 cycle per short int, 20.2 cycles per 16 short int-s. so, vectorization profitable here.


horizontal trailing maximum arrays of size n

we can use hor_tr_max compute horizontal trailing maximum of array of size n, n larger 16. however, output of step i needed compute next step. loop carried dependence results in low performance of code. function hor_tr_max_n, in code above, implements different method makes dependency chain shorter, beneficial due out of order scheduling.

function hor_tr_max_n costs 12.2 cycles per 16 short ints, 40 percent less unrolled scalar loop.

it forthcoming skylake-sp processor, vectorization of 'horizontal trailing maximum' more profitable, due wider vector registers.


Comments

Popular posts from this blog

inversion of control - Autofac named registration constructor injection -

verilog - Systemverilog dynamic casting issues -

ios - Change Storyboard View using Seague -