c - AVX scalar operations are much faster -


i test following simple function

void mul(double *a, double *b) {   (int = 0; i<n; i++) a[i] *= b[i]; } 

with large arrays memory bandwidth bound. test code use below. when compile -o2 takes 1.7 seconds. when compile -o2 -mavx takes 1.0 seconds. non vex-encoded scalar operations 70% slower! why this?

here the assembly -o2 , -o2 -mavx. vimddif of <code>-o2</code> , <code>-o2 -mavx</code>

https://godbolt.org/g/w4p60f

system: i7-6700hq@2.60ghz (skylake) 32 gb mem, ubuntu 16.10, gcc 6.3

test code

//gcc -o2 -fopenmp test.c //or //gcc -o2 -mavx -fopenmp test.c #include <string.h> #include <stdio.h> #include <x86intrin.h> #include <omp.h>  #define n 1000000 #define r 1000  void mul(double *a, double *b) {   (int = 0; i<n; i++) a[i] *= b[i]; }  int main() {   double *a = (double*)_mm_malloc(sizeof *a * n, 32);   double *b = (double*)_mm_malloc(sizeof *b * n, 32);    //b must initialized correct bandwidth!!!   memset(a, 1, sizeof *a * n);   memset(b, 1, sizeof *b * n);    double dtime;   const double mem = 3*sizeof(double)*n*r/1024/1024/1024;   const double maxbw = 34.1;   dtime = -omp_get_wtime();   for(int i=0; i<r; i++) mul(a,b);   dtime += omp_get_wtime();   printf("time %.2f s, %.1f gb/s, efficency %.1f%%\n", dtime, mem/dtime, 100*mem/dtime/maxbw);    _mm_free(a), _mm_free(b); } 

the problem related dirty upper half of avx register after calling omp_get_wtime(). problem particularly skylake processors.

the first time read problem here. since other people have observed problem: here , here.

using gdb found omp_get_wtime() calls clock_gettime. rewrote code use clock_gettime() , see same problem.

void fix_avx() { __asm__ __volatile__ ( "vzeroupper" : : : ); } void fix_sse() { } void (*fix)();  double get_wtime() {   struct timespec time;   clock_gettime(clock_monotonic, &time);   #ifndef  __avx__    fix();   #endif   return time.tv_sec + 1e-9*time.tv_nsec; }  void dispatch() {   fix = fix_sse;   #if defined(__intel_compiler)   if (_may_i_use_cpu_feature (_feature_avx)) fix = fix_avx;   #else   #if defined(__gnuc__) && !defined(__clang__)   __builtin_cpu_init();   #endif   if(__builtin_cpu_supports("avx")) fix = fix_avx;   #endif } 

stepping through code gdb see first time clock_gettime called calls _dl_runtime_resolve_avx(). believe problem in function based on this comment. function appears called first time clock_gettime called.

with gcc problem goes away using //__asm__ __volatile__ ( "vzeroupper" : : : ); after first call clock_gettime clang (using clang -o2 -fno-vectorize since clang vectorizes @ -o2) goes away using after every call clock_gettime.

here code used test (with gcc 6.3 , clang 3.8)

#include <string.h> #include <stdio.h> #include <x86intrin.h> #include <time.h>  void fix_avx() { __asm__ __volatile__ ( "vzeroupper" : : : ); } void fix_sse() { } void (*fix)();  double get_wtime() {   struct timespec time;   clock_gettime(clock_monotonic, &time);   #ifndef  __avx__    fix();   #endif   return time.tv_sec + 1e-9*time.tv_nsec; }  void dispatch() {   fix = fix_sse;   #if defined(__intel_compiler)   if (_may_i_use_cpu_feature (_feature_avx)) fix = fix_avx;   #else   #if defined(__gnuc__) && !defined(__clang__)   __builtin_cpu_init();   #endif   if(__builtin_cpu_supports("avx")) fix = fix_avx;   #endif }  #define n 1000000 #define r 1000  void mul(double *a, double *b) {   (int = 0; i<n; i++) a[i] *= b[i]; }  int main() {   dispatch();   const double mem = 3*sizeof(double)*n*r/1024/1024/1024;   const double maxbw = 34.1;    double *a = (double*)_mm_malloc(sizeof *a * n, 32);   double *b = (double*)_mm_malloc(sizeof *b * n, 32);    //b must initialized correct bandwidth!!!   memset(a, 1, sizeof *a * n);   memset(b, 1, sizeof *b * n);    double dtime;   //dtime = get_wtime(); // call once fix gcc   //printf("%f\n", dtime);   //fix = fix_sse;    dtime = -get_wtime();   for(int i=0; i<r; i++) mul(a,b);   dtime += get_wtime();   printf("time %.2f s, %.1f gb/s, efficency %.1f%%\n", dtime, mem/dtime, 100*mem/dtime/maxbw);    _mm_free(a), _mm_free(b); } 

if disable lazy function call resolution -z now (e.g. clang -o2 -fno-vectorize -z foo.c) clang needs __asm__ __volatile__ ( "vzeroupper" : : : ); after first call clock_gettime gcc.

i expected -z now need __asm__ __volatile__ ( "vzeroupper" : : : ); right after main() still need after first call clock_gettime.


Comments

Popular posts from this blog

inversion of control - Autofac named registration constructor injection -

verilog - Systemverilog dynamic casting issues -

ios - Change Storyboard View using Seague -