c - AVX scalar operations are much faster -
i test following simple function
void mul(double *a, double *b) { (int = 0; i<n; i++) a[i] *= b[i]; } with large arrays memory bandwidth bound. test code use below. when compile -o2 takes 1.7 seconds. when compile -o2 -mavx takes 1.0 seconds. non vex-encoded scalar operations 70% slower! why this?
here the assembly -o2 , -o2 -mavx. 
system: i7-6700hq@2.60ghz (skylake) 32 gb mem, ubuntu 16.10, gcc 6.3
test code
//gcc -o2 -fopenmp test.c //or //gcc -o2 -mavx -fopenmp test.c #include <string.h> #include <stdio.h> #include <x86intrin.h> #include <omp.h> #define n 1000000 #define r 1000 void mul(double *a, double *b) { (int = 0; i<n; i++) a[i] *= b[i]; } int main() { double *a = (double*)_mm_malloc(sizeof *a * n, 32); double *b = (double*)_mm_malloc(sizeof *b * n, 32); //b must initialized correct bandwidth!!! memset(a, 1, sizeof *a * n); memset(b, 1, sizeof *b * n); double dtime; const double mem = 3*sizeof(double)*n*r/1024/1024/1024; const double maxbw = 34.1; dtime = -omp_get_wtime(); for(int i=0; i<r; i++) mul(a,b); dtime += omp_get_wtime(); printf("time %.2f s, %.1f gb/s, efficency %.1f%%\n", dtime, mem/dtime, 100*mem/dtime/maxbw); _mm_free(a), _mm_free(b); }
the problem related dirty upper half of avx register after calling omp_get_wtime(). problem particularly skylake processors.
the first time read problem here. since other people have observed problem: here , here.
using gdb found omp_get_wtime() calls clock_gettime. rewrote code use clock_gettime() , see same problem.
void fix_avx() { __asm__ __volatile__ ( "vzeroupper" : : : ); } void fix_sse() { } void (*fix)(); double get_wtime() { struct timespec time; clock_gettime(clock_monotonic, &time); #ifndef __avx__ fix(); #endif return time.tv_sec + 1e-9*time.tv_nsec; } void dispatch() { fix = fix_sse; #if defined(__intel_compiler) if (_may_i_use_cpu_feature (_feature_avx)) fix = fix_avx; #else #if defined(__gnuc__) && !defined(__clang__) __builtin_cpu_init(); #endif if(__builtin_cpu_supports("avx")) fix = fix_avx; #endif } stepping through code gdb see first time clock_gettime called calls _dl_runtime_resolve_avx(). believe problem in function based on this comment. function appears called first time clock_gettime called.
with gcc problem goes away using //__asm__ __volatile__ ( "vzeroupper" : : : ); after first call clock_gettime clang (using clang -o2 -fno-vectorize since clang vectorizes @ -o2) goes away using after every call clock_gettime.
here code used test (with gcc 6.3 , clang 3.8)
#include <string.h> #include <stdio.h> #include <x86intrin.h> #include <time.h> void fix_avx() { __asm__ __volatile__ ( "vzeroupper" : : : ); } void fix_sse() { } void (*fix)(); double get_wtime() { struct timespec time; clock_gettime(clock_monotonic, &time); #ifndef __avx__ fix(); #endif return time.tv_sec + 1e-9*time.tv_nsec; } void dispatch() { fix = fix_sse; #if defined(__intel_compiler) if (_may_i_use_cpu_feature (_feature_avx)) fix = fix_avx; #else #if defined(__gnuc__) && !defined(__clang__) __builtin_cpu_init(); #endif if(__builtin_cpu_supports("avx")) fix = fix_avx; #endif } #define n 1000000 #define r 1000 void mul(double *a, double *b) { (int = 0; i<n; i++) a[i] *= b[i]; } int main() { dispatch(); const double mem = 3*sizeof(double)*n*r/1024/1024/1024; const double maxbw = 34.1; double *a = (double*)_mm_malloc(sizeof *a * n, 32); double *b = (double*)_mm_malloc(sizeof *b * n, 32); //b must initialized correct bandwidth!!! memset(a, 1, sizeof *a * n); memset(b, 1, sizeof *b * n); double dtime; //dtime = get_wtime(); // call once fix gcc //printf("%f\n", dtime); //fix = fix_sse; dtime = -get_wtime(); for(int i=0; i<r; i++) mul(a,b); dtime += get_wtime(); printf("time %.2f s, %.1f gb/s, efficency %.1f%%\n", dtime, mem/dtime, 100*mem/dtime/maxbw); _mm_free(a), _mm_free(b); } if disable lazy function call resolution -z now (e.g. clang -o2 -fno-vectorize -z foo.c) clang needs __asm__ __volatile__ ( "vzeroupper" : : : ); after first call clock_gettime gcc.
i expected -z now need __asm__ __volatile__ ( "vzeroupper" : : : ); right after main() still need after first call clock_gettime.
Comments
Post a Comment