The cost of AVX

To complement the information in my comparison of Haswell with Sandybridge I have performed a detailed study of clock-rate and power-absorbed using a recent version of turbostat that shows also RAPL readings.

I have used the following simple program that exploit gcc function-versioning to switch between generic (sse2) and native (avx or avx2) code. The switch happens touching and removing a file.

// #define DIV
using Float = float;
// using Float = double;

namespace {
inline
Float _sum0(Float const *  x, 
           Float const *  y, Float const *  z, int n) {
  Float sum=0;
#pragma GCC ivdep
  for (int i=0; i!=n; ++i)
#ifdef DIV
    sum += z[i]+x[i]/y[i];
#else
    sum += z[i]+x[i]*y[i];
#endif
  return sum;
}
}

Float __attribute__ ((target ("default")))
sum(Float const *  x, Float const *  y, Float const *  z, int n) {
  return _sum0(x,y,z,n);
}


Float  __attribute__ ((__target__ ("arch=haswell")))
sum(Float const *  xm Float const *  y, Float const *  z, int n) {
  return _sum0(x,y,z,n);
}

Float  __attribute__ ((__target__ ("arch=sandybridge")))
sum(Float const *  x,
     Float const *  y, Float const *  z, int n) {
  return _sum0(x,y,z,n);
}



Float  __attribute__ ((__target__ ("arch=nehalem")))
sum(Float const *  x, Float const *  y, Float const *  z, int n) {
  return _sum0(x,y,z,n);
}

#include<cstdlib>
#include <fstream>
#include <iostream>

int main(int npar, char * par[]) {

  alignas(32) Float x[10240],y[10240],z[10240];
  for (int i=0; i!=10240; ++i)
    x[i]=y[i]=z[i]=(1+i)*0.1;

while (1) {
    {
    std::ifstream in("endAvx");
    if (in) return 0;
    in.close();
    }
    std::ifstream in("goAvx");
    if(in) {
     for (int i=0; i<1000; ++i)
      s += sum(x,y,z,10240);
    } else {
     for (int i=0; i<1000; ++i)
      s += _sum0(x,y,z,10240);
    }
    in.close();
  }

  return 0;
}

compiled with c++ -std=c++1y -Ofast avxSpeed.cpp run as task set -c 1 ./a.out &

I run turbostat -i 1 on a different window (as root).

on Haswell I got

   Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
      -       -     487   12.51    3891    3492       0   12.54    0.02    0.00   74.93      57      57    0.00    0.00    0.00    0.00   27.58   18.17    0.00
      0       0       1    0.02    3878    3492       0    0.06    0.00    0.00   99.92      46      57    0.00    0.00    0.00    0.00   27.58   18.17    0.00
      0       4       0    0.01    3888    3492       0    0.07
      1       1    3891   99.99    3891    3492       0    0.01    0.00    0.00    0.00      57
then
touch goAvx
   Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
      -       -     437   12.51    3492    3492       0   12.53    0.03    0.02   74.92      60      60    0.00    0.00    0.00    0.00   25.83   19.66    0.00
      0       0       0    0.01    3491    3492       0    0.03    0.00    0.06   99.90      49      60    0.00    0.00    0.00    0.00   25.83   19.66    0.00
      0       4       0    0.01    3500    3492       0    0.04
      1       1    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      60
then
rm goAvx
and
   Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
      -       -     487   12.51    3891    3492       0   12.54    0.01    0.03   74.90      61      61    0.00    0.00    0.00    0.00   27.71   18.30    0.00
      0       0       1    0.02    3864    3492       0    0.07    0.00    0.06   99.85      48      61    0.00    0.00    0.00    0.00   27.71   18.30    0.00
      0       4       1    0.02    3873    3492       0    0.07
      1       1    3891   99.99    3891    3492       0    0.01    0.00    0.00    0.00      61
touch  goAvx
   Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
      -       -     437   12.51    3492    3492       0   12.53    0.05    0.01   74.90      62      62    0.00    0.00    0.00    0.00   25.64   19.55    0.00
      0       0       0    0.01    3493    3492       0    0.05    0.07    0.02   99.85      52      62    0.00    0.00    0.00    0.00   25.64   19.55    0.00
      0       4       0    0.01    3497    3492       0    0.05
      1       1    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      62
rm goAvx
   Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
      -       -     487   12.51    3891    3492       0   12.56    0.05    0.00   74.88      62      62    0.00    0.00    0.00    0.00   27.59   18.29    0.00
      0       0       1    0.02    3842    3492       0    0.09    0.05    0.00   99.84      50      62    0.00    0.00    0.00    0.00   27.59   18.29    0.00
      0       4       1    0.02    3859    3492       0    0.09
      1       1    3890   99.98    3891    3492       0    0.02    0.00    0.00    0.00      62

on SandyBridge

   Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
      -       -     474   12.50    3790    3392       0   12.57    0.00   74.92    0.00      56      56    0.00    0.00    0.00    0.00   24.68   20.93    0.30
      0       0       1    0.02    3682    3392       0    0.08    0.00   99.90    0.00      50      56    0.00    0.00    0.00    0.00   24.68   20.93    0.30
      0       4       0    0.01    3646    3392       0    0.09
      1       1    3789   99.97    3790    3392       0    0.03    0.00    0.00    0.00      56
touch goAvx
   Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
      -       -     474   12.51    3790    3392       0   12.55    0.03   74.91    0.00      60      60    0.00    0.00    0.00    0.00   28.59   24.84    0.30
      0       0       1    0.02    3673    3392       0    0.07    0.07   99.84    0.00      50      60    0.00    0.00    0.00    0.00   28.59   24.84    0.30
      0       4       0    0.01    3667    3392       0    0.08
      1       1    3789   99.96    3790    3392       0    0.04    0.00    0.00    0.00      60
rm goAvx
   Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
      -       -     474   12.51    3791    3392       0   12.54    0.01   74.94    0.00      58      58    0.00    0.00    0.00    0.00   24.85   21.08    0.30
      0       0       1    0.02    3692    3392       0    0.05    0.05   99.88    0.00      51      58    0.00    0.00    0.00    0.00   24.85   21.08    0.30
      0       4       0    0.01    3695    3392       0    0.07
      1       1    3790   99.97    3791    3392       0    0.03    0.00    0.00    0.00      58

Running Scimak2

first test: running on core 0 the little test above and on core 1 scimark compiled for nehalem architecture

 taskset -c 0 ./avxSpeed &
taskset -c 1 ./scimark2_410_nhl 
       
Using       2.00 seconds min time per kenel.
running permutation 0
Composite Score:         2393.74
FFT             Mflops:  1909.24    (N=1024)
SOR             Mflops:  2073.59    (100 x 100)
MonteCarlo:     Mflops:   912.46
Sparse matmult  Mflops:  2478.81    (N=1000, nz=5000)
LU              Mflops:  4594.60    (M=100, N=100)
|   1909.24|   2073.59|    912.46|   2478.81|   4594.60||
[innocent@vinavx2 scimark2]$ touch goAvx
[innocent@vinavx2 scimark2]$ taskset -c 1 ./scimark2_410_nhl

Using       2.00 seconds min time per kenel.
running permutation 0
Composite Score:         2166.63
FFT             Mflops:  1727.07    (N=1024)
SOR             Mflops:  1873.70    (100 x 100)
MonteCarlo:     Mflops:   827.83
Sparse matmult  Mflops:  2251.50    (N=1000, nz=5000)
LU              Mflops:  4153.05    (M=100, N=100)
|   1727.07|   1873.70|    827.83|   2251.50|   4153.05||
conclusion too obvious to comment. (will not report turbostat , similar to what below...)


second test: running 4 scimark2 jobs in parallel: my version of scimark can run the micro-benchmark in a different order: here I make sure that each job run a different sequence...

source run4 ./scimark2_410_nhl

    Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
       -       -    1847   50.02    3692    3492       0   49.98    0.00    0.00    0.00      54      54    0.00    0.00    0.00    0.00   54.90   46.26    0.00
       0       0    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      54      54    0.00    0.00    0.00    0.00   54.90   46.26    0.00
       0       4       0    0.01    3694    3492       0   99.99
       1       1    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      49
       1       5       1    0.02    3691    3492       0   99.98
       2       2    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      52
       2       6       6    0.15    3691    3492       0   99.85
       3       3    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      47
       3       7       0    0.01    3692    3492       0   99.99

    Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
       -       -    1847   50.03    3691    3492       0   49.97    0.00    0.00    0.00      69      69    0.00    0.00    0.00    0.00   62.26   53.47    0.00
       0       0    3691   99.99    3691    3492       0    0.01    0.00    0.00    0.00      69      69    0.00    0.00    0.00    0.00   62.26   53.47    0.00
       0       4       6    0.16    3687    3492       0   99.84
       1       1    3691   99.99    3691    3492       0    0.01    0.00    0.00    0.00      67
       1       5       2    0.06    3660    3492       0   99.94
       2       2    3691   99.99    3691    3492       0    0.01    0.00    0.00    0.00      64
       2       6       2    0.05    3671    3492       0   99.95
       3       3    3691   99.99    3692    3492       0    0.01    0.00    0.00    0.00      64
       3       7       1    0.01    3694    3492       0   99.99
  
Composite Score: 2265.40
1695.41 1981.10 880.54 2380.75 4389.17

now running the same sequence (first snapshot at the beginning, second snapshot while is running MC, second third toward the end)

   Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
       -       -    1847   50.03    3692    3492       0   49.97    0.00    0.00    0.00      67      67    0.00    0.00    0.00    0.00   59.13   50.66    0.00
       0       0    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      67      67    0.00    0.00    0.00    0.00   59.13   50.66    0.00
       0       4       4    0.11    3693    3492       0   99.89
       1       1    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      66
       1       5       0    0.01    3691    3492       0   99.99
       2       2    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      62
       2       6       2    0.07    3690    3492       0   99.93
       3       3    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      62
       3       7       2    0.06    3689    3492       0   99.94

     Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
       -       -    1847   50.02    3692    3492       0   49.98    0.00    0.00    0.00      61      61    0.00    0.00    0.00    0.00   47.03   38.40    0.00
       0       0    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      61      61    0.00    0.00    0.00    0.00   47.03   38.40    0.00
       0       4       1    0.01    3696    3492       0   99.99
       1       1    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      61
       1       5       0    0.01    3693    3492       0   99.99
       2       2    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      58
       2       6       5    0.14    3692    3492       0   99.86
       3       3    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      56
       3       7       0    0.01    3694    3492       0   99.99

    Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
       -       -    1846   50.02    3692    3492       0   49.98    0.00    0.00    0.00      83      83    0.00    0.00    0.00    0.00   76.05   66.75    0.00
       0       0    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      83      83    0.00    0.00    0.00    0.00   76.05   66.75    0.00
       0       4       4    0.12    3692    3492       0   99.88
       1       1    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      82
       1       5       0    0.01    3696    3492       0   99.99
       2       2    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      80
       2       6       0    0.01    3694    3492       0   99.99
       3       3    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      77
       3       7       1    0.02    3691    3492       0   99.98
 
Composite Score: 2290.03
1832.28 1980.45 874.71 2379.90 4382.79

now running compiled for haswell: different sequences

source run4 ./scimark2_410_hsw

    Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
       -       -    1747   50.03    3492    3492       0   49.97    0.00    0.00    0.00      54      54    0.00    0.00    0.00    0.00   48.97   40.23    0.00
       0       0    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      54      54    0.00    0.00    0.00    0.00   48.97   40.23    0.00
       0       4       4    0.13    3492    3492       0   99.87
       1       1    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      48
       1       5       1    0.04    3492    3492       0   99.96
       2       2    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      49
       2       6       1    0.03    3491    3492       0   99.97
       3       3    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      48
       3       7       1    0.02    3493    3492       0   99.98
    Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
       -       -    1747   50.03    3492    3492       0   49.97    0.00    0.00    0.00      74      74    0.00    0.00    0.00    0.00   55.72   48.35    0.00
       0       0    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      74      74    0.00    0.00    0.00    0.00   55.72   48.35    0.00
       0       4       0    0.01    3493    3492       0   99.99
       1       1    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      72
       1       5       1    0.02    3501    3492       0   99.98
       2       2    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      63
       2       6       6    0.16    3492    3492       0   99.84
       3       3    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      64
       3       7       0    0.01    3492    3492       0   99.99
   
Composite Score: 2401.44
1879.05 2153.12 845.34 2437.37 4694.69

same sequence: we notice that the initial benchmarks use avx and clock is low, then MC (no real avx, actually just scalar integers) clock higher, than arrives those who use avx, and vectorized. Clock goes down power usage increased by 28W (only 14W w/r/t see code in the same region)

    Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
       -       -    1747   50.03    3492    3492       0   49.97    0.00    0.00    0.00      62      62    0.00    0.00    0.00    0.00   56.79   48.25    0.00
       0       0    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      62      62    0.00    0.00    0.00    0.00   56.79   48.25    0.00
       0       4       0    0.01    3491    3492       0   99.99
       1       1    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      59
       1       5       1    0.01    3498    3492       0   99.99
       2       2    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      57
       2       6       8    0.22    3492    3492       0   99.78
       3       3    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      55
       3       7       1    0.02    3495    3492       0   99.98

  Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
       -       -    1847   50.02    3692    3492       0   49.98    0.00    0.00    0.00      60      60    0.00    0.00    0.00    0.00   48.52   39.67    0.00
       0       0    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      60      60    0.00    0.00    0.00    0.00   48.52   39.67    0.00
       0       4       6    0.16    3692    3492       0   99.84
       1       1    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      60
       1       5       0    0.01    3698    3492       0   99.99
       2       2    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      57
       2       6       1    0.01    3689    3492       0   99.99
       3       3    3692  100.00    3692    3492       0    0.00    0.00    0.00    0.00      55
       3       7       0    0.01    3692    3492       0   99.99
   Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt 
       -       -    1747   50.03    3492    3492       0   49.97    0.00    0.00    0.00      82      82    0.00    0.00    0.00    0.00   73.72   67.72    0.00
       0       0    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      82      82    0.00    0.00    0.00    0.00   73.72   67.72    0.00
       0       4       2    0.06    3491    3492       0   99.94
       1       1    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      81
       1       5       0    0.01    3502    3492       0   99.99
       2       2    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      79
       2       6       4    0.12    3492    3492       0   99.88
       3       3    3492  100.00    3492    3492       0    0.00    0.00    0.00    0.00      74
       3       7       0    0.01    3492    3492       0   99.99

Composite Score: 2483.89 Composite Score: 2398.95

1881.67 2274.68 893.21 2573.69 4796.21
1881.08 2276.17 893.36 2480.60 4463.54

Conclusions

On SandyBridge AVX costs 4 watts and does not affect clock speed.
On Haswell it costs only 1.25 (even 0 or "negative") watts but brings the clock back to nominal.

-- VincenzoInnocente - 09 Jul 2014

Edit | Attach | Watch | Print version | History: r5 < r4 < r3 < r2 < r1 | Backlinks | Raw View | WYSIWYG | More topic actions
Topic revision: r5 - 2016-11-17 - VincenzoInnocente
 
    • Cern Search Icon Cern Search
    • TWiki Search Icon TWiki Search
    • Google Search Icon Google Search

    Main All webs login

This site is powered by the TWiki collaboration platform Powered by PerlCopyright &© 2008-2024 by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
or Ideas, requests, problems regarding TWiki? use Discourse or Send feedback