20 #error Implementation only for AVX capable architectures 23 #include <immintrin.h> 32 const unsigned quot = n / 8;
33 const unsigned rem = n % 8;
34 __m256d t0 = _mm256_setzero_pd();
35 __m256d t1 = _mm256_setzero_pd();
36 for (
unsigned k = 0; k < quot; k++) {
37 __m256d f0 = _mm256_loadu_pd(u);
38 __m256d f1 = _mm256_loadu_pd(v);
39 f0 = _mm256_mul_pd(f0, f1);
40 t0 = _mm256_add_pd(t0, f0);
43 __m256d f2 = _mm256_loadu_pd(u);
44 __m256d f3 = _mm256_loadu_pd(v);
45 f2 = _mm256_mul_pd(f2, f3);
46 t1 = _mm256_add_pd(t1, f2);
50 t0 = _mm256_hadd_pd(t0, t1);
51 alignas(32)
double tmp[4];
52 _mm256_store_pd(tmp, t0);
53 double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
54 for (
unsigned k = 0; k < rem; k++) {
55 result += *u++ * *v++;
double DotProductAVX(const double *u, const double *v, int n)