18 #if !defined(__SSE4_1__) 19 #error Implementation only for SSE 4.1 capable architectures 25 #include <emmintrin.h> 26 #include <smmintrin.h> 33 static int32_t IntDotProductSSE(
const int8_t* u,
const int8_t* v,
int n) {
34 int max_offset = n - 8;
39 if (offset <= max_offset) {
41 __m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u));
42 __m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v));
43 __m128i sum = _mm_cvtepi8_epi16(packed1);
44 packed2 = _mm_cvtepi8_epi16(packed2);
48 sum = _mm_madd_epi16(sum, packed2);
49 while (offset <= max_offset) {
50 packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u + offset));
51 packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v + offset));
53 packed1 = _mm_cvtepi8_epi16(packed1);
54 packed2 = _mm_cvtepi8_epi16(packed2);
55 packed1 = _mm_madd_epi16(packed1, packed2);
56 sum = _mm_add_epi32(sum, packed1);
59 sum = _mm_hadd_epi32(sum, sum);
60 sum = _mm_hadd_epi32(sum, sum);
61 result = _mm_cvtsi128_si32(sum);
64 result += u[offset] * v[offset];
71 static void PartialMatrixDotVector1(
const int8_t* wi,
const double* scales,
72 const int8_t* u,
int num_in,
74 double total = IntDotProductSSE(u, wi, num_in);
76 *v = (total / INT8_MAX + wi[num_in]) * *scales;
79 static void matrixDotVector(
int dim1,
int dim2,
const int8_t* wi,
80 const double* scales,
const int8_t* u,
double* v) {
81 const int num_out = dim1;
82 const int num_in = dim2 - 1;
85 for (; output < num_out; output++) {
86 PartialMatrixDotVector1(wi, scales, u, num_in, v);
static const IntSimdMatrix intSimdMatrixSSE