User:David.Monniaux/Optimization

SSE floating-point vectorization in IA32/AMD64
void multTransp(const int m, const int n, const int p,           scalar dest[const restrict m][p],            const scalar a[const restrict m][n],            const scalar b[const restrict p][n]) { for(int i=0; i<m; i++) { for(int k=0; k<p; k++) { scalar s = 0.0; scalar *ap = a[i], *bp = b[k]; for(int j=0; j<n; j++) { s += ap[j] * bp[j]; }     dest[i][k] = s;    } } }

IEEE single precision
gcc 4.2 2005/12/12 icc 9.0

Opteron: 2.2 GHz Athlon64: 2 GHz

gcc -std=c99 -O3 -fprefetch-loop-arrays -funsafe-math-optimizations -ftree-vectorize -ftree-vectorizer-verbose=3 matrixMult.c -o matrixMult_gcc icc -O3 -xW -fp-model fast -std=c99 -vec-report2 matrixMult.c -o matrixMult_icc

gcc -std=c99 -O3 matrixMult.c -o matrixMult_gcc icc -O2 -xW -fp-model strict -std=c99 -vec-report2 matrixMult.c -o matrixMult_icc

100x100x100 matrix mult, 5000 matrices
Athlon64 icc vectorized: 3103 gcc vectorized: 3334 icc non vectorized: 11024 gcc non vectorized: 10883

Opteron icc vectorized: 2823 gcc vectorized: 3029 icc non vectorized: 9761 gcc non vectorized: 9678

1000x1000x1000 matrix mult, 2 matrices
Athlon64 icc vectorized 4223 gcc vectorized 5132 icc non vectorized 5366 gcc non vectorized 5294

Opteron icc vectorized 4178 gcc vectorized 4500 (3980 min) icc non vectorized 5818 gcc non vectorized 5731

Conclusion
A naïve implementation of matrix multiplication will have speed solely depending on the cache speed.

MMX/SSE integer vectorization
void combineVectorsInt8(                   uint8_t c1,                    uint8_t c2,                    int n,                    uint8_t * restrict dest,                    uint8_t * restrict const s1,                    uint8_t * restrict const s2) { for(int i=0; i> 8) + t) >> 8; } }

typedef union { uint8_t t8[8]; uint16_t t16[4]; uint8_t v8 __attribute__ ((vector_size (8))); uint16_t v16 __attribute__ ((vector_size (8))); } v64;

void combineVectorsInt8SSE_v8(                   uint8_t c1,                    uint8_t c2,                    int n,                    v64 * restrict dest,                    v64 * restrict const s1,                    v64 * restrict const s2) { v64 zero, c1v, c2v, shift; zero.v8 ^= zero.v8; for(int i=0; i<4; i++) { c1v.t16[i]=c1; c2v.t16[i]=c2; shift.t16[i]=0x80; }

for(int i=0; i<n/8; i++) { v64 dh, dl;

{     v64 s1h, s2h, dht; s1h.v16 = __builtin_ia32_punpckhbw (s1[i].v16, zero.v16); s2h.v16 = __builtin_ia32_punpckhbw (s2[i].v16, zero.v16); dht.v16 = __builtin_ia32_paddw(                  __builtin_ia32_paddw( __builtin_ia32_pmullw(c1v.v16, s1h.v16), __builtin_ia32_pmullw(c2v.v16, s2h.v16)),               shift.v16); dh.v16 = __builtin_ia32_psrlw(dht.v16, 8); dh.v16 = __builtin_ia32_paddw(dh.v16, dht.v16); dh.v16 = __builtin_ia32_psrlw(dh.v16, 8); }

{     v64 s1l, s2l, dlt; s1l.v16 = __builtin_ia32_punpcklbw (s1[i].v16, zero.v16); s2l.v16 = __builtin_ia32_punpcklbw (s2[i].v16, zero.v16); dlt.v16 = __builtin_ia32_paddw(                  __builtin_ia32_paddw( __builtin_ia32_pmullw(c1v.v16, s1l.v16), __builtin_ia32_pmullw(c2v.v16, s2l.v16)),               shift.v16); dl.v16 = __builtin_ia32_psrlw(dlt.v16, 8); dl.v16 = __builtin_ia32_paddw(dl.v16, dlt.v16); dl.v16 = __builtin_ia32_psrlw(dl.v16, 8); }

dest[i].v8 = __builtin_ia32_packuswb(dl.v16, dh.v16); } }

typedef union { uint8_t t8[16]; uint16_t t16[8]; uint8_t v8 __attribute__ ((vector_size (16))); uint16_t v16 __attribute__ ((vector_size (16))); } v128;

void combineVectorsInt8SSE_v16(                   uint8_t c1,                    uint8_t c2,                    int n,                    v128 * restrict dest,                    v128 * restrict const s1,                    v128 * restrict const s2) { v128 zero, c1v, c2v, shift; zero.v8 ^= zero.v8; for(int i=0; i<sizeof(v128)/2; i++) { c1v.t16[i]=c1; c2v.t16[i]=c2; shift.t16[i]=0x80; }

for(int i=0; i<n/sizeof(v128); i++) { v128 dh, dl;

{     v128 s1h, s2h, dht; s1h.v16 = __builtin_ia32_punpckhbw128 (s1[i].v16, zero.v16); s2h.v16 = __builtin_ia32_punpckhbw128 (s2[i].v16, zero.v16); dht.v16 = (c1v.v16 * s1h.v16) + (c2v.v16 * s2h.v16) + shift.v16; dh.v16 = __builtin_ia32_psrlwi128(dht.v16, 8); dh.v16 += dht.v16; dh.v16 = __builtin_ia32_psrlwi128(dh.v16, 8); }

{     v128 s1l, s2l, dlt; s1l.v16 = __builtin_ia32_punpcklbw128 (s1[i].v16, zero.v16); s2l.v16 = __builtin_ia32_punpcklbw128 (s2[i].v16, zero.v16); dlt.v16 = (c1v.v16 * s1l.v16) + (c2v.v16 * s2l.v16) + shift.v16; dl.v16 = __builtin_ia32_psrlwi128(dlt.v16, 8); dl.v16 += dlt.v16; dl.v16 = __builtin_ia32_psrlwi128(dl.v16, 8); }

dest[i].v8 = __builtin_ia32_packuswb128(dl.v16, dh.v16); } }

CPU                    normal  sse1    sse2 Pentium-M      1.8     519     111     109 Pentium-4      2.4     1909    3203    244 Pentium-4      1.6     2863    4748    357 Pentium-3      0.8     3541    224 Athlon64       2.0     572     168     148

Conclusion: the Pentium-4 totally sucks at MMX, better use normal operations or recode everything using XMM.