Comments on: Instruction Level Parallelism Hi Jacob, Your absolutely right, that does definitely speed up the SSE code further. Changing to <pre> static inline __m128 mat4_mul_vec4_faster( const mat4 & m, __m128 v ){ __m128 t0 = v; __m128 t1 = v; __m128 t2 = v; __m128 t3 = v; t0 = _mm_shuffle_ps( t0, t0, 0xff ); t1 = _mm_shuffle_ps( t1, t1, 0xaa ); t0 = _mm_mul_ps( t0, m.col0 ); t1 = _mm_mul_ps( t1, m.col1 ); t2 = _mm_shuffle_ps( t2, t2, 0x55 ); t3 = _mm_shuffle_ps( t3, t3, 0x00 ); t2 = _mm_mul_ps( t2, m.col2 ); t3 = _mm_mul_ps( t3, m.col3 ); t0 = _mm_add_ps( t0, t1 ); t2 = _mm_add_ps( t2, t3 ); t0 = _mm_add_ps( t0, t2 ); return t0; } </pre> gives the result, <pre> slower 0x000538ee faster 0x000445de speed up 18.181% </pre> While messing around with this, found a further improvement (requiring SSE2), <pre> static inline __m128 mat4_mul_vec4_faster( const mat4 & m, __m128 v ){ __m128 t0, t1, t2, t3 = v; t0 = (__m128) _mm_shuffle_epi32( (__m128i) v, 0xff ); t1 = (__m128) _mm_shuffle_epi32( (__m128i) v, 0xaa ); t0 = _mm_mul_ps( t0, m.col0 ); t1 = _mm_mul_ps( t1, m.col1 ); t2 = (__m128) _mm_shuffle_epi32( (__m128i) v, 0x55 ); t3 = _mm_shuffle_ps( t3, t3, 0x00 ); t2 = _mm_mul_ps( t2, m.col2 ); t3 = _mm_mul_ps( t3, m.col3 ); t0 = _mm_add_ps( t0, t1 ); t2 = _mm_add_ps( t2, t3 ); t0 = _mm_add_ps( t0, t2 ); return t0; </pre> this gives, <pre> slower 0x000538ec faster 0x00040916 speed up 22.727% </pre> mulps has 4 cycles latency on my cpu, vs 1 for pshufd/shufps, so executing the mulps as soon as possible makes sense. Actually a bit suprised GCC didn't do this itself. But to be honest, don't yet have a good explanation for why pshufd helps so much. It does save some register copies, but will also incur a 2-cycle bypass delay. And replacing the final shufps with pshufd slows it back down again. Will comment back here when i do have a solid explanation, but if anyone reading this does already know, please post :) [results on Core i7, cpuid family 6, model 30, stepping 5] Hi Jacob,
Your absolutely right, that does definitely speed up the SSE code further.
Changing to

 
  static inline __m128 mat4_mul_vec4_faster( const mat4 & m, __m128 v ){
 
    __m128 t0 = v;
 
    __m128 t1 = v;
 
    __m128 t2 = v;
 
    __m128 t3 = v;
 
    t0 = _mm_shuffle_ps( t0, t0, 0xff );
 
    t1 = _mm_shuffle_ps( t1, t1, 0xaa );
 
    t0 = _mm_mul_ps( t0, m.col0 );
 
    t1 = _mm_mul_ps( t1, m.col1 );
 
    t2 = _mm_shuffle_ps( t2, t2, 0x55 );
 
    t3 = _mm_shuffle_ps( t3, t3, 0x00 );
 
    t2 = _mm_mul_ps( t2, m.col2 );
 
    t3 = _mm_mul_ps( t3, m.col3 );
 
    t0 = _mm_add_ps( t0, t1 );
 
    t2 = _mm_add_ps( t2, t3 );
 
    t0 = _mm_add_ps( t0, t2 );
 
    return t0;
 
  }
 
  

gives the result,

 
  slower 0x000538ee
 
  faster 0x000445de
 
  speed up 18.181%
 
  

While messing around with this, found a further improvement (requiring SSE2),

 
  static inline __m128 mat4_mul_vec4_faster( const mat4 & m, __m128 v ){
 
    __m128 t0, t1, t2, t3 = v;
 
    t0 = (__m128) _mm_shuffle_epi32( (__m128i) v, 0xff );
 
    t1 = (__m128) _mm_shuffle_epi32( (__m128i) v, 0xaa );
 
    t0 = _mm_mul_ps( t0, m.col0 );
 
    t1 = _mm_mul_ps( t1, m.col1 );
 
    t2 = (__m128) _mm_shuffle_epi32( (__m128i) v, 0x55 );
 
    t3 = _mm_shuffle_ps( t3, t3, 0x00 );
 
    t2 = _mm_mul_ps( t2, m.col2 );
 
    t3 = _mm_mul_ps( t3, m.col3 );
 
    t0 = _mm_add_ps( t0, t1 );
 
    t2 = _mm_add_ps( t2, t3 );
 
    t0 = _mm_add_ps( t0, t2 );
 
    return t0;
 
  

this gives,

 
  slower 0x000538ec
 
  faster 0x00040916
 
  speed up 22.727%
 
  

mulps has 4 cycles latency on my cpu, vs 1 for pshufd/shufps, so executing the mulps as soon as possible makes sense. Actually a bit suprised GCC didn’t do this itself.
But to be honest, don’t yet have a good explanation for why pshufd helps so much. It does save some register copies, but will also incur a 2-cycle bypass delay. And replacing the final shufps with pshufd slows it back down again. Will comment back here when i do have a solid explanation, but if anyone reading this does already know, please post :)

[results on Core i7, cpuid family 6, model 30, stepping 5]

]]>
By: Jakob/2011/05/25/instruction-level-parallelism/#comment-4873 Jakob Wed, 25 May 2011 19:16:21 +0000 Thanks for spotting that Dan. Fixed now. Thanks for spotting that Dan. Fixed now.

]]>
By: Jaewon/2011/05/25/instruction-level-parallelism/#comment-4848 Jaewon Wed, 25 May 2011 10:43:21 +0000 You've lost the include paths in your sample code, thanks to HTML and You’ve lost the include paths in your sample code, thanks to HTML and

]]>