Last time around we looked at how instruction level parallelism can be achieved when you have an understanding of how your instructions get scheduled. But the example we used, was simple enough that we could ignore an important feature of the PPU and SPU architectures. That feature is dual issue. Under the right conditions, both of these architectures can issue not one, but two instructions every clock cycle.

First up we are going to look at the conditions for dual issue on an SPU, then we are going to look at optimizing loops using software pipelining.

On an SPU, the instructions are split into two categories, pipeline 0 and pipeline 1 instructions. Appendix B.1.2 of the Cell Broadband Engine Programming Handbook (

This intrinsics version is a smidgen more than 6 times as fast as the reference version, so often we’d say that’s good enough and move on to another problem. But this most certainly can still be made faster. We’ll use this intrinsics version as the basis for our assembler version that we’ll optimize further by pipelining the loop. We still haven’t explained what “pipelining the loop” means, but don’t worry, nearly there.

First up, lets translate this function into assembly code.

assembler_0.s

 
          .section    .text.assembler, "ax", @progbits
 
  
 
          .global     assembler
 
          .type       assembler, @function
 
  
 
          # $3    void *__restrict__ out
 
          # $4    const void *__restrict__ in
 
          # $5    uint32_t count
 
          # $6    uint32_t stride
 
          .set        out,            3
 
          .set        in,             4
 
          .set        count,          5
 
          .set        stride,         6
 
  assembler:
 
  
 
          .set        p_ina,          in
 
          .set        p_inb,          7
 
          .set        p_inc,          8
 
          .set        p_ind,          9
 
          .set        p_ina_tmp,      10
 
          .set        p_inb_tmp,      11
 
          .set        p_inc_tmp,      12
 
          .set        p_ind_tmp,      13
 
  
 
          .set        strd4,          14
 
          .set        onef,           15
 
          .set        neg_onef,       16
 
          .set        shufabcd,       17
 
          .set        second_qword,   18
 
          .set        x_scale,        19
 
          .set        yz_scale,       20
 
          .set        shufAaBb,       21
 
          .set        shufCcDd,       22
 
          .set        x3ff,           23
 
  
 
          .set        qa0,            24
 
          .set        qa1,            25
 
          .set        qb0,            26
 
          .set        qb1,            27
 
          .set        qc0,            28
 
          .set        qc1,            29
 
          .set        qd0,            30
 
          .set        qd1,            31
 
          .set        shuf_a,         32
 
          .set        shuf_b,         33
 
          .set        shuf_c,         34
 
          .set        shuf_d,         35
 
          .set        u32a,           36
 
          .set        u32b,           37
 
          .set        u32c,           38
 
          .set        u32d,           39
 
          .set        u32ac,          40
 
          .set        u32bd,          41
 
          .set        u32,            42
 
          .set        x,              43
 
          .set        y,              44
 
          .set        z,              45
 
          .set        s,              46
 
          .set        xazaxbzb,       47
 
          .set        xczcxdzd,       48
 
          .set        yasaybsb,       49
 
          .set        ycscydsd,       50
 
          .set        outa,           51
 
          .set        outb,           52
 
          .set        outc,           53
 
          .set        outd,           54
 
  
 
          ai          count, count, 3
 
          andi        count, count, -4
 
  
 
          shli        strd4, stride, 2
 
          ilhu        onef, 0x3f80
 
          ilhu        neg_onef, 0xbf80
 
          cwd         shufabcd, 0 ( $1 )
 
          andbi       shufabcd, shufabcd, 15
 
          ilh         second_qword, 0x1010
 
          lqr         x_scale, _x_scale
 
          lqr         yz_scale, _yz_scale
 
          lqr         shufAaBb, _shufAaBb
 
          orbi        shufCcDd, shufAaBb, 8
 
          il          x3ff, 0x3ff
 
  
 
          a           p_inb, p_ina, stride
 
          a           p_inc, p_inb, stride
 
          a           p_ind, p_inc, stride
 
  
 
      loop:
 
          ai          count, count, -4
 
          lqd         qa0, 0  ( p_ina )
 
          lqd         qa1, 16 ( p_ina )
 
          lqd         qb0, 0  ( p_inb )
 
          lqd         qb1, 16 ( p_inb )
 
          lqd         qc0, 0  ( p_inc )
 
          lqd         qc1, 16 ( p_inc )
 
          lqd         qd0, 0  ( p_ind )
 
          lqd         qd1, 16 ( p_ind )
 
          rotqby      shuf_a, shufabcd, p_ina
 
          rotqby      shuf_b, shufabcd, p_inb
 
          rotqby      shuf_c, shufabcd, p_inc
 
          rotqby      shuf_d, shufabcd, p_ind
 
          andi        p_ina_tmp, p_ina, 15
 
          andi        p_inb_tmp, p_inb, 15
 
          andi        p_inc_tmp, p_inc, 15
 
          andi        p_ind_tmp, p_ind, 15
 
          shlqby      p_ina_tmp, second_qword, p_ina_tmp
 
          shlqby      p_inb_tmp, second_qword, p_inb_tmp
 
          shlqby      p_inc_tmp, second_qword, p_inc_tmp
 
          shlqby      p_ind_tmp, second_qword, p_ind_tmp
 
          andc        shuf_a, shuf_a, p_ina_tmp
 
          andc        shuf_b, shuf_b, p_inb_tmp
 
          andc        shuf_c, shuf_c, p_inc_tmp
 
          andc        shuf_d, shuf_d, p_ind_tmp
 
          a           p_ina, p_ina, strd4
 
          a           p_inb, p_inb, strd4
 
          a           p_inc, p_inc, strd4
 
          a           p_ind, p_ind, strd4
 
          shufb       u32a, qa0, qa1, shuf_a
 
          shufb       u32b, qb0, qb1, shuf_b
 
          shufb       u32c, qc0, qc1, shuf_c
 
          shufb       u32d, qd0, qd1, shuf_d
 
          shufb       u32ac, u32a, u32c, shufAaBb
 
          shufb       u32bd, u32b, u32d, shufAaBb
 
          shufb       u32, u32ac, u32bd, shufAaBb
 
          rotmi       x, u32, -21
 
          rotmi       y, u32, -11
 
          rotmi       z, u32, -1
 
          shli        s, u32, 31
 
          and         y, y, x3ff
 
          and         z, z, x3ff
 
          or          s, s, onef
 
          cuflt       x, x, 0
 
          cuflt       y, y, 0
 
          cuflt       z, z, 0
 
          fma         x, x, x_scale,  neg_onef
 
          fma         y, y, yz_scale, neg_onef
 
          fma         z, z, yz_scale, neg_onef
 
          shufb       xazaxbzb, x, z, shufAaBb
 
          shufb       xczcxdzd, x, z, shufCcDd
 
          shufb       yasaybsb, y, s, shufAaBb
 
          shufb       ycscydsd, y, s, shufCcDd
 
          shufb       outa, xazaxbzb, yasaybsb, shufAaBb
 
          shufb       outb, xazaxbzb, yasaybsb, shufCcDd
 
          shufb       outc, xczcxdzd, ycscydsd, shufAaBb
 
          shufb       outd, xczcxdzd, ycscydsd, shufCcDd
 
          stqd        outa, 0x00 ( out )
 
          stqd        outb, 0x10 ( out )
 
          stqd        outc, 0x20 ( out )
 
          stqd        outd, 0x30 ( out )
 
          ai          out, out, 0x40
 
          brnz        count, loop
 
  
 
          bi          $0
 
  
 
          .size       assembler, .-assembler
 
  
 
          .section    .rodata.assembler, "a", @progbits
 
          .align      4
 
  _x_scale:           .float  0.0009770396, 0.0009770396, 0.0009770396, 0.0009770396
 
  _yz_scale:          .float  0.0019550342, 0.0019550342, 0.0019550342, 0.0019550342
 
  _shufAaBb:          .long   0x00010203,   0x10111213,   0x04050607,   0x14151617
 
  

The translation was a pretty trivial straight forward one, but it sucks. GCC scheduled the intrinsics version much better. This initial assembly version is 1.4 times as expensive as the intrinsics one. So using our new found understanding of instruction latency and dual issue rules, lets schedule this code better. For breivety, we’ll just show the instructions, not all the .sets and .rodata, as they stay the same.

abreiviated assenbler_1.s

 
          .align      3
 
  assembler:
 
  
 
          ai          count, count, 3 ;               lqr         x_scale, _x_scale
 
          andi        count, count, -4 ;              lqr         yz_scale, _yz_scale
 
          shli        strd4, stride, 2 ;              lqr         shufAaBb, _shufAaBb
 
          ilhu        onef, 0x3f80 ;                  cwd         shufabcd, 0 ( $1 )
 
          ilhu        neg_onef, 0xbf80 ;              hbrr        loop_branch, loop
 
          ilh         second_qword, 0x1010 ;          /*lnop*/
 
          il          x3ff, 0x3ff ;                   /*lnop*/
 
          andbi       shufabcd, shufabcd, 15 ;        /*lnop*/
 
          a           p_inb, p_ina, stride ;          /*lnop*/
 
          a           p_inc, p_inb, stride ;          /*lnop*/
 
          a           p_ind, p_inc, stride ;          /*lnop*/
 
          orbi        shufCcDd, shufAaBb, 8 ;         lnop
 
  
 
      loop:
 
          # pipeline 0                                # pipeline 1                                      # cycle
 
          ai          count, count, -4 ;              lqd         qa0, 0  ( p_ina )                     #   0
 
          andi        p_ina_tmp, p_ina, 15 ;          lqd         qa1, 16 ( p_ina )                     #   1
 
          andi        p_inb_tmp, p_inb, 15 ;          lqd         qb0, 0  ( p_inb )                     #   2
 
          andi        p_inc_tmp, p_inc, 15 ;          lqd         qb1, 16 ( p_inb )                     #   3
 
          andi        p_ind_tmp, p_ind, 15 ;          lqd         qc0, 0  ( p_inc )                     #   4
 
          /*nop*/ ;                                   lqd         qc1, 16 ( p_inc )                     #   5
 
          /*nop*/ ;                                   lqd         qd0, 0  ( p_ind )                     #   6
 
          /*nop*/ ;                                   lqd         qd1, 16 ( p_ind )                     #   7
 
          /*nop*/ ;                                   rotqby      shuf_a, shufabcd, p_ina               #   8
 
          a           p_ina, p_ina, strd4 ;           rotqby      shuf_b, shufabcd, p_inb               #   9
 
          a           p_inb, p_inb, strd4 ;           rotqby      shuf_c, shufabcd, p_inc               #  10
 
          a           p_inc, p_inc, strd4 ;           rotqby      shuf_d, shufabcd, p_ind               #  11
 
          a           p_ind, p_ind, strd4 ;           shlqby      p_ina_tmp, second_qword, p_ina_tmp    #  12
 
          /*nop*/ ;                                   shlqby      p_inc_tmp, second_qword, p_inc_tmp    #  13
 
          /*nop*/ ;                                   shlqby      p_inb_tmp, second_qword, p_inb_tmp    #  14
 
          nop ;                                       shlqby      p_ind_tmp, second_qword, p_ind_tmp    #  15
 
          andc        shuf_a, shuf_a, p_ina_tmp ;     /*lnop*/                                          #  16
 
          andc        shuf_c, shuf_c, p_inc_tmp ;     /*lnop*/                                          #  17
 
          andc        shuf_b, shuf_b, p_inb_tmp ;     shufb       u32a, qa0, qa1, shuf_a                #  18
 
          andc        shuf_d, shuf_d, p_ind_tmp ;     shufb       u32c, qc0, qc1, shuf_c                #  19
 
          /*nop*/ ;                                   shufb       u32b, qb0, qb1, shuf_b                #  20
 
          /*nop*/ ;                                   shufb       u32d, qd0, qd1, shuf_d                #  21
 
          /*nop*/ ;                                   /*lnop*/                                          #  22
 
          /*nop*/ ;                                   shufb       u32ac, u32a, u32c, shufAaBb           #  23
 
          /*nop*/ ;                                   /*lnop*/                                          #  24
 
          /*nop*/ ;                                   shufb       u32bd, u32b, u32d, shufAaBb           #  25
 
          /*nop*/ ;                                   /*lnop*/                                          #  26
 
          /*nop*/ ;                                   /*lnop*/                                          #  27
 
          /*nop*/ ;                                   /*lnop*/                                          #  28
 
          nop ;                                       shufb       u32, u32ac, u32bd, shufAaBb           #  29
 
          /*nop*/ ;                                   /*lnop*/                                          #  30
 
          /*nop*/ ;                                   /*lnop*/                                          #  31
 
          /*nop*/ ;                                   /*lnop*/                                          #  32
 
          rotmi       z, u32, -1 ;                    /*lnop*/                                          #  33
 
          rotmi       y, u32, -11 ;                   /*lnop*/                                          #  34
 
          rotmi       x, u32, -21 ;                   /*lnop*/                                          #  35
 
          shli        s, u32, 31 ;                    /*lnop*/                                          #  36
 
          and         z, z, x3ff ;                    /*lnop*/                                          #  37
 
          and         y, y, x3ff ;                    /*lnop*/                                          #  38
 
          cuflt       z, z, 0 ;                       /*lnop*/                                          #  39
 
          cuflt       x, x, 0 ;                       /*lnop*/                                          #  40
 
          cuflt       y, y, 0 ;                       /*lnop*/                                          #  41
 
          or          s, s, onef ;                    /*lnop*/                                          #  42
 
          /*nop*/ ;                                   /*lnop*/                                          #  43
 
          /*nop*/ ;                                   /*lnop*/                                          #  44
 
          /*nop*/ ;                                   /*lnop*/                                          #  45
 
          fma         z, z, yz_scale, neg_onef ;      /*lnop*/                                          #  46
 
          fma         x, x, x_scale,  neg_onef ;      /*lnop*/                                          #  47
 
          fma         y, y, yz_scale, neg_onef ;      /*lnop*/                                          #  48
 
          /*nop*/ ;                                   /*lnop*/                                          #  49
 
          /*nop*/ ;                                   /*lnop*/                                          #  50
 
          /*nop*/ ;                                   /*lnop*/                                          #  51
 
          /*nop*/ ;                                   shufb       xazaxbzb, x, z, shufAaBb              #  52
 
          /*nop*/ ;                                   shufb       yasaybsb, y, s, shufAaBb              #  53
 
          /*nop*/ ;                                   shufb       xczcxdzd, x, z, shufCcDd              #  54
 
          /*nop*/ ;                                   shufb       ycscydsd, y, s, shufCcDd              #  55
 
          /*nop*/ ;                                   /*lnop*/                                          #  56
 
          /*nop*/ ;                                   shufb       outa, xazaxbzb, yasaybsb, shufAaBb    #  57
 
          /*nop*/ ;                                   shufb       outb, xazaxbzb, yasaybsb, shufCcDd    #  58
 
          /*nop*/ ;                                   shufb       outc, xczcxdzd, ycscydsd, shufAaBb    #  59
 
          /*nop*/ ;                                   shufb       outd, xczcxdzd, ycscydsd, shufCcDd    #  60
 
          /*nop*/ ;                                   stqd        outa, 0x00 ( out )                    #  61
 
          /*nop*/ ;                                   stqd        outb, 0x10 ( out )                    #  62
 
          /*nop*/ ;                                   stqd        outc, 0x20 ( out )                    #  63
 
          nop ;                                       stqd        outd, 0x30 ( out )                    #  64
 
          ai          out, out, 0x40 ; loop_branch:   brnz        count, loop                           #  60
 
  
 
          bi          $0
 
  
 
          .size       assembler, .-assembler
 
  

This version is now fractionally faster than the intrinsics version, but pretty much the same. Look how many nops there are in there. This code is nowhere near fully utilizing the SPU. Ideally we’d like to be issuing two instructions every cycle. The loop has 63 instructions that do something useful, and 69 nops (mostly commented out), so it is acheiving less then 50% utilization of the SPU.

Finally, its’ time to look at software pipelining. The idea is to make use of the spare instruction cycles by executing several iterations of the loop at the same time. But this is not unrolling the loop. Instead, the contents of the loop is wrapped around on itself. So lets see how it can be done.

The loop has 27 pipe 0 instructions, and 36 pipe 1 (not including any nops). That means that the pipe 1 instructions are the limiting factor, the loop cannot be executed in less than 36 cycles. First we split the loop after cycle 34, and reschedule the remaining instructions back into the start of the loop.

The ROTMI from cycle 35 can be slotted into cycle 5. The SHLI from cycle 36 can be slotted into cycle 6, and so on. So now when the SPU is executing one iteration of the loop, these instructions correspond to one loop behind most of the other instructions. Not every instruction was able to be scheduled in the first time wrapping around. Several instructions remained, so again we wrapped them around. The end result was three iterations of the loop pipelined inside each other.

We are going to need some prologue code before entering the loop, as the otherwise first two iterations will be using some undefined values. We simply take two copies of the loop, and remove the instructions that were pipelined from a previous iteration. There is then is the question of a epilogue to the loop. Because all the stores were scheduled in the same iteration of the loop, we have an option. One possibility is to create an epilogue the same way as we did a prologue, the other is to just execute the loop two more times. We chose to do without an epilogue to save some code space.

Unfortunately it isn’t always as “easy” as this. When wrapping the loop around like this, we need to be very careful that the resulting loop actually still does the same thing. We need to make sure that the instructions are placed before any others that modify the registers they use. We’ll see an example of this problem soon.

Below is the optimized function with pipelining (again we have abreviated it, not copying all the .set pseudo ops).

abreiviated assenbler_2.s

 
          .align      3
 
  assembler:
 
  
 
          ai          count, count, 3 ;               lqr         x_scale, _x_scale
 
          andi        count, count, -4 ;              lqr         yz_scale, _yz_scale
 
          shli        strd4, stride, 2 ;              lqr         shufAaBb, _shufAaBb
 
          ilhu        onef, 0x3f80 ;                  cwd         shufabcd, 0 ( $1 )
 
          ilhu        neg_onef, 0xbf80 ;              hbrr        loop_branch, loop
 
          ilh         second_qword, 0x1010 ;          /*lnop*/
 
          il          x3ff, 0x3ff ;                   /*lnop*/
 
          andbi       shufabcd, shufabcd, 15 ;        /*lnop*/
 
          a           p_inb, p_ina, stride ;          /*lnop*/
 
          a           p_inc, p_inb, stride ;          /*lnop*/
 
          a           p_ind, p_inc, stride ;          /*lnop*/
 
          orbi        shufCcDd, shufAaBb, 8 ;         /*lnop*/
 
  
 
          /*nop*/ ;                                   lqd         qa0, 0  ( p_ina )
 
          andi        p_ina_tmp, p_ina, 15 ;          lqd         qa1, 16 ( p_ina )
 
          andi        p_inb_tmp, p_inb, 15 ;          lqd         qb0, 0  ( p_inb )
 
          andi        p_inc_tmp, p_inc, 15 ;          lqd         qb1, 16 ( p_inb )
 
          andi        p_ind_tmp, p_ind, 15 ;          lqd         qc0, 0  ( p_inc )
 
          /*nop*/ ;                                   lqd         qc1, 16 ( p_inc )
 
          /*nop*/ ;                                   lqd         qd0, 0  ( p_ind )
 
          /*nop*/ ;                                   lqd         qd1, 16 ( p_ind )
 
          /*nop*/ ;                                   rotqby      shuf_a, shufabcd, p_ina
 
          a           p_ina, p_ina, strd4 ;           rotqby      shuf_b, shufabcd, p_inb
 
          a           p_inb, p_inb, strd4 ;           rotqby      shuf_c, shufabcd, p_inc
 
          a           p_inc, p_inc, strd4 ;           rotqby      shuf_d, shufabcd, p_ind
 
          a           p_ind, p_ind, strd4 ;           shlqby      p_ina_tmp, second_qword, p_ina_tmp
 
          /*nop*/ ;                                   shlqby      p_inc_tmp, second_qword, p_inc_tmp
 
          /*nop*/ ;                                   shlqby      p_inb_tmp, second_qword, p_inb_tmp
 
          nop ;                                       shlqby      p_ind_tmp, second_qword, p_ind_tmp
 
          andc        shuf_a, shuf_a, p_ina_tmp ;     /*lnop*/
 
          andc        shuf_c, shuf_c, p_inc_tmp ;     /*lnop*/
 
          andc        shuf_b, shuf_b, p_inb_tmp ;     shufb       u32a, qa0, qa1, shuf_a
 
          andc        shuf_d, shuf_d, p_ind_tmp ;     shufb       u32c, qc0, qc1, shuf_c
 
          /*nop*/ ;                                   shufb       u32b, qb0, qb1, shuf_b
 
          /*nop*/ ;                                   shufb       u32d, qd0, qd1, shuf_d
 
          /*nop*/ ;                                   /*lnop*/
 
          /*nop*/ ;                                   shufb       u32ac, u32a, u32c, shufAaBb
 
          /*nop*/ ;                                   /*lnop*/
 
          /*nop*/ ;                                   shufb       u32bd, u32b, u32d, shufAaBb
 
          /*nop*/ ;                                   /*lnop*/
 
          /*nop*/ ;                                   /*lnop*/
 
          /*nop*/ ;                                   /*lnop*/
 
          nop ;                                       shufb       u32, u32ac, u32bd, shufAaBb
 
          /*nop*/ ;                                   /*lnop*/
 
          /*nop*/ ;                                   /*lnop*/
 
          /*nop*/ ;                                   /*lnop*/
 
          rotmi       z, u32, -1 ;                    lnop
 
          rotmi       y, u32, -11 ;                   /*lnop*/
 
  
 
          /*nop*/ ;                                   lqd         qa0, 0  ( p_ina )
 
          andi        p_ina_tmp, p_ina, 15 ;          lqd         qa1, 16 ( p_ina )
 
          andi        p_inb_tmp, p_inb, 15 ;          lqd         qb0, 0  ( p_inb )
 
          andi        p_inc_tmp, p_inc, 15 ;          lqd         qb1, 16 ( p_inb )
 
          andi        p_ind_tmp, p_ind, 15 ;          lqd         qc0, 0  ( p_inc )
 
    /*2*/ rotmi       x, u32, -21 ;                   lqd         qc1, 16 ( p_inc )
 
    /*2*/ shli        s, u32, 31 ;                    lqd         qd0, 0  ( p_ind )
 
    /*2*/ and         z, z, x3ff ;                    lqd         qd1, 16 ( p_ind )
 
    /*2*/ and         y, y, x3ff ;                    rotqby      shuf_a, shufabcd, p_ina
 
          a           p_ina, p_ina, strd4 ;           rotqby      shuf_b, shufabcd, p_inb
 
          a           p_inb, p_inb, strd4 ;           rotqby      shuf_c, shufabcd, p_inc
 
          a           p_inc, p_inc, strd4 ;           rotqby      shuf_d, shufabcd, p_ind
 
          a           p_ind, p_ind, strd4 ;           shlqby      p_ina_tmp, second_qword, p_ina_tmp
 
    /*2*/ cuflt       z, z, 0 ;                       shlqby      p_inc_tmp, second_qword, p_inc_tmp
 
    /*2*/ cuflt       x, x, 0 ;                       shlqby      p_inb_tmp, second_qword, p_inb_tmp
 
    /*2*/ cuflt       y, y, 0 ;                       shlqby      p_ind_tmp, second_qword, p_ind_tmp
 
          andc        shuf_a, shuf_a, p_ina_tmp ;     /*lnop*/
 
          andc        shuf_c, shuf_c, p_inc_tmp ;     /*lnop*/
 
          andc        shuf_b, shuf_b, p_inb_tmp ;     shufb       u32a, qa0, qa1, shuf_a
 
          andc        shuf_d, shuf_d, p_ind_tmp ;     shufb       u32c, qc0, qc1, shuf_c
 
    /*2*/ or          s, s, onef ;                    shufb       u32b, qb0, qb1, shuf_b
 
    /*2*/ fma         z, z, yz_scale, neg_onef ;      shufb       u32d, qd0, qd1, shuf_d
 
    /*2*/ fma         x, x, x_scale,  neg_onef ;      lnop
 
    /*2*/ fma         y, y, yz_scale, neg_onef ;      shufb       u32ac, u32a, u32c, shufAaBb
 
          /*nop*/ ;                                   /*lnop*/
 
          /*nop*/ ;                                   shufb       u32bd, u32b, u32d, shufAaBb
 
          /*nop*/ ;                                   /*lnop*/
 
          /*nop*/ ;                                   /*lnop*/
 
          /*nop*/ ;                                   shufb       xazaxbzb, x, z, shufAaBb            /*2*/
 
          /*nop*/ ;                                   shufb       u32, u32ac, u32bd, shufAaBb
 
          /*nop*/ ;                                   shufb       yasaybsb, y, s, shufAaBb            /*2*/
 
          /*nop*/ ;                                   shufb       xczcxdzd, x, z, shufCcDd            /*2*/
 
          /*nop*/ ;                                   shufb       ycscydsd, y, s, shufCcDd            /*2*/
 
          rotmi       z, u32, -1 ;                    lnop
 
          rotmi       y, u32, -11 ;                   shufb       outa, xazaxbzb, yasaybsb, shufAaBb  /*2*/
 
  
 
      loop:
 
          # pipeline 0                                # pipeline 1                                            # cycle
 
          ai          count, count, -4 ;              lqd         qa0, 0  ( p_ina )                           #   0
 
          andi        p_ina_tmp, p_ina, 15 ;          lqd         qa1, 16 ( p_ina )                           #   1
 
          andi        p_inb_tmp, p_inb, 15 ;          lqd         qb0, 0  ( p_inb )                           #   2
 
          andi        p_inc_tmp, p_inc, 15 ;          lqd         qb1, 16 ( p_inb )                           #   3
 
          andi        p_ind_tmp, p_ind, 15 ;          lqd         qc0, 0  ( p_inc )                           #   4
 
    /*2*/ rotmi       x, u32, -21 ;                   lqd         qc1, 16 ( p_inc )                           #   5
 
    /*2*/ shli        s, u32, 31 ;                    lqd         qd0, 0  ( p_ind )                           #   6
 
    /*2*/ and         z, z, x3ff ;                    lqd         qd1, 16 ( p_ind )                           #   7
 
    /*2*/ and         y, y, x3ff ;                    rotqby      shuf_a, shufabcd, p_ina                     #   8
 
          a           p_ina, p_ina, strd4 ;           rotqby      shuf_b, shufabcd, p_inb                     #   9
 
          a           p_inb, p_inb, strd4 ;           rotqby      shuf_c, shufabcd, p_inc                     #  10
 
          a           p_inc, p_inc, strd4 ;           rotqby      shuf_d, shufabcd, p_ind                     #  11
 
          a           p_ind, p_ind, strd4 ;           shlqby      p_ina_tmp, second_qword, p_ina_tmp          #  12
 
    /*2*/ cuflt       z, z, 0 ;                       shlqby      p_inc_tmp, second_qword, p_inc_tmp          #  13
 
    /*2*/ cuflt       x, x, 0 ;                       shlqby      p_inb_tmp, second_qword, p_inb_tmp          #  14
 
    /*2*/ cuflt       y, y, 0 ;                       shlqby      p_ind_tmp, second_qword, p_ind_tmp          #  15
 
          andc        shuf_a, shuf_a, p_ina_tmp ;     shufb       outb, xazaxbzb, yasaybsb, shufCcDd  /*3*/   #  16
 
          andc        shuf_c, shuf_c, p_inc_tmp ;     shufb       outc, xczcxdzd, ycscydsd, shufAaBb  /*3*/   #  17
 
          andc        shuf_b, shuf_b, p_inb_tmp ;     shufb       u32a, qa0, qa1, shuf_a                      #  18
 
          andc        shuf_d, shuf_d, p_ind_tmp ;     shufb       u32c, qc0, qc1, shuf_c                      #  19
 
    /*2*/ or          s, s, onef ;                    shufb       u32b, qb0, qb1, shuf_b                      #  20
 
    /*2*/ fma         z, z, yz_scale, neg_onef ;      shufb       u32d, qd0, qd1, shuf_d                      #  21
 
    /*2*/ fma         x, x, x_scale,  neg_onef ;      shufb       outd, xczcxdzd, ycscydsd, shufCcDd  /*3*/   #  22
 
    /*2*/ fma         y, y, yz_scale, neg_onef ;      shufb       u32ac, u32a, u32c, shufAaBb                 #  23
 
          /*nop*/ ;                                   stqd        outa, 0x00 ( out )                  /*3*/   #  24
 
          /*nop*/ ;                                   shufb       u32bd, u32b, u32d, shufAaBb                 #  25
 
          /*nop*/ ;                                   stqd        outb, 0x10 ( out )                  /*3*/   #  26
 
          /*nop*/ ;                                   stqd        outc, 0x20 ( out )                  /*3*/   #  27
 
          /*nop*/ ;                                   shufb       xazaxbzb, x, z, shufAaBb            /*2*/   #  28
 
          /*nop*/ ;                                   shufb       u32, u32ac, u32bd, shufAaBb                 #  29
 
          /*nop*/ ;                                   shufb       yasaybsb, y, s, shufAaBb            /*2*/   #  30
 
          /*nop*/ ;                                   shufb       xczcxdzd, x, z, shufCcDd            /*2*/   #  31
 
          nop ;                                       shufb       ycscydsd, y, s, shufCcDd            /*2*/   #  32
 
          rotmi       z, u32, -1 ;                    stqd        outd, 0x30 ( out )                  /*3*/   #  33
 
          rotmi       y, u32, -11 ;                   shufb       outa, xazaxbzb, yasaybsb, shufAaBb  /*2*/   #  34
 
    /*3*/ ai          out, out, 0x40 ; loop_branch:   brnz        count, loop                                 #  35
 
  
 
          bi          $0
 
  
 
          .size       assembler, .-assembler
 
  

Nice, 61 down to 36 cycles. We are issueing a pipe 1 instruction every cycle. But what if we could trade some pipe 1 instructions for equivalent pipe 0 instructions ? This is where some quite interesting, and somewhat counter intuitive, optimizations come into play.

Part of the code used for generating the SHUFB controls,

 
          andi        p_ina_tmp, p_ina, 15
 
          shlqby      p_ina_tmp, second_qword, p_ina_tmp
 
  

can be replaced with

 
          cgtb        p_ina_tmp, cmp_addr_mod16, p_ina_mod16_byte
 
          andbi       p_ina_tmp, p_ina_tmp, 16
 
          a           p_ina_mod16_byte, p_ina_mod16_byte, stride4_mod16_byte
 
          andbi       p_ina_mod16_byte, p_ina_mod16_byte, 15
 
  

where cmp_addr_mod16 is preloaded with {0x100f0e0d,0x0c0b0a09,0x08070605,0x04030201}, p_ina_mod16_byte contains p_ina%16 in every byte, and stride4_mod16_byte contains (stride*4)%16 in every byte. I say “somewhat counter intuitive”, as we have locally increased the latency, from 6 to 8 cycles.

So we have traded one pipe 1 instruction for three pipe 0 instructions. If we also do this for p_inb_tmp, then we’ll have 33 pipe 0 instructions and 34 pipe 1.

This was a little tricker to schedule this time, since we had more instructions in less slots. The problem was that the lifetime of some variables was too long. Originally to calculate x, we had the following code,

 
          rotmi       x, u32, -21
 
          cuflt       x, x, 0
 
          fma         x, x, x_scale, neg_onef
 
  

That’s is a long 17 cycles, and was preventing the

 
          shufb       xazaxbzb, x, z, shufAaBb
 
  

from being correctly scheduled. So what we did is break x into several different variables,

 
          rotmi       x0, u32, -21
 
          cuflt       x1, x0, 0
 
          fma         x2, x1, x_scale, neg_onef
 
  

The SHUFB only needed to use x2, so this could then be scheduled without a problem. The register usage increased, but we just managed to stay within the registers specified as volatile by the ABI, so we didn’t need to go saving and restoring registers in the function prologue/epilogue.

Another trick in this final version is to remove some of the loop prologue by using the “trashcan-stores” technique. The idea is to call the loop body extra times rather than prologue code. The output writen by these extra loop iterations is writen to a “trashcan” location on the stack instead of the caller’s buffer. To implement this, the input argument out has been renamed to out_tmp, and in the setup code we added

 
          ai          out, $1, -0x40
 
          ai          out_tmp, out_tmp, -0x40
 
  

and to the loop, we replaced the

 
          # stores ...
 
          ai          out, out, 0x40
 
  

with

 
          ai          out_tmp, out_tmp, 0x40
 
          # stores ...
 
          ai          out, out_tmp, 0
 
  

After exchanging pipe 1 for pipe 0 instructions, we were left with one less pipe 0 instruction, which is now used to copy out_tmp to out. With a second temporary variable we could have eliminated all of the prologue code, but the register copy would cost us an extra cycle.

assembler.s

 
          .section    .text.assembler, "ax", @progbits
 
  
 
          .global     assembler
 
          .type       assembler, @function
 
  
 
          # $3    void *__restrict__ out
 
          # $4    const void *__restrict__ in
 
          # $5    uint32_t count
 
          # $6    uint32_t stride
 
          .set        out_tmp,            3
 
          .set        in,                 4
 
          .set        count,              5
 
          .set        stride,             6
 
          .align      3
 
  assembler:
 
  
 
          .set        p_ina,              in
 
          .set        p_inb,              7
 
          .set        p_inc,              8
 
          .set        p_ind,              9
 
          .set        p_ina_0,            10
 
          .set        p_inb_0,            11
 
          .set        p_inc_0,            12
 
          .set        p_ind_0,            13
 
  
 
          .set        strd4,              14
 
          .set        onef,               15
 
          .set        neg_onef,           16
 
          .set        shufabcd,           17
 
          .set        second_qword,       18
 
          .set        x_scale,            19
 
          .set        yz_scale,           20
 
          .set        shufAaBb,           21
 
          .set        shufCcDd,           22
 
          .set        x3ff,               23
 
  
 
          .set        qa0,                24
 
          .set        qa1,                25
 
          .set        qb0,                26
 
          .set        qb1,                27
 
          .set        qc0,                28
 
          .set        qc1,                29
 
          .set        qd0,                30
 
          .set        qd1,                31
 
          .set        shuf_a,             32
 
          .set        shuf_b,             33
 
          .set        shuf_c,             34
 
          .set        shuf_d,             35
 
          .set        u32a,               36
 
          .set        u32b,               37
 
          .set        u32c,               38
 
          .set        u32d,               39
 
          .set        u32ac,              40
 
          .set        u32bd,              41
 
          .set        u32,                42
 
          .set        x,                  43
 
          .set        y,                  44
 
          .set        z,                  45
 
          .set        s,                  46
 
          .set        xazaxbzb,           47
 
          .set        xczcxdzd,           48
 
          .set        yasaybsb,           49
 
          .set        ycscydsd,           50
 
          .set        outa,               51
 
          .set        outb,               52
 
          .set        outc,               53
 
          .set        outd,               54
 
          .set        p_ina_mod16_byte,   55
 
          .set        p_inb_mod16_byte,   56
 
          .set        byte_splat,         57
 
          .set        cmp_addr_mod16,     58
 
          .set        strd4_mod16b,       59
 
          .set        p_ina_mod16b,       60
 
          .set        p_inb_mod16b,       61
 
          .set        x0,                 62
 
          .set        y0,                 63
 
          .set        z0,                 64
 
          .set        s0,                 65
 
          .set        x1,                 66
 
          .set        y1,                 67
 
          .set        z1,                 68
 
          .set        s1,                 69
 
          .set        x2,                 70
 
          .set        y2,                 71
 
          .set        z2,                 72
 
          .set        y3,                 73
 
          .set        z3,                 74
 
          .set        p_ina_1,            75
 
          .set        p_inb_1,            76
 
          .set        p_inc_1,            77
 
          .set        p_ind_1,            78
 
          .set        out,                79
 
  
 
          ilh         byte_splat, 0x0303 ;                        lqr         cmp_addr_mod16, _cmp_addr_mod16
 
          ai          count, count, 7 ;                           lqr         x_scale, _x_scale
 
          andi        count, count, -4 ;                          lqr         yz_scale, _yz_scale
 
          shli        strd4, stride, 2 ;                          lqr         shufAaBb, _shufAaBb
 
          ilhu        onef, 0x3f80 ;                              cwd         shufabcd, 0 ( $1 )
 
          ilhu        neg_onef, 0xbf80 ;                          hbrr        loop_branch, loop
 
          ilh         second_qword, 0x1010 ;                      shufb       p_ina_mod16_byte, p_ina, p_ina, byte_splat
 
          il          x3ff, 0x3ff ;                               shufb       p_inb_mod16_byte, p_inb, p_inb, byte_splat
 
          andbi       shufabcd, shufabcd, 15 ;                    shufb       strd4_mod16b, strd4, strd4, byte_splat
 
          a           p_inb, p_ina, stride ;                      /*lnop*/
 
          a           p_inc, p_inb, stride ;                      /*lnop*/
 
          a           p_ind, p_inc, stride ;                      /*lnop*/
 
          orbi        shufCcDd, shufAaBb, 8 ;                     /*lnop*/
 
          andbi       p_ina_mod16_byte, p_ina_mod16_byte, 15 ;    /*lnop*/
 
          andbi       p_inb_mod16_byte, p_inb_mod16_byte, 15 ;    /*lnop*/
 
          andbi       strd4_mod16b, strd4_mod16b, 15 ;            /*lnop*/
 
          ai          out, $1, -0x40 ;                            /*lnop*/
 
          ai          out_tmp, out_tmp, -0x40 ;                   /*lnop*/
 
  
 
          /*nop*/ ;                                               lqd         qa0, 0  ( p_ina )
 
          cgtb        p_ina_0, cmp_addr_mod16, p_ina_mod16b ;     lqd         qa1, 16 ( p_ina )
 
          cgtb        p_inb_0, cmp_addr_mod16, p_inb_mod16b ;     lqd         qb0, 0  ( p_inb )
 
          andi        p_inc_0, p_inc, 15 ;                        lqd         qb1, 16 ( p_inb )
 
          andi        p_ind_0, p_ind, 15 ;                        lqd         qc0, 0  ( p_inc )
 
          /*nop*/ ;                                               lqd         qc1, 16 ( p_inc )
 
          /*nop*/ ;                                               lqd         qd0, 0  ( p_ind )
 
          /*nop*/ ;                                               lqd         qd1, 16 ( p_ind )
 
          /*nop*/ ;                                               rotqby      shuf_a, shufabcd, p_ina
 
          a           p_ina, p_ina, strd4 ;                       rotqby      shuf_b, shufabcd, p_inb
 
          a           p_inb, p_inb, strd4 ;                       rotqby      shuf_c, shufabcd, p_inc
 
          a           p_inc, p_inc, strd4 ;                       rotqby      shuf_d, shufabcd, p_ind
 
          a           p_ind, p_ind, strd4 ;                       shlqby      p_inc_1, second_qword, p_inc_0
 
          andbi       p_ina_1, p_ina_0, 16 ;                      shlqby      p_ind_1, second_qword, p_ind_0
 
          andbi       p_inb_1, p_inb_0, 16 ;                      /*lnop*/
 
          andc        shuf_a, shuf_a, p_ina_1 ;                   /*lnop*/
 
          andc        shuf_c, shuf_c, p_inc_1 ;                   lnop
 
          andc        shuf_b, shuf_b, p_inb_1 ;                   shufb       u32a, qa0, qa1, shuf_a
 
          andc        shuf_d, shuf_d, p_ind_1 ;                   shufb       u32c, qc0, qc1, shuf_c
 
          /*nop*/ ;                                               shufb       u32b, qb0, qb1, shuf_b
 
          /*nop*/ ;                                               shufb       u32d, qd0, qd1, shuf_d
 
          /*nop*/ ;                                               /*lnop*/
 
          nop ;                                                   shufb       u32ac, u32a, u32c, shufAaBb
 
          /*nop*/ ;                                               /*lnop*/
 
          a           p_ina_mod16b, p_ina_mod16b, strd4_mod16b ;  shufb       u32bd, u32b, u32d, shufAaBb
 
          a           p_inb_mod16b, p_inb_mod16b, strd4_mod16b ;  /*lnop*/
 
          andbi       p_ina_mod16b, p_ina_mod16b, 15 ;            /*lnop*/
 
          andbi       p_inb_mod16b, p_inb_mod16b, 15 ;            /*lnop*/
 
          /*nop*/ ;                                               shufb       u32, u32ac, u32bd, shufAaBb
 
  
 
      loop:
 
          # pipeline 0                                            # pipeline 1                                        # cycle
 
          ai          count, count, -4 ;                          lqd         qa0, 0  ( p_ina )                       #   0
 
          cgtb        p_ina_0, cmp_addr_mod16, p_ina_mod16b ;     lqd         qa1, 16 ( p_ina )                       #   1
 
          cgtb        p_inb_0, cmp_addr_mod16, p_inb_mod16b ;     lqd         qb0, 0  ( p_inb )                       #   2
 
          andi        p_inc_0, p_inc, 15 ;                        lqd         qb1, 16 ( p_inb )                       #   3
 
          andi        p_ind_0, p_ind, 15 ;                        lqd         qc0, 0  ( p_inc )                       #   4
 
    /*2*/ rotmi       z0, u32, -1 ;                               lqd         qc1, 16 ( p_inc )                       #   5
 
    /*2*/ rotmi       y0, u32, -11 ;                              lqd         qd0, 0  ( p_ind )                       #   6
 
    /*2*/ rotmi       x0, u32, -21 ;                              lqd         qd1, 16 ( p_ind )                       #   7
 
    /*2*/ shli        s0, u32, 31 ;                               rotqby      shuf_a, shufabcd, p_ina                 #   8
 
          a           p_ina, p_ina, strd4 ;                       rotqby      shuf_b, shufabcd, p_inb                 #   9
 
          a           p_inb, p_inb, strd4 ;                       rotqby      shuf_c, shufabcd, p_inc                 #  10
 
          a           p_inc, p_inc, strd4 ;                       rotqby      shuf_d, shufabcd, p_ind                 #  11
 
          a           p_ind, p_ind, strd4 ;                       shlqby      p_inc_1, second_qword, p_inc_0          #  12
 
          andbi       p_ina_1, p_ina_0, 16 ;                      shlqby      p_ind_1, second_qword, p_ind_0          #  13
 
          andbi       p_inb_1, p_inb_0, 16 ;                      shufb       xazaxbzb, x2, z3, shufAaBb         /*3*/#  14
 
    /*2*/ and         z1, z0, x3ff ;                              shufb       yasaybsb, y3, s1, shufAaBb         /*3*/#  15
 
          andc        shuf_a, shuf_a, p_ina_1 ;                   shufb       xczcxdzd, x2, z3, shufCcDd         /*3*/#  16
 
          andc        shuf_c, shuf_c, p_inc_1 ;                   shufb       ycscydsd, y3, s1, shufCcDd         /*3*/#  17
 
          andc        shuf_b, shuf_b, p_inb_1 ;                   shufb       u32a, qa0, qa1, shuf_a                  #  18
 
          andc        shuf_d, shuf_d, p_ind_1 ;                   shufb       u32c, qc0, qc1, shuf_c                  #  19
 
    /*2*/ and         y1, y0, x3ff ;                              shufb       u32b, qb0, qb1, shuf_b                  #  20
 
    /*2*/ cuflt       z2, z1, 0 ;                                 shufb       u32d, qd0, qd1, shuf_d                  #  21
 
    /*2*/ cuflt       x1, x0, 0 ;                                 shufb       outa, xazaxbzb, yasaybsb, shufAaBb /*3*/#  22
 
    /*2*/ cuflt       y2, y1, 0 ;                                 shufb       u32ac, u32a, u32c, shufAaBb             #  23
 
    /*3*/ ai          out_tmp, out_tmp, 0x40 ;                    shufb       outb, xazaxbzb, yasaybsb, shufCcDd /*3*/#  24
 
          a           p_ina_mod16b, p_ina_mod16b, strd4_mod16b ;  shufb       u32bd, u32b, u32d, shufAaBb             #  25
 
          a           p_inb_mod16b, p_inb_mod16b, strd4_mod16b ;  shufb       outc, xczcxdzd, ycscydsd, shufAaBb /*3*/#  26
 
          andbi       p_ina_mod16b, p_ina_mod16b, 15 ;            shufb       outd, xczcxdzd, ycscydsd, shufCcDd /*3*/#  27
 
    /*2*/ fma         z3, z2, yz_scale, neg_onef ;                stqd        outa, 0x00 ( out )                 /*3*/#  28
 
    /*2*/ fma         x2, x1, x_scale,  neg_onef ;                shufb       u32, u32ac, u32bd, shufAaBb             #  29
 
    /*2*/ fma         y3, y2, yz_scale, neg_onef ;                stqd        outb, 0x10 ( out )                 /*3*/#  30
 
          andbi       p_inb_mod16b, p_inb_mod16b, 15 ;            stqd        outc, 0x20 ( out )                 /*3*/#  31
 
    /*2*/ or          s1, s0, onef ;                              stqd        outd, 0x30 ( out )                 /*3*/#  32
 
    /*3*/ ai          out, out_tmp, 0 ;            loop_branch:   brnz        count, loop                             #  33
 
  
 
          bi          $0
 
  
 
          .size       assembler, .-assembler
 
  
 
           .section    .rodata.assembler, "a", @progbits
 
          .align      4
 
  _x_scale:           .float  0.0009770396, 0.0009770396, 0.0009770396, 0.0009770396
 
  _yz_scale:          .float  0.0019550342, 0.0019550342, 0.0019550342, 0.0019550342
 
  _shufAaBb:          .long   0x00010203,   0x10111213,   0x04050607,   0x14151617
 
  _cmp_addr_mod16:    .long   0x100f0e0d,   0x0c0b0a09,   0x08070605,   0x04030201
 
  

So our final results are,

 
                 cycles       speed up
 
      reference  0x00001f36
 
      intrinsics 0x0000052f   6.021
 
      assembler  0x00000290   12.180
 
  

When a subject is complicated, reading different explainations from different people can really help your understanding. Also here on #AltDevBlogADay, Jaymin Kessler has put together some excellent videos on the same topic, if you haven’t already, have a look at /2011/03/16/software-pipelining-failed-video-experiment/. The “trashcan-stores” technique comes from Jon Rocatis, and is described in the comment he posted to Jaymin’s blog post.

If you want to mess around with this code, or verify results, the test setup used was,

test.c++

 
  #include <spu_intrinsics.h>
 
  #include <stdint.h>
 
  #include <stdio.h>
 
  #include <string.h>
 
  
 
  #define COMPRESSED_TANGENT( TANGENT_X, TANGENT_Y, TANGENT_Z, BITANGENT_SIGN ) \
 
     ((uint32_t)(((TANGENT_X)*0.5f+0.5f)*0x7ff))<<21 \
 
    |((uint32_t)(((TANGENT_Y)*0.5f+0.5f)*0x3ff))<<11 \
 
    |((uint32_t)(((TANGENT_Z)*0.5f+0.5f)*0x3ff))<<1  \
 
    |((BITANGENT_SIGN)<0.f)
 
  #define PADDING 0xdecafbad
 
  #define TEST_VALUES                                                     \
 
    COMPRESSED_TANGENT(+1.f, 0.f, 0.f,+1.f), PADDING, PADDING,            \
 
    COMPRESSED_TANGENT(-1.f, 0.f, 0.f,+1.f), PADDING, PADDING,            \
 
    COMPRESSED_TANGENT( 0.f,+1.f, 0.f,+1.f), PADDING, PADDING,            \
 
    COMPRESSED_TANGENT( 0.f,-1.f, 0.f,+1.f), PADDING, PADDING,            \
 
    COMPRESSED_TANGENT( 0.f, 0.f,+1.f,+1.f), PADDING, PADDING,            \
 
    COMPRESSED_TANGENT( 0.f, 0.f,-1.f,+1.f), PADDING, PADDING,            \
 
    COMPRESSED_TANGENT(+1.f, 0.f, 0.f,-1.f), PADDING, PADDING,            \
 
    COMPRESSED_TANGENT(-1.f, 0.f, 0.f,-1.f), PADDING, PADDING,            \
 
    COMPRESSED_TANGENT( 0.f,+1.f, 0.f,-1.f), PADDING, PADDING,            \
 
    COMPRESSED_TANGENT( 0.f,-1.f, 0.f,-1.f), PADDING, PADDING,            \
 
    COMPRESSED_TANGENT( 0.f, 0.f,+1.f,-1.f), PADDING, PADDING,            \
 
    COMPRESSED_TANGENT( 0.f, 0.f,-1.f,-1.f), PADDING, PADDING
 
  #define TEST_VALUES_X16                                                 \
 
    TEST_VALUES, TEST_VALUES, TEST_VALUES, TEST_VALUES,                   \
 
    TEST_VALUES, TEST_VALUES, TEST_VALUES, TEST_VALUES,                   \
 
    TEST_VALUES, TEST_VALUES, TEST_VALUES, TEST_VALUES,                   \
 
    TEST_VALUES, TEST_VALUES, TEST_VALUES, TEST_VALUES
 
  #define TEST_VALUES_X256                                                \
 
    TEST_VALUES_X16, TEST_VALUES_X16, TEST_VALUES_X16, TEST_VALUES_X16,   \
 
    TEST_VALUES_X16, TEST_VALUES_X16, TEST_VALUES_X16, TEST_VALUES_X16,   \
 
    TEST_VALUES_X16, TEST_VALUES_X16, TEST_VALUES_X16, TEST_VALUES_X16,   \
 
    TEST_VALUES_X16, TEST_VALUES_X16, TEST_VALUES_X16, TEST_VALUES_X16
 
  static const uint32_t test_data[]={
 
    TEST_VALUES_X256
 
  };
 
  #define ARRAYLEN(ARRAY) (sizeof(ARRAY)/sizeof(*(ARRAY)))
 
  
 
  static uint32_t results[ (ARRAYLEN(test_data)/3)*4 ]
 
    __attribute__((__aligned__(16)));
 
  
 
  extern void reference( void *__restrict__ out,const void *__restrict__ in,
 
    uint32_t count, uint32_t stride );
 
  extern void intrinsics( void *__restrict__ out,const void *__restrict__ in,
 
    uint32_t count, uint32_t stride );
 
  extern "C" void assembler( void *__restrict__ out,const void *__restrict__ in,
 
    uint32_t count, uint32_t stride );
 
  
 
  int main( int argc, char **argv ) {
 
  
 
    uint32_t reference_cycles;
 
    uint32_t intrinsics_cycles;
 
    uint32_t assembler_cycles;
 
  
 
    (void) argc;
 
    (void) argv;
 
  
 
    {
 
      spu_writech( SPU_WrDec, -1 );
 
      reference( results, test_data, ARRAYLEN(test_data)/3, 12 );
 
      reference_cycles = -spu_readch( SPU_RdDec );
 
    }
 
  
 
    {
 
      spu_writech( SPU_WrDec, -1 );
 
      intrinsics( results, test_data, ARRAYLEN(test_data)/3, 12 );
 
      intrinsics_cycles = -spu_readch( SPU_RdDec );
 
    }
 
  
 
    {
 
      spu_writech( SPU_WrDec, -1 );
 
      assembler( results, test_data, ARRAYLEN(test_data)/3, 12 );
 
      assembler_cycles = -spu_readch( SPU_RdDec );
 
    }
 
  
 
    printf(
 
      "           cycles       speed up\n"
 
      "reference  0x%08x\n"
 
      "intrinsics 0x%08x   %.3f\n"
 
      "assembler  0x%08x   %.3f\n",
 
      reference_cycles,
 
      intrinsics_cycles, (float)reference_cycles/(float)intrinsics_cycles,
 
      assembler_cycles,  (float)reference_cycles/(float)assembler_cycles );
 
  
 
    return 0;
 
  }
 
  

build.sh

 
  #! /bin/sh
 
  
 
  set -e
 
  
 
  spu-g++ -c -O3 -Werror -Wall -pedantic test.c++       -o test.o
 
  spu-g++ -c -O3 -Werror -Wall -pedantic reference.c++  -o reference.o
 
  spu-g++ -c -O3 -Werror -Wall -pedantic intrinsics.c++ -o intrinsics.o
 
  spu-as --fatal-warnings assembler.s -o assembler.o
 
  spu-g++ test.o reference.o intrinsics.o assembler.o -o test
 
  
 
  scp test ${PS3_USER_IP}:/tmp/test
 
  ssh ${PS3_USER_IP} /tmp/test
 
  

To be honest, you do need to think carefully if it is really the right move to optimize like this. Chances are, your scarce dev time can but put to better use optimizing something else. As we saw, the code became very fast, twice as fast as a fairly respectable intrinsics implementation, but it also became quite difficult to maintain. If you now want to make any change, you pretty much always need to reschedule the whole thing again from scratch. Luckily, if you are a registerred PS3 developer, then Sony has a very awesome tool which takes the manual pain out of the process. But as always, to fully get the most from your tools, you need to understand how they work. Specifically, reducing the lifetime of a variable like we did for x by using several temporaries helps. And if your working on some other hardware where the tools do not exist, then this can get you that last elusive boost of speed you need.

Next time around, we’ll have a look at branch prediction, and how to remove branches.