Device Works learn. build.

Microbenchmarks on ARM Cortex M

After the last post we’ve got some cool hybrid c/asm code. It must be faster than the naive solution, but how do we know? Happily there exists dedicated debug hardware that can count the cycles of execution without disturbing them! Let’s start over with the basic code from the last post and add this in.

int dot(short * a, short * b, int len) {
  int r=0;
  while(--len!=-1) {
    r+=(*a++)*(*b++);
  }
  return r;
}
#define array_len(a) (sizeof(a)/sizeof(a[0]))

int main(char * argc, char * argv[]) {
  short a[512] = {1}

  int r = dot( a, a, array_len(a));

  return r;
}

Now let’s add some cycle counting in main.

#include "stdio.h"
int dot(short * a, short * b, int len) {
  int r=0;
  while(--len!=-1) {
    r+=(*a++)*(*b++);
  }
  return r;
}
#define array_len(a) (sizeof(a)/sizeof(a[0]))
int main(char * argc, char * argv[]) {
  short a[512] = {1}

  // These addresses are defined by ARM
  volatile unsigned int *DWT_CYCCNT = (volatile unsigned int *)0xE0001004;
  volatile unsigned int *DWT_CONTROL = (volatile unsigned int *)0xE0001000;
  volatile unsigned int *SCB_DEMCR = (volatile unsigned int *)0xE000EDFC;

  //Turn on the cycle counter and reset it to 0
  *SCB_DEMCR = *SCB_DEMCR | 0x01000000;
  *DWT_CYCCNT = 0;
  *DWT_CONTROL = *DWT_CONTROL | 1;

  int r = dot( a, a, array_len(a));

  //Turn it off
  *DWT_CONTROL = *DWT_CONTROL | 0;
  printf("%d cycles\n", *DWT_CYCCNT );

  return r;
}

That’s 6185 cycles. Let’s see how we did with SMLAD.

#include "stdint.h"


__attribute__((always_inline)) static inline int32_t SMLAD(uint32_t a, uint32_t b, int32_t acc)
{
  int32_t r;
  asm ( "SMLAD %0, %1, %2, %3"
    : "=r"(r)
    : "r"(a), "r"(b), "r"(acc));
  return r;
}

int dot(short * a, short * b, int len) {
  int32_t r=0;
  uint32_t *pa=a, *pb=b;
  len>>=1;
  while(--len!=-1) {
    r = SMLAD(*pa++,*pb++,r);
  }
  return r;
}

Using O3 it’s 3362 cycles. Using Os it’s 4142. How about unrolling?

int dot(short * a, short * b, int len) {
  int32_t r=0;
  uint64_t *pa=a, *pb=b;
  len>>=2;
  while(--len!=-1) {
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
  }
  return r;
}

Gets us 2470 cycles with O3 and 2349 with Os. Let’s unroll further.

int dot(short * a, short * b, int len) {
  int32_t r=0;
  uint64_t *pa=a, *pb=b;
  len>>=3;
  while(--len!=-1) {
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
  }
  return r;
}

1703 with O3, 2026 with Os

int dot(short * a, short * b, int len) {
  int32_t r=0;
  uint64_t *pa=a, *pb=b;
  len>>=4;
  while(--len!=-1) {
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
  }
  return r;
}

1746 with O3, 1610 with Os.

int dot(short * a, short * b, int len) {
  int32_t r=0;
  uint64_t *pa=a, *pb=b;
  len>>=5;
  while(--len!=-1) {
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
  }
  return r;
}

Cycles by unroll and optimization level

Unroll O3 Os
1 6185  
2 3362 4142
4 1703 2026
8 1746 1610
16 1918 1338
32 2016 1170
64 2214 1217

There’s a catch with these unrolls, the input buffers need to be a multiple of the unroll length. But we have one more trick.

Duff’s device

We can reduce the requirement to be a multiple of the unroll down to being a multiple of the smallest internal unroll with Duff. In this case the smalles internal unroll is 8, to match the load multiple instruction.

int dot(short * a, short * b, int len) {
  int32_t r=0;
  uint64_t *pa=a, *pb=b;
  int rm = len & 0xf;
  len>>=4;
  switch(rm%8) {
  do {
    case 0:
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
    case 3:
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
    case 2:
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
    case 1:
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
  } while(--len!=0);
  }

  return r;
}

But it costs us a few cycles, and O3 is faster.

Unroll Os O3
16 2133 1813
32 1753 1443
64 1982 1499

It’s clear it takes advantage of the inner unroll.

Fastest Duff (unroll by 32 O3)

<dot>:
	b4f0      	push	{r4, r5, r6, r7}
	f002 0407 	and.w	r4, r2, #7
	3c01      	subs	r4, #1
	1152      	asrs	r2, r2, #5
	2c06      	cmp	r4, #6
	d85f      	bhi.n	3386e <dot+0xce>
	e8df f004 	tbb	[pc, r4]
	6561      	.short	0x6561
	04716d69 	.word	0x04716d69
	5a          	.byte	0x5a
	00          	.byte	0x00
	4603      	mov	r3, r0
	f04f 0c00 	mov.w	ip, #0
	681d      	ldr	r5, [r3, #0]
	6858      	ldr	r0, [r3, #4]
	680e      	ldr	r6, [r1, #0]
	684c      	ldr	r4, [r1, #4]
	fb25 cc06 	smlad	ip, r5, r6, ip
	3308      	adds	r3, #8
	3108      	adds	r1, #8
	fb20 cc04 	smlad	ip, r0, r4, ip
	e893 0021 	ldmia.w	r3, {r0, r5}
	e891 0050 	ldmia.w	r1, {r4, r6}
	fb20 cc04 	smlad	ip, r0, r4, ip
	fb25 cc06 	smlad	ip, r5, r6, ip
	3308      	adds	r3, #8
	3108      	adds	r1, #8
	681d      	ldr	r5, [r3, #0]
	6858      	ldr	r0, [r3, #4]
	680e      	ldr	r6, [r1, #0]
	684c      	ldr	r4, [r1, #4]
	fb25 cc06 	smlad	ip, r5, r6, ip
	3308      	adds	r3, #8
	3108      	adds	r1, #8
	fb20 cc04 	smlad	ip, r0, r4, ip
	e893 0041 	ldmia.w	r3, {r0, r6}
	e891 0030 	ldmia.w	r1, {r4, r5}
	fb20 cc04 	smlad	ip, r0, r4, ip
	fb26 cc05 	smlad	ip, r6, r5, ip
	3308      	adds	r3, #8
	3108      	adds	r1, #8
	6858      	ldr	r0, [r3, #4]
	684c      	ldr	r4, [r1, #4]
	681e      	ldr	r6, [r3, #0]
	680d      	ldr	r5, [r1, #0]
	fb26 cc05 	smlad	ip, r6, r5, ip
	3308      	adds	r3, #8
	3108      	adds	r1, #8
	fb20 cc04 	smlad	ip, r0, r4, ip
	3a01      	subs	r2, #1
	681d      	ldr	r5, [r3, #0]
	685c      	ldr	r4, [r3, #4]
	680e      	ldr	r6, [r1, #0]
	6848      	ldr	r0, [r1, #4]
	fb25 c506 	smlad	r5, r5, r6, ip
	fb24 5000 	smlad	r0, r4, r0, r5
	d031      	beq.n	3389c <dot+0xfc>
	3308      	adds	r3, #8
	3108      	adds	r1, #8
	685e      	ldr	r6, [r3, #4]
	680d      	ldr	r5, [r1, #0]
	684c      	ldr	r4, [r1, #4]
	681f      	ldr	r7, [r3, #0]
	fb27 0005 	smlad	r0, r7, r5, r0
	3308      	adds	r3, #8
	3108      	adds	r1, #8
	fb26 0c04 	smlad	ip, r6, r4, r0
	685c      	ldr	r4, [r3, #4]
	684e      	ldr	r6, [r1, #4]
	6818      	ldr	r0, [r3, #0]
	680d      	ldr	r5, [r1, #0]
	fb20 cc05 	smlad	ip, r0, r5, ip
	3308      	adds	r3, #8
	3108      	adds	r1, #8
	fb24 cc06 	smlad	ip, r4, r6, ip
	e7ac      	b.n	337c0 <dot+0x20>
	4603      	mov	r3, r0
	f04f 0c00 	mov.w	ip, #0
	e7f0      	b.n	33850 <dot+0xb0>
	4603      	mov	r3, r0
	2000      	movs	r0, #0
	e7e3      	b.n	3383c <dot+0x9c>
	4603      	mov	r3, r0
	f04f 0c00 	mov.w	ip, #0
	e7d3      	b.n	33824 <dot+0x84>
	4603      	mov	r3, r0
	f04f 0c00 	mov.w	ip, #0
	e7c5      	b.n	33810 <dot+0x70>
	4603      	mov	r3, r0
	f04f 0c00 	mov.w	ip, #0
	e7b7      	b.n	337fc <dot+0x5c>
	4603      	mov	r3, r0
	f04f 0c00 	mov.w	ip, #0
	e7a9      	b.n	337e8 <dot+0x48>
	4603      	mov	r3, r0
	f04f 0c00 	mov.w	ip, #0
	e79b      	b.n	337d4 <dot+0x34>
	bcf0      	pop	{r4, r5, r6, r7}
	4770      	bx	lr

Fastest overall (Unroll by 32 Os):

<dot>:
	b5f0      	push	{r4, r5, r6, r7, lr}
	4603      	mov	r3, r0
	1192      	asrs	r2, r2, #6
	2000      	movs	r0, #0
	f112 32ff 	adds.w	r2, r2, #4294967295	; 0xffffffff
	f0c0 8083 	bcc.w	338b6 <dot+0x116>
	685c      	ldr	r4, [r3, #4]
	680f      	ldr	r7, [r1, #0]
	681e      	ldr	r6, [r3, #0]
	684d      	ldr	r5, [r1, #4]
	fb26 0607 	smlad	r6, r6, r7, r0
	6898      	ldr	r0, [r3, #8]
	fb24 6505 	smlad	r5, r4, r5, r6
	699f      	ldr	r7, [r3, #24]
	688c      	ldr	r4, [r1, #8]
	69de      	ldr	r6, [r3, #28]
	fb20 5504 	smlad	r5, r0, r4, r5
	68d8      	ldr	r0, [r3, #12]
	68cc      	ldr	r4, [r1, #12]
	fb20 5504 	smlad	r5, r0, r4, r5
	6918      	ldr	r0, [r3, #16]
	690c      	ldr	r4, [r1, #16]
	fb20 5504 	smlad	r5, r0, r4, r5
	6958      	ldr	r0, [r3, #20]
	694c      	ldr	r4, [r1, #20]
	fb20 5504 	smlad	r5, r0, r4, r5
	69cc      	ldr	r4, [r1, #28]
	6988      	ldr	r0, [r1, #24]
	fb27 5000 	smlad	r0, r7, r0, r5
	6a1d      	ldr	r5, [r3, #32]
	fb26 0604 	smlad	r6, r6, r4, r0
	6b1f      	ldr	r7, [r3, #48]	; 0x30
	6a4c      	ldr	r4, [r1, #36]	; 0x24
	6a08      	ldr	r0, [r1, #32]
	fb25 6000 	smlad	r0, r5, r0, r6
	6a5d      	ldr	r5, [r3, #36]	; 0x24
	6a9e      	ldr	r6, [r3, #40]	; 0x28
	fb25 0c04 	smlad	ip, r5, r4, r0
	6acc      	ldr	r4, [r1, #44]	; 0x2c
	6add      	ldr	r5, [r3, #44]	; 0x2c
	6a88      	ldr	r0, [r1, #40]	; 0x28
	fb26 c000 	smlad	r0, r6, r0, ip
	6b5e      	ldr	r6, [r3, #52]	; 0x34
	fb25 0504 	smlad	r5, r5, r4, r0
	6b4c      	ldr	r4, [r1, #52]	; 0x34
	6b08      	ldr	r0, [r1, #48]	; 0x30
	fb27 5000 	smlad	r0, r7, r0, r5
	6b9d      	ldr	r5, [r3, #56]	; 0x38
	fb26 0604 	smlad	r6, r6, r4, r0
	6c9f      	ldr	r7, [r3, #72]	; 0x48
	6bcc      	ldr	r4, [r1, #60]	; 0x3c
	6b88      	ldr	r0, [r1, #56]	; 0x38
	fb25 6000 	smlad	r0, r5, r0, r6
	6bdd      	ldr	r5, [r3, #60]	; 0x3c
	6c1e      	ldr	r6, [r3, #64]	; 0x40
	fb25 0c04 	smlad	ip, r5, r4, r0
	6c4c      	ldr	r4, [r1, #68]	; 0x44
	6c5d      	ldr	r5, [r3, #68]	; 0x44
	6c08      	ldr	r0, [r1, #64]	; 0x40
	fb26 c000 	smlad	r0, r6, r0, ip
	6cde      	ldr	r6, [r3, #76]	; 0x4c
	fb25 0504 	smlad	r5, r5, r4, r0
	6ccc      	ldr	r4, [r1, #76]	; 0x4c
	6c88      	ldr	r0, [r1, #72]	; 0x48
	fb27 5000 	smlad	r0, r7, r0, r5
	6d1d      	ldr	r5, [r3, #80]	; 0x50
	fb26 0604 	smlad	r6, r6, r4, r0
	6e1f      	ldr	r7, [r3, #96]	; 0x60
	6d4c      	ldr	r4, [r1, #84]	; 0x54
	6d08      	ldr	r0, [r1, #80]	; 0x50
	fb25 6000 	smlad	r0, r5, r0, r6
	6d5d      	ldr	r5, [r3, #84]	; 0x54
	6d9e      	ldr	r6, [r3, #88]	; 0x58
	fb25 0c04 	smlad	ip, r5, r4, r0
	6dcc      	ldr	r4, [r1, #92]	; 0x5c
	6ddd      	ldr	r5, [r3, #92]	; 0x5c
	6d88      	ldr	r0, [r1, #88]	; 0x58
	fb26 c000 	smlad	r0, r6, r0, ip
	6e5e      	ldr	r6, [r3, #100]	; 0x64
	fb25 0504 	smlad	r5, r5, r4, r0
	6e4c      	ldr	r4, [r1, #100]	; 0x64
	6e08      	ldr	r0, [r1, #96]	; 0x60
	fb27 5000 	smlad	r0, r7, r0, r5
	6e9d      	ldr	r5, [r3, #104]	; 0x68
	fb26 0604 	smlad	r6, r6, r4, r0
	6f8f      	ldr	r7, [r1, #120]	; 0x78
	6ecc      	ldr	r4, [r1, #108]	; 0x6c
	6e88      	ldr	r0, [r1, #104]	; 0x68
	fb25 6000 	smlad	r0, r5, r0, r6
	6edd      	ldr	r5, [r3, #108]	; 0x6c
	6f1e      	ldr	r6, [r3, #112]	; 0x70
	fb25 0c04 	smlad	ip, r5, r4, r0
	6f4d      	ldr	r5, [r1, #116]	; 0x74
	6f5c      	ldr	r4, [r3, #116]	; 0x74
	6f08      	ldr	r0, [r1, #112]	; 0x70
	fb26 c000 	smlad	r0, r6, r0, ip
	6fde      	ldr	r6, [r3, #124]	; 0x7c
	fb24 0405 	smlad	r4, r4, r5, r0
	6fc8      	ldr	r0, [r1, #124]	; 0x7c
	6f9d      	ldr	r5, [r3, #120]	; 0x78
	fb25 4407 	smlad	r4, r5, r7, r4
	3380      	adds	r3, #128	; 0x80
	3180      	adds	r1, #128	; 0x80
	fb26 4000 	smlad	r0, r6, r0, r4
	e778      	b.n	337a8 <dot+0x8>
	bdf0      	pop	{r4, r5, r6, r7, pc}

Try playing with it yourself on the compiler explorer.