algorithm - tinyAVR: best known multiplication routines for 8-bit and 16-bit factors? -


"faster avr200b.asm"? mpy8u-routine avr200b.asm processors of atmel's avr family not implement of mul instructions seems pretty generic, mpy16u looks sloppy rotating both lower result bytes 16 times instead of 8. antonio presented fast 16×16→16 unsigned multiplication using 64 cycles worst case excluding call/return overhead.
arbitrarily suggest optimisation goals worst case cycle count, word count (ram , flash), register usage, , expected cycle count in order of decreasing priority.
(there reduced core avrs ("single digit"-attiny, 10/20/40) differences including timing, suggested ignore.)

(caution: don't take claim herein granted, @ least not without independent affirmation.)

what best known 8×8→8/16, 16×16→16/32 , 16×8→16/24 bit multiplication routines avrs without mul?

unsigned, shift , add, partial product shifting right, unrolled 8×8→16, 16×16→32 (this avr200b presents. keep boring, shaved off cycle here, cycle there, , threw in 16×16→16.)

;************************************************************** ;* ;* "mpy8u" - 8x8 bit unsigned multiplication ;* ;* multiplies 2 register variables mp8u , mc8u. ;* result placed in registers m8uh, m8ul ; ;* number of words  : 39 + return ;* number of cycles : 33 + return ;* low registers used   : none ;* high registers used  : 3 (mc8u,mp8u/m8ul,m8uh) ;* ;* note: result , multiplier low byte same register. ;* causes multiplier overwritten result. ;* ;**************************************************************  .def    mc8u    = r16   ; multiplicand .def    mp8u    = r17   ; multiplier (mp, m'plier) .def    m8ul    = r17   ; partial product (pp)/result low byte .def    m8uh    = r18   ; result high byte  .cseg ; deadlines: noad81: 8, noad82: 11 even:               ;      4    not shift pp, yet     lsr   mp8u      ; 1    5    shift m'plier process bit 1     brcs  noad81    ;1/2   6/7  multiplicand in pp     sbrs  mp8u, 0   ;1/2   7/8  if m'plier original bit 2 clear     clr   m8uh      ; 1    8        mp bits 0-2 0: clear pp     lsr   mp8u      ; 1    9    shift multiplier alignment     rjmp  noad82    ; 2   11    in time ... again.  mpy8u:     mov   m8uh, mc8u; 1    1    move m'cand res high byte                     ;           (not yet knowing used)      lsr   mp8u      ; 1    2    shift m'plier process bit 0     brcc       ;1/2   3/4  if carry set     lsr   m8uh      ; 1    4        shift right res high byte     ror   m8ul      ; 1    5    rot right result low , m'plier 1      brcc  noad81    ;1/2   6/7  if carry set     add   m8uh, mc8u; 1    7        add m'cand result high byte noad81:             ;      7     ror   m8uh      ; 1    8    rotate right result high byte     ror   m8ul      ; 1    9    ror result low , m'plier 2      brcc  noad82    ;1/2  10/11 if carry set     add   m8uh, mc8u; 1   11        add m'cand result high byte noad82:             ;     11     ror   m8uh      ; 1   12    rotate right result high byte done82:             ;     12     ror   m8ul      ; 1   13    result low byte , multiplier 3      brcc  noad83    ;1/2  14/15 if carry set     add   m8uh, mc8u; 1   15        add m'cand result high byte noad83:     ror   m8uh      ; 1   16    rotate right result high byte     ror   m8ul      ; 1   17    result low byte , multiplier      brcc  noad84    ;1/2  18/19 if carry set     add   m8uh, mc8u; 1   19        add m'cand result high byte noad84: ror m8uh    ; 1   20    rotate right result high byte     ror   m8ul      ; 1   21    result low byte , multiplier      brcc  noad85    ;1/2  22/23 if carry set     add   m8uh, mc8u; 1   23        add m'cand result high byte noad85: ror m8uh    ; 1   24    shift right result high byte     ror   m8ul      ; 1   25    result low byte , multiplier      brcc  noad86    ;1/2  26/27 if carry set     add   m8uh, mc8u; 1   27        add m'cand result high byte noad86: ror m8uh    ; 1   28    rotate right result high byte     ror   m8ul      ; 1   29    result low byte , multiplier      brcc  noad87    ;1/2  30/31 if carry set     add   m8uh, mc8u; 1   31        add m'cand result high byte noad87: ror m8uh    ; 1   32    rotate right result high byte     ror   m8ul      ; 1   33    rotate right result low byte      ret 

16×16→32

;*************************************************************** ;* ;* "mpy16u" - 16x16->32 bit unsigned multiplication ;* ;* subroutine multiplies 2 16-bit register variables ;*  mp16uh:mp16ul (replaced low product) , mc16uh:mc16ul. ; (madd16u have added in value in acc1:acc0 (lame-?)) ;* result placed in m16u3:m16u2:m16u1:m16u0. ;* ;* number of words  :135 + return (configuration dependent) ;* number of cycles : 97 + return ;* (avr200b.asm mpy16u improved: 100, as-is: 116) ;* low registers used   : none ;* high registers used  : 6 (mp16ul,mp16uh, ;*                           mc16ul/m16u0,mc16uh/m16u1, ;*                           acc0/m16u2, acc1/m16u3) ;* ;***************************************************************  #if orderedmultiply .def    mcinl   = r16   ; multiplicand (m'cand) low byte .def    mcinh   = r17   ; multiplicand high byte .def    mc16ul  = r22   ; multiplicand low byte .def    mc16uh  = r23   ; multiplicand high byte .def    acc0    = r20   ; accumulator byte 0 .def    acc1    = r21   ; accumulator byte 1 (msb) # warning 2 warnings preceding definitions of # warning  r20 , r21 due , may ignored #else .def    mc16ul  = r16   ; multiplicand low byte .def    mc16uh  = r17   ; multiplicand high byte #endif .def    mp16ul  = r18   ; multiplier (m'plier) low byte .def    mp16uh  = r19   ; multiplier high byte # warning 2 warnings preceding definitions of # warning  r18 , r19 due , may ignored .def    m16u0   = r18   ; result byte 0 (lsb) .def    m16u1   = r19   ; result byte 1 .def    m16u2   = r20   ; result byte 2 .def    m16u3   = r21   ; result byte 3 (msb)  .cseg code0: rjmp mpy16u ; deadlines: done1: 11, noadd1: 9, 2: 15, 3: 21, 4: 25 mp00:                   ;     4     bit 0 of m'plier 0 ; trick: jmp conditional 1 clock late: duplicate condition     lsr   mp16ul        ; 1   5     bit 1 c     brcs  noadd1        ;1/2  6/7   if set, start accumulating #if shortheadstart ; 4 cycles saved neither rotating nor adding: ;    enough clear & jump "normal control flow"     clr   m16u2         ; 1   7     clear 2 highest result bytes     clr   m16u3         ; 1   8     (no sub clear c: sets z)     brne  done1         ;1/2  9/10  c must cleared -     rjmp  mp7done       ; 2  11 #else ; instead of clearing partial product when non-zero, ;  1 go on looking first bit set     lsr   mp16ul        ; 1   8     bit 2 c     brcs  noadd2        ;1/2  9/10  if set, start accumulating     lsr   mp16ul        ; 1  10     bit 3     brcs  noadd3        ;1/2 11/12  if set, start accumulating     lsr   mp16ul        ; 1  12     bit 4     brcs  noadd4        ;1/2 13/14  if set, start accumulating     lsr   mp16ul        ; 1  14     bit 5     brcs  noadd5        ;1/2 15/16  if set, start accumulating     lsr   mp16ul        ; 1  16     bit 6     brcs  noadd6        ;1/2 17/18  if set, start accumulating     lsr   mp16ul        ; 1  18     bit 7     brcs  noadd7        ;1/2 19/20  if set, start accumulating ; weeeell, there's 0 in m16u0 , next mp16uh     lsr   mp16uh        ; 1  20     bit 8     brcs  noclear       ; 1  21/22  if not set     clr   m16u2         ; 1  22         clear pp -     clr   m16u3         ; 1  23         @ long last noclear:     brne  noadd8        ;1/2 24/25  upper bits set. duh!     mov   m16u2, m16u1  ; 1  25     mov   m16u3, m16u2  ; 1  26     clr   m16u3         ; 1  27     *0, *256 faster ...     ret  #endif #if orderedmultiply ompy16u:    ; ordering may faster on average (see *early*),             ; adds 6 cycles , 7 words     cp    mcinl, mp16ul ;-6     cpc   mcinh, mp16uh ;-5     brlo  mclower       ;-4     if m'cand not lower     movw  mc16ul, mcinl ;-3         use m'cand     rjmp  ordered       ;-2 mclower:                ;-3     if m'cand lower     movw  mc16ul, mp16ul;-2         use m'plier m'cand     movw  mp16ul, mcinl ;-1         , vice-versa ordered: #endif  mpy16u:                 ;     0     not knowing used:     movw  m16u2, mc16ul ; 1   1     set partial product mc ; using asr sets overflow flag b0^b7: use?     asr   mp16ul        ; 1   2     shift m'plier low, keep bit7     brcc  mp00          ;1/2  3/4   if m'plier low bit 1 noadd0:     ror   m16u3         ; 1   4     rotate right result byte 3     ror   m16u2         ; 1   5     rotate right result byte 2 ;   ror   m16u1         ; rotate res byte 1 , m'plier high     ror   m16u0         ; 1   6     res byte 0 , m'plier low ; in seizure of coding exuberance, ;  1 check consecutive 1 bits ...     brcc  noadd1        ;1/2  7/8   if carry set     add   m16u2, mc16ul ; 1   8         add mc lo res byte 2     adc   m16u3, mc16uh ; 1   9         add mc hi res byte 3 noadd1:     ror   m16u3         ; 1  10     rotate right result byte 3     ror   m16u2         ; 1  11     rotate right result byte 2 ;   ror   m16u1         ; rotate res byte 1 , m'plier high done1:                  ;    11     ror   m16u0         ; 1  12     res byte 0 , m'plier low      brcc  noadd2        ;1/2 13/14  if carry set     add   m16u2, mc16ul ; 1  +1         add mc lo res byte 2     adc   m16u3, mc16uh ; 1  +2         add mc hi res byte 3 noadd2: ror m16u3       ; 1  +3     rotate right result byte 3     ror   m16u2         ; 1  +4     rotate right result byte 2     ror   m16u0         ; 1  18     res byte 0 , m'plier low      brcc  noadd3        ;1/2 19/20  if carry set     add   m16u2, mc16ul ; 1  +1         add mc lo res byte 2     adc   m16u3, mc16uh ; 1  +2         add mc hi res byte 3 noadd3: ror m16u3       ; 1  +3     rotate right result byte 3     ror   m16u2         ; 1  +4     rotate right result byte 2     ror   m16u0         ; 1  24     res byte 0 , m'plier low      brcc  noadd4        ;1/2 25/26  if carry set     add   m16u2, mc16ul ; 1  +1         add mc lo res byte 2     adc   m16u3, mc16uh ; 1  +2         add mc hi res byte 3 noadd4: ror m16u3       ; 1  +3     rotate right result byte 3     ror   m16u2         ; 1  +4     rotate right result byte 2     ror   m16u0         ; 1  30     res byte 0 , m'plier low      brcc  noadd5        ;1/2 31/32  if carry set     add   m16u2, mc16ul ; 1  +1         add mc lo res byte 2     adc   m16u3, mc16uh ; 1  +2         add mc hi res byte 3 noadd5: ror m16u3       ; 1  +3     rotate right result byte 3     ror   m16u2         ; 1  +4     rotate right result byte 2     ror   m16u0         ; 1  36     res byte 0 , m'plier low      brcc  noadd6        ;1/2 37/38  if carry set     add   m16u2, mc16ul ; 1  +1         add mc lo res byte 2     adc   m16u3, mc16uh ; 1  +2         add mc hi res byte 3 noadd6: ror m16u3       ; 1  +3     rotate right result byte 3     ror   m16u2         ; 1  +4     rotate right result byte 2     ror   m16u0         ; 1  42     res byte 0 , m'plier low      brcc  noadd7        ;1/2 43/44  if carry set     add   m16u2, mc16ul ; 1  +1         add mc lo res byte 2     adc   m16u3, mc16uh ; 1  +2         add mc hi res byte 3 noadd7: ror m16u3       ; 1  +3     rotate right result byte 3     ror   m16u2         ; 1  +4     rotate right result byte 2     ror   m16u0         ; 1  48     res byte 0 , m'plier low  ; instead of least significant bit of next byte, ;  never got rotated one, ;  shifted c highest bit mp16ul - again. ; using asr mp16uh set overflow flag b8^b15, ;  asr mp16uh set overflow flag b8^b7: use? ; quick check suggested "booth subtraction" on bit 7 ;  entirely possible. booth gains runs of identical bits ;  - 2 of signified overflow cleared. ; denoting addition +, subtractions - ; (the 1 done trailing) , "do nothings" (fast!) 0, ; relevant sequences ; (left: before recoding, right: after) ; 000+   00+-   (-1+2        1) ; 00++   0+0-   (-1  +4      3) ; 0+0+   +0--   (-1-2  +8    5) 0++- (-1+2+4) equivalent ; 0+++   +00-   (-1    +8    7) ; +00+   +0+-   (-1+2  +8    9) ; +0++   ++0-   (-1  +4+8   11) (-1-4+16 looks worse) ; ++0+  +00--   (-1-2   +16 13) (-1+8+ 8 looks worse) ; ++++  +000-   (-1     +16 15) ; deeming cases few non-zeros (operations) less critical, ; critical sequences? 0, 1, 3 good, ;  7 sort of. promises massive , messy - manana.  ; so, start on next byte of m'plier:     lsr   mp16uh        ; 1  49     shift multiplier high ;   brcc  noadd8        ;1/2 50/51  if carry set  #if !noceckearly ; (now wouldn't fun have out? more ; if knew multiplier (have chance to) ;  less multiplicand ...) ;   breq  earlyout      ;1/2 50/51 ; how not hurt worst case? ; 0 after shift easy: never come back! ;  "worst" thing might happen ninth bit set ;  - final addition , done. ; hard part non-zero, , c set: delayed 1 cycle ;   (where branch-on-zero-or-cary-clear instruction?!)     brcc  checkearly    ;1/2 50/51  if carry set #else ; 'til know how .define noearlyout 1     brcc  noadd8        ;1/2 50/51  if carry set #endif     add   m16u2, mc16ul ; 1  +1         add mc lo res byte 2     adc   m16u3, mc16uh ; 1  +2         add mc hi res byte 3 noadd8:     ror   m16u3         ; 1  +3     rotate right result byte 3     ror   m16u2         ; 1  +4     rotate right result byte 2     ror   m16u1         ; 1  54     res byte 1 , m'plier high ;   ror   m16u0         ;rotate res byte 0 , m'plier low      brcc  noadd9        ;1/2 55/56  if carry set add9:     add   m16u2, mc16ul ; 1  +1         add mc lo res byte 2     adc   m16u3, mc16uh ; 1  +2         add mc hi res byte 3 noadd9:     ror   m16u3         ; 1  +3     rotate right result byte 3     ror   m16u2         ; 1  +4     rotate right result byte 2 mp9done:     ror   m16u1         ; 1  61     res byte 1 , m'plier high      brcc  noadd10       ;1/2 62/63  if carry set     add   m16u2, mc16ul ; 1  +1         add mc lo res byte 2     adc   m16u3, mc16uh ; 1  +2         add mc hi res byte 3 noadd10:ror m16u3       ; 1  +3     rotate right result byte 3     ror   m16u2         ; 1  +4     rotate right result byte 2     ror   m16u1         ; 1  67     res byte 1 , m'plier high      brcc  noadd11       ;1/2 68/69  if carry set     add   m16u2, mc16ul ; 1  +1         add mc lo res byte 2     adc   m16u3, mc16uh ; 1  +2         add mc hi res byte 3 noadd11:ror m16u3       ; 1  +3     rotate right result byte 3     ror   m16u2         ; 1  +4     rotate right result byte 2     ror   m16u1         ; 1  73     res byte 1 , m'plier high      brcc  noadd12       ;1/2 74/75  if carry set     add   m16u2, mc16ul ; 1  +1         add mc lo res byte 2     adc   m16u3, mc16uh ; 1  +2         add mc hi res byte 3 noadd12:ror m16u3       ; 1  +3     rotate right result byte 3     ror   m16u2         ; 1  +4     rotate right result byte 2     ror   m16u1         ; 1  79     res byte 1 , m'plier high      brcc  noadd13       ;1/2 80/81  if carry set     add   m16u2, mc16ul ; 1  +1         add mc lo res byte 2     adc   m16u3, mc16uh ; 1  +2         add mc hi res byte 3 noadd13:ror m16u3       ; 1  +3     rotate right result byte 3     ror   m16u2         ; 1  +4     rotate right result byte 2     ror   m16u1         ; 1  85     res byte 1 , m'plier high      brcc  noadd14       ;1/2 86/83  if carry set     add   m16u2, mc16ul ; 1  +1         add mc lo res byte 2     adc   m16u3, mc16uh ; 1  +2         add mc hi res byte 3 noadd14:ror m16u3       ; 1  +3     rotate right result byte 3     ror   m16u2         ; 1  +4     rotate right result byte 2     ror   m16u1         ; 1  91     res byte 1 , m'plier high      brcc  noadd15       ;1/2 92/93  if carry set     add   m16u2, mc16ul ; 1  +1         add mc lo res byte 2     adc   m16u3, mc16uh ; 1  +2         add mc hi res byte 3 noadd15:ror m16u3       ; 1  +3     rotate right result byte 3     ror   m16u2         ; 1  +4     rotate right result byte 2     ror   m16u1         ; 1  97     res byte 1 , m'plier high      ret                 ; 3 100 :-/ #if !nocheckearly ; deadlines: add9 56, noadd9 58 checkearly:             ;    51     adds 15 words out ; make jump add9 1 cycle late, so: ; check after ror ; (which real pity needing 8 0 bits instead of 7) ;   breq  highzero      ;1/2        if m'plier high byte <> 0     ror   m16u3         ; 1  +1         rotate right res byte 3     ror   m16u2         ; 1  +2         rotate right res byte 2     ror   m16u1         ; 1  +3         res byte 1 , mp high     brcs  add9          ;1/2 56     if carry clear     brne  noadd9        ; 2  57     don't add time ; partial product in m16u3:m16u2:m16u1 ; - 7 bits left of position needed ;   lsl   m16u1         ;           no need: zero.     lsl   m16u2         ; 1  57     roll ...     rol   m16u3         ; 1  58 #endif #if !nocheckearly || !noearlyout earlyout:               ;    51 ; multiplier high 7 bits 0 ; can't use movw here because of overlap, if not alignment     mov   m16u1, m16u2  ; 1  59     place byte 1 of product     mov   m16u2, m16u3  ; 1  60     place byte 2 of product     clr   m16u3         ; 1  61     clear highest byte of result # if !noearlyout ; if reaching earlyout c (possibly) set bit 8, ;     account     brcc  nofinaladd    ;1/2 62     if bit 8 of m'plier clear     add   m16u1, mc16ul ; 1  63         add mc lo res byte 1     adc   m16u2, mc16uh ; 1  64         add mc hi res byte 2     adc   m16u3, m16u3  ; 1  65         1 bit might pending nofinaladd: # endif     ret                     ; 3 68  pretend done #endif 

to continued (16×16→16) - don't hold breath.


Comments

Popular posts from this blog

php - failed to open stream: HTTP request failed! HTTP/1.0 400 Bad Request -

java - How to filter a backspace keyboard input -

java - Show Soft Keyboard when EditText Appears -