performance – Fastest (in clock cycles) 16-bit x 16-bit unsigned integer division algorithm for ATMEGA1284?

I am trying to create an optimized 16-bit division algorithm for the AVR ATMEGA1284. The goal is to reduce the number of clock cycles as much as possible.

AVR INSTRUCTION SET MANUAL:
https://ww1.microchip.com/downloads/en/devicedoc/atmel-0856-avr-instruction-set-manual.pdf

AVR200: Multiply and Divide Routines:
https://ww1.microchip.com/downloads/en/AppNotes/doc0936.pdf

A standard shift and subtract type division algorithm suggested by Atmel/Microchip takes between 173 clock cycles and 243 clock cycles, depending on if you unroll the loop or not.

What I have so far takes a maximum of 68 clock cycles. I ran an exhaustive test 16-hour test proving that the algorithm returns the correct result for all 2^32 combinations of inputs and outputs. So I am not looking for any validation that the algorithm returns the correct results. I am looking for ways to reduce either the code size, lookup table size, or number of clock cycles.

The constraints are as follows.

  • The dividend is an unsigned 16-bit number passed into the algorithm in a pair of 8-bit registers.
  • The divisor is an unsigned 16-bit number passed into the algorithm in a pair of 8-bit registers.
  • The algorithm returns the quotient in a pair of 8-bit registers.
  • The algorithm also returns the remainder in a pair of 8-bit registers.
  • The algorithm code (plus any lookup tables) must consume less than 5K bytes of flash memory.
  • The algorithm may return any values for division by 0.

Here is what I have so far.

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;ARGUMENTS:  r16, r17, r18, r19
;  r16:r17 = N (numerator)
;  r18:r19 = D (divisor)
;RETURNS:    r20, r21
;  r20:r21 (quotient)
;  r22:r23 (remainder)
;
;DESCRIPTION:  divides an unsigned 16 bit number N by unsigned 16 bit divisor D
;  Typical run time is 57 to 68 clock cycles.
;
;ALGORITHM OVERVIEW
;
;RZERO = 0;
;if(D < 256){
;  N2 = (N * ((R1H_TBL(D) << 8) + R1L_TBL(D))) >> 16;
;  P  = N2 * D
;}else{
;  D1 = (R1H_TBL(D) * D) >> 8
;  N1 = (R1H_TBL(D) * N) >> 8
;  if(D1 < 256){
;    N2 = N1 >> 8;
;  }else{
;    N2 = N2 * R2_TBL(D1 & 0xFF);
;  }
;  P = N2 * D;
;  if(P > 65535){
;    N2 = N2 - 1    ;//Decrement quotient
;    N1 = N2 - P + D;//Calculate remainder
;    return;//return quotient in N2, remainder in N1
;  }
;}
;N1 = N - P;
;if(P > N){
;  N2 = N2 - 1;//decrease quotient
;  N1 = N1 + D;//increase reamainder
;  return;//return quotient in N2, remainder in N1
;}
;if(N1 > D){
;  N2 = N2 + 1;
;  N1 = N1 - D;
;  return;//return quotient in N2, remainder in N1
;}
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.def NH    = r16 .def NL    = r17
.def DH    = r18 .def DL    = r19
.def N2H   = r20 .def N2L   = r21
.def N1H   = r22 .def N1L   = r23
.def PRODL = r0  .def PRODH = r1
.def PH    = r2  .def PL    = r3
.def D1H   = r4  .def D1L   = r5
.def RZERO = r6
.def Rx    = r7 

idivu_16x16:  
    clr RZERO                 ;1
    ;Check that DH is not zero
    tst DH                    ;1
    brne idivu_16x16_dhne   ;2
    ;code for D < 256   
idivu_16x8:
    ;lookup low byte of recipricol into P.
    ;where P = min(2^16 / D,2^16-1)
    mov zl, DL               ;1
    ldi zh, high(R1L_TBL*2)  ;1 
    lpm PL, Z                ;3
    ldi zh, high(R1H_TBL*2)  ;1 
    lpm PH, Z                ;3
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    ;calculate N2 = (P * N) >> 16
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    ;     NH:NL
    ;  X  RH:RL
    ;------------------------------------------
    ;   N2H    |   N2L    |  N1H     | dropped
    ;----------+----------+----------+---------
    ;          |          | H(PL*NL) | L(PL*NL)
    ;          | H(PL*NH) | L(PL*NH) |
    ;          | H(PH*NL) | L(PH*NL) |
    ; H(PH*NH) | L(PH*NH) |          |
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
    mul NL , PL     ;1  PL*NL
    mov N1H, PRODH  ;1  N1H <= H(PL*NL)
    mul NH , PH     ;1  PH*NH
    mov N2H, PRODH  ;1  N2H <= H(PH*NH)
    mov N2L, PRODL  ;1  N2L <= L(PH*NH)
    mul NH , PL     ;1  PL*NH
    add N1H, PRODL  ;1  N1H <= H(PL*NL) + L(PL*NH) 
    adc N2L, PRODH  ;1  N2L <= L(PH*NH) + H(PL*NH)
    adc N2H, RZERO  ;1  propagate carry to N2H      
    mul NL , PH     ;1  PH*NL
    add N1H, PRODL  ;1  N1H <= H(PL*NL) + L(PL*NH) + L(PH*NL)
    adc N2L, PRODH  ;1  N2L <= H(PH*NL) + L(PH*NH) + H(PL*NH)
    adc N2H, RZERO  ;1  propagate carry to N2H  
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    ;calculate P = N2 * DL ,note DH=0
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;    
    mul N2L, DL              ;1
    mov PL, PRODL            ;1
    mov PH, PRODH            ;1
    mul N2H, DL              ;1
    add PH, PRODL            ;1
    rjmp idivu_16x16_adj_n ;2
    ;code for D >= 256
idivu_16x16_dhne:          
    ;Lookup Rx = min(256 / DH, 255)     
    mov zl, DH               ;1
    ldi zh, high(R1H_TBL*2)  ;1 
    lpm Rx, Z                ;3
    ;D1 = (D * Rx) >> 8          
    mul Rx , DH              ;1
    mov D1L, PRODL           ;1
    mov D1H, PRODH           ;1
    mul Rx , DL              ;1
    add D1L, PRODH           ;1
    adc D1H, RZERO           ;1
    ;N1 = (D * Rx) >> 8          
    mul Rx , NH              ;1
    mov N1L, PRODL           ;1
    mov N1H, PRODH           ;1
    mul Rx , NL              ;1
    add N1L, PRODH           ;1
    adc N1H, RZERO           ;1
    ;if D1H = 0 then use Rx = 256, otherwise use table   
    tst D1H                  ;1
    brne idivu_16x16_dxhne ;2
    mov N2L, N1H             ;1
    clr N2H                  ;1
    rjmp idivu_16x16_checkn;2
    idivu_16x16_dxhne:
    ;Lookup Rx = (2 ^ 16)  (256 + D1H)
    mov zl, D1L              ;1
    ldi zh, high(R2_TBL*2)   ;1
    lpm Rx, Z                ;3
    ;N2 = (N1 * R2) >> 16
    mul Rx  , N1H            ;1
    mov PL  , PRODL          ;1
    mov N2L , PRODH          ;1
    mul Rx , N1L             ;1
    add PL , PRODH           ;1
    adc N2L, RZERO           ;1
    clr N2H                  ;1
    idivu_16x16_checkn:
    ;Check result (it may be off by +/- 1)
    ;P = N2 * D
    ;NOTE:  N2 <=255 so NxH = 0, also P < 2^16 so we can discard upper byte of DH * NxL
    mul DL , N2L             ;1
    mov PL, PRODL            ;1
    mov PH, PRODH            ;1
    mul DH , N2L             ;1
    add PH , PRODL           ;1 
    brcc idivu_16x16_adj_n ;2
    ;if multiply overflowed then...
    ;decrement quotient
    ;calculate remainder as N - P + D
    subi N2L, 0x01           ;1
    sbci N2H, 0x00           ;1
    mov N1L, NL              ;1
    mov N1H, NH              ;1
    sub N1L, PL              ;1
    sbc N1H, PH              ;1
    add  N1L, DL             ;1
    adc  N1H, DH             ;1
    rjmp idivu_16x16_end   ;2
    ;Adjust result up or down by 1 if needed.
    idivu_16x16_adj_n:
    ;Add -P to N, with result in P
    mov N1L, NL              ;1
    mov N1H, NH              ;1
    sub N1L, PL              ;1
    sbc N1H, PH              ;1
    brsh idivu_16x16_pltn  ;2
    idivu_16x16_decn2:
    ;if P > N then decrement quotient, add to remainder
    subi N2L, 0x01           ;1
    sbci N2H, 0x00           ;1
    add  N1L, DL             ;1
    adc  N1H, DH             ;1
    rjmp idivu_16x16_end   ;2
    idivu_16x16_pltn:
    ;test remainder to D 
    cp  N1L, DL              ;1
    cpc N1H, DH              ;1
    ;if remainder < D then goto end
    brlo idivu_16x16_end   ;2
    ;if remainder >= D then increment quotient, reduce remainder
    subi N2L, 0xFF           ;1
    sbci N2H, 0xFF           ;1
    sub N1L, DL              ;1
    sbc N1H, DH              ;1
    idivu_16x16_end:
    ret
    .undef NH    .undef NL   
    .undef DH    .undef DL   
    .undef N2H   .undef N2L  
    .undef N1H   .undef N1L  
    .undef PRODL .undef PRODH
    .undef PH    .undef PL   
    .undef D1H   .undef D1L  
    .undef RZERO 
    .undef Rx