rts/gmp/mpn/x86/divrem_1.asm

   1 dnl  x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
   2
   3 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 2.1 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public
  18 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  19 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  20 dnl  Suite 330, Boston, MA 02111-1307, USA.
  21
  22
  23 dnl        cycles/limb
  24 dnl  K6        20
  25 dnl  P5        44
  26 dnl  P6        39
  27 dnl  486   approx 43 maybe
  28 dnl
  29 dnl
  30 dnl  The following have their own optimized divrem_1 implementations, but
  31 dnl  for reference the code here runs as follows.
  32 dnl
  33 dnl        cycles/limb
  34 dnl  P6MMX     39
  35 dnl  K7        42
  36
  37
  38 include(`../config.m4')
  39
  40
  41 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
  42 C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
  43 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
  44 C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor);
  45 C
  46 C Divide src,size by divisor and store the quotient in dst+xsize,size.
  47 C Extend the division to fractional quotient limbs in dst,xsize.  Return the
  48 C remainder.  Either or both xsize and size can be 0.
  49 C
  50 C mpn_divrem_1c takes a carry parameter which is an initial high limb,
  51 C effectively one extra limb at the top of src,size.  Must have
  52 C carry<divisor.
  53 C
  54 C
  55 C Essentially the code is the same as the division based part of
  56 C mpn/generic/divrem_1.c, but has the following advantages.
  57 C
  58 C - If gcc isn't being used then divrem_1.c will get the generic C
  59 C   udiv_qrnnd() and be rather slow.
  60 C
  61 C - On K6, using the loop instruction is a 10% speedup, but gcc doesn't
  62 C   generate that instruction (as of gcc 2.95.2 at least).
  63 C
  64 C A test is done to see if the high limb is less the the divisor, and if so
  65 C one less div is done.  A div is between 20 and 40 cycles on the various
  66 C x86s, so assuming high<divisor about half the time, then this test saves
  67 C half that amount.  The branch misprediction penalty on each chip is less
  68 C than half a div.
  69 C
  70 C
  71 C K6: Back-to-back div instructions run at 20 cycles, the same as the loop
  72 C     here, so it seems there's nothing to gain by rearranging the loop.
  73 C     Pairing the mov and loop instructions was found to gain nothing.  (The
  74 C     same is true of the mpn/x86/mod_1.asm loop.)
  75 C
  76 C     With a "decl/jnz" rather than a "loop" this code runs at 22 cycles.
  77 C     The loop_or_decljnz macro is an easy way to get a 10% speedup.
  78 C
  79 C     The fast K6 multiply might be thought to suit a multiply-by-inverse,
  80 C     but that algorithm has been found to suffer from the releatively poor
  81 C     carry handling on K6 and too many auxiliary instructions.  The
  82 C     fractional part however could be done at about 13 c/l.
  83 C
  84 C P5: Moving the load down to pair with the store might save 1 cycle, but
  85 C     that doesn't seem worth bothering with, since it'd be only a 2.2%
  86 C     saving.
  87 C
  88 C     Again here the auxiliary instructions hinder a multiply-by-inverse,
  89 C     though there might be a 10-15% speedup available
  90
  91
  92 defframe(PARAM_CARRY,  24)
  93 defframe(PARAM_DIVISOR,20)
  94 defframe(PARAM_SIZE,   16)
  95 defframe(PARAM_SRC,    12)
  96 defframe(PARAM_XSIZE,  8)
  97 defframe(PARAM_DST,    4)
  98
  99         .text
 100         ALIGN(16)
 101
 102 PROLOGUE(mpn_divrem_1c)
 103 deflit(`FRAME',0)
 104
 105         movl    PARAM_SIZE, %ecx
 106         pushl   %edi            FRAME_pushl()
 107
 108         movl    PARAM_SRC, %edi
 109         pushl   %esi            FRAME_pushl()
 110
 111         movl    PARAM_DIVISOR, %esi
 112         pushl   %ebx            FRAME_pushl()
 113
 114         movl    PARAM_DST, %ebx
 115         pushl   %ebp            FRAME_pushl()
 116
 117         movl    PARAM_XSIZE, %ebp
 118         orl     %ecx, %ecx
 119
 120         movl    PARAM_CARRY, %edx
 121         jz      LF(mpn_divrem_1,fraction)
 122
 123         leal    -4(%ebx,%ebp,4), %ebx   C dst one limb below integer part
 124         jmp     LF(mpn_divrem_1,integer_top)
 125
 126 EPILOGUE()
 127
 128
 129 PROLOGUE(mpn_divrem_1)
 130 deflit(`FRAME',0)
 131
 132         movl    PARAM_SIZE, %ecx
 133         pushl   %edi            FRAME_pushl()
 134
 135         movl    PARAM_SRC, %edi
 136         pushl   %esi            FRAME_pushl()
 137
 138         movl    PARAM_DIVISOR, %esi
 139         orl     %ecx,%ecx
 140
 141         jz      L(size_zero)
 142         pushl   %ebx            FRAME_pushl()
 143
 144         movl    -4(%edi,%ecx,4), %eax   C src high limb
 145         xorl    %edx, %edx
 146
 147         movl    PARAM_DST, %ebx
 148         pushl   %ebp            FRAME_pushl()
 149
 150         movl    PARAM_XSIZE, %ebp
 151         cmpl    %esi, %eax
 152
 153         leal    -4(%ebx,%ebp,4), %ebx   C dst one limb below integer part
 154         jae     L(integer_entry)
 155
 156
 157         C high<divisor, so high of dst is zero, and avoid one div
 158
 159         movl    %edx, (%ebx,%ecx,4)
 160         decl    %ecx
 161
 162         movl    %eax, %edx
 163         jz      L(fraction)
 164
 165
 166 L(integer_top):
 167         C eax   scratch (quotient)
 168         C ebx   dst+4*xsize-4
 169         C ecx   counter
 170         C edx   scratch (remainder)
 171         C esi   divisor
 172         C edi   src
 173         C ebp   xsize
 174
 175         movl    -4(%edi,%ecx,4), %eax
 176 L(integer_entry):
 177
 178         divl    %esi
 179
 180         movl    %eax, (%ebx,%ecx,4)
 181         loop_or_decljnz L(integer_top)
 182
 183
 184 L(fraction):
 185         orl     %ebp, %ecx
 186         jz      L(done)
 187
 188         movl    PARAM_DST, %ebx
 189
 190
 191 L(fraction_top):
 192         C eax   scratch (quotient)
 193         C ebx   dst
 194         C ecx   counter
 195         C edx   scratch (remainder)
 196         C esi   divisor
 197         C edi
 198         C ebp
 199
 200         xorl    %eax, %eax
 201
 202         divl    %esi
 203
 204         movl    %eax, -4(%ebx,%ecx,4)
 205         loop_or_decljnz L(fraction_top)
 206
 207
 208 L(done):
 209         popl    %ebp
 210         movl    %edx, %eax
 211         popl    %ebx
 212         popl    %esi
 213         popl    %edi
 214         ret
 215
 216
 217 L(size_zero):
 218 deflit(`FRAME',8)
 219         movl    PARAM_XSIZE, %ecx
 220         xorl    %eax, %eax
 221
 222         movl    PARAM_DST, %edi
 223
 224         cld     C better safe than sorry, see mpn/x86/README.family
 225
 226         rep
 227         stosl
 228
 229         popl    %esi
 230         popl    %edi
 231         ret
 232 EPILOGUE()