rts/gmp/mpn/x86/mul_basecase.asm

   1 dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
   2 dnl  in a third limb vector.
   3
   4
   5 dnl  Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation,
   6 dnl  Inc.
   7 dnl
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or
  11 dnl  modify it under the terms of the GNU Lesser General Public License as
  12 dnl  published by the Free Software Foundation; either version 2.1 of the
  13 dnl  License, or (at your option) any later version.
  14 dnl
  15 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  16 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 dnl  Lesser General Public License for more details.
  19 dnl
  20 dnl  You should have received a copy of the GNU Lesser General Public
  21 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  22 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  23 dnl  Suite 330, Boston, MA 02111-1307, USA.
  24
  25
  26 include(`../config.m4')
  27
  28
  29 C void mpn_mul_basecase (mp_ptr wp,
  30 C                        mp_srcptr xp, mp_size_t xsize,
  31 C                        mp_srcptr yp, mp_size_t ysize);
  32 C
  33 C This was written in a haste since the Pentium optimized code that was used
  34 C for all x86 machines was slow for the Pentium II.  This code would benefit
  35 C from some cleanup.
  36 C
  37 C To shave off some percentage of the run-time, one should make 4 variants
  38 C of the Louter loop, for the four different outcomes of un mod 4.  That
  39 C would avoid Loop0 altogether.  Code expansion would be > 4-fold for that
  40 C part of the function, but since it is not very large, that would be
  41 C acceptable.
  42 C
  43 C The mul loop (at L(oopM)) might need some tweaking.  It's current speed is
  44 C unknown.
  45
  46 defframe(PARAM_YSIZE,20)
  47 defframe(PARAM_YP,   16)
  48 defframe(PARAM_XSIZE,12)
  49 defframe(PARAM_XP,   8)
  50 defframe(PARAM_WP,   4)
  51
  52 defframe(VAR_MULTIPLIER, -4)
  53 defframe(VAR_COUNTER,    -8)
  54 deflit(VAR_STACK_SPACE,  8)
  55
  56         .text
  57         ALIGN(8)
  58
  59 PROLOGUE(mpn_mul_basecase)
  60 deflit(`FRAME',0)
  61
  62         subl    $VAR_STACK_SPACE,%esp
  63         pushl   %esi
  64         pushl   %ebp
  65         pushl   %edi
  66 deflit(`FRAME',eval(VAR_STACK_SPACE+12))
  67
  68         movl    PARAM_XP,%esi
  69         movl    PARAM_WP,%edi
  70         movl    PARAM_YP,%ebp
  71
  72         movl    (%esi),%eax             C load xp[0]
  73         mull    (%ebp)                  C multiply by yp[0]
  74         movl    %eax,(%edi)             C store to wp[0]
  75         movl    PARAM_XSIZE,%ecx        C xsize
  76         decl    %ecx                    C If xsize = 1, ysize = 1 too
  77         jz      L(done)
  78
  79         pushl   %ebx
  80 FRAME_pushl()
  81         movl    %edx,%ebx
  82
  83         leal    4(%esi),%esi
  84         leal    4(%edi),%edi
  85
  86 L(oopM):
  87         movl    (%esi),%eax             C load next limb at xp[j]
  88         leal    4(%esi),%esi
  89         mull    (%ebp)
  90         addl    %ebx,%eax
  91         movl    %edx,%ebx
  92         adcl    $0,%ebx
  93         movl    %eax,(%edi)
  94         leal    4(%edi),%edi
  95         decl    %ecx
  96         jnz     L(oopM)
  97
  98         movl    %ebx,(%edi)             C most significant limb of product
  99         addl    $4,%edi                 C increment wp
 100         movl    PARAM_XSIZE,%eax
 101         shll    $2,%eax
 102         subl    %eax,%edi
 103         subl    %eax,%esi
 104
 105         movl    PARAM_YSIZE,%eax        C ysize
 106         decl    %eax
 107         jz      L(skip)
 108         movl    %eax,VAR_COUNTER        C set index i to ysize
 109
 110 L(outer):
 111         movl    PARAM_YP,%ebp           C yp
 112         addl    $4,%ebp                 C make ebp point to next v limb
 113         movl    %ebp,PARAM_YP
 114         movl    (%ebp),%eax             C copy y limb ...
 115         movl    %eax,VAR_MULTIPLIER     C ... to stack slot
 116         movl    PARAM_XSIZE,%ecx
 117
 118         xorl    %ebx,%ebx
 119         andl    $3,%ecx
 120         jz      L(end0)
 121
 122 L(oop0):
 123         movl    (%esi),%eax
 124         mull    VAR_MULTIPLIER
 125         leal    4(%esi),%esi
 126         addl    %ebx,%eax
 127         movl    $0,%ebx
 128         adcl    %ebx,%edx
 129         addl    %eax,(%edi)
 130         adcl    %edx,%ebx               C propagate carry into cylimb
 131
 132         leal    4(%edi),%edi
 133         decl    %ecx
 134         jnz     L(oop0)
 135
 136 L(end0):
 137         movl    PARAM_XSIZE,%ecx
 138         shrl    $2,%ecx
 139         jz      L(endX)
 140
 141         ALIGN(8)
 142 L(oopX):
 143         movl    (%esi),%eax
 144         mull    VAR_MULTIPLIER
 145         addl    %eax,%ebx
 146         movl    $0,%ebp
 147         adcl    %edx,%ebp
 148
 149         movl    4(%esi),%eax
 150         mull    VAR_MULTIPLIER
 151         addl    %ebx,(%edi)
 152         adcl    %eax,%ebp       C new lo + cylimb
 153         movl    $0,%ebx
 154         adcl    %edx,%ebx
 155
 156         movl    8(%esi),%eax
 157         mull    VAR_MULTIPLIER
 158         addl    %ebp,4(%edi)
 159         adcl    %eax,%ebx       C new lo + cylimb
 160         movl    $0,%ebp
 161         adcl    %edx,%ebp
 162
 163         movl    12(%esi),%eax
 164         mull    VAR_MULTIPLIER
 165         addl    %ebx,8(%edi)
 166         adcl    %eax,%ebp       C new lo + cylimb
 167         movl    $0,%ebx
 168         adcl    %edx,%ebx
 169
 170         addl    %ebp,12(%edi)
 171         adcl    $0,%ebx         C propagate carry into cylimb
 172
 173         leal    16(%esi),%esi
 174         leal    16(%edi),%edi
 175         decl    %ecx
 176         jnz     L(oopX)
 177
 178 L(endX):
 179         movl    %ebx,(%edi)
 180         addl    $4,%edi
 181
 182         C we incremented wp and xp in the loop above; compensate
 183         movl    PARAM_XSIZE,%eax
 184         shll    $2,%eax
 185         subl    %eax,%edi
 186         subl    %eax,%esi
 187
 188         movl    VAR_COUNTER,%eax
 189         decl    %eax
 190         movl    %eax,VAR_COUNTER
 191         jnz     L(outer)
 192
 193 L(skip):
 194         popl    %ebx
 195         popl    %edi
 196         popl    %ebp
 197         popl    %esi
 198         addl    $8,%esp
 199         ret
 200
 201 L(done):
 202         movl    %edx,4(%edi)       C store to wp[1]
 203         popl    %edi
 204         popl    %ebp
 205         popl    %esi
 206         addl    $8,%esp
 207         ret
 208
 209 EPILOGUE()