rts/gmp/mpn/x86/pentium/diveby3.asm

   1 dnl  Intel P5 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
   2 dnl
   3 dnl  P5: 15.0 cycles/limb
   4
   5
   6 dnl  Copyright (C) 2000 Free Software Foundation, Inc.
   7 dnl
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or
  11 dnl  modify it under the terms of the GNU Lesser General Public License as
  12 dnl  published by the Free Software Foundation; either version 2.1 of the
  13 dnl  License, or (at your option) any later version.
  14 dnl
  15 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  16 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 dnl  Lesser General Public License for more details.
  19 dnl
  20 dnl  You should have received a copy of the GNU Lesser General Public
  21 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  22 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  23 dnl  Suite 330, Boston, MA 02111-1307, USA.
  24
  25
  26 include(`../config.m4')
  27
  28
  29 C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
  30 C                              mp_limb_t carry);
  31
  32 defframe(PARAM_CARRY,16)
  33 defframe(PARAM_SIZE, 12)
  34 defframe(PARAM_SRC,   8)
  35 defframe(PARAM_DST,   4)
  36
  37 dnl  multiplicative inverse of 3, modulo 2^32
  38 deflit(INVERSE_3,        0xAAAAAAAB)
  39
  40 dnl  ceil(b/3), ceil(b*2/3) and floor(b*2/3) where b=2^32
  41 deflit(ONE_THIRD_CEIL,   0x55555556)
  42 deflit(TWO_THIRDS_CEIL,  0xAAAAAAAB)
  43 deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA)
  44
  45         .text
  46         ALIGN(8)
  47
  48 PROLOGUE(mpn_divexact_by3c)
  49 deflit(`FRAME',0)
  50
  51         movl    PARAM_SRC, %ecx
  52         movl    PARAM_SIZE, %edx
  53
  54         decl    %edx
  55         jnz     L(two_or_more)
  56
  57         movl    (%ecx), %edx
  58         movl    PARAM_CARRY, %eax       C risk of cache bank clash here
  59
  60         movl    PARAM_DST, %ecx
  61         subl    %eax, %edx
  62
  63         sbbl    %eax, %eax              C 0 or -1
  64
  65         imull   $INVERSE_3, %edx, %edx
  66
  67         negl    %eax                    C 0 or 1
  68         cmpl    $ONE_THIRD_CEIL, %edx
  69
  70         sbbl    $-1, %eax               C +1 if edx>=ceil(b/3)
  71         cmpl    $TWO_THIRDS_CEIL, %edx
  72
  73         sbbl    $-1, %eax               C +1 if edx>=ceil(b*2/3)
  74         movl    %edx, (%ecx)
  75
  76         ret
  77
  78
  79 L(two_or_more):
  80         C eax
  81         C ebx
  82         C ecx   src
  83         C edx   size-1
  84         C esi
  85         C edi
  86         C ebp
  87
  88         pushl   %ebx    FRAME_pushl()
  89         pushl   %esi    FRAME_pushl()
  90
  91         pushl   %edi    FRAME_pushl()
  92         pushl   %ebp    FRAME_pushl()
  93
  94         movl    PARAM_DST, %edi
  95         movl    PARAM_CARRY, %esi
  96
  97         movl    (%ecx), %eax            C src low limb
  98         xorl    %ebx, %ebx
  99
 100         sub     %esi, %eax
 101         movl    $TWO_THIRDS_FLOOR, %esi
 102
 103         leal    (%ecx,%edx,4), %ecx     C &src[size-1]
 104         leal    (%edi,%edx,4), %edi     C &dst[size-1]
 105
 106         adcl    $0, %ebx                C carry, 0 or 1
 107         negl    %edx                    C -(size-1)
 108
 109
 110 C The loop needs a source limb ready at the top, which leads to one limb
 111 C handled separately at the end, and the special case above for size==1.
 112 C There doesn't seem to be any scheduling that would keep the speed but move
 113 C the source load and carry subtract up to the top.
 114 C
 115 C The destination cache line prefetching adds 1 cycle to the loop but is
 116 C considered worthwhile.  The slowdown is a factor of 1.07, but will prevent
 117 C repeated write-throughs if the destination isn't in L1.  A version using
 118 C an outer loop to prefetch only every 8 limbs (a cache line) proved to be
 119 C no faster, due to unavoidable branch mispreditions in the inner loop.
 120 C
 121 C setc is 2 cycles on P54, so an adcl is used instead.  If the movl $0,%ebx
 122 C could be avoided then the src limb fetch could pair up and save a cycle.
 123 C This would probably mean going to a two limb loop with the carry limb
 124 C alternately positive or negative, since an sbbl %ebx,%ebx will leave a
 125 C value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax.
 126 C
 127 C A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as
 128 C "cmpl %edx, $n" with the immediate as the second operand.
 129 C
 130 C The "4" source displacement is in the loop rather than the setup because
 131 C this gets L(top) aligned to 8 bytes at no cost.
 132
 133         ALIGN(8)
 134 L(top):
 135         C eax   source limb, carry subtracted
 136         C ebx   carry (0 or 1)
 137         C ecx   &src[size-1]
 138         C edx   counter, limbs, negative
 139         C esi   TWO_THIRDS_FLOOR
 140         C edi   &dst[size-1]
 141         C ebp   scratch (result limb)
 142
 143         imull   $INVERSE_3, %eax, %ebp
 144
 145         cmpl    $ONE_THIRD_CEIL, %ebp
 146         movl    (%edi,%edx,4), %eax     C dst cache line prefetch
 147
 148         sbbl    $-1, %ebx               C +1 if ebp>=ceil(b/3)
 149         cmpl    %ebp, %esi
 150
 151         movl    4(%ecx,%edx,4), %eax    C next src limb
 152
 153         sbbl    %ebx, %eax              C and further -1 if ebp>=ceil(b*2/3)
 154         movl    $0, %ebx
 155
 156         adcl    $0, %ebx                C new carry
 157         movl    %ebp, (%edi,%edx,4)
 158
 159         incl    %edx
 160         jnz     L(top)
 161
 162
 163
 164         imull   $INVERSE_3, %eax, %edx
 165
 166         cmpl    $ONE_THIRD_CEIL, %edx
 167         movl    %edx, (%edi)
 168
 169         sbbl    $-1, %ebx       C +1 if edx>=ceil(b/3)
 170         cmpl    $TWO_THIRDS_CEIL, %edx
 171
 172         sbbl    $-1, %ebx       C +1 if edx>=ceil(b*2/3)
 173         popl    %ebp
 174
 175         movl    %ebx, %eax
 176         popl    %edi
 177
 178         popl    %esi
 179         popl    %ebx
 180
 181         ret
 182
 183 EPILOGUE()