3 dnl m4 macros for x86 assembler.
6 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software; you can redistribute it and/or
11 dnl modify it under the terms of the GNU Lesser General Public License as
12 dnl published by the Free Software Foundation; either version 2.1 of the
13 dnl License, or (at your option) any later version.
15 dnl The GNU MP Library is distributed in the hope that it will be useful,
16 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 dnl Lesser General Public License for more details.
20 dnl You should have received a copy of the GNU Lesser General Public
21 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23 dnl Suite 330, Boston, MA 02111-1307, USA.
28 dnl m4 isn't perfect for processing BSD style x86 assembler code, the main
31 dnl 1. Doing define(foo,123) and then using foo in an addressing mode like
32 dnl foo(%ebx) expands as a macro rather than a constant. This is worked
33 dnl around by using deflit() from asm-defs.m4, instead of define().
35 dnl 2. Immediates in macro definitions need a space or `' to stop the $
36 dnl looking like a macro parameter. For example,
38 dnl define(foo, `mov $ 123, %eax')
40 dnl This is only a problem in macro definitions, not in ordinary text,
41 dnl nor in macro parameters like text passed to forloop() or ifdef().
44 deflit(BYTES_PER_MP_LIMB, 4)
47 dnl --------------------------------------------------------------------------
48 dnl Replacement PROLOGUE/EPILOGUE with more sophisticated error checking.
49 dnl Nesting and overlapping not allowed.
53 dnl Usage: PROLOGUE(functionname)
55 dnl Generate a function prologue. functionname gets GSYM_PREFIX added.
58 dnl PROLOGUE(mpn_add_n)
63 m4_assert_defined(`PROLOGUE_cpu')
64 `ifdef(`PROLOGUE_current_function',
65 `m4_error(`PROLOGUE'(`PROLOGUE_current_function') needs an `EPILOGUE'() before `PROLOGUE'($1)
68 define(`PROLOGUE_current_function',`$1')dnl
69 PROLOGUE_cpu(GSYM_PREFIX`'$1)')
74 dnl Notice the function name is passed to EPILOGUE_cpu(), letting it use $1
75 dnl instead of the long PROLOGUE_current_function symbol.
79 m4_assert_defined(`EPILOGUE_cpu')
80 `ifdef(`PROLOGUE_current_function',,
81 `m4_error(`EPILOGUE'() with no `PROLOGUE'()
83 EPILOGUE_cpu(GSYM_PREFIX`'PROLOGUE_current_function)`'dnl
84 undefine(`PROLOGUE_current_function')')
87 `ifdef(`PROLOGUE_current_function',
88 `m4_error(`EPILOGUE() for PROLOGUE('PROLOGUE_current_function`) never seen
92 dnl Usage: PROLOGUE_assert_inside()
94 dnl Use this unquoted on a line on its own at the start of a macro
95 dnl definition to add some code to check the macro is only used inside a
96 dnl PROLOGUE/EPILOGUE pair, and that hence PROLOGUE_current_function is
99 define(PROLOGUE_assert_inside,
101 ``PROLOGUE_assert_inside_internal'(m4_doublequote($`'0))`dnl '')
103 define(PROLOGUE_assert_inside_internal,
105 `ifdef(`PROLOGUE_current_function',,
106 `m4_error(`$1 used outside a PROLOGUE / EPILOGUE pair
110 dnl Usage: L(labelname)
111 dnl LF(functionname,labelname)
113 dnl Generate a local label in the current or given function. For LF(),
114 dnl functionname gets GSYM_PREFIX added, the same as with PROLOGUE().
116 dnl For example, in a function mpn_add_n (and with MPN_PREFIX __gmpn),
118 dnl L(bar) => L__gmpn_add_n__bar
119 dnl LF(somefun,bar) => Lsomefun__bar
121 dnl The funtion name and label name get two underscores between them rather
122 dnl than one to guard against clashing with a separate external symbol that
123 dnl happened to be called functionname_labelname. (Though this would only
124 dnl happen if the local label prefix is is empty.) Underscores are used so
125 dnl the whole label will still be a valid C identifier and so can be easily
128 dnl LSYM_PREFIX can be L$, so defn() is used to prevent L expanding as the
129 dnl L macro and making an infinite recursion.
132 m4_assert_defined(`LSYM_PREFIX')
133 `defn(`LSYM_PREFIX')GSYM_PREFIX`'$1`'__$2')
137 PROLOGUE_assert_inside()
138 `LF(PROLOGUE_current_function,`$1')')
141 dnl Called: PROLOGUE_cpu(gsym)
142 dnl EPILOGUE_cpu(gsym)
156 dnl --------------------------------------------------------------------------
157 dnl Various x86 macros.
161 dnl Usage: ALIGN_OFFSET(bytes,offset)
163 dnl Align to `offset' away from a multiple of `bytes'.
165 dnl This is useful for testing, for example align to something very strict
166 dnl and see what effect offsets from it have, "ALIGN_OFFSET(256,32)".
168 dnl Generally you wouldn't execute across the padding, but it's done with
169 dnl nop's so it'll work.
174 forloop(`i',1,$2,` nop
178 dnl Usage: defframe(name,offset)
180 dnl Make a definition like the following with which to access a parameter
181 dnl or variable on the stack.
183 dnl define(name,`FRAME+offset(%esp)')
185 dnl Actually m4_empty_if_zero(FRAME+offset) is used, which will save one
186 dnl byte if FRAME+offset is zero, by putting (%esp) rather than 0(%esp).
187 dnl Use define(`defframe_empty_if_zero_disabled',1) if for some reason the
188 dnl zero offset is wanted.
190 dnl The new macro also gets a check that when it's used FRAME is actually
191 dnl defined, and that the final %esp offset isn't negative, which would
192 dnl mean an attempt to access something below the current %esp.
194 dnl deflit() is used rather than a plain define(), so the new macro won't
195 dnl delete any following parenthesized expression. name(%edi) will come
196 dnl out say as 16(%esp)(%edi). This isn't valid assembler and should
197 dnl provoke an error, which is better than silently giving just 16(%esp).
199 dnl See README.family for more on the suggested way to access the stack
205 m4_assert_defined(`FRAME')
206 `defframe_check_notbelow(`$1',$2,FRAME)dnl
207 defframe_empty_if_zero(FRAME+($2))(%esp)')')
209 dnl Called: defframe_empty_if_zero(expression)
210 define(defframe_empty_if_zero,
211 `ifelse(defframe_empty_if_zero_disabled,1,
213 `m4_empty_if_zero($1)')')
215 dnl Called: defframe_check_notbelow(`name',offset,FRAME)
216 define(defframe_check_notbelow,
218 `ifelse(eval(($3)+($2)<0),1,
219 `m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes
223 dnl Usage: FRAME_pushl()
225 dnl FRAME_addl_esp(n)
226 dnl FRAME_subl_esp(n)
228 dnl Adjust FRAME appropriately for a pushl or popl, or for an addl or subl
231 dnl Using these macros is completely optional. Sometimes it makes more
232 dnl sense to put explicit deflit(`FRAME',N) forms, especially when there's
233 dnl jumps and different sequences of FRAME values need to be used in
234 dnl different places.
238 m4_assert_defined(`FRAME')
239 `deflit(`FRAME',eval(FRAME+4))')
243 m4_assert_defined(`FRAME')
244 `deflit(`FRAME',eval(FRAME-4))')
246 define(FRAME_addl_esp,
248 m4_assert_defined(`FRAME')
249 `deflit(`FRAME',eval(FRAME-($1)))')
251 define(FRAME_subl_esp,
253 m4_assert_defined(`FRAME')
254 `deflit(`FRAME',eval(FRAME+($1)))')
257 dnl Usage: defframe_pushl(name)
259 dnl Do a combination of a FRAME_pushl() and a defframe() to name the stack
260 dnl location just pushed. This should come after a pushl instruction.
261 dnl Putting it on the same line works and avoids lengthening the code. For
264 dnl pushl %eax defframe_pushl(VAR_COUNTER)
266 dnl Notice the defframe() is done with an unquoted -FRAME thus giving its
267 dnl current value without tracking future changes.
269 define(defframe_pushl,
270 `FRAME_pushl()defframe(`$1',-FRAME)')
273 dnl --------------------------------------------------------------------------
274 dnl Assembler instruction macros.
278 dnl Usage: emms_or_femms
279 dnl femms_available_p
281 dnl femms_available_p expands to 1 or 0 according to whether the AMD 3DNow
282 dnl femms instruction is available. emms_or_femms expands to femms if
283 dnl available, or emms if not.
285 dnl emms_or_femms is meant for use in the K6 directory where plain K6
286 dnl (without femms) and K6-2 and K6-3 (with a slightly faster femms) are
287 dnl supported together.
289 dnl On K7 femms is no longer faster and is just an alias for emms, so plain
290 dnl emms may as well be used.
292 define(femms_available_p,
293 m4_assert_numargs(-1)
295 `HAVE_TARGET_CPU_k62',
296 `HAVE_TARGET_CPU_k63',
297 `HAVE_TARGET_CPU_athlon')')
299 define(emms_or_femms,
300 m4_assert_numargs(-1)
301 `ifelse(femms_available_p,1,`femms',`emms')')
306 dnl The gas 2.9.1 that comes with FreeBSD 3.4 doesn't support femms, so the
307 dnl following is a replacement using .byte.
309 dnl If femms isn't available, an emms is generated instead, for convenience
310 dnl when testing on a machine without femms.
313 m4_assert_numargs(-1)
314 `ifelse(femms_available_p,1,
315 `.byte 15,14 C AMD 3DNow femms',
317 m4_warning(`warning, using emms in place of femms, use for testing only
321 dnl Usage: jadcl0(op)
323 dnl Issue a jnc/incl as a substitute for adcl $0,op. This isn't an exact
324 dnl replacement, since it doesn't set the flags like adcl does.
326 dnl This finds a use in K6 mpn_addmul_1, mpn_submul_1, mpn_mul_basecase and
327 dnl mpn_sqr_basecase because on K6 an adcl is slow, the branch
328 dnl misprediction penalty is small, and the multiply algorithm used leads
329 dnl to a carry bit on average only 1/4 of the time.
331 dnl jadcl0_disabled can be set to 1 to instead issue an ordinary adcl for
332 dnl comparison. For example,
334 dnl define(`jadcl0_disabled',1)
336 dnl When using a register operand, eg. "jadcl0(%edx)", the jnc/incl code is
337 dnl the same size as an adcl. This makes it possible to use the exact same
338 dnl computed jump code when testing the relative speed of jnc/incl and adcl
339 dnl with jadcl0_disabled.
343 `ifelse(jadcl0_disabled,1,
350 dnl Usage: cmov_available_p
352 dnl Expand to 1 if cmov is available, 0 if not.
354 define(cmov_available_p,
356 `HAVE_TARGET_CPU_pentiumpro',
357 `HAVE_TARGET_CPU_pentium2',
358 `HAVE_TARGET_CPU_pentium3',
359 `HAVE_TARGET_CPU_athlon')')
362 dnl Usage: x86_lookup(target, key,value, key,value, ...)
363 dnl x86_lookup_p(target, key,value, key,value, ...)
365 dnl Look for `target' among the `key' parameters.
367 dnl x86_lookup expands to the corresponding `value', or generates an error
368 dnl if `target' isn't found.
370 dnl x86_lookup_p expands to 1 if `target' is found, or 0 if not.
373 `ifelse(eval($#<3),1,
374 `m4_error(`unrecognised part of x86 instruction: $1
376 `ifelse(`$1',`$2', `$3',
377 `x86_lookup(`$1',shift(shift(shift($@))))')')')
380 `ifelse(eval($#<3),1, `0',
381 `ifelse(`$1',`$2', `1',
382 `x86_lookup_p(`$1',shift(shift(shift($@))))')')')
385 dnl Usage: x86_opcode_reg32(reg)
386 dnl x86_opcode_reg32_p(reg)
388 dnl x86_opcode_reg32 expands to the standard 3 bit encoding for the given
389 dnl 32-bit register, eg. `%ebp' turns into 5.
391 dnl x86_opcode_reg32_p expands to 1 if reg is a valid 32-bit register, or 0
394 define(x86_opcode_reg32,
396 `x86_lookup(`$1',x86_opcode_reg32_list)')
398 define(x86_opcode_reg32_p,
400 `x86_lookup_p(`$1',x86_opcode_reg32_list)')
402 define(x86_opcode_reg32_list,
413 dnl Usage: x86_opcode_tttn(cond)
415 dnl Expand to the 4-bit "tttn" field value for the given x86 branch
416 dnl condition (like `c', `ae', etc).
418 define(x86_opcode_tttn,
420 `x86_lookup(`$1',x86_opcode_ttn_list)')
422 define(x86_opcode_tttn_list,
425 `b', 2, `c', 2, `nae',2,
426 `nb', 3, `nc', 3, `ae', 3,
433 `p', 10, `pe', 10, `npo',10,
434 `np', 11, `npe',11, `po', 11,
441 dnl Usage: cmovCC(srcreg,dstreg)
443 dnl Generate a cmov instruction if the target supports cmov, or simulate it
444 dnl with a conditional jump if not (the latter being meant only for
445 dnl testing). For example,
447 dnl cmovz( %eax, %ebx)
449 dnl cmov instructions are generated using .byte sequences, since only
450 dnl recent versions of gas know cmov.
452 dnl The source operand can only be a plain register. (m4 code implementing
453 dnl full memory addressing modes exists, believe it or not, but isn't
454 dnl currently needed and isn't included.)
456 dnl All the standard conditions are defined. Attempting to use one without
457 dnl the macro parentheses, such as just "cmovbe %eax, %ebx", will provoke
458 dnl an error. This ensures the necessary .byte sequences aren't
459 dnl accidentally missed.
461 dnl Called: define_cmov_many(cond,tttn,cond,tttn,...)
462 define(define_cmov_many,
463 `ifelse(m4_length(`$1'),0,,
464 `define_cmov(`$1',`$2')define_cmov_many(shift(shift($@)))')')
466 dnl Called: define_cmov(cond,tttn)
470 m4_instruction_wrapper()
472 `cmov_internal'(m4_doublequote($`'0),``$1',`$2'',dnl
473 m4_doublequote($`'1),m4_doublequote($`'2)))')
475 define_cmov_many(x86_opcode_tttn_list)
478 dnl Called: cmov_internal(name,cond,tttn,src,dst)
479 define(cmov_internal,
481 `ifelse(cmov_available_p,1,
482 `cmov_bytes_tttn(`$1',`$3',`$4',`$5')',
483 `m4_warning(`warning, simulating cmov with jump, use for testing only
484 ')cmov_simulate(`$2',`$4',`$5')')')
486 dnl Called: cmov_simulate(cond,src,dst)
487 dnl If this is going to be used with memory operands for the source it will
488 dnl need to be changed to do a fetch even if the condition is false, so as
489 dnl to trigger exceptions the same way a real cmov does.
490 define(cmov_simulate,
492 `j$1 1f C cmov$1 $2, $3
497 dnl Called: cmov_bytes_tttn(name,tttn,src,dst)
498 define(cmov_bytes_tttn,
503 eval(192+8*x86_opcode_reg32(`$4')+x86_opcode_reg32(`$3')) dnl
507 dnl Usage: loop_or_decljnz label
509 dnl Generate either a "loop" instruction or a "decl %ecx / jnz", whichever
510 dnl is better. "loop" is better on K6 and probably on 386, on other chips
511 dnl separate decl/jnz is better.
513 dnl This macro is just for mpn/x86/divrem_1.asm and mpn/x86/mod_1.asm where
514 dnl this loop_or_decljnz variation is enough to let the code be shared by
517 define(loop_or_decljnz,
518 `ifelse(loop_is_better_p,1,
523 define(loop_is_better_p,
524 `m4_ifdef_anyof_p(`HAVE_TARGET_CPU_k6',
525 `HAVE_TARGET_CPU_k62',
526 `HAVE_TARGET_CPU_k63',
527 `HAVE_TARGET_CPU_i386')')
530 dnl Usage: Zdisp(inst,op,op,op)
532 dnl Generate explicit .byte sequences if necessary to force a byte-sized
533 dnl zero displacement on an instruction. For example,
535 dnl Zdisp( movl, 0,(%esi), %eax)
539 dnl .byte 139,70,0 C movl 0(%esi), %eax
541 dnl If the displacement given isn't 0, then normal assembler code is
542 dnl generated. For example,
544 dnl Zdisp( movl, 4,(%esi), %eax)
548 dnl movl 4(%esi), %eax
550 dnl This means a single Zdisp() form can be used with an expression for the
551 dnl displacement, and .byte will be used only if necessary. The
552 dnl displacement argument is eval()ed.
554 dnl Because there aren't many places a 0(reg) form is wanted, Zdisp is
555 dnl implemented with a table of instructions and encodings. A new entry is
556 dnl needed for any different operation or registers.
559 `define(`Zdisp_found',0)dnl
560 Zdisp_match( movl, %eax, 0,(%edi), `137,71,0', $@)`'dnl
561 Zdisp_match( movl, %ebx, 0,(%edi), `137,95,0', $@)`'dnl
562 Zdisp_match( movl, %esi, 0,(%edi), `137,119,0', $@)`'dnl
563 Zdisp_match( movl, 0,(%ebx), %eax, `139,67,0', $@)`'dnl
564 Zdisp_match( movl, 0,(%ebx), %esi, `139,115,0', $@)`'dnl
565 Zdisp_match( movl, 0,(%esi), %eax, `139,70,0', $@)`'dnl
566 Zdisp_match( movl, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00', $@)`'dnl
567 Zdisp_match( addl, %ebx, 0,(%edi), `1,95,0', $@)`'dnl
568 Zdisp_match( addl, %ecx, 0,(%edi), `1,79,0', $@)`'dnl
569 Zdisp_match( addl, %esi, 0,(%edi), `1,119,0', $@)`'dnl
570 Zdisp_match( subl, %ecx, 0,(%edi), `41,79,0', $@)`'dnl
571 Zdisp_match( adcl, 0,(%edx), %esi, `19,114,0', $@)`'dnl
572 Zdisp_match( sbbl, 0,(%edx), %esi, `27,114,0', $@)`'dnl
573 Zdisp_match( movq, 0,(%eax,%ecx,8), %mm0, `0x0f,0x6f,0x44,0xc8,0x00', $@)`'dnl
574 Zdisp_match( movq, 0,(%ebx,%eax,4), %mm0, `0x0f,0x6f,0x44,0x83,0x00', $@)`'dnl
575 Zdisp_match( movq, 0,(%ebx,%eax,4), %mm2, `0x0f,0x6f,0x54,0x83,0x00', $@)`'dnl
576 Zdisp_match( movq, 0,(%esi), %mm0, `15,111,70,0', $@)`'dnl
577 Zdisp_match( movq, %mm0, 0,(%edi), `15,127,71,0', $@)`'dnl
578 Zdisp_match( movq, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7f,0x54,0x81,0x00', $@)`'dnl
579 Zdisp_match( movq, %mm2, 0,(%edx,%eax,4), `0x0f,0x7f,0x54,0x82,0x00', $@)`'dnl
580 Zdisp_match( movq, %mm0, 0,(%edx,%ecx,8), `0x0f,0x7f,0x44,0xca,0x00', $@)`'dnl
581 Zdisp_match( movd, 0,(%eax,%ecx,8), %mm1, `0x0f,0x6e,0x4c,0xc8,0x00', $@)`'dnl
582 Zdisp_match( movd, 0,(%edx,%ecx,8), %mm0, `0x0f,0x6e,0x44,0xca,0x00', $@)`'dnl
583 Zdisp_match( movd, %mm0, 0,(%eax,%ecx,4), `0x0f,0x7e,0x44,0x88,0x00', $@)`'dnl
584 Zdisp_match( movd, %mm0, 0,(%ecx,%eax,4), `0x0f,0x7e,0x44,0x81,0x00', $@)`'dnl
585 Zdisp_match( movd, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7e,0x54,0x81,0x00', $@)`'dnl
586 ifelse(Zdisp_found,0,
587 `m4_error(`unrecognised instruction in Zdisp: $1 $2 $3 $4
591 `ifelse(eval(m4_stringequal_p(`$1',`$6')
592 && m4_stringequal_p(`$2',0)
593 && m4_stringequal_p(`$3',`$8')
594 && m4_stringequal_p(`$4',`$9')),1,
595 `define(`Zdisp_found',1)dnl
597 ` .byte $5 C `$1 0$3, $4'',
600 `ifelse(eval(m4_stringequal_p(`$1',`$6')
601 && m4_stringequal_p(`$2',`$7')
602 && m4_stringequal_p(`$3',0)
603 && m4_stringequal_p(`$4',`$9')),1,
604 `define(`Zdisp_found',1)dnl
606 ` .byte $5 C `$1 $2, 0$4'',
607 ` $6 $7, $8$9')')')')
610 dnl Usage: shldl(count,src,dst)
611 dnl shrdl(count,src,dst)
612 dnl shldw(count,src,dst)
613 dnl shrdw(count,src,dst)
615 dnl Generate a double-shift instruction, possibly omitting a %cl count
616 dnl parameter if that's what the assembler requires, as indicated by
617 dnl WANT_SHLDL_CL in config.m4. For example,
619 dnl shldl( %cl, %eax, %ebx)
621 dnl turns into either
623 dnl shldl %cl, %eax, %ebx
627 dnl Immediate counts are always passed through unchanged. For example,
629 dnl shrdl( $2, %esi, %edi)
631 dnl shrdl $2, %esi, %edi
634 dnl If you forget to use the macro form "shldl( ...)" and instead write
635 dnl just a plain "shldl ...", an error results. This ensures the necessary
636 dnl variant treatment of %cl isn't accidentally bypassed.
638 define(define_shd_instruction,
640 m4_instruction_wrapper()
642 `shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl
643 m4_doublequote($`'2),m4_doublequote($`'3)))')
645 dnl Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc
646 define_shd_instruction(shldl)
647 define_shd_instruction(shrdl)
648 define_shd_instruction(shldw)
649 define_shd_instruction(shrdw)
651 dnl Called: shd_instruction(op,count,src,dst)
652 define(shd_instruction,
654 m4_assert_defined(`WANT_SHLDL_CL')
655 `ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1,
657 ``$1' `$2', `$3', `$4'')')
660 dnl Usage: ASSERT(cond, instructions)
662 dnl If WANT_ASSERT is 1, output the given instructions and expect the given
663 dnl flags condition to then be satisfied. For example,
665 dnl ASSERT(ne, `cmpl %eax, %ebx')
667 dnl The instructions can be omitted to just assert a flags condition with
668 dnl no extra calculation. For example,
672 dnl When `instructions' is not empty, a pushf/popf is added to preserve the
673 dnl flags, but the instructions themselves must preserve any registers that
674 dnl matter. FRAME is adjusted for the push and pop, so the instructions
675 dnl given can use defframe() stack variables.
678 m4_assert_numargs_range(1,2)
679 `ifelse(WANT_ASSERT,1,
681 ifelse(`$2',,,` pushf ifdef(`FRAME',`FRAME_pushl()')')
684 ud2 C assertion failed
686 ifelse(`$2',,,` popf ifdef(`FRAME',`FRAME_popl()')')
690 dnl Usage: movl_text_address(label,register)
692 dnl Get the address of a text segment label, using either a plain movl or a
693 dnl position-independent calculation, as necessary. For example,
695 dnl movl_code_address(L(foo),%eax)
697 dnl This macro is only meant for use in ASSERT()s or when testing, since
698 dnl the PIC sequence it generates will want to be done with a ret balancing
699 dnl the call on CPUs with return address branch predition.
701 dnl The addl generated here has a backward reference to 1b, and so won't
702 dnl suffer from the two forwards references bug in old gas (described in
703 dnl mpn/x86/README.family).
705 define(movl_text_address,