1 \section[COptJumps]{Macros for tail-jumping}
3 % this file is part of the C-as-assembler document
10 %************************************************************************
12 \subsection[COptJumps-portable]{Tail-(non-)jumping in ``portable~C''}
14 %************************************************************************
17 #if ! (defined(__STG_TAILJUMPS__) && defined(__GNUC__))
19 #define JMP_(target) return((F_) (target))
20 #define RESUME_(target) JMP_(target)
23 Don't need to do anything magical for the mini-interpreter, because
24 we're really going to use the plain old C one (and the debugging
25 variant, too, for that matter).
27 %************************************************************************
29 \subsection[COptJumps-optimised]{Tail-jumping in ``optimised~C''}
31 %************************************************************************
34 #else /* __STG_TAILJUMPS__ && __GNUC__ */
37 GCC will have assumed that pushing/popping of C-stack frames is going
38 on when it generated its code, and used stack space accordingly.
39 However, we actually {\em post-process away} all such stack-framery
40 (see \tr{ghc/driver/ghc-asm.lprl}). Things will be OK however, if we
41 initially make sure there are @RESERVED_C_STACK_BYTES@ on the C-stack
42 to begin with, for local variables.
45 #define RESERVED_C_STACK_BYTES (512 * sizeof(I_)) /* MUST BE OF GENEROUS ALIGNMENT */
48 The platform-specific details are given in alphabetical order.
50 %************************************************************************
52 \subsubsection[COptJumps-alpha]{Tail-jumping on Alphas}
54 %************************************************************************
56 We have to set the procedure value register (\$27) before branching, so
57 that the target function can load the gp (\$29) as appropriate.
59 It seems that \tr{_procedure} can't be declared within the body of the
60 \tr{JMP_} macro...at least, not if we want it to be \$27, which we do!
64 /* ToDo: less specific? */
67 Jumping to a new block of code, we need to set up $27 to point
68 at the target, so that the callee can establish its gp (as an
69 offset from its own starting address). For some reason, gcc
70 refuses to give us $27 for _procedure if it's declared as a
71 local variable, so the workaround is to make it a global.
73 Note: The local variable works in gcc 2.6.2, but fails in 2.5.8.
76 /* MOVED: to COptRegs.lh -- very unsatisfactorily.
77 Otherwise, we can get a "global register variable follows a
78 function definition" error.
80 Once we can take gcc 2.6.x as std, then we can use
81 the local variant, and the problem goes away. (WDP 95/02)
83 register void *_procedure __asm__("$27");
87 do { _procedure = (void *)(cont); \
92 When we resume at the point where a call was originally made,
93 we need to restore $26, so that gp can be reloaded appropriately.
94 However, sometimes we ``resume'' by entering a new function
95 (typically EnterNodeCode), so we need to set up $27 as well.
98 #define RESUME_(cont) \
99 do { _procedure = (void *)(cont); \
100 __asm__ volatile("mov $27,$26"); \
104 #define MINI_INTERPRETER_SETUP \
105 __asm__ volatile ("stq $9,-8($30)\n" \
106 "stq $10,-16($30)\n" \
107 "stq $11,-24($30)\n" \
108 "stq $12,-32($30)\n" \
109 "stq $13,-40($30)\n" \
110 "stq $14,-48($30)\n" \
111 "stq $15,-56($30)\n" \
112 "stt $f2,-64($30)\n" \
113 "stt $f3,-72($30)\n" \
114 "stt $f4,-80($30)\n" \
115 "stt $f5,-88($30)\n" \
116 "stt $f6,-96($30)\n" \
117 "stt $f7,-104($30)\n" \
118 "stt $f8,-112($30)\n" \
119 "stt $f9,-120($30)\n" \
120 "lda $30,-%0($30)" : : \
121 "K" (RESERVED_C_STACK_BYTES+8*sizeof(double)+8*sizeof(long)));
123 #define MINI_INTERPRETER_END \
124 __asm__ volatile (".align 3\n" \
125 ".globl miniInterpretEnd\n" \
126 "miniInterpretEnd:\n" \
127 "lda $30,%0($30)\n" \
129 "ldq $10,-16($30)\n" \
130 "ldq $11,-24($30)\n" \
131 "ldq $12,-32($30)\n" \
132 "ldq $13,-40($30)\n" \
133 "ldq $14,-48($30)\n" \
134 "ldq $15,-56($30)\n" \
135 "ldt $f2,-64($30)\n" \
136 "ldt $f3,-72($30)\n" \
137 "ldt $f4,-80($30)\n" \
138 "ldt $f5,-88($30)\n" \
139 "ldt $f6,-96($30)\n" \
140 "ldt $f7,-104($30)\n" \
141 "ldt $f8,-112($30)\n" \
142 "ldt $f9,-120($30)" : : \
143 "K" (RESERVED_C_STACK_BYTES+8*sizeof(double)+8*sizeof(long)));
148 %************************************************************************
150 \subsubsection[COptJumps-Hpux]{Tail-jumping on a HP-PA machine running HP-UX}
152 %************************************************************************
155 #if hppa1_1_hp_hpux_TARGET
157 /* do FUNBEGIN/END the easy way */
158 #define FUNBEGIN __asm__ volatile ("--- BEGIN ---");
159 #define FUNEND __asm__ volatile ("--- END ---");
161 /* The stack grows up! Local variables are allocated just above the
162 frame pointer, and extra arguments are stashed just below the stack
163 pointer, so the safe space is again in the middle (cf. sparc).
165 Sven Panne <Sven.Panne@informatik.uni-muenchen.de> writes:
167 But now for the reallly bad news: Some nasty guy in the threaded world
168 modifies R3 (the frame pointer)!! This should not happen (as far as I
169 know R3 should be a callee-saves register). Sadly, I can't reproduce
170 this behaviour consistently, Perhaps it is some strange point of our
171 boxes here? (uname -svrm gives HP-UX A.09.05 A 9000/715)
175 So here is my next try: Don't calculate the register buffer by _adding_
176 to FP[r3], but by _subtracting_ from SP! The patch below should result in the
177 same addresses (+/- some bytes :-) By the way, is the SP[r30] after returning
178 from the threaded world the same as the one before entering it?
179 I really hope so, otherwise %#*&!!
183 do { void *_procedure = (void *)(cont); \
187 #define RESUME_(cont) JMP_(cont)
189 #define MINI_INTERPRETER_SETUP \
190 StgChar space[RESERVED_C_STACK_BYTES+16*sizeof(long)+10*sizeof(double)]; \
191 /* __asm__ volatile ("ldo %0(%%r3),%%r19\n" */ \
192 __asm__ volatile ("ldo %0(%%r30),%%r19\n" \
193 "\tstw %%r3, 0(0,%%r19)\n" \
194 "\tstw %%r4, 4(0,%%r19)\n" \
195 "\tstw %%r5, 8(0,%%r19)\n" \
196 "\tstw %%r6,12(0,%%r19)\n" \
197 "\tstw %%r7,16(0,%%r19)\n" \
198 "\tstw %%r8,20(0,%%r19)\n" \
199 "\tstw %%r9,24(0,%%r19)\n" \
200 "\tstw %%r10,28(0,%%r19)\n" \
201 "\tstw %%r11,32(0,%%r19)\n" \
202 "\tstw %%r12,36(0,%%r19)\n" \
203 "\tstw %%r13,40(0,%%r19)\n" \
204 "\tstw %%r14,44(0,%%r19)\n" \
205 "\tstw %%r15,48(0,%%r19)\n" \
206 "\tstw %%r16,52(0,%%r19)\n" \
207 "\tstw %%r17,56(0,%%r19)\n" \
208 "\tstw %%r18,60(0,%%r19)\n" \
209 "\tldo 80(%%r19),%%r19\n" \
210 "\tfstds %%fr12,-16(0,%%r19)\n" \
211 "\tfstds %%fr13, -8(0,%%r19)\n" \
212 "\tfstds %%fr14, 0(0,%%r19)\n" \
213 "\tfstds %%fr15, 8(0,%%r19)\n" \
214 "\tldo 32(%%r19),%%r19\n" \
215 "\tfstds %%fr16,-16(0,%%r19)\n" \
216 "\tfstds %%fr17, -8(0,%%r19)\n" \
217 "\tfstds %%fr18, 0(0,%%r19)\n" \
218 "\tfstds %%fr19, 8(0,%%r19)\n" \
219 "\tldo 32(%%r19),%%r19\n" \
220 "\tfstds %%fr20,-16(0,%%r19)\n" \
221 "\tfstds %%fr21, -8(0,%%r19)\n" : : \
222 /* "n" (RESERVED_C_STACK_BYTES - (116 * sizeof(long) + 10 * sizeof(double))) : "%r19" ); */ \
223 "n" (-(116 * sizeof(long) + 10 * sizeof(double))) : "%r19" );
225 #define MINI_INTERPRETER_END \
226 __asm__ volatile (".align 4\n" \
227 "\t.EXPORT miniInterpretEnd,CODE\n" \
228 "\t.EXPORT miniInterpretEnd,ENTRY,PRIV_LEV=3\n" \
229 "miniInterpretEnd\n" \
230 /* "\tldo %0(%%r3),%%r19\n" */ \
231 "\tldo %0(%%r30),%%r19\n" \
232 "\tldw 0(0,%%r19),%%r3\n" \
233 "\tldw 4(0,%%r19),%%r4\n" \
234 "\tldw 8(0,%%r19),%%r5\n" \
235 "\tldw 12(0,%%r19),%%r6\n" \
236 "\tldw 16(0,%%r19),%%r7\n" \
237 "\tldw 20(0,%%r19),%%r8\n" \
238 "\tldw 24(0,%%r19),%%r9\n" \
239 "\tldw 28(0,%%r19),%%r10\n" \
240 "\tldw 32(0,%%r19),%%r11\n" \
241 "\tldw 36(0,%%r19),%%r12\n" \
242 "\tldw 40(0,%%r19),%%r13\n" \
243 "\tldw 44(0,%%r19),%%r14\n" \
244 "\tldw 48(0,%%r19),%%r15\n" \
245 "\tldw 52(0,%%r19),%%r16\n" \
246 "\tldw 56(0,%%r19),%%r17\n" \
247 "\tldw 60(0,%%r19),%%r18\n" \
248 "\tldo 80(%%r19),%%r19\n" \
249 "\tfldds -16(0,%%r19),%%fr12\n" \
250 "\tfldds -8(0,%%r19),%%fr13\n" \
251 "\tfldds 0(0,%%r19),%%fr14\n" \
252 "\tfldds 8(0,%%r19),%%fr15\n" \
253 "\tldo 32(%%r19),%%r19\n" \
254 "\tfldds -16(0,%%r19),%%fr16\n" \
255 "\tfldds -8(0,%%r19),%%fr17\n" \
256 "\tfldds 0(0,%%r19),%%fr18\n" \
257 "\tfldds 8(0,%%r19),%%fr19\n" \
258 "\tldo 32(%%r19),%%r19\n" \
259 "\tfldds -16(0,%%r19),%%fr20\n" \
260 "\tfldds -8(0,%%r19),%%fr21\n" : : \
261 /* "n" (RESERVED_C_STACK_BYTES - (116 * sizeof(long) + 10 * sizeof(double))) : "%r19"); */ \
262 "n" (-(116 * sizeof(long) + 10 * sizeof(double))) : "%r19");
264 #endif /* hppa1.1-hp-hpux* */
267 %************************************************************************
269 \subsubsection[COptJumps-iX86]{Tail-jumping on a 386/486}
271 %************************************************************************
276 /* *not* a good way to do this (WDP 96/05) */
277 #if defined(solaris2_TARGET_OS) || defined(linux_TARGET_OS)
278 #define MINI_INTERPRET_END "miniInterpretEnd"
280 #define MINI_INTERPRET_END "_miniInterpretEnd"
283 /* do FUNBEGIN/END the easy way */
284 #define FUNBEGIN __asm__ volatile ("--- BEGIN ---");
285 #define FUNEND __asm__ volatile ("--- END ---");
287 /* try "m68k-style" for now */
288 extern void __DISCARD__(STG_NO_ARGS);
293 target = (void *)(cont); \
297 #define RESUME_(target) JMP_(target)
299 /* The safe part of the stack frame is near the top */
301 #define MINI_INTERPRETER_SETUP \
302 StgChar space[RESERVED_C_STACK_BYTES+4*sizeof(long)]; \
303 __asm__ volatile ("leal %c0(%%esp),%%eax\n" \
304 "\tmovl %%ebx,0(%%eax)\n" \
305 "\tmovl %%esi,4(%%eax)\n" \
306 "\tmovl %%edi,8(%%eax)\n" \
307 "\tmovl %%ebp,12(%%eax)\n" \
308 : : "n" (RESERVED_C_STACK_BYTES) \
311 /* the initial "addl $f,%esp" in ..._END compensates for
312 the "call" (rather than a jump) in miniInterpret.
315 #define MINI_INTERPRETER_END \
316 __asm__ volatile (".align 4\n" \
317 ".globl " MINI_INTERPRET_END "\n" \
318 MINI_INTERPRET_END ":\n" \
321 __asm__ volatile ("addl $4,%%esp\n" \
322 "\tleal %c0(%%esp),%%eax\n" \
323 "\tmovl 0(%%eax),%%ebx\n" \
324 "\tmovl 4(%%eax),%%esi\n" \
325 "\tmovl 8(%%eax),%%edi\n" \
326 "\tmovl 12(%%eax),%%ebp" \
327 : : "n" (RESERVED_C_STACK_BYTES) : "%eax");
329 #endif /* __i[3456]86__ */
332 %************************************************************************
334 \subsubsection[COptJumps-m68k]{Tail-jumping on m68k boxes}
336 %************************************************************************
338 For 680x0s, we use a quite-magic @JMP_@ macro, which includes
339 beginning- and end-of-function markers.
344 #define FUNBEGIN __asm__ volatile ("--- BEGIN ---");
345 #define FUNEND __asm__ volatile ("--- END ---");
348 The call to \tr{__DISCARD__} in @JMP_@ is fodder for GCC, to force it
349 to pop arguments to previous function calls before the end of the
350 current function. This is unnecessary if we can manage to compile
351 with \tr{-fomit-frame-pointer} as well as \tr{-fno-defer-pop}. (WDP
352 95/02: Either false or dodgy.) At the moment, the asm mangler removes
353 these calls to \tr{__DISCARD__}.
357 extern void __DISCARD__(STG_NO_ARGS);
362 target = (void *)(cont); \
366 #define RESUME_(target) JMP_(target)
368 #define MINI_INTERPRETER_SETUP \
369 StgChar space[RESERVED_C_STACK_BYTES+11*sizeof(long)]; \
370 __asm__ volatile ("moveml a2-a6/d2-d7,sp@(%c0)\n" \
371 "\tlea sp@(%c0),a6" : : "J" (RESERVED_C_STACK_BYTES));
373 #define MINI_INTERPRETER_END \
374 __asm__ volatile (".even\n" \
375 ".globl _miniInterpretEnd\n" \
376 "_miniInterpretEnd:\n" \
378 "\tmoveml sp@(%c0),a2-a6/d2-d7" : : "J" (RESERVED_C_STACK_BYTES));
380 #endif /* __m68k__ */
383 %************************************************************************
385 \subsubsection[COptJumps-mips]{Tail-jumping on a MIPS box}
387 %************************************************************************
390 #if mipseb_TARGET_ARCH || mipsel_TARGET_ARCH
392 /* do FUNBEGIN/END the easy way */
393 #define FUNBEGIN __asm__ volatile ("--- BEGIN ---");
394 #define FUNEND __asm__ volatile ("--- END ---");
396 /* try "m68k-style" for now */
397 extern void __DISCARD__(STG_NO_ARGS);
399 /* this is "alpha-style" */
401 do { __DISCARD__(); \
402 _procedure = (void *)(cont); \
406 #define RESUME_(target) JMP_(target)
408 /* _All_ callee-saved regs, whether we steal them or not, must be saved
412 #define MINI_INTERPRETER_SETUP \
413 StgChar space[RESERVED_C_STACK_BYTES+6*sizeof(double)+9*sizeof(long)]; \
414 __asm__ volatile ("addu $2,$sp,%0\n" \
415 "\ts.d $f20,0($2)\n" \
416 "\ts.d $f22,8($2)\n" \
417 "\ts.d $f24,16($2)\n" \
418 "\ts.d $f26,24($2)\n" \
419 "\ts.d $f28,32($2)\n" \
420 "\ts.d $f30,40($2)\n" \
421 "\tsw $16,48($2)\n" \
422 "\tsw $17,52($2)\n" \
423 "\tsw $18,56($2)\n" \
424 "\tsw $19,60($2)\n" \
425 "\tsw $20,64($2)\n" \
426 "\tsw $21,68($2)\n" \
427 "\tsw $22,72($2)\n" \
428 "\tsw $23,76($2)\n" \
429 "\tsw $fp,80($2)\n" \
430 : : "I" (RESERVED_C_STACK_BYTES+16) : "$2" );
432 /* the 16 bytes is for the argument-register save-area above $sp */
434 #define MINI_INTERPRETER_END \
435 __asm__ volatile (".align 2\n" \
436 ".globl miniInterpretEnd\n" \
437 "miniInterpretEnd:\n" \
438 "\taddu $2,$sp,%0\n" \
439 "\tl.d $f20,0($2)\n" \
440 "\tl.d $f22,8($2)\n" \
441 "\tl.d $f24,16($2)\n" \
442 "\tl.d $f26,24($2)\n" \
443 "\tl.d $f28,32($2)\n" \
444 "\tl.d $f30,40($2)\n" \
445 "\tlw $16,48($2)\n" \
446 "\tlw $17,52($2)\n" \
447 "\tlw $18,56($2)\n" \
448 "\tlw $19,60($2)\n" \
449 "\tlw $20,64($2)\n" \
450 "\tlw $21,68($2)\n" \
451 "\tlw $22,72($2)\n" \
452 "\tlw $23,76($2)\n" \
453 "\tlw $fp,80($2)\n" \
454 : : "I" (RESERVED_C_STACK_BYTES+16) : "$2" );
459 %************************************************************************
461 \subsubsection[COptJumps-powerpc]{Tail-jumping on an IBM PowerPC running AIX}
463 %************************************************************************
466 #if powerpc_TARGET_ARCH || rs6000_TARGET_ARCH
468 /* do FUNBEGIN/END the easy way */
469 #define FUNBEGIN __asm__ volatile ("--- BEGIN ---");
470 #define FUNEND __asm__ volatile ("--- END ---");
472 /* try "m68k-style" for now */
473 extern void __DISCARD__(STG_NO_ARGS);
475 /* this is "alpha-style" */
477 do { void *_procedure = (void *)(cont); \
481 #define RESUME_(target) JMP_(target)
483 /* _All_ callee-saved regs, whether we steal them or not, must be saved
487 #define MINI_INTERPRETER_SETUP \
488 StgChar space[RESERVED_C_STACK_BYTES+6*sizeof(double)+19*sizeof(long)]; \
489 __asm__ volatile ("stm 13,-176(1)\n" \
490 "\tstfd 14,-200(1)\n" \
491 "\tstfd 15,-208(1)\n" \
492 "\tstfd 16,-216(1)\n" \
493 "\tstfd 17,-224(1)\n" \
494 "\tstfd 18,-232(1)\n" \
495 "\tstfd 19,-240(1)\n" \
496 : : "I" (RESERVED_C_STACK_BYTES+16) : "1" );
498 /* the 16 bytes is for the argument-register save-area above $sp */
500 #define MINI_INTERPRETER_END \
501 __asm__ volatile (".globl miniInterpretEnd\n" \
502 "miniInterpretEnd:\n" \
503 "\tlm 13,-176(1)\n" \
504 "\tlfd 14,-200(1)\n" \
505 "\tlfd 15,-208(1)\n" \
506 "\tlfd 16,-216(1)\n" \
507 "\tlfd 17,-224(1)\n" \
508 "\tlfd 18,-232(1)\n" \
509 "\tlfd 19,-240(1)\n" \
510 : : "I" (RESERVED_C_STACK_BYTES+16) : "1" );
515 %************************************************************************
517 \subsubsection[COptJumps-sparc]{Tail-jumping on Sun4s}
519 %************************************************************************
521 We want tailjumps to be calls, because `call xxx' is the only Sparc branch
522 that allows an arbitrary label as a target. (Gcc's ``goto *target'' construct
523 ends up loading the label into a register and then jumping, at the cost of
524 two extra instructions for the 32-bit load.)
526 When entering the threaded world, we stash our return address in a known
527 location so that \tr{%i7} is available as an extra callee-saves register.
528 Of course, we have to restore this when coming out of the threaded world.
530 I hate this god-forsaken architecture. Since the top of the reserved
531 stack space is used for globals and the bottom is reserved for outgoing arguments,
532 we have to stick our return address somewhere in the middle. Currently, I'm
533 allowing 100 extra outgoing arguments beyond the first 6. --JSM
536 #if sparc_TARGET_ARCH
538 #ifdef solaris2_TARGET_OS
539 #define MINI_INTERPRET_END "miniInterpretEnd"
541 #define MINI_INTERPRET_END "_miniInterpretEnd"
544 #define JMP_(cont) ((F_) (cont))()
545 /* Oh so happily, the above turns into a "call" instruction,
546 which, on a SPARC, is nothing but a "jmpl" with the
547 return address in %o7 [which we don't care about].
549 #define RESUME_(target) JMP_(target)
551 #define MINI_INTERPRETER_SETUP \
552 StgChar space[RESERVED_C_STACK_BYTES+sizeof(void *)]; \
553 register void *i7 __asm__("%i7"); \
554 ((void **)(space))[100] = i7;
556 #define MINI_INTERPRETER_END \
557 __asm__ volatile (".align 4\n" \
558 ".global " MINI_INTERPRET_END "\n" \
559 MINI_INTERPRET_END ":\n" \
560 "\tld %1,%0" : "=r" (i7) : "m" (((void **)(space))[100]));
562 #endif /* __sparc__ */
565 %************************************************************************
567 \subsubsection[COptJumps-OOPS]{Someone screwed up here, too...}
569 %************************************************************************
571 If one of the above machine-dependent sections wasn't triggered,
572 @JMP_@ won't be defined and you'll get link errors (if not
577 *???????* No JMP_ macro???
580 #endif /* __STG_TAILJUMPS__ */
583 If @FUNBEGIN@ and @FUNEND@ weren't defined, give them the default
584 (nothing). Also, define @FB_@ and @FE_@ (short forms).
586 #if ! defined(FUNBEGIN)
587 #define FUNBEGIN /* nothing */
589 #if ! defined(FUNEND)
590 #define FUNEND /* nothing */
593 #define FB_ FUNBEGIN /* short forms */
596 #endif /* ! that's all of... COPTJUMPS_H */