--- /dev/null
+\section[COptJumps]{Macros for tail-jumping}
+
+% this file is part of the C-as-assembler document
+
+\begin{code}
+#ifndef COPTJUMPS_H
+#define COPTJUMPS_H
+\end{code}
+
+%************************************************************************
+%* *
+\subsection[COptJumps-portable]{Tail-(non-)jumping in ``portable~C''}
+%* *
+%************************************************************************
+
+\begin{code}
+#if ! (defined(__STG_TAILJUMPS__) && defined(__GNUC__))
+
+#define JMP_(target) return((F_) (target))
+#define RESUME_(target) JMP_(target)
+\end{code}
+
+Don't need to do anything magical for the mini-interpreter, because
+we're really going to use the plain old C one (and the debugging
+variant, too, for that matter).
+
+%************************************************************************
+%* *
+\subsection[COptJumps-optimised]{Tail-jumping in ``optimised~C''}
+%* *
+%************************************************************************
+
+\begin{code}
+#else /* __STG_TAILJUMPS__ && __GNUC__ */
+\end{code}
+
+GCC will have assumed that pushing/popping of C-stack frames is going
+on when it generated its code, and used stack space accordingly.
+However, we actually {\em post-process away} all such stack-framery
+(see \tr{ghc/driver/ghc-asm-*.lprl}).
+Thing will be OK however, if we initially make sure there are
+@RESERVED_C_STACK_BYTES@ on the C-stack to begin with, for local
+variables.
+
+\begin{code}
+#define RESERVED_C_STACK_BYTES (512 * sizeof(I_)) /* MUST BE OF GENEROUS ALIGNMENT */
+\end{code}
+
+The platform-specific details are given in alphabetical order.
+
+%************************************************************************
+%* *
+\subsubsection[COptJumps-alpha]{Tail-jumping on Alphas}
+%* *
+%************************************************************************
+
+We have to set the procedure value register (\$27) before branching, so
+that the target function can load the gp (\$29) as appropriate.
+
+It seems that \tr{_procedure} can't be declared within the body of the
+\tr{JMP_} macro...at least, not if we want it to be \$27, which we do!
+
+\begin{code}
+#if alpha_dec_osf1_TARGET
+ /* ToDo: less specific? */
+
+/*
+ Jumping to a new block of code, we need to set up $27 to point
+ at the target, so that the callee can establish its gp (as an
+ offset from its own starting address). For some reason, gcc
+ refuses to give us $27 for _procedure if it's declared as a
+ local variable, so the workaround is to make it a global.
+
+ Note: The local variable works in gcc 2.6.2, but fails in 2.5.8.
+ */
+
+/* MOVED: to COptRegs.lh -- very unsatisfactorily.
+ Otherwise, we can get a "global register variable follows a
+ function definition" error.
+
+ Once we can take gcc 2.6.x as std, then we can use
+ the local variant, and the problem goes away. (WDP 95/02)
+
+register void *_procedure __asm__("$27");
+*/
+
+#define JMP_(cont) \
+ do { _procedure = (void *)(cont); \
+ goto *_procedure; \
+ } while(0)
+
+/*
+ When we resume at the point where a call was originally made,
+ we need to restore $26, so that gp can be reloaded appropriately.
+ However, sometimes we ``resume'' by entering a new function
+ (typically EnterNodeCode), so we need to set up $27 as well.
+ */
+
+#define RESUME_(cont) \
+ do { _procedure = (void *)(cont); \
+ __asm__ volatile("mov $27,$26"); \
+ goto *_procedure; \
+ } while(0);
+
+#define MINI_INTERPRETER_SETUP \
+ __asm__ volatile ("stq $9,-8($30)\n" \
+ "stq $10,-16($30)\n" \
+ "stq $11,-24($30)\n" \
+ "stq $12,-32($30)\n" \
+ "stq $13,-40($30)\n" \
+ "stq $14,-48($30)\n" \
+ "stq $15,-56($30)\n" \
+ "stt $f2,-64($30)\n" \
+ "stt $f3,-72($30)\n" \
+ "stt $f4,-80($30)\n" \
+ "stt $f5,-88($30)\n" \
+ "stt $f6,-96($30)\n" \
+ "stt $f7,-104($30)\n" \
+ "stt $f8,-112($30)\n" \
+ "stt $f9,-120($30)\n" \
+ "lda $30,-%0($30)" : : \
+ "K" (RESERVED_C_STACK_BYTES+8*sizeof(double)+8*sizeof(long)));
+
+#define MINI_INTERPRETER_END \
+ __asm__ volatile (".align 3\n" \
+ ".globl miniInterpretEnd\n" \
+ "miniInterpretEnd:\n" \
+ "lda $30,%0($30)\n" \
+ "ldq $9,-8($30)\n" \
+ "ldq $10,-16($30)\n" \
+ "ldq $11,-24($30)\n" \
+ "ldq $12,-32($30)\n" \
+ "ldq $13,-40($30)\n" \
+ "ldq $14,-48($30)\n" \
+ "ldq $15,-56($30)\n" \
+ "ldt $f2,-64($30)\n" \
+ "ldt $f3,-72($30)\n" \
+ "ldt $f4,-80($30)\n" \
+ "ldt $f5,-88($30)\n" \
+ "ldt $f6,-96($30)\n" \
+ "ldt $f7,-104($30)\n" \
+ "ldt $f8,-112($30)\n" \
+ "ldt $f9,-120($30)" : : \
+ "K" (RESERVED_C_STACK_BYTES+8*sizeof(double)+8*sizeof(long)));
+
+#endif /* __alpha */
+\end{code}
+
+%************************************************************************
+%* *
+\subsubsection[COptJumps-Hpux]{Tail-jumping on a HP-PA machine running HP-UX}
+%* *
+%************************************************************************
+
+\begin{code}
+#if hppa1_1_hp_hpux_TARGET
+
+/* do FUNBEGIN/END the easy way */
+#define FUNBEGIN __asm__ volatile ("--- BEGIN ---");
+#define FUNEND __asm__ volatile ("--- END ---");
+
+/* The stack grows up! Local variables are allocated just above the
+ frame pointer, and extra arguments are stashed just below the stack
+ pointer, so the safe space is again in the middle (cf. sparc).
+ */
+
+#define JMP_(cont) \
+ do { void *_procedure = (void *)(cont); \
+ goto *_procedure; \
+ } while(0)
+
+#define RESUME_(cont) JMP_(cont)
+
+#define MINI_INTERPRETER_SETUP \
+ StgChar space[RESERVED_C_STACK_BYTES+16*sizeof(long)+10*sizeof(double)]; \
+ __asm__ volatile ("ldo %0(%%r3),%%r19\n" \
+ "\tstw %%r3, 0(0,%%r19)\n" \
+ "\tstw %%r4, 4(0,%%r19)\n" \
+ "\tstw %%r5, 8(0,%%r19)\n" \
+ "\tstw %%r6,12(0,%%r19)\n" \
+ "\tstw %%r7,16(0,%%r19)\n" \
+ "\tstw %%r8,20(0,%%r19)\n" \
+ "\tstw %%r9,24(0,%%r19)\n" \
+ "\tstw %%r10,28(0,%%r19)\n" \
+ "\tstw %%r11,32(0,%%r19)\n" \
+ "\tstw %%r12,36(0,%%r19)\n" \
+ "\tstw %%r13,40(0,%%r19)\n" \
+ "\tstw %%r14,44(0,%%r19)\n" \
+ "\tstw %%r15,48(0,%%r19)\n" \
+ "\tstw %%r16,52(0,%%r19)\n" \
+ "\tstw %%r17,56(0,%%r19)\n" \
+ "\tstw %%r18,60(0,%%r19)\n" \
+ "\tldo 80(%%r19),%%r19\n" \
+ "\tfstds %%fr12,-16(0,%%r19)\n" \
+ "\tfstds %%fr13, -8(0,%%r19)\n" \
+ "\tfstds %%fr14, 0(0,%%r19)\n" \
+ "\tfstds %%fr15, 8(0,%%r19)\n" \
+ "\tldo 32(%%r19),%%r19\n" \
+ "\tfstds %%fr16,-16(0,%%r19)\n" \
+ "\tfstds %%fr17, -8(0,%%r19)\n" \
+ "\tfstds %%fr18, 0(0,%%r19)\n" \
+ "\tfstds %%fr19, 8(0,%%r19)\n" \
+ "\tldo 32(%%r19),%%r19\n" \
+ "\tfstds %%fr20,-16(0,%%r19)\n" \
+ "\tfstds %%fr21, -8(0,%%r19)\n" : : \
+ "n" (RESERVED_C_STACK_BYTES - (116 * sizeof(long) + 10 * sizeof(double))) : "%r19" );
+
+#define MINI_INTERPRETER_END \
+ __asm__ volatile (".align 4\n" \
+ "\t.EXPORT miniInterpretEnd,CODE\n" \
+ "\t.EXPORT miniInterpretEnd,ENTRY,PRIV_LEV=3\n" \
+ "miniInterpretEnd\n" \
+ "\tldo %0(%%r3),%%r19\n" \
+ "\tldw 0(0,%%r19),%%r3\n" \
+ "\tldw 4(0,%%r19),%%r4\n" \
+ "\tldw 8(0,%%r19),%%r5\n" \
+ "\tldw 12(0,%%r19),%%r6\n" \
+ "\tldw 16(0,%%r19),%%r7\n" \
+ "\tldw 20(0,%%r19),%%r8\n" \
+ "\tldw 24(0,%%r19),%%r9\n" \
+ "\tldw 28(0,%%r19),%%r10\n" \
+ "\tldw 32(0,%%r19),%%r11\n" \
+ "\tldw 36(0,%%r19),%%r12\n" \
+ "\tldw 40(0,%%r19),%%r13\n" \
+ "\tldw 44(0,%%r19),%%r14\n" \
+ "\tldw 48(0,%%r19),%%r15\n" \
+ "\tldw 52(0,%%r19),%%r16\n" \
+ "\tldw 56(0,%%r19),%%r17\n" \
+ "\tldw 60(0,%%r19),%%r18\n" \
+ "\tldo 80(%%r19),%%r19\n" \
+ "\tfldds -16(0,%%r19),%%fr12\n" \
+ "\tfldds -8(0,%%r19),%%fr13\n" \
+ "\tfldds 0(0,%%r19),%%fr14\n" \
+ "\tfldds 8(0,%%r19),%%fr15\n" \
+ "\tldo 32(%%r19),%%r19\n" \
+ "\tfldds -16(0,%%r19),%%fr16\n" \
+ "\tfldds -8(0,%%r19),%%fr17\n" \
+ "\tfldds 0(0,%%r19),%%fr18\n" \
+ "\tfldds 8(0,%%r19),%%fr19\n" \
+ "\tldo 32(%%r19),%%r19\n" \
+ "\tfldds -16(0,%%r19),%%fr20\n" \
+ "\tfldds -8(0,%%r19),%%fr21\n" : : \
+ "n" (RESERVED_C_STACK_BYTES - (116 * sizeof(long) + 10 * sizeof(double))) : "%r19");
+
+#endif /* hppa1.1-hp-hpux* */
+\end{code}
+
+%************************************************************************
+%* *
+\subsubsection[COptJumps-iX86]{Tail-jumping on a 386/486}
+%* *
+%************************************************************************
+
+\begin{code}
+#if i386_TARGET_ARCH || i486_TARGET_ARCH
+
+/* do FUNBEGIN/END the easy way */
+#define FUNBEGIN __asm__ volatile ("--- BEGIN ---");
+#define FUNEND __asm__ volatile ("--- END ---");
+
+/* try "m68k-style" for now */
+extern void __DISCARD__(STG_NO_ARGS);
+
+#define JMP_(cont) \
+ do { void *target; \
+ __DISCARD__(); \
+ target = (void *)(cont); \
+ goto *target; \
+ } while(0)
+
+#define RESUME_(target) JMP_(target)
+
+/* The safe part of the stack frame is near the top */
+
+extern P_ SP_stack[];
+extern I_ SP_stack_ptr;
+
+#define MINI_INTERPRETER_SETUP \
+ StgChar space[RESERVED_C_STACK_BYTES+4*sizeof(long)]; \
+ __asm__ volatile ("leal %c0(%%esp),%%eax\n" \
+ "\tmovl %%ebx,0(%%eax)\n" \
+ "\tmovl %%esi,4(%%eax)\n" \
+ "\tmovl %%edi,8(%%eax)\n" \
+ "\tmovl %%ebp,12(%%eax)\n" \
+ "\tmovl %%esp,_MainRegTable+100" \
+ : : "n" (RESERVED_C_STACK_BYTES) \
+ : "%eax"); \
+ __asm__ volatile ("movl %%esp,%0" \
+ : "=r" (SP_stack[++SP_stack_ptr]));
+
+#define MINI_INTERPRETER_END \
+ __asm__ volatile (".align 4\n" \
+ ".globl _miniInterpretEnd\n" \
+ "_miniInterpretEnd:\n" \
+ "\tnop" \
+ : : : "memory" ); \
+ __asm__ volatile ("movl %0,%%esp\n" \
+ "\tmovl %%esp,_MainRegTable+100" \
+ : : "m" (SP_stack[SP_stack_ptr--]) ); \
+ __asm__ volatile ("leal %c0(%%esp),%%eax\n" \
+ "\tmovl 0(%%eax),%%ebx\n" \
+ "\tmovl 4(%%eax),%%esi\n" \
+ "\tmovl 8(%%eax),%%edi\n" \
+ "\tmovl 12(%%eax),%%ebp" \
+ : : "n" (RESERVED_C_STACK_BYTES) : "%eax");
+
+#endif /* __i[34]86__ */
+\end{code}
+
+%************************************************************************
+%* *
+\subsubsection[COptJumps-m68k]{Tail-jumping on m68k boxes}
+%* *
+%************************************************************************
+
+For 680x0s, we use a quite-magic @JMP_@ macro, which includes
+beginning- and end-of-function markers.
+
+\begin{code}
+#if m68k_TARGET_ARCH
+
+#define FUNBEGIN __asm__ volatile ("--- BEGIN ---");
+#define FUNEND __asm__ volatile ("--- END ---");
+\end{code}
+
+The call to \tr{__DISCARD__} in @JMP_@ is fodder for GCC, to force it
+to pop arguments to previous function calls before the end of the
+current function. This is unnecessary if we can manage to compile
+with \tr{-fomit-frame-pointer} as well as \tr{-fno-defer-pop}. (WDP
+95/02: Either false or dodgy.) At the moment, the asm mangler removes
+these calls to \tr{__DISCARD__}.
+
+
+\begin{code}
+extern void __DISCARD__(STG_NO_ARGS);
+
+#define JMP_(cont) \
+ do { void *target; \
+ __DISCARD__(); \
+ target = (void *)(cont); \
+ goto *target; \
+ } while(0)
+
+#define RESUME_(target) JMP_(target)
+
+#define MINI_INTERPRETER_SETUP \
+ StgChar space[RESERVED_C_STACK_BYTES+11*sizeof(long)]; \
+ __asm__ volatile ("moveml a2-a6/d2-d7,sp@(%c0)\n" \
+ "\tlea sp@(%c0),a6" : : "J" (RESERVED_C_STACK_BYTES));
+
+#define MINI_INTERPRETER_END \
+ __asm__ volatile (".even\n" \
+ ".globl _miniInterpretEnd\n" \
+ "_miniInterpretEnd:\n" \
+ "\taddqw #4,sp\n" \
+ "\tmoveml sp@(%c0),a2-a6/d2-d7" : : "J" (RESERVED_C_STACK_BYTES));
+
+#endif /* __m68k__ */
+\end{code}
+
+%************************************************************************
+%* *
+\subsubsection[COptJumps-mips]{Tail-jumping on a MIPS box}
+%* *
+%************************************************************************
+
+\begin{code}
+#if mipseb_TARGET_ARCH || mipsel_TARGET_ARCH
+
+/* do FUNBEGIN/END the easy way */
+#define FUNBEGIN __asm__ volatile ("--- BEGIN ---");
+#define FUNEND __asm__ volatile ("--- END ---");
+
+/* try "m68k-style" for now */
+extern void __DISCARD__(STG_NO_ARGS);
+
+/* this is "alpha-style" */
+#define JMP_(cont) \
+ do { __DISCARD__(); \
+ _procedure = (void *)(cont); \
+ goto *_procedure; \
+ } while(0)
+
+#define RESUME_(target) JMP_(target)
+
+/* _All_ callee-saved regs, whether we steal them or not, must be saved
+ (and restored).
+*/
+
+#define MINI_INTERPRETER_SETUP \
+ StgChar space[RESERVED_C_STACK_BYTES+6*sizeof(double)+9*sizeof(long)]; \
+ __asm__ volatile ("addu $2,$sp,%0\n" \
+ "\ts.d $f20,0($2)\n" \
+ "\ts.d $f22,8($2)\n" \
+ "\ts.d $f24,16($2)\n" \
+ "\ts.d $f26,24($2)\n" \
+ "\ts.d $f28,32($2)\n" \
+ "\ts.d $f30,40($2)\n" \
+ "\tsw $16,48($2)\n" \
+ "\tsw $17,52($2)\n" \
+ "\tsw $18,56($2)\n" \
+ "\tsw $19,60($2)\n" \
+ "\tsw $20,64($2)\n" \
+ "\tsw $21,68($2)\n" \
+ "\tsw $22,72($2)\n" \
+ "\tsw $23,76($2)\n" \
+ "\tsw $fp,80($2)\n" \
+ : : "I" (RESERVED_C_STACK_BYTES+16) : "$2" );
+
+ /* the 16 bytes is for the argument-register save-area above $sp */
+
+#define MINI_INTERPRETER_END \
+ __asm__ volatile (".align 2\n" \
+ ".globl miniInterpretEnd\n" \
+ "miniInterpretEnd:\n" \
+ "\taddu $2,$sp,%0\n" \
+ "\tl.d $f20,0($2)\n" \
+ "\tl.d $f22,8($2)\n" \
+ "\tl.d $f24,16($2)\n" \
+ "\tl.d $f26,24($2)\n" \
+ "\tl.d $f28,32($2)\n" \
+ "\tl.d $f30,40($2)\n" \
+ "\tlw $16,48($2)\n" \
+ "\tlw $17,52($2)\n" \
+ "\tlw $18,56($2)\n" \
+ "\tlw $19,60($2)\n" \
+ "\tlw $20,64($2)\n" \
+ "\tlw $21,68($2)\n" \
+ "\tlw $22,72($2)\n" \
+ "\tlw $23,76($2)\n" \
+ "\tlw $fp,80($2)\n" \
+ : : "I" (RESERVED_C_STACK_BYTES+16) : "$2" );
+
+#endif /* mips */
+\end{code}
+
+%************************************************************************
+%* *
+\subsubsection[COptJumps-RS6000]{Tail-jumping on an IBM RS6000 running AIX}
+%* *
+%************************************************************************
+
+\begin{code}
+#if rs6000_ibm_aix_TARGET
+
+#define JMP_(cont) ((F_) (cont))()
+/* partain: untested */
+
+#endif /* rs6000-ibm-aix* */
+\end{code}
+
+%************************************************************************
+%* *
+\subsubsection[COptJumps-sparc]{Tail-jumping on Sun4s}
+%* *
+%************************************************************************
+
+We want tailjumps to be calls, because `call xxx' is the only Sparc branch
+that allows an arbitrary label as a target. (Gcc's ``goto *target'' construct
+ends up loading the label into a register and then jumping, at the cost of
+two extra instructions for the 32-bit load.)
+
+When entering the threaded world, we stash our return address in a known
+location so that \tr{%i7} is available as an extra callee-saves register.
+Of course, we have to restore this when coming out of the threaded world.
+
+I hate this god-forsaken architecture. Since the top of the reserved
+stack space is used for globals and the bottom is reserved for outgoing arguments,
+we have to stick our return address somewhere in the middle. Currently, I'm
+allowing 100 extra outgoing arguments beyond the first 6. --JSM
+
+\begin{code}
+#if sparc_TARGET_ARCH
+
+#ifdef solaris2_TARGET_OS
+#define MINI_INTERPRET_END "miniInterpretEnd"
+#else
+#define MINI_INTERPRET_END "_miniInterpretEnd"
+#endif
+
+#define JMP_(cont) ((F_) (cont))()
+ /* Oh so happily, the above turns into a "call" instruction,
+ which, on a SPARC, is nothing but a "jmpl" with the
+ return address in %o7 [which we don't care about].
+ */
+#define RESUME_(target) JMP_(target)
+
+#define MINI_INTERPRETER_SETUP \
+ StgChar space[RESERVED_C_STACK_BYTES+sizeof(void *)]; \
+ register void *i7 __asm__("%i7"); \
+ ((void **)(space))[100] = i7;
+
+#define MINI_INTERPRETER_END \
+ __asm__ volatile (".align 4\n" \
+ ".global " MINI_INTERPRET_END "\n" \
+ MINI_INTERPRET_END ":\n" \
+ "\tld %1,%0" : "=r" (i7) : "m" (((void **)(space))[100]));
+
+#endif /* __sparc__ */
+\end{code}
+
+%************************************************************************
+%* *
+\subsubsection[COptJumps-OOPS]{Someone screwed up here, too...}
+%* *
+%************************************************************************
+
+If one of the above machine-dependent sections wasn't triggered,
+@JMP_@ won't be defined and you'll get link errors (if not
+C-compiler errors).
+
+\begin{code}
+#if !defined(JMP_)
+*???????* No JMP_ macro???
+#endif
+
+#endif /* __STG_TAILJUMPS__ */
+\end{code}
+
+If @FUNBEGIN@ and @FUNEND@ weren't defined, give them the default
+(nothing). Also, define @FB_@ and @FE_@ (short forms).
+\begin{code}
+#if ! defined(FUNBEGIN)
+#define FUNBEGIN /* nothing */
+#endif
+#if ! defined(FUNEND)
+#define FUNEND /* nothing */
+#endif
+
+#define FB_ FUNBEGIN /* short forms */
+#define FE_ FUNEND
+
+#endif /* ! that's all of... COPTJUMPS_H */
+\end{code}