Make SpecConstr specialise for constant arguments again
[ghc-hetmet.git] / rts / StgCRun.c
1 /* -----------------------------------------------------------------------------
2  *
3  * (c) The GHC Team, 1998-2003
4  *
5  * STG-to-C glue.
6  *
7  * To run an STG function from C land, call
8  *
9  *              rv = StgRun(f,BaseReg);
10  *
11  * where "f" is the STG function to call, and BaseReg is the address of the
12  * RegTable for this run (we might have separate RegTables if we're running
13  * multiple threads on an SMP machine).
14  *
15  * In the end, "f" must JMP to StgReturn (defined below),
16  * passing the return-value "rv" in R1,
17  * to return to the caller of StgRun returning "rv" in
18  * the whatever way C returns a value.
19  *
20  * NOTE: StgRun/StgReturn do *NOT* load or store Hp or any
21  * other registers (other than saving the C callee-saves
22  * registers).  Instead, the called function "f" must do that
23  * in STG land.
24  *
25  * GCC will have assumed that pushing/popping of C-stack frames is
26  * going on when it generated its code, and used stack space
27  * accordingly.  However, we actually {\em post-process away} all
28  * such stack-framery (see \tr{ghc/driver/ghc-asm.lprl}). Things will
29  * be OK however, if we initially make sure there are
30  * @RESERVED_C_STACK_BYTES@ on the C-stack to begin with, for local
31  * variables.
32  *
33  * -------------------------------------------------------------------------- */
34
35 #include "PosixSource.h"
36
37
38 /*
39  * We define the following (unused) global register variables, because for
40  * some reason gcc generates sub-optimal code for StgRun() on the Alpha
41  * (unnecessarily saving extra registers on the stack) if we don't.
42  *
43  * Why do it at the top of this file, rather than near StgRun() below?  Because
44  * gcc doesn't let us define global register variables after any function
45  * definition has been read.  Any point after #include "Stg.h" would be too
46  * late.
47  *
48  * We define alpha_EXTRA_CAREFUL here to save $s6, $f8 and $f9 -- registers
49  * that we don't use but which are callee-save registers.  The __divq() routine
50  * in libc.a clobbers $s6.
51  */
52 #include "ghcconfig.h"
53 #ifndef USE_MINIINTERPRETER
54 #ifdef alpha_HOST_ARCH
55 #define alpha_EXTRA_CAREFUL
56 register long   fake_ra __asm__("$26");
57 register long   fake_gp __asm__("$29");
58 #ifdef alpha_EXTRA_CAREFUL
59 register long   fake_s6 __asm__("$15");
60 register double fake_f8 __asm__("$f8");
61 register double fake_f9 __asm__("$f9");
62 #endif
63 #endif
64 #endif
65
66 /* include Stg.h first because we want real machine regs in here: we
67  * have to get the value of R1 back from Stg land to C land intact.
68  */
69 #include "Stg.h"
70 #include "Rts.h"
71 #include "StgRun.h"
72 #include "RtsFlags.h"
73 #include "OSThreads.h"
74 #include "Capability.h"
75
76 #ifdef DEBUG
77 #include "RtsUtils.h"
78 #include "Printer.h"
79 #endif
80
81 #ifdef USE_MINIINTERPRETER
82
83 /* -----------------------------------------------------------------------------
84    any architecture (using miniinterpreter)
85    -------------------------------------------------------------------------- */
86
87 StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg STG_UNUSED)
88 {
89     while (f) {
90         /* XXX Disabled due to RtsFlags[]/RtsFlags mismatch
91         IF_DEBUG(interpreter,
92             debugBelch("Jumping to ");
93             printPtr((P_)f); fflush(stdout);
94             debugBelch("\n");
95             );
96         */
97         f = (StgFunPtr) (f)();
98     }
99     return (StgRegTable *)R1.p;
100 }
101
102 StgFunPtr StgReturn(void)
103 {
104     return 0;
105 }
106
107 #else /* !USE_MINIINTERPRETER */
108
109 #ifdef LEADING_UNDERSCORE
110 #define STG_RUN "_StgRun"
111 #else
112 #define STG_RUN "StgRun"
113 #endif
114
115 #ifdef LEADING_UNDERSCORE
116 #define STG_RETURN "_StgReturn"
117 #else
118 #define STG_RETURN "StgReturn"
119 #endif
120
121 /* -----------------------------------------------------------------------------
122    x86 architecture
123    -------------------------------------------------------------------------- */
124
125 #ifdef i386_HOST_ARCH
126
127 #ifdef darwin_TARGET_OS
128 #define STG_GLOBAL ".globl "
129 #else
130 #define STG_GLOBAL ".global "
131 #endif
132
133 StgRegTable *
134 StgRun(StgFunPtr f, StgRegTable *basereg) {
135
136     unsigned char space[ RESERVED_C_STACK_BYTES + 4*sizeof(void *) ];
137     StgRegTable * r;
138
139     __asm__ volatile (
140         /*
141          * save callee-saves registers on behalf of the STG code.
142          */
143         "movl %%esp, %%eax\n\t"
144         "addl %4, %%eax\n\t"
145         "movl %%ebx,0(%%eax)\n\t"
146         "movl %%esi,4(%%eax)\n\t"
147         "movl %%edi,8(%%eax)\n\t"
148         "movl %%ebp,12(%%eax)\n\t"
149         /*
150          * Set BaseReg
151          */
152         "movl %3,%%ebx\n\t"
153         /*
154          * grab the function argument from the stack
155          */
156         "movl %2,%%eax\n\t"
157         
158         /*
159          * Darwin note:
160          * The stack pointer has to be aligned to a multiple of 16 bytes at
161          * this point. This works out correctly with gcc 4.0.1, but it might
162          * break at any time in the future. TODO: Make this future-proof.
163          */
164
165         /*
166          * jump to it
167          */
168         "jmp *%%eax\n\t"
169
170         STG_GLOBAL STG_RETURN "\n"
171         STG_RETURN ":\n\t"
172
173         "movl %%esi, %%eax\n\t"   /* Return value in R1  */
174
175         /*
176          * restore callee-saves registers.  (Don't stomp on %%eax!)
177          */
178         "movl %%esp, %%edx\n\t"
179         "addl %4, %%edx\n\t"
180         "movl 0(%%edx),%%ebx\n\t"       /* restore the registers saved above */
181         "movl 4(%%edx),%%esi\n\t"
182         "movl 8(%%edx),%%edi\n\t"
183         "movl 12(%%edx),%%ebp\n\t"
184
185       : "=&a" (r), "=m" (space)
186       : "m" (f), "m" (basereg), "i" (RESERVED_C_STACK_BYTES)
187       : "edx" /* stomps on %edx */
188     );
189
190     return r;
191 }
192
193 #endif
194
195 /* ----------------------------------------------------------------------------
196    x86-64 is almost the same as plain x86.
197
198    I've done it using entirely inline assembler, because I couldn't
199    get gcc to generate the correct subtraction from %rsp by using
200    the local array variable trick.  It didn't seem to reserve
201    enough space.  Oh well, it's not much harder this way.
202
203    ------------------------------------------------------------------------- */
204
205 #ifdef x86_64_HOST_ARCH
206
207 extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
208
209 static void GNUC3_ATTRIBUTE(used)
210 StgRunIsImplementedInAssembler(void)
211 {
212     __asm__ volatile (
213         /*
214          * save callee-saves registers on behalf of the STG code.
215          */
216         ".globl " STG_RUN "\n"
217         STG_RUN ":\n\t"
218         "subq %0, %%rsp\n\t"
219         "movq %%rsp, %%rax\n\t"
220         "addq %0-48, %%rax\n\t"
221         "movq %%rbx,0(%%rax)\n\t"
222         "movq %%rbp,8(%%rax)\n\t"
223         "movq %%r12,16(%%rax)\n\t"
224         "movq %%r13,24(%%rax)\n\t"
225         "movq %%r14,32(%%rax)\n\t"
226         "movq %%r15,40(%%rax)\n\t"
227         /*
228          * Set BaseReg
229          */
230         "movq %%rsi,%%r13\n\t"
231         /*
232          * grab the function argument from the stack, and jump to it.
233          */
234         "movq %%rdi,%%rax\n\t"
235         "jmp *%%rax\n\t"
236
237         ".globl " STG_RETURN "\n"
238         STG_RETURN ":\n\t"
239
240         "movq %%rbx, %%rax\n\t"   /* Return value in R1  */
241
242         /*
243          * restore callee-saves registers.  (Don't stomp on %%rax!)
244          */
245         "movq %%rsp, %%rdx\n\t"
246         "addq %0-48, %%rdx\n\t"
247         "movq 0(%%rdx),%%rbx\n\t"       /* restore the registers saved above */
248         "movq 8(%%rdx),%%rbp\n\t"
249         "movq 16(%%rdx),%%r12\n\t"
250         "movq 24(%%rdx),%%r13\n\t"
251         "movq 32(%%rdx),%%r14\n\t"
252         "movq 40(%%rdx),%%r15\n\t"
253         "addq %0, %%rsp\n\t"
254         "retq"
255
256         : : "i"(RESERVED_C_STACK_BYTES+48+8 /*stack frame size*/));
257     /* 
258        HACK alert!
259
260        The x86_64 ABI specifies that on a procedure call, %rsp is
261        aligned on a 16-byte boundary + 8.  That is, the first
262        argument on the stack after the return address will be
263        16-byte aligned.  
264        
265        Which should be fine: RESERVED_C_STACK_BYTES+48 is a multiple
266        of 16 bytes.  
267        
268        BUT... when we do a C-call from STG land, gcc likes to put the
269        stack alignment adjustment in the prolog.  eg. if we're calling
270        a function with arguments in regs, gcc will insert 'subq $8,%rsp'
271        in the prolog, to keep %rsp aligned (the return address is 8
272        bytes, remember).  The mangler throws away the prolog, so we
273        lose the stack alignment.
274
275        The hack is to add this extra 8 bytes to our %rsp adjustment
276        here, so that throughout STG code, %rsp is 16-byte aligned,
277        ready for a C-call.  
278
279        A quick way to see if this is wrong is to compile this code:
280
281           main = System.Exit.exitWith ExitSuccess
282
283        And run it with +RTS -sstderr.  The stats code in the RTS, in
284        particular statsPrintf(), relies on the stack alignment because
285        it saves the %xmm regs on the stack, so it'll fall over if the
286        stack isn't aligned, and calling exitWith from Haskell invokes
287        shutdownHaskellAndExit using a C call.
288
289        Future gcc releases will almost certainly break this hack...
290     */
291 }
292
293 #endif /* x86-64 */
294
295 /* -----------------------------------------------------------------------------
296    Sparc architecture
297
298    --
299    OLD COMMENT from GHC-3.02:
300
301    We want tailjumps to be calls, because `call xxx' is the only Sparc
302    branch that allows an arbitrary label as a target.  (Gcc's ``goto
303    *target'' construct ends up loading the label into a register and
304    then jumping, at the cost of two extra instructions for the 32-bit
305    load.)
306
307    When entering the threaded world, we stash our return address in a
308    known location so that \tr{%i7} is available as an extra
309    callee-saves register.  Of course, we have to restore this when
310    coming out of the threaded world.
311
312    I hate this god-forsaken architecture.  Since the top of the
313    reserved stack space is used for globals and the bottom is reserved
314    for outgoing arguments, we have to stick our return address
315    somewhere in the middle.  Currently, I'm allowing 100 extra
316    outgoing arguments beyond the first 6.  --JSM
317
318    Updated info (GHC 4.06): we don't appear to use %i7 any more, so
319    I'm not sure whether we still need to save it.  Incedentally, what
320    does the last paragraph above mean when it says "the top of the
321    stack is used for globals"?  What globals?  --SDM
322
323    Updated info (GHC 4.08.2): not saving %i7 any more (see below).
324    -------------------------------------------------------------------------- */
325
326 #ifdef sparc_HOST_ARCH
327
328 StgRegTable *
329 StgRun(StgFunPtr f, StgRegTable *basereg) {
330
331     unsigned char space[RESERVED_C_STACK_BYTES];
332 #if 0
333     register void *i7 __asm__("%i7");
334     ((void **)(space))[100] = i7;
335 #endif
336     f();
337     __asm__ volatile (
338             ".align 4\n"
339             ".global " STG_RETURN "\n"
340             STG_RETURN ":"
341             : : "p" (space) : "l0","l1","l2","l3","l4","l5","l6","l7");
342     /* we tell the C compiler that l0-l7 are clobbered on return to
343      * StgReturn, otherwise it tries to use these to save eg. the
344      * address of space[100] across the call.  The correct thing
345      * to do would be to save all the callee-saves regs, but we
346      * can't be bothered to do that.
347      *
348      * We also explicitly mark space as used since gcc eliminates it
349      * otherwise.
350      *
351      * The code that gcc generates for this little fragment is now
352      * terrible.  We could do much better by coding it directly in
353      * assembler.
354      */
355 #if 0
356     /* updated 4.08.2: we don't save %i7 in the middle of the reserved
357      * space any more, since gcc tries to save its address across the
358      * call to f(), this gets clobbered in STG land and we end up
359      * dereferencing a bogus pointer in StgReturn.
360      */
361     __asm__ volatile ("ld %1,%0"
362                       : "=r" (i7) : "m" (((void **)(space))[100]));
363 #endif
364     return (StgRegTable *)R1.i;
365 }
366
367 #endif
368
369 /* -----------------------------------------------------------------------------
370    alpha architecture
371
372    "The stack pointer (SP) must at all times denote an address that has octaword
373     alignment. (This restriction has the side effect that the in-memory portion
374     of the argument list, if any, will start on an octaword boundary.) Note that
375     the stack grows toward lower addresses. During a procedure invocation, SP
376     can never be set to a value that is higher than the value of SP at entry to
377     that procedure invocation.
378
379    "The contents of the stack, located above the portion of the argument list
380     (if any) that is passed in memory, belong to the calling procedure. Because
381     they are part of the calling procedure, they should not be read or written
382     by the called procedure, except as specified by indirect arguments or
383     language-controlled up-level references.
384
385    "The SP value might be used by the hardware when raising exceptions and
386     asynchronous interrupts. It must be assumed that the contents of the stack
387     below the current SP value and within the stack for the current thread are
388     continually and unpredictably modified, as specified in the _Alpha
389     Architecture Reference Manual_, and as a result of asynchronous software
390     actions."
391
392    -- Compaq Computer Corporation, Houston. Tru64 UNIX Calling Standard for
393       Alpha Systems, 5.1 edition, August 2000, section 3.2.1.  http://www.
394       tru64unix.compaq.com/docs/base_doc/DOCUMENTATION/V51_PDF/ARH9MBTE.PDF
395    -------------------------------------------------------------------------- */
396
397 #ifdef alpha_HOST_ARCH
398
399 StgRegTable *
400 StgRun(StgFunPtr f, StgRegTable *basereg)
401 {
402     register long   real_ra __asm__("$26"); volatile long   save_ra;
403     register long   real_gp __asm__("$29"); volatile long   save_gp;
404
405     register long   real_s0 __asm__("$9" ); volatile long   save_s0;
406     register long   real_s1 __asm__("$10"); volatile long   save_s1;
407     register long   real_s2 __asm__("$11"); volatile long   save_s2;
408     register long   real_s3 __asm__("$12"); volatile long   save_s3;
409     register long   real_s4 __asm__("$13"); volatile long   save_s4;
410     register long   real_s5 __asm__("$14"); volatile long   save_s5;
411 #ifdef alpha_EXTRA_CAREFUL
412     register long   real_s6 __asm__("$15"); volatile long   save_s6;
413 #endif
414
415     register double real_f2 __asm__("$f2"); volatile double save_f2;
416     register double real_f3 __asm__("$f3"); volatile double save_f3;
417     register double real_f4 __asm__("$f4"); volatile double save_f4;
418     register double real_f5 __asm__("$f5"); volatile double save_f5;
419     register double real_f6 __asm__("$f6"); volatile double save_f6;
420     register double real_f7 __asm__("$f7"); volatile double save_f7;
421 #ifdef alpha_EXTRA_CAREFUL
422     register double real_f8 __asm__("$f8"); volatile double save_f8;
423     register double real_f9 __asm__("$f9"); volatile double save_f9;
424 #endif
425
426     register StgFunPtr real_pv __asm__("$27");
427
428     StgRegTable * ret;
429
430     save_ra = real_ra;
431     save_gp = real_gp;
432
433     save_s0 = real_s0;
434     save_s1 = real_s1;
435     save_s2 = real_s2;
436     save_s3 = real_s3;
437     save_s4 = real_s4;
438     save_s5 = real_s5;
439 #ifdef alpha_EXTRA_CAREFUL
440     save_s6 = real_s6;
441 #endif
442
443     save_f2 = real_f2;
444     save_f3 = real_f3;
445     save_f4 = real_f4;
446     save_f5 = real_f5;
447     save_f6 = real_f6;
448     save_f7 = real_f7;
449 #ifdef alpha_EXTRA_CAREFUL
450     save_f8 = real_f8;
451     save_f9 = real_f9;
452 #endif
453
454     real_pv = f;
455
456     __asm__ volatile(   "lda $30,-%0($30)"      "\n"
457                 "\t"    "jmp ($27)"             "\n"
458                 "\t"    ".align 3"              "\n"
459                 ".globl " STG_RETURN            "\n"
460                 STG_RETURN ":"                  "\n"
461                 "\t"    "lda $30,%0($30)"       "\n"
462                 : : "K" (RESERVED_C_STACK_BYTES));
463
464     ret = real_s5;
465
466     real_s0 = save_s0;
467     real_s1 = save_s1;
468     real_s2 = save_s2;
469     real_s3 = save_s3;
470     real_s4 = save_s4;
471     real_s5 = save_s5;
472 #ifdef alpha_EXTRA_CAREFUL
473     real_s6 = save_s6;
474 #endif
475
476     real_f2 = save_f2;
477     real_f3 = save_f3;
478     real_f4 = save_f4;
479     real_f5 = save_f5;
480     real_f6 = save_f6;
481     real_f7 = save_f7;
482 #ifdef alpha_EXTRA_CAREFUL
483     real_f8 = save_f8;
484     real_f9 = save_f9;
485 #endif
486
487     real_ra = save_ra;
488     real_gp = save_gp;
489
490     return ret;
491 }
492
493 #endif /* alpha_HOST_ARCH */
494
495 /* -----------------------------------------------------------------------------
496    HP-PA architecture
497    -------------------------------------------------------------------------- */
498
499 #ifdef hppa1_1_HOST_ARCH
500
501 StgRegTable *
502 StgRun(StgFunPtr f, StgRegTable *basereg)
503 {
504     StgChar space[RESERVED_C_STACK_BYTES+16*sizeof(long)+10*sizeof(double)];
505     StgRegTable * ret;
506
507     __asm__ volatile ("ldo %0(%%r30),%%r19\n"
508                       "\tstw %%r3, 0(0,%%r19)\n"
509                       "\tstw %%r4, 4(0,%%r19)\n"
510                       "\tstw %%r5, 8(0,%%r19)\n"
511                       "\tstw %%r6,12(0,%%r19)\n"
512                       "\tstw %%r7,16(0,%%r19)\n"
513                       "\tstw %%r8,20(0,%%r19)\n"
514                       "\tstw %%r9,24(0,%%r19)\n"
515                       "\tstw %%r10,28(0,%%r19)\n"
516                       "\tstw %%r11,32(0,%%r19)\n"
517                       "\tstw %%r12,36(0,%%r19)\n"
518                       "\tstw %%r13,40(0,%%r19)\n"
519                       "\tstw %%r14,44(0,%%r19)\n"
520                       "\tstw %%r15,48(0,%%r19)\n"
521                       "\tstw %%r16,52(0,%%r19)\n"
522                       "\tstw %%r17,56(0,%%r19)\n"
523                       "\tstw %%r18,60(0,%%r19)\n"
524                       "\tldo 80(%%r19),%%r19\n"
525                       "\tfstds %%fr12,-16(0,%%r19)\n"
526                       "\tfstds %%fr13, -8(0,%%r19)\n"
527                       "\tfstds %%fr14,  0(0,%%r19)\n"
528                       "\tfstds %%fr15,  8(0,%%r19)\n"
529                       "\tldo 32(%%r19),%%r19\n"
530                       "\tfstds %%fr16,-16(0,%%r19)\n"
531                       "\tfstds %%fr17, -8(0,%%r19)\n"
532                       "\tfstds %%fr18,  0(0,%%r19)\n"
533                       "\tfstds %%fr19,  8(0,%%r19)\n"
534                       "\tldo 32(%%r19),%%r19\n"
535                       "\tfstds %%fr20,-16(0,%%r19)\n"
536                       "\tfstds %%fr21, -8(0,%%r19)\n" : :
537                       "n" (-(116 * sizeof(long) + 10 * sizeof(double))) : "%r19"
538                       );
539
540     f();
541
542     __asm__ volatile (".align 4\n"
543                       "\t.EXPORT " STG_RETURN ",CODE\n"
544                       "\t.EXPORT " STG_RETURN ",ENTRY,PRIV_LEV=3\n"
545                       STG_RETURN "\n"
546                       /* "\tldo %0(%%r3),%%r19\n" */
547                       "\tldo %1(%%r30),%%r19\n"
548                       "\tcopy %%r11, %0\n"  /* save R1 */
549                       "\tldw  0(0,%%r19),%%r3\n"
550                       "\tldw  4(0,%%r19),%%r4\n"
551                       "\tldw  8(0,%%r19),%%r5\n"
552                       "\tldw 12(0,%%r19),%%r6\n"
553                       "\tldw 16(0,%%r19),%%r7\n"
554                       "\tldw 20(0,%%r19),%%r8\n"
555                       "\tldw 24(0,%%r19),%%r9\n"
556                       "\tldw 28(0,%%r19),%%r10\n"
557                       "\tldw 32(0,%%r19),%%r11\n"
558                       "\tldw 36(0,%%r19),%%r12\n"
559                       "\tldw 40(0,%%r19),%%r13\n"
560                       "\tldw 44(0,%%r19),%%r14\n"
561                       "\tldw 48(0,%%r19),%%r15\n"
562                       "\tldw 52(0,%%r19),%%r16\n"
563                       "\tldw 56(0,%%r19),%%r17\n"
564                       "\tldw 60(0,%%r19),%%r18\n"
565                       "\tldo 80(%%r19),%%r19\n"
566                       "\tfldds -16(0,%%r19),%%fr12\n"
567                       "\tfldds  -8(0,%%r19),%%fr13\n"
568                       "\tfldds   0(0,%%r19),%%fr14\n"
569                       "\tfldds   8(0,%%r19),%%fr15\n"
570                       "\tldo 32(%%r19),%%r19\n"
571                       "\tfldds -16(0,%%r19),%%fr16\n"
572                       "\tfldds  -8(0,%%r19),%%fr17\n"
573                       "\tfldds   0(0,%%r19),%%fr18\n"
574                       "\tfldds   8(0,%%r19),%%fr19\n"
575                       "\tldo 32(%%r19),%%r19\n"
576                       "\tfldds -16(0,%%r19),%%fr20\n"
577                       "\tfldds  -8(0,%%r19),%%fr21\n"
578                          : "=r" (ret)
579                          : "n" (-(116 * sizeof(long) + 10 * sizeof(double)))
580                          : "%r19"
581                       );
582
583     return ret;
584 }
585
586 #endif /* hppa1_1_HOST_ARCH */
587
588 /* -----------------------------------------------------------------------------
589    PowerPC architecture
590
591    Everything is in assembler, so we don't have to deal with GCC...
592    
593    -------------------------------------------------------------------------- */
594
595 #ifdef powerpc_HOST_ARCH
596
597 extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
598
599 #ifdef darwin_HOST_OS
600 void StgRunIsImplementedInAssembler(void)
601 {
602 #if HAVE_SUBSECTIONS_VIA_SYMBOLS
603             // if the toolchain supports deadstripping, we have to
604             // prevent it here (it tends to get confused here).
605         __asm__ volatile (".no_dead_strip _StgRunIsImplementedInAssembler");
606 #endif
607         __asm__ volatile (
608                 "\n.globl _StgRun\n"
609                 "_StgRun:\n"
610                 "\tmflr r0\n"
611                 "\tbl saveFP # f14\n"
612                 "\tstmw r13,-220(r1)\n"
613                 "\tstwu r1,-%0(r1)\n"
614                 "\tmr r27,r4\n" // BaseReg == r27
615                 "\tmtctr r3\n"
616                 "\tmr r12,r3\n"
617                 "\tbctr\n"
618                 ".globl _StgReturn\n"
619                 "_StgReturn:\n"
620                 "\tmr r3,r14\n"
621                 "\tla r1,%0(r1)\n"
622                 "\tlmw r13,-220(r1)\n"
623                 "\tb restFP # f14\n"
624         : : "i"(RESERVED_C_STACK_BYTES+224 /*stack frame size*/));
625 }
626 #else
627
628 // This version is for PowerPC Linux.
629
630 // Differences from the Darwin/Mac OS X version:
631 // *) Different Assembler Syntax
632 // *) Doesn't use Register Saving Helper Functions (although they exist somewhere)
633 // *) We may not access positive stack offsets
634 //    (no "Red Zone" as in the Darwin ABI)
635 // *) The Link Register is saved to a different offset in the caller's stack frame
636 //    (Linux: 4(r1), Darwin 8(r1))
637
638 static void GNUC3_ATTRIBUTE(used)
639 StgRunIsImplementedInAssembler(void)
640 {
641         __asm__ volatile (
642                 "\t.globl StgRun\n"
643                 "\t.type StgRun,@function\n"
644                 "StgRun:\n"
645                 "\tmflr 0\n"
646                 "\tstw 0,4(1)\n"
647                 "\tmr 5,1\n"
648                 "\tstwu 1,-%0(1)\n"
649                 "\tstmw 13,-220(5)\n"
650                 "\tstfd 14,-144(5)\n"
651                 "\tstfd 15,-136(5)\n"
652                 "\tstfd 16,-128(5)\n"
653                 "\tstfd 17,-120(5)\n"
654                 "\tstfd 18,-112(5)\n"
655                 "\tstfd 19,-104(5)\n"
656                 "\tstfd 20,-96(5)\n"
657                 "\tstfd 21,-88(5)\n"
658                 "\tstfd 22,-80(5)\n"
659                 "\tstfd 23,-72(5)\n"
660                 "\tstfd 24,-64(5)\n"
661                 "\tstfd 25,-56(5)\n"
662                 "\tstfd 26,-48(5)\n"
663                 "\tstfd 27,-40(5)\n"
664                 "\tstfd 28,-32(5)\n"
665                 "\tstfd 29,-24(5)\n"
666                 "\tstfd 30,-16(5)\n"
667                 "\tstfd 31,-8(5)\n"
668                 "\tmr 27,4\n"  // BaseReg == r27
669                 "\tmtctr 3\n"
670                 "\tmr 12,3\n"
671                 "\tbctr\n"
672                 ".globl StgReturn\n"
673                 "\t.type StgReturn,@function\n"
674                 "StgReturn:\n"
675                 "\tmr 3,14\n"
676                 "\tla 5,%0(1)\n"
677                 "\tlmw 13,-220(5)\n"
678                 "\tlfd 14,-144(5)\n"
679                 "\tlfd 15,-136(5)\n"
680                 "\tlfd 16,-128(5)\n"
681                 "\tlfd 17,-120(5)\n"
682                 "\tlfd 18,-112(5)\n"
683                 "\tlfd 19,-104(5)\n"
684                 "\tlfd 20,-96(5)\n"
685                 "\tlfd 21,-88(5)\n"
686                 "\tlfd 22,-80(5)\n"
687                 "\tlfd 23,-72(5)\n"
688                 "\tlfd 24,-64(5)\n"
689                 "\tlfd 25,-56(5)\n"
690                 "\tlfd 26,-48(5)\n"
691                 "\tlfd 27,-40(5)\n"
692                 "\tlfd 28,-32(5)\n"
693                 "\tlfd 29,-24(5)\n"
694                 "\tlfd 30,-16(5)\n"
695                 "\tlfd 31,-8(5)\n"
696                 "\tmr 1,5\n"
697                 "\tlwz 0,4(1)\n"
698                 "\tmtlr 0\n"
699                 "\tblr\n"
700         : : "i"(RESERVED_C_STACK_BYTES+224 /*stack frame size*/));
701 }
702 #endif
703
704 #endif
705
706 /* -----------------------------------------------------------------------------
707    PowerPC 64 architecture
708
709    Everything is in assembler, so we don't have to deal with GCC...
710    
711    -------------------------------------------------------------------------- */
712
713 #ifdef powerpc64_HOST_ARCH
714
715 #ifdef linux_HOST_OS
716 extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
717
718 static void GNUC3_ATTRIBUTE(used)
719 StgRunIsImplementedInAssembler(void)
720 {
721         // r0 volatile
722         // r1 stack pointer
723         // r2 toc - needs to be saved
724         // r3-r10 argument passing, volatile
725         // r11, r12 very volatile (not saved across cross-module calls)
726         // r13 thread local state (never modified, don't need to save)
727         // r14-r31 callee-save
728         __asm__ volatile (
729                 ".section \".opd\",\"aw\"\n"
730                 ".align 3\n"
731                 ".globl StgRun\n"
732                 "StgRun:\n"
733                         "\t.quad\t.StgRun,.TOC.@tocbase,0\n"
734                         "\t.size StgRun,24\n"
735                 ".globl StgReturn\n"
736                 "StgReturn:\n"
737                         "\t.quad\t.StgReturn,.TOC.@tocbase,0\n"
738                         "\t.size StgReturn,24\n"
739                 ".previous\n"
740                 ".globl .StgRun\n"
741                 ".type .StgRun,@function\n"
742                 ".StgRun:\n"
743                         "\tmflr 0\n"
744                         "\tmr 5, 1\n"
745                         "\tstd 0, 16(1)\n"
746                         "\tstdu 1, -%0(1)\n"
747                         "\tstd 2, -296(5)\n"
748                         "\tstd 14, -288(5)\n"
749                         "\tstd 15, -280(5)\n"
750                         "\tstd 16, -272(5)\n"
751                         "\tstd 17, -264(5)\n"
752                         "\tstd 18, -256(5)\n"
753                         "\tstd 19, -248(5)\n"
754                         "\tstd 20, -240(5)\n"
755                         "\tstd 21, -232(5)\n"
756                         "\tstd 22, -224(5)\n"
757                         "\tstd 23, -216(5)\n"
758                         "\tstd 24, -208(5)\n"
759                         "\tstd 25, -200(5)\n"
760                         "\tstd 26, -192(5)\n"
761                         "\tstd 27, -184(5)\n"
762                         "\tstd 28, -176(5)\n"
763                         "\tstd 29, -168(5)\n"
764                         "\tstd 30, -160(5)\n"
765                         "\tstd 31, -152(5)\n"
766                         "\tstfd 14, -144(5)\n"
767                         "\tstfd 15, -136(5)\n"
768                         "\tstfd 16, -128(5)\n"
769                         "\tstfd 17, -120(5)\n"
770                         "\tstfd 18, -112(5)\n"
771                         "\tstfd 19, -104(5)\n"
772                         "\tstfd 20, -96(5)\n"
773                         "\tstfd 21, -88(5)\n"
774                         "\tstfd 22, -80(5)\n"
775                         "\tstfd 23, -72(5)\n"
776                         "\tstfd 24, -64(5)\n"
777                         "\tstfd 25, -56(5)\n"
778                         "\tstfd 26, -48(5)\n"
779                         "\tstfd 27, -40(5)\n"
780                         "\tstfd 28, -32(5)\n"
781                         "\tstfd 29, -24(5)\n"
782                         "\tstfd 30, -16(5)\n"
783                         "\tstfd 31, -8(5)\n"
784                         "\tmr 27, 4\n"  // BaseReg == r27
785                         "\tld 2, 8(3)\n"
786                         "\tld 3, 0(3)\n"
787                         "\tmtctr 3\n"
788                         "\tbctr\n"
789                 ".globl .StgReturn\n"
790                 ".type .StgReturn,@function\n"
791                 ".StgReturn:\n"
792                         "\tmr 3,14\n"
793                         "\tla 5, %0(1)\n" // load address == addi r5, r1, %0
794                         "\tld 2, -296(5)\n"
795                         "\tld 14, -288(5)\n"
796                         "\tld 15, -280(5)\n"
797                         "\tld 16, -272(5)\n"
798                         "\tld 17, -264(5)\n"
799                         "\tld 18, -256(5)\n"
800                         "\tld 19, -248(5)\n"
801                         "\tld 20, -240(5)\n"
802                         "\tld 21, -232(5)\n"
803                         "\tld 22, -224(5)\n"
804                         "\tld 23, -216(5)\n"
805                         "\tld 24, -208(5)\n"
806                         "\tld 25, -200(5)\n"
807                         "\tld 26, -192(5)\n"
808                         "\tld 27, -184(5)\n"
809                         "\tld 28, -176(5)\n"
810                         "\tld 29, -168(5)\n"
811                         "\tld 30, -160(5)\n"
812                         "\tld 31, -152(5)\n"
813                         "\tlfd 14, -144(5)\n"
814                         "\tlfd 15, -136(5)\n"
815                         "\tlfd 16, -128(5)\n"
816                         "\tlfd 17, -120(5)\n"
817                         "\tlfd 18, -112(5)\n"
818                         "\tlfd 19, -104(5)\n"
819                         "\tlfd 20, -96(5)\n"
820                         "\tlfd 21, -88(5)\n"
821                         "\tlfd 22, -80(5)\n"
822                         "\tlfd 23, -72(5)\n"
823                         "\tlfd 24, -64(5)\n"
824                         "\tlfd 25, -56(5)\n"
825                         "\tlfd 26, -48(5)\n"
826                         "\tlfd 27, -40(5)\n"
827                         "\tlfd 28, -32(5)\n"
828                         "\tlfd 29, -24(5)\n"
829                         "\tlfd 30, -16(5)\n"
830                         "\tlfd 31, -8(5)\n"
831                         "\tmr 1, 5\n"
832                         "\tld 0, 16(1)\n"
833                         "\tmtlr 0\n"
834                         "\tblr\n"
835         : : "i"(RESERVED_C_STACK_BYTES+304 /*stack frame size*/));
836 }
837 #else // linux_HOST_OS
838 #error Only linux support for power64 right now.
839 #endif
840
841 #endif
842
843 /* -----------------------------------------------------------------------------
844    IA64 architecture
845
846    Again, in assembler - so we can fiddle with the register stack, and because
847    gcc doesn't handle asm-clobbered callee-saves correctly.
848
849    loc0  - loc15: preserved locals
850    loc16 - loc28: STG registers
851            loc29: saved ar.pfs
852            loc30: saved b0
853            loc31: saved gp (gcc 3.3 uses this slot)
854            loc32: saved ar.lc
855            loc33: saved pr
856        f2  -  f5: preserved floating-point registers
857        f16 - f23: preserved floating-point registers
858    -------------------------------------------------------------------------- */
859
860 #ifdef ia64_HOST_ARCH
861
862 /* the memory stack is rarely used, so 16K is excessive */
863 #undef RESERVED_C_STACK_BYTES
864 #define RESERVED_C_STACK_BYTES 1024
865
866 /* We don't spill all the callee-save FP registers, only the ones that
867  * gcc has been observed to use */
868 #define PRESERVED_FP_REGISTERS 12
869
870 /* We always allocate 34 local and 8 output registers.  As long as gcc used
871  * fewer than 32 locals, the mangler will adjust the stack frame accordingly. */
872 #define LOCALS 34
873
874 static void GNUC3_ATTRIBUTE(used)
875 StgRunIsImplementedInAssembler(void)
876 {
877     __asm__ volatile(
878                 ".global StgRun\n"
879                 "StgRun:\n"
880                 "\talloc loc29 = ar.pfs, 0, %1, 8, 0\n" /* setup register frame */
881                 "\tld8 r18 = [r32],8\n"                 /* get procedure address */
882                 "\tadds sp = -%0, sp ;;\n"              /* setup stack */
883                 "\tld8 gp = [r32]\n"                    /* get procedure GP */
884                 "\tadds r16 = %0-(%2*16), sp\n"
885                 "\tadds r17 = %0-((%2-1)*16), sp ;;\n"
886                 "\tstf.spill [r16] = f16,32\n"          /* spill callee-saved fp regs */
887                 "\tstf.spill [r17] = f17,32\n"
888                 "\tmov b6 = r18 ;;\n"                   /* set target address */
889                 "\tstf.spill [r16] = f18,32\n"
890                 "\tstf.spill [r17] = f19,32\n"
891                 "\tmov loc30 = b0 ;;\n"                 /* save return address */
892                 "\tstf.spill [r16] = f20,32\n"
893                 "\tstf.spill [r17] = f21,32 ;;\n"
894                 "\tstf.spill [r16] = f22,32\n"
895                 "\tstf.spill [r17] = f23,32\n"
896                 "\tmov loc32 = ar.lc ;;\n"              /* save loop counter */
897                 "\tstf.spill [r16] = f2,32\n"
898                 "\tstf.spill [r17] = f3,32\n"
899                 "\tmov loc33 = pr ;;\n"                 /* save predicate registers */
900                 "\tstf.spill [r16] = f4,32\n"
901                 "\tstf.spill [r17] = f5,32\n"
902                 "\tbr.few b6 ;;\n"                      /* branch to function */
903                 ".global StgReturn\n"
904                 "StgReturn:\n"
905                 "\tmov r8 = loc16\n"            /* return value in r8 */
906                 "\tadds r16 = %0-(%2*16), sp\n"
907                 "\tadds r17 = %0-((%2-1)*16), sp ;;\n"
908                 "\tldf.fill f16 = [r16],32\n"   /* start restoring fp regs */
909                 "\tldf.fill f17 = [r17],32\n"
910                 "\tmov ar.pfs = loc29 ;;\n"     /* restore register frame */
911                 "\tldf.fill f18 = [r16],32\n"
912                 "\tldf.fill f19 = [r17],32\n"
913                 "\tmov b0 = loc30 ;;\n"         /* restore return address */
914                 "\tldf.fill f20 = [r16],32\n"
915                 "\tldf.fill f21 = [r17],32\n"
916                 "\tmov ar.lc = loc32 ;;\n"      /* restore loop counter */
917                 "\tldf.fill f22 = [r16],32\n"
918                 "\tldf.fill f23 = [r17],32\n"
919                 "\tmov pr = loc33 ;;\n"         /* restore predicate registers */
920                 "\tldf.fill f2 = [r16],32\n"
921                 "\tldf.fill f3 = [r17],32\n"
922                 "\tadds sp = %0, sp ;;\n"       /* restore stack */
923                 "\tldf.fill f4 = [r16],32\n"
924                 "\tldf.fill f5 = [r17],32\n"
925                 "\tbr.ret.sptk.many b0 ;;\n"    /* return */
926         : : "i"(RESERVED_C_STACK_BYTES + PRESERVED_FP_REGISTERS*16),
927             "i"(LOCALS),
928             "i"(PRESERVED_FP_REGISTERS));
929 }
930
931 #endif
932
933 /* -----------------------------------------------------------------------------
934    MIPS architecture
935    -------------------------------------------------------------------------- */
936
937 #ifdef mips_HOST_ARCH
938
939 StgThreadReturnCode
940 StgRun(StgFunPtr f, StgRegTable *basereg)
941 {
942     register StgThreadReturnCode __v0 __asm__("$2");
943
944     __asm__ __volatile__(
945         "       la      $25, %1                 \n"
946         "       move    $30, %2                 \n"
947         "       jr      %1                      \n"
948         "       .align 3                        \n"
949         "       .globl " STG_RETURN "           \n"
950         "       .aent " STG_RETURN "            \n"
951         STG_RETURN ":                           \n"
952         "       move    %0, $16                 \n"
953         "       move    $3, $17                 \n"
954         : "=r" (__v0),
955         : "r" (f), "r" (basereg)
956         "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23",
957         "$25", "$28", "$30",
958         "$f20", "$f22", "$f24", "$f26", "$f28", "$f30",
959         "memory");
960
961     return __v0;
962 }
963
964 #endif /* mips_HOST_ARCH */
965
966 #endif /* !USE_MINIINTERPRETER */