1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
6 * ---------------------------------------------------------------------------*/
8 /* A little bit of background...
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so result in memory leaks on both the C and
39 #include "PosixSource.h"
41 #include "RtsExternal.h"
49 #if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
53 #ifdef LEADING_UNDERSCORE
54 #define UNDERSCORE "_"
58 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
60 Now here's something obscure for you:
62 When generating an adjustor thunk that uses the C calling
63 convention, we have to make sure that the thunk kicks off
64 the process of jumping into Haskell with a tail jump. Why?
65 Because as a result of jumping in into Haskell we may end
66 up freeing the very adjustor thunk we came from using
67 freeHaskellFunctionPtr(). Hence, we better not return to
68 the adjustor code on our way out, since it could by then
71 The fix is readily at hand, just include the opcodes
72 for the C stack fixup code that we need to perform when
73 returning in some static piece of memory and arrange
74 to return to it before tail jumping from the adjustor thunk.
76 static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
79 ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
80 UNDERSCORE "obscure_ccall_ret_code:\n\t"
85 extern void obscure_ccall_ret_code(void);
89 #if defined(x86_64_HOST_ARCH)
90 static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
93 ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
94 UNDERSCORE "obscure_ccall_ret_code:\n\t"
99 extern void obscure_ccall_ret_code(void);
102 #if defined(alpha_HOST_ARCH)
103 /* To get the definition of PAL_imb: */
104 # if defined(linux_HOST_OS)
105 # include <asm/pal.h>
107 # include <machine/pal.h>
111 #if defined(ia64_HOST_ARCH)
113 /* Layout of a function descriptor */
114 typedef struct _IA64FunDesc {
120 stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
123 nat data_size_in_words, total_size_in_words;
125 /* round up to a whole number of words */
126 data_size_in_words = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
127 total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
129 /* allocate and fill it in */
130 arr = (StgArrWords *)allocate(total_size_in_words);
131 SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
133 /* obtain a stable ptr */
134 *stable = getStablePtr((StgPtr)arr);
136 /* and return a ptr to the goods inside the array */
137 return(&(arr->payload));
141 #if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
142 __asm__("obscure_ccall_ret_code:\n\t"
147 extern void obscure_ccall_ret_code(void);
150 #if defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
151 #if !(defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS))
153 /* !!! !!! WARNING: !!! !!!
154 * This structure is accessed from AdjustorAsm.s
155 * Any changes here have to be mirrored in the offsets there.
158 typedef struct AdjustorStub {
159 #if defined(powerpc_HOST_ARCH) && defined(darwin_HOST_OS)
166 #elif defined(powerpc64_HOST_ARCH) && defined(darwin_HOST_OS)
167 /* powerpc64-darwin: just guessing that it won't use fundescs. */
178 /* fundesc-based ABIs */
187 StgInt negative_framesize;
188 StgInt extrawords_plus_one;
194 #if defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
196 /* !!! !!! WARNING: !!! !!!
197 * This structure is accessed from AdjustorAsm.s
198 * Any changes here have to be mirrored in the offsets there.
201 typedef struct AdjustorStub {
202 unsigned char call[8];
206 StgInt argument_size;
210 #if defined(darwin_HOST_OS) || defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
211 static int totalArgumentSize(char *typeString)
216 char t = *typeString++;
220 // on 32-bit platforms, Double and Int64 occupy two words.
223 if(sizeof(void*) == 4)
228 // everything else is one word.
238 createAdjustor(int cconv, StgStablePtr hptr,
241 #if !defined(powerpc_HOST_ARCH) && !defined(powerpc64_HOST_ARCH) && !defined(x86_64_HOST_ARCH)
246 void *adjustor = NULL;
250 case 0: /* _stdcall */
251 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
252 /* Magic constant computed by inspecting the code length of
253 the following assembly language snippet
254 (offset and machine code prefixed):
256 <0>: 58 popl %eax # temp. remove ret addr..
257 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
258 # hold a StgStablePtr
259 <6>: 50 pushl %eax # put back ret. addr
260 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
261 <c>: ff e0 jmp %eax # and jump to it.
262 # the callee cleans up the stack
264 adjustor = allocateExec(14);
266 unsigned char *const adj_code = (unsigned char *)adjustor;
267 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
269 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
270 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
272 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
274 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
275 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
277 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
278 adj_code[0x0d] = (unsigned char)0xe0;
284 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
285 /* Magic constant computed by inspecting the code length of
286 the following assembly language snippet
287 (offset and machine code prefixed):
289 <00>: 68 ef be ad de pushl $0xdeadbeef # constant is large enough to
290 # hold a StgStablePtr
291 <05>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
292 <0a>: 68 ef be ad de pushl $obscure_ccall_ret_code # push the return address
293 <0f>: ff e0 jmp *%eax # jump to wptr
295 The ccall'ing version is a tad different, passing in the return
296 address of the caller to the auto-generated C stub (which enters
297 via the stable pointer.) (The auto-generated C stub is in on this
298 game, don't worry :-)
300 See the comment next to obscure_ccall_ret_code why we need to
301 perform a tail jump instead of a call, followed by some C stack
304 Note: The adjustor makes the assumption that any return value
305 coming back from the C stub is not stored on the stack.
306 That's (thankfully) the case here with the restricted set of
307 return types that we support.
309 adjustor = allocateExec(17);
311 unsigned char *const adj_code = (unsigned char *)adjustor;
313 adj_code[0x00] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
314 *((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
316 adj_code[0x05] = (unsigned char)0xb8; /* movl $wptr, %eax */
317 *((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
319 adj_code[0x0a] = (unsigned char)0x68; /* pushl obscure_ccall_ret_code */
320 *((StgFunPtr*)(adj_code + 0x0b)) =
321 (StgFunPtr)obscure_ccall_ret_code;
323 adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
324 adj_code[0x10] = (unsigned char)0xe0;
326 #elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
329 What's special about Darwin/Mac OS X on i386?
330 It wants the stack to stay 16-byte aligned.
332 We offload most of the work to AdjustorAsm.S.
334 AdjustorStub *adjustorStub = allocateExec(sizeof(AdjustorStub));
335 adjustor = adjustorStub;
337 extern void adjustorCode(void);
338 int sz = totalArgumentSize(typeString);
340 adjustorStub->call[0] = 0xe8;
341 *(long*)&adjustorStub->call[1] = ((char*)&adjustorCode) - ((char*)adjustorStub + 5);
342 adjustorStub->hptr = hptr;
343 adjustorStub->wptr = wptr;
345 // The adjustor puts the following things on the stack:
347 // 2.) padding and (a copy of) the arguments
348 // 3.) a dummy argument
350 // 5.) return address (for returning to the adjustor)
351 // All these have to add up to a multiple of 16.
353 // first, include everything in frame_size
354 adjustorStub->frame_size = sz * 4 + 16;
356 adjustorStub->frame_size = (adjustorStub->frame_size + 15) & ~15;
357 // only count 2.) and 3.) as part of frame_size
358 adjustorStub->frame_size -= 12;
359 adjustorStub->argument_size = sz;
362 #elif defined(x86_64_HOST_ARCH)
369 %rdi,%rsi,%rdx,%rcx,%r8,%r9 = arg0..arg6
371 if there are <6 integer args, then we can just push the
372 StablePtr into %edi and shuffle the other args up.
374 If there are >=6 integer args, then we have to flush one arg
375 to the stack, and arrange to adjust the stack ptr on return.
376 The stack will be rearranged to this:
381 return address *** <-- dummy arg in stub fn.
383 obscure_ccall_ret_code
385 This unfortunately means that the type of the stub function
386 must have a dummy argument for the original return address
387 pointer inserted just after the 6th integer argument.
389 Code for the simple case:
391 0: 4d 89 c1 mov %r8,%r9
392 3: 49 89 c8 mov %rcx,%r8
393 6: 48 89 d1 mov %rdx,%rcx
394 9: 48 89 f2 mov %rsi,%rdx
395 c: 48 89 fe mov %rdi,%rsi
396 f: 48 8b 3d 0a 00 00 00 mov 10(%rip),%rdi
397 16: ff 25 0c 00 00 00 jmpq *12(%rip)
399 20: .quad 0 # aligned on 8-byte boundary
400 28: .quad 0 # aligned on 8-byte boundary
403 And the version for >=6 integer arguments:
406 2: ff 35 20 00 00 00 pushq 32(%rip) # 28 <ccall_adjustor+0x28>
407 8: 4d 89 c1 mov %r8,%r9
408 b: 49 89 c8 mov %rcx,%r8
409 e: 48 89 d1 mov %rdx,%rcx
410 11: 48 89 f2 mov %rsi,%rdx
411 14: 48 89 fe mov %rdi,%rsi
412 17: 48 8b 3d 12 00 00 00 mov 18(%rip),%rdi # 30 <ccall_adjustor+0x30>
413 1e: ff 25 14 00 00 00 jmpq *20(%rip) # 38 <ccall_adjustor+0x38>
415 28: .quad 0 # aligned on 8-byte boundary
416 30: .quad 0 # aligned on 8-byte boundary
417 38: .quad 0 # aligned on 8-byte boundary
424 // determine whether we have 6 or more integer arguments,
425 // and therefore need to flush one to the stack.
426 for (c = typeString; *c != '\0'; c++) {
427 if (*c == 'i' || *c == 'l') i++;
432 adjustor = allocateExec(0x30);
434 *(StgInt32 *)adjustor = 0x49c1894d;
435 *(StgInt32 *)(adjustor+0x4) = 0x8948c889;
436 *(StgInt32 *)(adjustor+0x8) = 0xf28948d1;
437 *(StgInt32 *)(adjustor+0xc) = 0x48fe8948;
438 *(StgInt32 *)(adjustor+0x10) = 0x000a3d8b;
439 *(StgInt32 *)(adjustor+0x14) = 0x25ff0000;
440 *(StgInt32 *)(adjustor+0x18) = 0x0000000c;
441 *(StgInt64 *)(adjustor+0x20) = (StgInt64)hptr;
442 *(StgInt64 *)(adjustor+0x28) = (StgInt64)wptr;
446 adjustor = allocateExec(0x40);
448 *(StgInt32 *)adjustor = 0x35ff5141;
449 *(StgInt32 *)(adjustor+0x4) = 0x00000020;
450 *(StgInt32 *)(adjustor+0x8) = 0x49c1894d;
451 *(StgInt32 *)(adjustor+0xc) = 0x8948c889;
452 *(StgInt32 *)(adjustor+0x10) = 0xf28948d1;
453 *(StgInt32 *)(adjustor+0x14) = 0x48fe8948;
454 *(StgInt32 *)(adjustor+0x18) = 0x00123d8b;
455 *(StgInt32 *)(adjustor+0x1c) = 0x25ff0000;
456 *(StgInt32 *)(adjustor+0x20) = 0x00000014;
458 *(StgInt64 *)(adjustor+0x28) = (StgInt64)obscure_ccall_ret_code;
459 *(StgInt64 *)(adjustor+0x30) = (StgInt64)hptr;
460 *(StgInt64 *)(adjustor+0x38) = (StgInt64)wptr;
463 #elif defined(sparc_HOST_ARCH)
464 /* Magic constant computed by inspecting the code length of the following
465 assembly language snippet (offset and machine code prefixed):
467 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
468 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
469 <08>: D823A05C st %o4, [%sp + 92]
470 <0C>: 9A10000B mov %o3, %o5
471 <10>: 9810000A mov %o2, %o4
472 <14>: 96100009 mov %o1, %o3
473 <18>: 94100008 mov %o0, %o2
474 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
475 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
476 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
477 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
478 <2C> 00000000 ! place for getting hptr back easily
480 ccall'ing on SPARC is easy, because we are quite lucky to push a
481 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
482 existing arguments (note that %sp must stay double-word aligned at
483 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
484 To do this, we extend the *caller's* stack frame by 2 words and shift
485 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
486 procedure because of the tail-jump) by 2 positions. This makes room in
487 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
488 for destination addr of jump on SPARC, return address on x86, ...). This
489 shouldn't cause any problems for a C-like caller: alloca is implemented
490 similarly, and local variables should be accessed via %fp, not %sp. In a
491 nutshell: This should work! (Famous last words! :-)
493 adjustor = allocateExec(4*(11+1));
495 unsigned long *const adj_code = (unsigned long *)adjustor;
497 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
498 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
499 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
500 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
501 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
502 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
503 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
504 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
505 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
506 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
507 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
508 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
509 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
510 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
511 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
513 adj_code[11] = (unsigned long)hptr;
516 asm("flush %0" : : "r" (adj_code ));
517 asm("flush %0" : : "r" (adj_code + 2));
518 asm("flush %0" : : "r" (adj_code + 4));
519 asm("flush %0" : : "r" (adj_code + 6));
520 asm("flush %0" : : "r" (adj_code + 10));
522 /* max. 5 instructions latency, and we need at >= 1 for returning */
528 #elif defined(alpha_HOST_ARCH)
529 /* Magic constant computed by inspecting the code length of
530 the following assembly language snippet
531 (offset and machine code prefixed; note that the machine code
532 shown is longwords stored in little-endian order):
534 <00>: 46520414 mov a2, a4
535 <04>: 46100412 mov a0, a2
536 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
537 <0c>: 46730415 mov a3, a5
538 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
539 <14>: 46310413 mov a1, a3
540 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
541 <1c>: 00000000 # padding for alignment
542 <20>: [8 bytes for hptr quadword]
543 <28>: [8 bytes for wptr quadword]
545 The "computed" jump at <08> above is really a jump to a fixed
546 location. Accordingly, we place an always-correct hint in the
547 jump instruction, namely the address offset from <0c> to wptr,
548 divided by 4, taking the lowest 14 bits.
550 We only support passing 4 or fewer argument words, for the same
551 reason described under sparc_HOST_ARCH above by JRS, 21 Aug 01.
552 On the Alpha the first 6 integer arguments are in a0 through a5,
553 and the rest on the stack. Hence we want to shuffle the original
554 caller's arguments by two.
556 On the Alpha the calling convention is so complex and dependent
557 on the callee's signature -- for example, the stack pointer has
558 to be a multiple of 16 -- that it seems impossible to me [ccshan]
559 to handle the general case correctly without changing how the
560 adjustor is called from C. For now, our solution of shuffling
561 registers only and ignoring the stack only works if the original
562 caller passed 4 or fewer argument words.
564 TODO: Depending on how much allocation overhead stgMallocBytes uses for
565 header information (more precisely, if the overhead is no more than
566 4 bytes), we should move the first three instructions above down by
567 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
569 ASSERT(((StgWord64)wptr & 3) == 0);
570 adjustor = allocateExec(48);
572 StgWord64 *const code = (StgWord64 *)adjustor;
574 code[0] = 0x4610041246520414L;
575 code[1] = 0x46730415a61b0020L;
576 code[2] = 0x46310413a77b0028L;
577 code[3] = 0x000000006bfb0000L
578 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
580 code[4] = (StgWord64)hptr;
581 code[5] = (StgWord64)wptr;
583 /* Ensure that instruction cache is consistent with our new code */
584 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
586 #elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
588 #define OP_LO(op,lo) ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
589 #define OP_HI(op,hi) ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
591 /* The PowerPC Linux (32-bit) calling convention is annoyingly complex.
592 We need to calculate all the details of the stack frame layout,
593 taking into account the types of all the arguments, and then
594 generate code on the fly. */
596 int src_gpr = 3, dst_gpr = 5;
598 int src_offset = 0, dst_offset = 0;
599 int n = strlen(typeString),i;
600 int src_locs[n], dst_locs[n];
605 Calculate where the arguments should go.
606 src_locs[] will contain the locations of the arguments in the
607 original stack frame passed to the adjustor.
608 dst_locs[] will contain the locations of the arguments after the
609 adjustor runs, on entry to the wrapper proc pointed to by wptr.
611 This algorithm is based on the one described on page 3-19 of the
612 System V ABI PowerPC Processor Supplement.
614 for(i=0;typeString[i];i++)
616 char t = typeString[i];
617 if((t == 'f' || t == 'd') && fpr <= 8)
618 src_locs[i] = dst_locs[i] = -32-(fpr++);
621 if(t == 'l' && src_gpr <= 9)
623 if((src_gpr & 1) == 0)
625 src_locs[i] = -src_gpr;
628 else if(t == 'i' && src_gpr <= 10)
630 src_locs[i] = -(src_gpr++);
634 if(t == 'l' || t == 'd')
639 src_locs[i] = src_offset;
640 src_offset += (t == 'l' || t == 'd') ? 8 : 4;
643 if(t == 'l' && dst_gpr <= 9)
645 if((dst_gpr & 1) == 0)
647 dst_locs[i] = -dst_gpr;
650 else if(t == 'i' && dst_gpr <= 10)
652 dst_locs[i] = -(dst_gpr++);
656 if(t == 'l' || t == 'd')
661 dst_locs[i] = dst_offset;
662 dst_offset += (t == 'l' || t == 'd') ? 8 : 4;
667 frameSize = dst_offset + 8;
668 frameSize = (frameSize+15) & ~0xF;
673 // allocate space for at most 4 insns per parameter
674 // plus 14 more instructions.
675 adjustor = allocateExec(4 * (4*n + 14));
676 code = (unsigned*)adjustor;
678 *code++ = 0x48000008; // b *+8
679 // * Put the hptr in a place where freeHaskellFunctionPtr
681 *code++ = (unsigned) hptr;
683 // * save the link register
684 *code++ = 0x7c0802a6; // mflr r0;
685 *code++ = 0x90010004; // stw r0, 4(r1);
686 // * and build a new stack frame
687 *code++ = OP_LO(0x9421, -frameSize); // stwu r1, -frameSize(r1)
689 // * now generate instructions to copy arguments
690 // from the old stack frame into the new stack frame.
693 if(src_locs[i] < -32)
694 ASSERT(dst_locs[i] == src_locs[i]);
695 else if(src_locs[i] < 0)
698 ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
701 ASSERT(dst_locs[i] > -32);
702 // dst is in GPR, too.
704 if(typeString[i] == 'l')
708 | ((-dst_locs[i]+1) << 16)
709 | ((-src_locs[i]+1) << 11)
710 | ((-src_locs[i]+1) << 21);
714 | ((-dst_locs[i]) << 16)
715 | ((-src_locs[i]) << 11)
716 | ((-src_locs[i]) << 21);
720 if(typeString[i] == 'l')
722 // stw src+1, dst_offset+4(r1)
724 | ((-src_locs[i]+1) << 21)
728 // stw src, dst_offset(r1)
730 | ((-src_locs[i]) << 21)
736 ASSERT(dst_locs[i] >= 0);
737 ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
739 if(typeString[i] == 'l')
741 // lwz r0, src_offset(r1)
743 | (src_locs[i] + frameSize + 8 + 4);
744 // stw r0, dst_offset(r1)
746 | (dst_locs[i] + 8 + 4);
748 // lwz r0, src_offset(r1)
750 | (src_locs[i] + frameSize + 8);
751 // stw r0, dst_offset(r1)
757 // * hptr will be the new first argument.
759 *code++ = OP_HI(0x3c60, hptr);
760 // ori r3,r3,lo(hptr)
761 *code++ = OP_LO(0x6063, hptr);
763 // * we need to return to a piece of code
764 // which will tear down the stack frame.
765 // lis r11,hi(obscure_ccall_ret_code)
766 *code++ = OP_HI(0x3d60, obscure_ccall_ret_code);
767 // ori r11,r11,lo(obscure_ccall_ret_code)
768 *code++ = OP_LO(0x616b, obscure_ccall_ret_code);
770 *code++ = 0x7d6803a6;
774 *code++ = OP_HI(0x3d60, wptr);
775 // ori r11,r11,lo(wptr)
776 *code++ = OP_LO(0x616b, wptr);
778 *code++ = 0x7d6903a6;
780 *code++ = 0x4e800420;
782 // Flush the Instruction cache:
784 unsigned *p = adjustor;
787 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
791 __asm__ volatile ("sync\n\tisync");
795 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
797 #define OP_LO(op,lo) ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
798 #define OP_HI(op,hi) ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
800 /* The following code applies to all PowerPC and PowerPC64 platforms
801 whose stack layout is based on the AIX ABI.
803 Besides (obviously) AIX, this includes
804 Mac OS 9 and BeOS/PPC (may they rest in peace),
805 which use the 32-bit AIX ABI
807 which uses the 64-bit AIX ABI
808 and Darwin (Mac OS X),
809 which uses the same stack layout as AIX,
810 but no function descriptors.
812 The actual stack-frame shuffling is implemented out-of-line
813 in the function adjustorCode, in AdjustorAsm.S.
814 Here, we set up an AdjustorStub structure, which
815 is a function descriptor (on platforms that have function
816 descriptors) or a short piece of stub code (on Darwin) to call
817 adjustorCode with a pointer to the AdjustorStub struct loaded
820 One nice thing about this is that there is _no_ code generated at
821 runtime on the platforms that have function descriptors.
823 AdjustorStub *adjustorStub;
824 int sz = 0, extra_sz, total_sz;
826 // from AdjustorAsm.s
827 // not declared as a function so that AIX-style
828 // fundescs can never get in the way.
829 extern void *adjustorCode;
832 adjustorStub = stgMallocBytes(sizeof(AdjustorStub), "createAdjustor");
834 adjustorStub = allocateExec(sizeof(AdjustorStub));
836 adjustor = adjustorStub;
838 adjustorStub->code = (void*) &adjustorCode;
841 // function descriptors are a cool idea.
842 // We don't need to generate any code at runtime.
843 adjustorStub->toc = adjustorStub;
846 // no function descriptors :-(
847 // We need to do things "by hand".
848 #if defined(powerpc_HOST_ARCH)
849 // lis r2, hi(adjustorStub)
850 adjustorStub->lis = OP_HI(0x3c40, adjustorStub);
851 // ori r2, r2, lo(adjustorStub)
852 adjustorStub->ori = OP_LO(0x6042, adjustorStub);
854 adjustorStub->lwz = OP_LO(0x8002, (char*)(&adjustorStub->code)
855 - (char*)adjustorStub);
857 adjustorStub->mtctr = 0x7c0903a6;
859 adjustorStub->bctr = 0x4e800420;
861 barf("adjustor creation not supported on this platform");
864 // Flush the Instruction cache:
866 int n = sizeof(AdjustorStub)/sizeof(unsigned);
867 unsigned *p = (unsigned*)adjustor;
870 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
874 __asm__ volatile ("sync\n\tisync");
878 // Calculate the size of the stack frame, in words.
879 sz = totalArgumentSize(typeString);
881 // The first eight words of the parameter area
882 // are just "backing store" for the parameters passed in
883 // the GPRs. extra_sz is the number of words beyond those first
889 // Calculate the total size of the stack frame.
890 total_sz = (6 /* linkage area */
891 + 8 /* minimum parameter area */
892 + 2 /* two extra arguments */
893 + extra_sz)*sizeof(StgWord);
895 // align to 16 bytes.
896 // AIX only requires 8 bytes, but who cares?
897 total_sz = (total_sz+15) & ~0xF;
899 // Fill in the information that adjustorCode in AdjustorAsm.S
900 // will use to create a new stack frame with the additional args.
901 adjustorStub->hptr = hptr;
902 adjustorStub->wptr = wptr;
903 adjustorStub->negative_framesize = -total_sz;
904 adjustorStub->extrawords_plus_one = extra_sz + 1;
907 #elif defined(ia64_HOST_ARCH)
909 Up to 8 inputs are passed in registers. We flush the last two inputs to
910 the stack, initially into the 16-byte scratch region left by the caller.
911 We then shuffle the others along by 4 (taking 2 registers for ourselves
912 to save return address and previous function state - we need to come back
913 here on the way out to restore the stack, so this is a real function
914 rather than just a trampoline).
916 The function descriptor we create contains the gp of the target function
917 so gp is already loaded correctly.
919 [MLX] alloc r16=ar.pfs,10,2,0
921 [MII] st8.spill [r12]=r38,8 // spill in6 (out4)
922 mov r41=r37 // out7 = in5 (out3)
923 mov r40=r36;; // out6 = in4 (out2)
924 [MII] st8.spill [r12]=r39 // spill in7 (out5)
926 mov r38=r34;; // out4 = in2 (out0)
927 [MII] mov r39=r35 // out5 = in3 (out1)
928 mov r37=r33 // out3 = in1 (loc1)
929 mov r36=r32 // out2 = in0 (loc0)
930 [MLX] adds r12=-24,r12 // update sp
931 movl r34=hptr;; // out0 = hptr
932 [MIB] mov r33=r16 // loc1 = ar.pfs
933 mov r32=b0 // loc0 = retaddr
934 br.call.sptk.many b0=b6;;
936 [MII] adds r12=-16,r12
941 br.ret.sptk.many b0;;
944 /* These macros distribute a long constant into the two words of an MLX bundle */
945 #define BITS(val,start,count) (((val) >> (start)) & ((1 << (count))-1))
946 #define MOVL_LOWORD(val) (BITS(val,22,18) << 46)
947 #define MOVL_HIWORD(val) ( (BITS(val,0,7) << 36) \
948 | (BITS(val,7,9) << 50) \
949 | (BITS(val,16,5) << 45) \
950 | (BITS(val,21,1) << 44) \
951 | (BITS(val,40,23)) \
952 | (BITS(val,63,1) << 59))
956 IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
957 StgWord64 wcode = wdesc->ip;
961 /* we allocate on the Haskell heap since malloc'd memory isn't
962 * executable - argh */
963 /* Allocated memory is word-aligned (8 bytes) but functions on ia64
964 * must be aligned to 16 bytes. We allocate an extra 8 bytes of
965 * wiggle room so that we can put the code on a 16 byte boundary. */
966 adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8+8, &stable);
968 fdesc = (IA64FunDesc *)adjustor;
969 code = (StgWord64 *)(fdesc + 1);
970 /* add 8 bytes to code if needed to align to a 16-byte boundary */
971 if ((StgWord64)code & 15) code++;
972 fdesc->ip = (StgWord64)code;
973 fdesc->gp = wdesc->gp;
975 code[0] = 0x0000058004288004 | MOVL_LOWORD(wcode);
976 code[1] = 0x6000000220000000 | MOVL_HIWORD(wcode);
977 code[2] = 0x029015d818984001;
978 code[3] = 0x8401200500420094;
979 code[4] = 0x886011d8189c0001;
980 code[5] = 0x84011004c00380c0;
981 code[6] = 0x0250210046013800;
982 code[7] = 0x8401000480420084;
983 code[8] = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
984 code[9] = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
985 code[10] = 0x0200210020010811;
986 code[11] = 0x1080006800006200;
987 code[12] = 0x0000210018406000;
988 code[13] = 0x00aa021000038005;
989 code[14] = 0x000000010000001d;
990 code[15] = 0x0084000880000200;
992 /* save stable pointers in convenient form */
993 code[16] = (StgWord64)hptr;
994 code[17] = (StgWord64)stable;
997 barf("adjustor creation not supported on this platform");
1012 freeHaskellFunctionPtr(void* ptr)
1014 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
1015 if ( *(unsigned char*)ptr != 0x68 &&
1016 *(unsigned char*)ptr != 0x58 ) {
1017 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1021 /* Free the stable pointer first..*/
1022 if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
1023 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
1025 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
1027 #elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
1028 if ( *(unsigned char*)ptr != 0xe8 ) {
1029 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1032 freeStablePtr(((AdjustorStub*)ptr)->hptr);
1033 #elif defined(x86_64_HOST_ARCH)
1034 if ( *(StgWord16 *)ptr == 0x894d ) {
1035 freeStablePtr(*(StgStablePtr*)(ptr+0x20));
1036 } else if ( *(StgWord16 *)ptr == 0x5141 ) {
1037 freeStablePtr(*(StgStablePtr*)(ptr+0x30));
1039 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1042 #elif defined(sparc_HOST_ARCH)
1043 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
1044 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1048 /* Free the stable pointer first..*/
1049 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
1050 #elif defined(alpha_HOST_ARCH)
1051 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
1052 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1056 /* Free the stable pointer first..*/
1057 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
1058 #elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
1059 if ( *(StgWord*)ptr != 0x48000008 ) {
1060 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1063 freeStablePtr(((StgStablePtr*)ptr)[1]);
1064 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
1065 extern void* adjustorCode;
1066 if ( ((AdjustorStub*)ptr)->code != (StgFunPtr) &adjustorCode ) {
1067 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1070 freeStablePtr(((AdjustorStub*)ptr)->hptr);
1071 #elif defined(ia64_HOST_ARCH)
1072 IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
1073 StgWord64 *code = (StgWord64 *)(fdesc+1);
1075 if (fdesc->ip != (StgWord64)code) {
1076 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1079 freeStablePtr((StgStablePtr)code[16]);
1080 freeStablePtr((StgStablePtr)code[17]);
1085 *((unsigned char*)ptr) = '\0';