1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
6 * ---------------------------------------------------------------------------*/
8 /* A little bit of background...
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so will result in memory leaks on both the C and
39 #include "PosixSource.h"
45 #if defined(USE_LIBFFI_FOR_ADJUSTORS)
51 freeHaskellFunctionPtr(void* ptr)
55 cl = (ffi_closure*)ptr;
56 freeStablePtr(cl->user_data);
57 stgFree(cl->cif->arg_types);
62 static ffi_type * char_to_ffi_type(char c)
65 case 'v': return &ffi_type_void;
66 case 'f': return &ffi_type_float;
67 case 'd': return &ffi_type_double;
68 case 'L': return &ffi_type_sint64;
69 case 'l': return &ffi_type_uint64;
70 case 'W': return &ffi_type_sint32;
71 case 'w': return &ffi_type_uint32;
72 case 'S': return &ffi_type_sint16;
73 case 's': return &ffi_type_uint16;
74 case 'B': return &ffi_type_sint8;
75 case 'b': return &ffi_type_uint8;
76 case 'p': return &ffi_type_pointer;
77 default: barf("char_to_ffi_type: unknown type '%c'", c);
82 createAdjustor (int cconv,
90 ffi_type *result_type;
95 n_args = strlen(typeString) - 1;
96 cif = stgMallocBytes(sizeof(ffi_cif), "createAdjustor");
97 arg_types = stgMallocBytes(n_args * sizeof(ffi_type*), "createAdjustor");
99 result_type = char_to_ffi_type(typeString[0]);
100 for (i=0; i < n_args; i++) {
101 arg_types[i] = char_to_ffi_type(typeString[i+1]);
104 #ifdef mingw32_TARGET_OS
105 case 0: /* stdcall */
110 abi = FFI_DEFAULT_ABI;
113 barf("createAdjustor: convention %d not supported on this platform", cconv);
116 r = ffi_prep_cif(cif, abi, n_args, result_type, arg_types);
117 if (r != FFI_OK) barf("ffi_prep_cif failed: %d", r);
119 cl = allocateExec(sizeof(ffi_closure), &code);
121 barf("createAdjustor: failed to allocate memory");
124 r = ffi_prep_closure(cl, cif, (void*)wptr, hptr/*userdata*/);
125 if (r != FFI_OK) barf("ffi_prep_closure failed: %d", r);
130 #else // To end of file...
136 #if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
140 #ifdef LEADING_UNDERSCORE
141 #define UNDERSCORE "_"
143 #define UNDERSCORE ""
145 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
147 Now here's something obscure for you:
149 When generating an adjustor thunk that uses the C calling
150 convention, we have to make sure that the thunk kicks off
151 the process of jumping into Haskell with a tail jump. Why?
152 Because as a result of jumping in into Haskell we may end
153 up freeing the very adjustor thunk we came from using
154 freeHaskellFunctionPtr(). Hence, we better not return to
155 the adjustor code on our way out, since it could by then
158 The fix is readily at hand, just include the opcodes
159 for the C stack fixup code that we need to perform when
160 returning in some static piece of memory and arrange
161 to return to it before tail jumping from the adjustor thunk.
163 static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
166 ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
167 UNDERSCORE "obscure_ccall_ret_code:\n\t"
168 "addl $0x4, %esp\n\t"
172 extern void obscure_ccall_ret_code(void);
176 #if defined(x86_64_HOST_ARCH)
177 static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
180 ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
181 UNDERSCORE "obscure_ccall_ret_code:\n\t"
182 "addq $0x8, %rsp\n\t"
186 extern void obscure_ccall_ret_code(void);
189 #if defined(alpha_HOST_ARCH)
190 /* To get the definition of PAL_imb: */
191 # if defined(linux_HOST_OS)
192 # include <asm/pal.h>
194 # include <machine/pal.h>
198 #if defined(ia64_HOST_ARCH)
200 /* Layout of a function descriptor */
201 typedef struct _IA64FunDesc {
207 stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
210 nat data_size_in_words, total_size_in_words;
212 /* round up to a whole number of words */
213 data_size_in_words = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
214 total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
216 /* allocate and fill it in */
217 arr = (StgArrWords *)allocate(total_size_in_words);
218 SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
220 /* obtain a stable ptr */
221 *stable = getStablePtr((StgPtr)arr);
223 /* and return a ptr to the goods inside the array */
224 return(&(arr->payload));
228 #if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
229 __asm__("obscure_ccall_ret_code:\n\t"
234 extern void obscure_ccall_ret_code(void);
237 #if defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
238 #if !(defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS))
240 /* !!! !!! WARNING: !!! !!!
241 * This structure is accessed from AdjustorAsm.s
242 * Any changes here have to be mirrored in the offsets there.
245 typedef struct AdjustorStub {
246 #if defined(powerpc_HOST_ARCH) && defined(darwin_HOST_OS)
253 #elif defined(powerpc64_HOST_ARCH) && defined(darwin_HOST_OS)
254 /* powerpc64-darwin: just guessing that it won't use fundescs. */
265 /* fundesc-based ABIs */
274 StgInt negative_framesize;
275 StgInt extrawords_plus_one;
281 #if defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
283 /* !!! !!! WARNING: !!! !!!
284 * This structure is accessed from AdjustorAsm.s
285 * Any changes here have to be mirrored in the offsets there.
288 typedef struct AdjustorStub {
289 unsigned char call[8];
293 StgInt argument_size;
297 #if (defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)) || defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
298 static int totalArgumentSize(char *typeString)
303 char t = *typeString++;
307 // on 32-bit platforms, Double and Int64 occupy two words.
311 if(sizeof(void*) == 4)
316 // everything else is one word.
326 createAdjustor(int cconv, StgStablePtr hptr,
329 #if !defined(powerpc_HOST_ARCH) && !defined(powerpc64_HOST_ARCH) && !defined(x86_64_HOST_ARCH)
334 void *adjustor = NULL;
339 case 0: /* _stdcall */
340 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
341 /* Magic constant computed by inspecting the code length of
342 the following assembly language snippet
343 (offset and machine code prefixed):
345 <0>: 58 popl %eax # temp. remove ret addr..
346 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
347 # hold a StgStablePtr
348 <6>: 50 pushl %eax # put back ret. addr
349 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
350 <c>: ff e0 jmp %eax # and jump to it.
351 # the callee cleans up the stack
353 adjustor = allocateExec(14,&code);
355 unsigned char *const adj_code = (unsigned char *)adjustor;
356 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
358 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
359 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
361 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
363 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
364 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
366 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
367 adj_code[0x0d] = (unsigned char)0xe0;
373 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
374 /* Magic constant computed by inspecting the code length of
375 the following assembly language snippet
376 (offset and machine code prefixed):
378 <00>: 68 ef be ad de pushl $0xdeadbeef # constant is large enough to
379 # hold a StgStablePtr
380 <05>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
381 <0a>: 68 ef be ad de pushl $obscure_ccall_ret_code # push the return address
382 <0f>: ff e0 jmp *%eax # jump to wptr
384 The ccall'ing version is a tad different, passing in the return
385 address of the caller to the auto-generated C stub (which enters
386 via the stable pointer.) (The auto-generated C stub is in on this
387 game, don't worry :-)
389 See the comment next to obscure_ccall_ret_code why we need to
390 perform a tail jump instead of a call, followed by some C stack
393 Note: The adjustor makes the assumption that any return value
394 coming back from the C stub is not stored on the stack.
395 That's (thankfully) the case here with the restricted set of
396 return types that we support.
398 adjustor = allocateExec(17,&code);
400 unsigned char *const adj_code = (unsigned char *)adjustor;
402 adj_code[0x00] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
403 *((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
405 adj_code[0x05] = (unsigned char)0xb8; /* movl $wptr, %eax */
406 *((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
408 adj_code[0x0a] = (unsigned char)0x68; /* pushl obscure_ccall_ret_code */
409 *((StgFunPtr*)(adj_code + 0x0b)) =
410 (StgFunPtr)obscure_ccall_ret_code;
412 adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
413 adj_code[0x10] = (unsigned char)0xe0;
415 #elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
418 What's special about Darwin/Mac OS X on i386?
419 It wants the stack to stay 16-byte aligned.
421 We offload most of the work to AdjustorAsm.S.
423 AdjustorStub *adjustorStub = allocateExec(sizeof(AdjustorStub),&code);
424 adjustor = adjustorStub;
426 extern void adjustorCode(void);
427 int sz = totalArgumentSize(typeString);
429 adjustorStub->call[0] = 0xe8;
430 *(long*)&adjustorStub->call[1] = ((char*)&adjustorCode) - ((char*)adjustorStub + 5);
431 adjustorStub->hptr = hptr;
432 adjustorStub->wptr = wptr;
434 // The adjustor puts the following things on the stack:
436 // 2.) padding and (a copy of) the arguments
437 // 3.) a dummy argument
439 // 5.) return address (for returning to the adjustor)
440 // All these have to add up to a multiple of 16.
442 // first, include everything in frame_size
443 adjustorStub->frame_size = sz * 4 + 16;
445 adjustorStub->frame_size = (adjustorStub->frame_size + 15) & ~15;
446 // only count 2.) and 3.) as part of frame_size
447 adjustorStub->frame_size -= 12;
448 adjustorStub->argument_size = sz;
451 #elif defined(x86_64_HOST_ARCH)
458 %rdi,%rsi,%rdx,%rcx,%r8,%r9 = arg0..arg6
460 if there are <6 integer args, then we can just push the
461 StablePtr into %edi and shuffle the other args up.
463 If there are >=6 integer args, then we have to flush one arg
464 to the stack, and arrange to adjust the stack ptr on return.
465 The stack will be rearranged to this:
470 return address *** <-- dummy arg in stub fn.
472 obscure_ccall_ret_code
474 This unfortunately means that the type of the stub function
475 must have a dummy argument for the original return address
476 pointer inserted just after the 6th integer argument.
478 Code for the simple case:
480 0: 4d 89 c1 mov %r8,%r9
481 3: 49 89 c8 mov %rcx,%r8
482 6: 48 89 d1 mov %rdx,%rcx
483 9: 48 89 f2 mov %rsi,%rdx
484 c: 48 89 fe mov %rdi,%rsi
485 f: 48 8b 3d 0a 00 00 00 mov 10(%rip),%rdi
486 16: ff 25 0c 00 00 00 jmpq *12(%rip)
488 20: .quad 0 # aligned on 8-byte boundary
489 28: .quad 0 # aligned on 8-byte boundary
492 And the version for >=6 integer arguments:
495 2: ff 35 20 00 00 00 pushq 32(%rip) # 28 <ccall_adjustor+0x28>
496 8: 4d 89 c1 mov %r8,%r9
497 b: 49 89 c8 mov %rcx,%r8
498 e: 48 89 d1 mov %rdx,%rcx
499 11: 48 89 f2 mov %rsi,%rdx
500 14: 48 89 fe mov %rdi,%rsi
501 17: 48 8b 3d 12 00 00 00 mov 18(%rip),%rdi # 30 <ccall_adjustor+0x30>
502 1e: ff 25 14 00 00 00 jmpq *20(%rip) # 38 <ccall_adjustor+0x38>
504 28: .quad 0 # aligned on 8-byte boundary
505 30: .quad 0 # aligned on 8-byte boundary
506 38: .quad 0 # aligned on 8-byte boundary
514 // determine whether we have 6 or more integer arguments,
515 // and therefore need to flush one to the stack.
516 for (c = typeString; *c != '\0'; c++) {
517 if (*c != 'f' && *c != 'd') i++;
522 adjustor = allocateExec(0x30,&code);
523 adj_code = (StgWord8*)adjustor;
525 *(StgInt32 *)adj_code = 0x49c1894d;
526 *(StgInt32 *)(adj_code+0x4) = 0x8948c889;
527 *(StgInt32 *)(adj_code+0x8) = 0xf28948d1;
528 *(StgInt32 *)(adj_code+0xc) = 0x48fe8948;
529 *(StgInt32 *)(adj_code+0x10) = 0x000a3d8b;
530 *(StgInt32 *)(adj_code+0x14) = 0x25ff0000;
531 *(StgInt32 *)(adj_code+0x18) = 0x0000000c;
532 *(StgInt64 *)(adj_code+0x20) = (StgInt64)hptr;
533 *(StgInt64 *)(adj_code+0x28) = (StgInt64)wptr;
537 adjustor = allocateExec(0x40,&code);
538 adj_code = (StgWord8*)adjustor;
540 *(StgInt32 *)adj_code = 0x35ff5141;
541 *(StgInt32 *)(adj_code+0x4) = 0x00000020;
542 *(StgInt32 *)(adj_code+0x8) = 0x49c1894d;
543 *(StgInt32 *)(adj_code+0xc) = 0x8948c889;
544 *(StgInt32 *)(adj_code+0x10) = 0xf28948d1;
545 *(StgInt32 *)(adj_code+0x14) = 0x48fe8948;
546 *(StgInt32 *)(adj_code+0x18) = 0x00123d8b;
547 *(StgInt32 *)(adj_code+0x1c) = 0x25ff0000;
548 *(StgInt32 *)(adj_code+0x20) = 0x00000014;
550 *(StgInt64 *)(adj_code+0x28) = (StgInt64)obscure_ccall_ret_code;
551 *(StgInt64 *)(adj_code+0x30) = (StgInt64)hptr;
552 *(StgInt64 *)(adj_code+0x38) = (StgInt64)wptr;
555 #elif defined(sparc_HOST_ARCH)
556 /* Magic constant computed by inspecting the code length of the following
557 assembly language snippet (offset and machine code prefixed):
559 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
560 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
561 <08>: D823A05C st %o4, [%sp + 92]
562 <0C>: 9A10000B mov %o3, %o5
563 <10>: 9810000A mov %o2, %o4
564 <14>: 96100009 mov %o1, %o3
565 <18>: 94100008 mov %o0, %o2
566 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
567 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
568 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
569 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
570 <2C> 00000000 ! place for getting hptr back easily
572 ccall'ing on SPARC is easy, because we are quite lucky to push a
573 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
574 existing arguments (note that %sp must stay double-word aligned at
575 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
576 To do this, we extend the *caller's* stack frame by 2 words and shift
577 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
578 procedure because of the tail-jump) by 2 positions. This makes room in
579 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
580 for destination addr of jump on SPARC, return address on x86, ...). This
581 shouldn't cause any problems for a C-like caller: alloca is implemented
582 similarly, and local variables should be accessed via %fp, not %sp. In a
583 nutshell: This should work! (Famous last words! :-)
585 adjustor = allocateExec(4*(11+1),&code);
587 unsigned long *const adj_code = (unsigned long *)adjustor;
589 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
590 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
591 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
592 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
593 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
594 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
595 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
596 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
597 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
598 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
599 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
600 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
601 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
602 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
603 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
605 adj_code[11] = (unsigned long)hptr;
608 asm("flush %0" : : "r" (adj_code ));
609 asm("flush %0" : : "r" (adj_code + 2));
610 asm("flush %0" : : "r" (adj_code + 4));
611 asm("flush %0" : : "r" (adj_code + 6));
612 asm("flush %0" : : "r" (adj_code + 10));
614 /* max. 5 instructions latency, and we need at >= 1 for returning */
620 #elif defined(alpha_HOST_ARCH)
621 /* Magic constant computed by inspecting the code length of
622 the following assembly language snippet
623 (offset and machine code prefixed; note that the machine code
624 shown is longwords stored in little-endian order):
626 <00>: 46520414 mov a2, a4
627 <04>: 46100412 mov a0, a2
628 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
629 <0c>: 46730415 mov a3, a5
630 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
631 <14>: 46310413 mov a1, a3
632 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
633 <1c>: 00000000 # padding for alignment
634 <20>: [8 bytes for hptr quadword]
635 <28>: [8 bytes for wptr quadword]
637 The "computed" jump at <08> above is really a jump to a fixed
638 location. Accordingly, we place an always-correct hint in the
639 jump instruction, namely the address offset from <0c> to wptr,
640 divided by 4, taking the lowest 14 bits.
642 We only support passing 4 or fewer argument words, for the same
643 reason described under sparc_HOST_ARCH above by JRS, 21 Aug 01.
644 On the Alpha the first 6 integer arguments are in a0 through a5,
645 and the rest on the stack. Hence we want to shuffle the original
646 caller's arguments by two.
648 On the Alpha the calling convention is so complex and dependent
649 on the callee's signature -- for example, the stack pointer has
650 to be a multiple of 16 -- that it seems impossible to me [ccshan]
651 to handle the general case correctly without changing how the
652 adjustor is called from C. For now, our solution of shuffling
653 registers only and ignoring the stack only works if the original
654 caller passed 4 or fewer argument words.
656 TODO: Depending on how much allocation overhead stgMallocBytes uses for
657 header information (more precisely, if the overhead is no more than
658 4 bytes), we should move the first three instructions above down by
659 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
661 ASSERT(((StgWord64)wptr & 3) == 0);
662 adjustor = allocateExec(48,&code);
664 StgWord64 *const code = (StgWord64 *)adjustor;
666 code[0] = 0x4610041246520414L;
667 code[1] = 0x46730415a61b0020L;
668 code[2] = 0x46310413a77b0028L;
669 code[3] = 0x000000006bfb0000L
670 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
672 code[4] = (StgWord64)hptr;
673 code[5] = (StgWord64)wptr;
675 /* Ensure that instruction cache is consistent with our new code */
676 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
678 #elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
680 #define OP_LO(op,lo) ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
681 #define OP_HI(op,hi) ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
683 /* The PowerPC Linux (32-bit) calling convention is annoyingly complex.
684 We need to calculate all the details of the stack frame layout,
685 taking into account the types of all the arguments, and then
686 generate code on the fly. */
688 int src_gpr = 3, dst_gpr = 5;
690 int src_offset = 0, dst_offset = 0;
691 int n = strlen(typeString),i;
692 int src_locs[n], dst_locs[n];
697 Calculate where the arguments should go.
698 src_locs[] will contain the locations of the arguments in the
699 original stack frame passed to the adjustor.
700 dst_locs[] will contain the locations of the arguments after the
701 adjustor runs, on entry to the wrapper proc pointed to by wptr.
703 This algorithm is based on the one described on page 3-19 of the
704 System V ABI PowerPC Processor Supplement.
706 for(i=0;typeString[i];i++)
708 char t = typeString[i];
709 if((t == 'f' || t == 'd') && fpr <= 8)
710 src_locs[i] = dst_locs[i] = -32-(fpr++);
713 if((t == 'l' || t == 'L') && src_gpr <= 9)
715 if((src_gpr & 1) == 0)
717 src_locs[i] = -src_gpr;
720 else if((t == 'w' || t == 'W') && src_gpr <= 10)
722 src_locs[i] = -(src_gpr++);
726 if(t == 'l' || t == 'L' || t == 'd')
731 src_locs[i] = src_offset;
732 src_offset += (t == 'l' || t == 'L' || t == 'd') ? 8 : 4;
735 if((t == 'l' || t == 'L') && dst_gpr <= 9)
737 if((dst_gpr & 1) == 0)
739 dst_locs[i] = -dst_gpr;
742 else if((t == 'w' || t == 'W') && dst_gpr <= 10)
744 dst_locs[i] = -(dst_gpr++);
748 if(t == 'l' || t == 'L' || t == 'd')
753 dst_locs[i] = dst_offset;
754 dst_offset += (t == 'l' || t == 'L' || t == 'd') ? 8 : 4;
759 frameSize = dst_offset + 8;
760 frameSize = (frameSize+15) & ~0xF;
765 // allocate space for at most 4 insns per parameter
766 // plus 14 more instructions.
767 adjustor = allocateExec(4 * (4*n + 14),&code);
768 code = (unsigned*)adjustor;
770 *code++ = 0x48000008; // b *+8
771 // * Put the hptr in a place where freeHaskellFunctionPtr
773 *code++ = (unsigned) hptr;
775 // * save the link register
776 *code++ = 0x7c0802a6; // mflr r0;
777 *code++ = 0x90010004; // stw r0, 4(r1);
778 // * and build a new stack frame
779 *code++ = OP_LO(0x9421, -frameSize); // stwu r1, -frameSize(r1)
781 // * now generate instructions to copy arguments
782 // from the old stack frame into the new stack frame.
785 if(src_locs[i] < -32)
786 ASSERT(dst_locs[i] == src_locs[i]);
787 else if(src_locs[i] < 0)
790 ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
793 ASSERT(dst_locs[i] > -32);
794 // dst is in GPR, too.
796 if(typeString[i] == 'l' || typeString[i] == 'L')
800 | ((-dst_locs[i]+1) << 16)
801 | ((-src_locs[i]+1) << 11)
802 | ((-src_locs[i]+1) << 21);
806 | ((-dst_locs[i]) << 16)
807 | ((-src_locs[i]) << 11)
808 | ((-src_locs[i]) << 21);
812 if(typeString[i] == 'l' || typeString[i] == 'L')
814 // stw src+1, dst_offset+4(r1)
816 | ((-src_locs[i]+1) << 21)
820 // stw src, dst_offset(r1)
822 | ((-src_locs[i]) << 21)
828 ASSERT(dst_locs[i] >= 0);
829 ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
831 if(typeString[i] == 'l' || typeString[i] == 'L')
833 // lwz r0, src_offset(r1)
835 | (src_locs[i] + frameSize + 8 + 4);
836 // stw r0, dst_offset(r1)
838 | (dst_locs[i] + 8 + 4);
840 // lwz r0, src_offset(r1)
842 | (src_locs[i] + frameSize + 8);
843 // stw r0, dst_offset(r1)
849 // * hptr will be the new first argument.
851 *code++ = OP_HI(0x3c60, hptr);
852 // ori r3,r3,lo(hptr)
853 *code++ = OP_LO(0x6063, hptr);
855 // * we need to return to a piece of code
856 // which will tear down the stack frame.
857 // lis r11,hi(obscure_ccall_ret_code)
858 *code++ = OP_HI(0x3d60, obscure_ccall_ret_code);
859 // ori r11,r11,lo(obscure_ccall_ret_code)
860 *code++ = OP_LO(0x616b, obscure_ccall_ret_code);
862 *code++ = 0x7d6803a6;
866 *code++ = OP_HI(0x3d60, wptr);
867 // ori r11,r11,lo(wptr)
868 *code++ = OP_LO(0x616b, wptr);
870 *code++ = 0x7d6903a6;
872 *code++ = 0x4e800420;
874 // Flush the Instruction cache:
876 unsigned *p = adjustor;
879 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
883 __asm__ volatile ("sync\n\tisync");
887 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
889 #define OP_LO(op,lo) ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
890 #define OP_HI(op,hi) ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
892 /* The following code applies to all PowerPC and PowerPC64 platforms
893 whose stack layout is based on the AIX ABI.
895 Besides (obviously) AIX, this includes
896 Mac OS 9 and BeOS/PPC (may they rest in peace),
897 which use the 32-bit AIX ABI
899 which uses the 64-bit AIX ABI
900 and Darwin (Mac OS X),
901 which uses the same stack layout as AIX,
902 but no function descriptors.
904 The actual stack-frame shuffling is implemented out-of-line
905 in the function adjustorCode, in AdjustorAsm.S.
906 Here, we set up an AdjustorStub structure, which
907 is a function descriptor (on platforms that have function
908 descriptors) or a short piece of stub code (on Darwin) to call
909 adjustorCode with a pointer to the AdjustorStub struct loaded
912 One nice thing about this is that there is _no_ code generated at
913 runtime on the platforms that have function descriptors.
915 AdjustorStub *adjustorStub;
916 int sz = 0, extra_sz, total_sz;
918 // from AdjustorAsm.s
919 // not declared as a function so that AIX-style
920 // fundescs can never get in the way.
921 extern void *adjustorCode;
924 adjustorStub = stgMallocBytes(sizeof(AdjustorStub), "createAdjustor");
926 adjustorStub = allocateExec(sizeof(AdjustorStub),&code);
928 adjustor = adjustorStub;
930 adjustorStub->code = (void*) &adjustorCode;
933 // function descriptors are a cool idea.
934 // We don't need to generate any code at runtime.
935 adjustorStub->toc = adjustorStub;
938 // no function descriptors :-(
939 // We need to do things "by hand".
940 #if defined(powerpc_HOST_ARCH)
941 // lis r2, hi(adjustorStub)
942 adjustorStub->lis = OP_HI(0x3c40, adjustorStub);
943 // ori r2, r2, lo(adjustorStub)
944 adjustorStub->ori = OP_LO(0x6042, adjustorStub);
946 adjustorStub->lwz = OP_LO(0x8002, (char*)(&adjustorStub->code)
947 - (char*)adjustorStub);
949 adjustorStub->mtctr = 0x7c0903a6;
951 adjustorStub->bctr = 0x4e800420;
953 barf("adjustor creation not supported on this platform");
956 // Flush the Instruction cache:
958 int n = sizeof(AdjustorStub)/sizeof(unsigned);
959 unsigned *p = (unsigned*)adjustor;
962 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
966 __asm__ volatile ("sync\n\tisync");
970 // Calculate the size of the stack frame, in words.
971 sz = totalArgumentSize(typeString);
973 // The first eight words of the parameter area
974 // are just "backing store" for the parameters passed in
975 // the GPRs. extra_sz is the number of words beyond those first
981 // Calculate the total size of the stack frame.
982 total_sz = (6 /* linkage area */
983 + 8 /* minimum parameter area */
984 + 2 /* two extra arguments */
985 + extra_sz)*sizeof(StgWord);
987 // align to 16 bytes.
988 // AIX only requires 8 bytes, but who cares?
989 total_sz = (total_sz+15) & ~0xF;
991 // Fill in the information that adjustorCode in AdjustorAsm.S
992 // will use to create a new stack frame with the additional args.
993 adjustorStub->hptr = hptr;
994 adjustorStub->wptr = wptr;
995 adjustorStub->negative_framesize = -total_sz;
996 adjustorStub->extrawords_plus_one = extra_sz + 1;
999 #elif defined(ia64_HOST_ARCH)
1001 Up to 8 inputs are passed in registers. We flush the last two inputs to
1002 the stack, initially into the 16-byte scratch region left by the caller.
1003 We then shuffle the others along by 4 (taking 2 registers for ourselves
1004 to save return address and previous function state - we need to come back
1005 here on the way out to restore the stack, so this is a real function
1006 rather than just a trampoline).
1008 The function descriptor we create contains the gp of the target function
1009 so gp is already loaded correctly.
1011 [MLX] alloc r16=ar.pfs,10,2,0
1013 [MII] st8.spill [r12]=r38,8 // spill in6 (out4)
1014 mov r41=r37 // out7 = in5 (out3)
1015 mov r40=r36;; // out6 = in4 (out2)
1016 [MII] st8.spill [r12]=r39 // spill in7 (out5)
1018 mov r38=r34;; // out4 = in2 (out0)
1019 [MII] mov r39=r35 // out5 = in3 (out1)
1020 mov r37=r33 // out3 = in1 (loc1)
1021 mov r36=r32 // out2 = in0 (loc0)
1022 [MLX] adds r12=-24,r12 // update sp
1023 movl r34=hptr;; // out0 = hptr
1024 [MIB] mov r33=r16 // loc1 = ar.pfs
1025 mov r32=b0 // loc0 = retaddr
1026 br.call.sptk.many b0=b6;;
1028 [MII] adds r12=-16,r12
1033 br.ret.sptk.many b0;;
1036 /* These macros distribute a long constant into the two words of an MLX bundle */
1037 #define BITS(val,start,count) (((val) >> (start)) & ((1 << (count))-1))
1038 #define MOVL_LOWORD(val) (BITS(val,22,18) << 46)
1039 #define MOVL_HIWORD(val) ( (BITS(val,0,7) << 36) \
1040 | (BITS(val,7,9) << 50) \
1041 | (BITS(val,16,5) << 45) \
1042 | (BITS(val,21,1) << 44) \
1043 | (BITS(val,40,23)) \
1044 | (BITS(val,63,1) << 59))
1047 StgStablePtr stable;
1048 IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
1049 StgWord64 wcode = wdesc->ip;
1053 /* we allocate on the Haskell heap since malloc'd memory isn't
1054 * executable - argh */
1055 /* Allocated memory is word-aligned (8 bytes) but functions on ia64
1056 * must be aligned to 16 bytes. We allocate an extra 8 bytes of
1057 * wiggle room so that we can put the code on a 16 byte boundary. */
1058 adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8+8, &stable);
1060 fdesc = (IA64FunDesc *)adjustor;
1061 code = (StgWord64 *)(fdesc + 1);
1062 /* add 8 bytes to code if needed to align to a 16-byte boundary */
1063 if ((StgWord64)code & 15) code++;
1064 fdesc->ip = (StgWord64)code;
1065 fdesc->gp = wdesc->gp;
1067 code[0] = 0x0000058004288004 | MOVL_LOWORD(wcode);
1068 code[1] = 0x6000000220000000 | MOVL_HIWORD(wcode);
1069 code[2] = 0x029015d818984001;
1070 code[3] = 0x8401200500420094;
1071 code[4] = 0x886011d8189c0001;
1072 code[5] = 0x84011004c00380c0;
1073 code[6] = 0x0250210046013800;
1074 code[7] = 0x8401000480420084;
1075 code[8] = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
1076 code[9] = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
1077 code[10] = 0x0200210020010811;
1078 code[11] = 0x1080006800006200;
1079 code[12] = 0x0000210018406000;
1080 code[13] = 0x00aa021000038005;
1081 code[14] = 0x000000010000001d;
1082 code[15] = 0x0084000880000200;
1084 /* save stable pointers in convenient form */
1085 code[16] = (StgWord64)hptr;
1086 code[17] = (StgWord64)stable;
1089 barf("adjustor creation not supported on this platform");
1104 freeHaskellFunctionPtr(void* ptr)
1106 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
1107 if ( *(unsigned char*)ptr != 0x68 &&
1108 *(unsigned char*)ptr != 0x58 ) {
1109 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1113 /* Free the stable pointer first..*/
1114 if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
1115 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
1117 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
1119 #elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
1120 if ( *(unsigned char*)ptr != 0xe8 ) {
1121 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1124 freeStablePtr(((AdjustorStub*)ptr)->hptr);
1125 #elif defined(x86_64_HOST_ARCH)
1126 if ( *(StgWord16 *)ptr == 0x894d ) {
1127 freeStablePtr(*(StgStablePtr*)((StgWord8*)ptr+0x20));
1128 } else if ( *(StgWord16 *)ptr == 0x5141 ) {
1129 freeStablePtr(*(StgStablePtr*)((StgWord8*)ptr+0x30));
1131 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1134 #elif defined(sparc_HOST_ARCH)
1135 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
1136 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1140 /* Free the stable pointer first..*/
1141 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
1142 #elif defined(alpha_HOST_ARCH)
1143 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
1144 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1148 /* Free the stable pointer first..*/
1149 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
1150 #elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
1151 if ( *(StgWord*)ptr != 0x48000008 ) {
1152 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1155 freeStablePtr(((StgStablePtr*)ptr)[1]);
1156 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
1157 extern void* adjustorCode;
1158 if ( ((AdjustorStub*)ptr)->code != (StgFunPtr) &adjustorCode ) {
1159 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1162 freeStablePtr(((AdjustorStub*)ptr)->hptr);
1163 #elif defined(ia64_HOST_ARCH)
1164 IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
1165 StgWord64 *code = (StgWord64 *)(fdesc+1);
1167 if (fdesc->ip != (StgWord64)code) {
1168 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1171 freeStablePtr((StgStablePtr)code[16]);
1172 freeStablePtr((StgStablePtr)code[17]);
1177 // Can't write to this memory, it is only executable:
1178 // *((unsigned char*)ptr) = '\0';
1183 #endif // !USE_LIBFFI_FOR_ADJUSTORS