1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
6 * ---------------------------------------------------------------------------*/
8 /* A little bit of background...
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so will result in memory leaks on both the C and
39 #include "PosixSource.h"
41 #include "RtsExternal.h"
45 #if defined(USE_LIBFFI)
51 freeHaskellFunctionPtr(void* ptr)
55 cl = (ffi_closure*)ptr;
56 freeStablePtr(cl->user_data);
57 stgFree(cl->cif->arg_types);
62 static ffi_type * char_to_ffi_type(char c)
65 case 'v': return &ffi_type_void;
66 case 'f': return &ffi_type_float;
67 case 'd': return &ffi_type_double;
68 case 'L': return &ffi_type_sint64;
69 case 'l': return &ffi_type_uint64;
70 case 'W': return &ffi_type_sint32;
71 case 'w': return &ffi_type_uint32;
72 case 'S': return &ffi_type_sint16;
73 case 's': return &ffi_type_uint16;
74 case 'B': return &ffi_type_sint8;
75 case 'b': return &ffi_type_uint8;
76 case 'p': return &ffi_type_pointer;
77 default: barf("char_to_ffi_type: unknown type '%c'", c);
82 createAdjustor (int cconv,
90 ffi_type *result_type;
94 n_args = strlen(typeString) - 1;
95 cif = stgMallocBytes(sizeof(ffi_cif), "createAdjustor");
96 arg_types = stgMallocBytes(n_args * sizeof(ffi_type*), "createAdjustor");
98 result_type = char_to_ffi_type(typeString[0]);
99 for (i=0; i < n_args; i++) {
100 arg_types[i] = char_to_ffi_type(typeString[i+1]);
103 #ifdef mingw32_TARGET_OS
104 case 0: /* stdcall */
109 abi = FFI_DEFAULT_ABI;
112 barf("createAdjustor: convention %d not supported on this platform", cconv);
115 r = ffi_prep_cif(cif, abi, n_args, result_type, arg_types);
116 if (r != FFI_OK) barf("ffi_prep_cif failed: %d", r);
118 // ToDo: use ffi_closure_alloc()
119 cl = allocateExec(sizeof(ffi_closure));
121 r = ffi_prep_closure(cl, cif, (void*)wptr, hptr/*userdata*/);
122 if (r != FFI_OK) barf("ffi_prep_closure failed: %d", r);
127 #else // To end of file...
133 #if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
137 #ifdef LEADING_UNDERSCORE
138 #define UNDERSCORE "_"
140 #define UNDERSCORE ""
142 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
144 Now here's something obscure for you:
146 When generating an adjustor thunk that uses the C calling
147 convention, we have to make sure that the thunk kicks off
148 the process of jumping into Haskell with a tail jump. Why?
149 Because as a result of jumping in into Haskell we may end
150 up freeing the very adjustor thunk we came from using
151 freeHaskellFunctionPtr(). Hence, we better not return to
152 the adjustor code on our way out, since it could by then
155 The fix is readily at hand, just include the opcodes
156 for the C stack fixup code that we need to perform when
157 returning in some static piece of memory and arrange
158 to return to it before tail jumping from the adjustor thunk.
160 static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
163 ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
164 UNDERSCORE "obscure_ccall_ret_code:\n\t"
165 "addl $0x4, %esp\n\t"
169 extern void obscure_ccall_ret_code(void);
173 #if defined(x86_64_HOST_ARCH)
174 static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
177 ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
178 UNDERSCORE "obscure_ccall_ret_code:\n\t"
179 "addq $0x8, %rsp\n\t"
183 extern void obscure_ccall_ret_code(void);
186 #if defined(alpha_HOST_ARCH)
187 /* To get the definition of PAL_imb: */
188 # if defined(linux_HOST_OS)
189 # include <asm/pal.h>
191 # include <machine/pal.h>
195 #if defined(ia64_HOST_ARCH)
197 /* Layout of a function descriptor */
198 typedef struct _IA64FunDesc {
204 stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
207 nat data_size_in_words, total_size_in_words;
209 /* round up to a whole number of words */
210 data_size_in_words = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
211 total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
213 /* allocate and fill it in */
214 arr = (StgArrWords *)allocate(total_size_in_words);
215 SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
217 /* obtain a stable ptr */
218 *stable = getStablePtr((StgPtr)arr);
220 /* and return a ptr to the goods inside the array */
221 return(&(arr->payload));
225 #if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
226 __asm__("obscure_ccall_ret_code:\n\t"
231 extern void obscure_ccall_ret_code(void);
234 #if defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
235 #if !(defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS))
237 /* !!! !!! WARNING: !!! !!!
238 * This structure is accessed from AdjustorAsm.s
239 * Any changes here have to be mirrored in the offsets there.
242 typedef struct AdjustorStub {
243 #if defined(powerpc_HOST_ARCH) && defined(darwin_HOST_OS)
250 #elif defined(powerpc64_HOST_ARCH) && defined(darwin_HOST_OS)
251 /* powerpc64-darwin: just guessing that it won't use fundescs. */
262 /* fundesc-based ABIs */
271 StgInt negative_framesize;
272 StgInt extrawords_plus_one;
278 #if defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
280 /* !!! !!! WARNING: !!! !!!
281 * This structure is accessed from AdjustorAsm.s
282 * Any changes here have to be mirrored in the offsets there.
285 typedef struct AdjustorStub {
286 unsigned char call[8];
290 StgInt argument_size;
294 #if defined(darwin_HOST_OS) || defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
295 static int totalArgumentSize(char *typeString)
300 char t = *typeString++;
304 // on 32-bit platforms, Double and Int64 occupy two words.
308 if(sizeof(void*) == 4)
313 // everything else is one word.
323 createAdjustor(int cconv, StgStablePtr hptr,
326 #if !defined(powerpc_HOST_ARCH) && !defined(powerpc64_HOST_ARCH) && !defined(x86_64_HOST_ARCH)
331 void *adjustor = NULL;
335 case 0: /* _stdcall */
336 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
337 /* Magic constant computed by inspecting the code length of
338 the following assembly language snippet
339 (offset and machine code prefixed):
341 <0>: 58 popl %eax # temp. remove ret addr..
342 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
343 # hold a StgStablePtr
344 <6>: 50 pushl %eax # put back ret. addr
345 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
346 <c>: ff e0 jmp %eax # and jump to it.
347 # the callee cleans up the stack
349 adjustor = allocateExec(14);
351 unsigned char *const adj_code = (unsigned char *)adjustor;
352 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
354 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
355 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
357 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
359 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
360 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
362 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
363 adj_code[0x0d] = (unsigned char)0xe0;
369 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
370 /* Magic constant computed by inspecting the code length of
371 the following assembly language snippet
372 (offset and machine code prefixed):
374 <00>: 68 ef be ad de pushl $0xdeadbeef # constant is large enough to
375 # hold a StgStablePtr
376 <05>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
377 <0a>: 68 ef be ad de pushl $obscure_ccall_ret_code # push the return address
378 <0f>: ff e0 jmp *%eax # jump to wptr
380 The ccall'ing version is a tad different, passing in the return
381 address of the caller to the auto-generated C stub (which enters
382 via the stable pointer.) (The auto-generated C stub is in on this
383 game, don't worry :-)
385 See the comment next to obscure_ccall_ret_code why we need to
386 perform a tail jump instead of a call, followed by some C stack
389 Note: The adjustor makes the assumption that any return value
390 coming back from the C stub is not stored on the stack.
391 That's (thankfully) the case here with the restricted set of
392 return types that we support.
394 adjustor = allocateExec(17);
396 unsigned char *const adj_code = (unsigned char *)adjustor;
398 adj_code[0x00] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
399 *((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
401 adj_code[0x05] = (unsigned char)0xb8; /* movl $wptr, %eax */
402 *((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
404 adj_code[0x0a] = (unsigned char)0x68; /* pushl obscure_ccall_ret_code */
405 *((StgFunPtr*)(adj_code + 0x0b)) =
406 (StgFunPtr)obscure_ccall_ret_code;
408 adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
409 adj_code[0x10] = (unsigned char)0xe0;
411 #elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
414 What's special about Darwin/Mac OS X on i386?
415 It wants the stack to stay 16-byte aligned.
417 We offload most of the work to AdjustorAsm.S.
419 AdjustorStub *adjustorStub = allocateExec(sizeof(AdjustorStub));
420 adjustor = adjustorStub;
422 extern void adjustorCode(void);
423 int sz = totalArgumentSize(typeString);
425 adjustorStub->call[0] = 0xe8;
426 *(long*)&adjustorStub->call[1] = ((char*)&adjustorCode) - ((char*)adjustorStub + 5);
427 adjustorStub->hptr = hptr;
428 adjustorStub->wptr = wptr;
430 // The adjustor puts the following things on the stack:
432 // 2.) padding and (a copy of) the arguments
433 // 3.) a dummy argument
435 // 5.) return address (for returning to the adjustor)
436 // All these have to add up to a multiple of 16.
438 // first, include everything in frame_size
439 adjustorStub->frame_size = sz * 4 + 16;
441 adjustorStub->frame_size = (adjustorStub->frame_size + 15) & ~15;
442 // only count 2.) and 3.) as part of frame_size
443 adjustorStub->frame_size -= 12;
444 adjustorStub->argument_size = sz;
447 #elif defined(x86_64_HOST_ARCH)
454 %rdi,%rsi,%rdx,%rcx,%r8,%r9 = arg0..arg6
456 if there are <6 integer args, then we can just push the
457 StablePtr into %edi and shuffle the other args up.
459 If there are >=6 integer args, then we have to flush one arg
460 to the stack, and arrange to adjust the stack ptr on return.
461 The stack will be rearranged to this:
466 return address *** <-- dummy arg in stub fn.
468 obscure_ccall_ret_code
470 This unfortunately means that the type of the stub function
471 must have a dummy argument for the original return address
472 pointer inserted just after the 6th integer argument.
474 Code for the simple case:
476 0: 4d 89 c1 mov %r8,%r9
477 3: 49 89 c8 mov %rcx,%r8
478 6: 48 89 d1 mov %rdx,%rcx
479 9: 48 89 f2 mov %rsi,%rdx
480 c: 48 89 fe mov %rdi,%rsi
481 f: 48 8b 3d 0a 00 00 00 mov 10(%rip),%rdi
482 16: ff 25 0c 00 00 00 jmpq *12(%rip)
484 20: .quad 0 # aligned on 8-byte boundary
485 28: .quad 0 # aligned on 8-byte boundary
488 And the version for >=6 integer arguments:
491 2: ff 35 20 00 00 00 pushq 32(%rip) # 28 <ccall_adjustor+0x28>
492 8: 4d 89 c1 mov %r8,%r9
493 b: 49 89 c8 mov %rcx,%r8
494 e: 48 89 d1 mov %rdx,%rcx
495 11: 48 89 f2 mov %rsi,%rdx
496 14: 48 89 fe mov %rdi,%rsi
497 17: 48 8b 3d 12 00 00 00 mov 18(%rip),%rdi # 30 <ccall_adjustor+0x30>
498 1e: ff 25 14 00 00 00 jmpq *20(%rip) # 38 <ccall_adjustor+0x38>
500 28: .quad 0 # aligned on 8-byte boundary
501 30: .quad 0 # aligned on 8-byte boundary
502 38: .quad 0 # aligned on 8-byte boundary
509 // determine whether we have 6 or more integer arguments,
510 // and therefore need to flush one to the stack.
511 for (c = typeString; *c != '\0'; c++) {
512 if (*c != 'f' && *c != 'd') i++;
517 adjustor = allocateExec(0x30);
519 *(StgInt32 *)adjustor = 0x49c1894d;
520 *(StgInt32 *)(adjustor+0x4) = 0x8948c889;
521 *(StgInt32 *)(adjustor+0x8) = 0xf28948d1;
522 *(StgInt32 *)(adjustor+0xc) = 0x48fe8948;
523 *(StgInt32 *)(adjustor+0x10) = 0x000a3d8b;
524 *(StgInt32 *)(adjustor+0x14) = 0x25ff0000;
525 *(StgInt32 *)(adjustor+0x18) = 0x0000000c;
526 *(StgInt64 *)(adjustor+0x20) = (StgInt64)hptr;
527 *(StgInt64 *)(adjustor+0x28) = (StgInt64)wptr;
531 adjustor = allocateExec(0x40);
533 *(StgInt32 *)adjustor = 0x35ff5141;
534 *(StgInt32 *)(adjustor+0x4) = 0x00000020;
535 *(StgInt32 *)(adjustor+0x8) = 0x49c1894d;
536 *(StgInt32 *)(adjustor+0xc) = 0x8948c889;
537 *(StgInt32 *)(adjustor+0x10) = 0xf28948d1;
538 *(StgInt32 *)(adjustor+0x14) = 0x48fe8948;
539 *(StgInt32 *)(adjustor+0x18) = 0x00123d8b;
540 *(StgInt32 *)(adjustor+0x1c) = 0x25ff0000;
541 *(StgInt32 *)(adjustor+0x20) = 0x00000014;
543 *(StgInt64 *)(adjustor+0x28) = (StgInt64)obscure_ccall_ret_code;
544 *(StgInt64 *)(adjustor+0x30) = (StgInt64)hptr;
545 *(StgInt64 *)(adjustor+0x38) = (StgInt64)wptr;
548 #elif defined(sparc_HOST_ARCH)
549 /* Magic constant computed by inspecting the code length of the following
550 assembly language snippet (offset and machine code prefixed):
552 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
553 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
554 <08>: D823A05C st %o4, [%sp + 92]
555 <0C>: 9A10000B mov %o3, %o5
556 <10>: 9810000A mov %o2, %o4
557 <14>: 96100009 mov %o1, %o3
558 <18>: 94100008 mov %o0, %o2
559 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
560 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
561 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
562 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
563 <2C> 00000000 ! place for getting hptr back easily
565 ccall'ing on SPARC is easy, because we are quite lucky to push a
566 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
567 existing arguments (note that %sp must stay double-word aligned at
568 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
569 To do this, we extend the *caller's* stack frame by 2 words and shift
570 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
571 procedure because of the tail-jump) by 2 positions. This makes room in
572 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
573 for destination addr of jump on SPARC, return address on x86, ...). This
574 shouldn't cause any problems for a C-like caller: alloca is implemented
575 similarly, and local variables should be accessed via %fp, not %sp. In a
576 nutshell: This should work! (Famous last words! :-)
578 adjustor = allocateExec(4*(11+1));
580 unsigned long *const adj_code = (unsigned long *)adjustor;
582 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
583 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
584 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
585 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
586 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
587 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
588 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
589 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
590 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
591 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
592 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
593 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
594 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
595 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
596 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
598 adj_code[11] = (unsigned long)hptr;
601 asm("flush %0" : : "r" (adj_code ));
602 asm("flush %0" : : "r" (adj_code + 2));
603 asm("flush %0" : : "r" (adj_code + 4));
604 asm("flush %0" : : "r" (adj_code + 6));
605 asm("flush %0" : : "r" (adj_code + 10));
607 /* max. 5 instructions latency, and we need at >= 1 for returning */
613 #elif defined(alpha_HOST_ARCH)
614 /* Magic constant computed by inspecting the code length of
615 the following assembly language snippet
616 (offset and machine code prefixed; note that the machine code
617 shown is longwords stored in little-endian order):
619 <00>: 46520414 mov a2, a4
620 <04>: 46100412 mov a0, a2
621 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
622 <0c>: 46730415 mov a3, a5
623 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
624 <14>: 46310413 mov a1, a3
625 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
626 <1c>: 00000000 # padding for alignment
627 <20>: [8 bytes for hptr quadword]
628 <28>: [8 bytes for wptr quadword]
630 The "computed" jump at <08> above is really a jump to a fixed
631 location. Accordingly, we place an always-correct hint in the
632 jump instruction, namely the address offset from <0c> to wptr,
633 divided by 4, taking the lowest 14 bits.
635 We only support passing 4 or fewer argument words, for the same
636 reason described under sparc_HOST_ARCH above by JRS, 21 Aug 01.
637 On the Alpha the first 6 integer arguments are in a0 through a5,
638 and the rest on the stack. Hence we want to shuffle the original
639 caller's arguments by two.
641 On the Alpha the calling convention is so complex and dependent
642 on the callee's signature -- for example, the stack pointer has
643 to be a multiple of 16 -- that it seems impossible to me [ccshan]
644 to handle the general case correctly without changing how the
645 adjustor is called from C. For now, our solution of shuffling
646 registers only and ignoring the stack only works if the original
647 caller passed 4 or fewer argument words.
649 TODO: Depending on how much allocation overhead stgMallocBytes uses for
650 header information (more precisely, if the overhead is no more than
651 4 bytes), we should move the first three instructions above down by
652 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
654 ASSERT(((StgWord64)wptr & 3) == 0);
655 adjustor = allocateExec(48);
657 StgWord64 *const code = (StgWord64 *)adjustor;
659 code[0] = 0x4610041246520414L;
660 code[1] = 0x46730415a61b0020L;
661 code[2] = 0x46310413a77b0028L;
662 code[3] = 0x000000006bfb0000L
663 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
665 code[4] = (StgWord64)hptr;
666 code[5] = (StgWord64)wptr;
668 /* Ensure that instruction cache is consistent with our new code */
669 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
671 #elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
673 #define OP_LO(op,lo) ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
674 #define OP_HI(op,hi) ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
676 /* The PowerPC Linux (32-bit) calling convention is annoyingly complex.
677 We need to calculate all the details of the stack frame layout,
678 taking into account the types of all the arguments, and then
679 generate code on the fly. */
681 int src_gpr = 3, dst_gpr = 5;
683 int src_offset = 0, dst_offset = 0;
684 int n = strlen(typeString),i;
685 int src_locs[n], dst_locs[n];
690 Calculate where the arguments should go.
691 src_locs[] will contain the locations of the arguments in the
692 original stack frame passed to the adjustor.
693 dst_locs[] will contain the locations of the arguments after the
694 adjustor runs, on entry to the wrapper proc pointed to by wptr.
696 This algorithm is based on the one described on page 3-19 of the
697 System V ABI PowerPC Processor Supplement.
699 for(i=0;typeString[i];i++)
701 char t = typeString[i];
702 if((t == 'f' || t == 'd') && fpr <= 8)
703 src_locs[i] = dst_locs[i] = -32-(fpr++);
706 if((t == 'l' || t == 'L') && src_gpr <= 9)
708 if((src_gpr & 1) == 0)
710 src_locs[i] = -src_gpr;
713 else if((t == 'w' || t == 'W') && src_gpr <= 10)
715 src_locs[i] = -(src_gpr++);
719 if((t == 'l' || t == 'L' || t == 'd')
724 src_locs[i] = src_offset;
725 src_offset += (t == 'l' || t == 'L' || t == 'd') ? 8 : 4;
728 if((t == 'l' || t == 'L') && dst_gpr <= 9)
730 if((dst_gpr & 1) == 0)
732 dst_locs[i] = -dst_gpr;
735 else if((t == 'w' || t == 'W') && dst_gpr <= 10)
737 dst_locs[i] = -(dst_gpr++);
741 if(t == 'l' || t == 'L' || t == 'd')
746 dst_locs[i] = dst_offset;
747 dst_offset += (t == 'l' || t == 'L' || t == 'd') ? 8 : 4;
752 frameSize = dst_offset + 8;
753 frameSize = (frameSize+15) & ~0xF;
758 // allocate space for at most 4 insns per parameter
759 // plus 14 more instructions.
760 adjustor = allocateExec(4 * (4*n + 14));
761 code = (unsigned*)adjustor;
763 *code++ = 0x48000008; // b *+8
764 // * Put the hptr in a place where freeHaskellFunctionPtr
766 *code++ = (unsigned) hptr;
768 // * save the link register
769 *code++ = 0x7c0802a6; // mflr r0;
770 *code++ = 0x90010004; // stw r0, 4(r1);
771 // * and build a new stack frame
772 *code++ = OP_LO(0x9421, -frameSize); // stwu r1, -frameSize(r1)
774 // * now generate instructions to copy arguments
775 // from the old stack frame into the new stack frame.
778 if(src_locs[i] < -32)
779 ASSERT(dst_locs[i] == src_locs[i]);
780 else if(src_locs[i] < 0)
783 ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
786 ASSERT(dst_locs[i] > -32);
787 // dst is in GPR, too.
789 if(typeString[i] == 'l' || typeString[i] == 'L')
793 | ((-dst_locs[i]+1) << 16)
794 | ((-src_locs[i]+1) << 11)
795 | ((-src_locs[i]+1) << 21);
799 | ((-dst_locs[i]) << 16)
800 | ((-src_locs[i]) << 11)
801 | ((-src_locs[i]) << 21);
805 if(typeString[i] == 'l' || typeString[i] == 'L')
807 // stw src+1, dst_offset+4(r1)
809 | ((-src_locs[i]+1) << 21)
813 // stw src, dst_offset(r1)
815 | ((-src_locs[i]) << 21)
821 ASSERT(dst_locs[i] >= 0);
822 ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
824 if(typeString[i] == 'l' || typeString[i] == 'L')
826 // lwz r0, src_offset(r1)
828 | (src_locs[i] + frameSize + 8 + 4);
829 // stw r0, dst_offset(r1)
831 | (dst_locs[i] + 8 + 4);
833 // lwz r0, src_offset(r1)
835 | (src_locs[i] + frameSize + 8);
836 // stw r0, dst_offset(r1)
842 // * hptr will be the new first argument.
844 *code++ = OP_HI(0x3c60, hptr);
845 // ori r3,r3,lo(hptr)
846 *code++ = OP_LO(0x6063, hptr);
848 // * we need to return to a piece of code
849 // which will tear down the stack frame.
850 // lis r11,hi(obscure_ccall_ret_code)
851 *code++ = OP_HI(0x3d60, obscure_ccall_ret_code);
852 // ori r11,r11,lo(obscure_ccall_ret_code)
853 *code++ = OP_LO(0x616b, obscure_ccall_ret_code);
855 *code++ = 0x7d6803a6;
859 *code++ = OP_HI(0x3d60, wptr);
860 // ori r11,r11,lo(wptr)
861 *code++ = OP_LO(0x616b, wptr);
863 *code++ = 0x7d6903a6;
865 *code++ = 0x4e800420;
867 // Flush the Instruction cache:
869 unsigned *p = adjustor;
872 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
876 __asm__ volatile ("sync\n\tisync");
880 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
882 #define OP_LO(op,lo) ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
883 #define OP_HI(op,hi) ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
885 /* The following code applies to all PowerPC and PowerPC64 platforms
886 whose stack layout is based on the AIX ABI.
888 Besides (obviously) AIX, this includes
889 Mac OS 9 and BeOS/PPC (may they rest in peace),
890 which use the 32-bit AIX ABI
892 which uses the 64-bit AIX ABI
893 and Darwin (Mac OS X),
894 which uses the same stack layout as AIX,
895 but no function descriptors.
897 The actual stack-frame shuffling is implemented out-of-line
898 in the function adjustorCode, in AdjustorAsm.S.
899 Here, we set up an AdjustorStub structure, which
900 is a function descriptor (on platforms that have function
901 descriptors) or a short piece of stub code (on Darwin) to call
902 adjustorCode with a pointer to the AdjustorStub struct loaded
905 One nice thing about this is that there is _no_ code generated at
906 runtime on the platforms that have function descriptors.
908 AdjustorStub *adjustorStub;
909 int sz = 0, extra_sz, total_sz;
911 // from AdjustorAsm.s
912 // not declared as a function so that AIX-style
913 // fundescs can never get in the way.
914 extern void *adjustorCode;
917 adjustorStub = stgMallocBytes(sizeof(AdjustorStub), "createAdjustor");
919 adjustorStub = allocateExec(sizeof(AdjustorStub));
921 adjustor = adjustorStub;
923 adjustorStub->code = (void*) &adjustorCode;
926 // function descriptors are a cool idea.
927 // We don't need to generate any code at runtime.
928 adjustorStub->toc = adjustorStub;
931 // no function descriptors :-(
932 // We need to do things "by hand".
933 #if defined(powerpc_HOST_ARCH)
934 // lis r2, hi(adjustorStub)
935 adjustorStub->lis = OP_HI(0x3c40, adjustorStub);
936 // ori r2, r2, lo(adjustorStub)
937 adjustorStub->ori = OP_LO(0x6042, adjustorStub);
939 adjustorStub->lwz = OP_LO(0x8002, (char*)(&adjustorStub->code)
940 - (char*)adjustorStub);
942 adjustorStub->mtctr = 0x7c0903a6;
944 adjustorStub->bctr = 0x4e800420;
946 barf("adjustor creation not supported on this platform");
949 // Flush the Instruction cache:
951 int n = sizeof(AdjustorStub)/sizeof(unsigned);
952 unsigned *p = (unsigned*)adjustor;
955 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
959 __asm__ volatile ("sync\n\tisync");
963 // Calculate the size of the stack frame, in words.
964 sz = totalArgumentSize(typeString);
966 // The first eight words of the parameter area
967 // are just "backing store" for the parameters passed in
968 // the GPRs. extra_sz is the number of words beyond those first
974 // Calculate the total size of the stack frame.
975 total_sz = (6 /* linkage area */
976 + 8 /* minimum parameter area */
977 + 2 /* two extra arguments */
978 + extra_sz)*sizeof(StgWord);
980 // align to 16 bytes.
981 // AIX only requires 8 bytes, but who cares?
982 total_sz = (total_sz+15) & ~0xF;
984 // Fill in the information that adjustorCode in AdjustorAsm.S
985 // will use to create a new stack frame with the additional args.
986 adjustorStub->hptr = hptr;
987 adjustorStub->wptr = wptr;
988 adjustorStub->negative_framesize = -total_sz;
989 adjustorStub->extrawords_plus_one = extra_sz + 1;
992 #elif defined(ia64_HOST_ARCH)
994 Up to 8 inputs are passed in registers. We flush the last two inputs to
995 the stack, initially into the 16-byte scratch region left by the caller.
996 We then shuffle the others along by 4 (taking 2 registers for ourselves
997 to save return address and previous function state - we need to come back
998 here on the way out to restore the stack, so this is a real function
999 rather than just a trampoline).
1001 The function descriptor we create contains the gp of the target function
1002 so gp is already loaded correctly.
1004 [MLX] alloc r16=ar.pfs,10,2,0
1006 [MII] st8.spill [r12]=r38,8 // spill in6 (out4)
1007 mov r41=r37 // out7 = in5 (out3)
1008 mov r40=r36;; // out6 = in4 (out2)
1009 [MII] st8.spill [r12]=r39 // spill in7 (out5)
1011 mov r38=r34;; // out4 = in2 (out0)
1012 [MII] mov r39=r35 // out5 = in3 (out1)
1013 mov r37=r33 // out3 = in1 (loc1)
1014 mov r36=r32 // out2 = in0 (loc0)
1015 [MLX] adds r12=-24,r12 // update sp
1016 movl r34=hptr;; // out0 = hptr
1017 [MIB] mov r33=r16 // loc1 = ar.pfs
1018 mov r32=b0 // loc0 = retaddr
1019 br.call.sptk.many b0=b6;;
1021 [MII] adds r12=-16,r12
1026 br.ret.sptk.many b0;;
1029 /* These macros distribute a long constant into the two words of an MLX bundle */
1030 #define BITS(val,start,count) (((val) >> (start)) & ((1 << (count))-1))
1031 #define MOVL_LOWORD(val) (BITS(val,22,18) << 46)
1032 #define MOVL_HIWORD(val) ( (BITS(val,0,7) << 36) \
1033 | (BITS(val,7,9) << 50) \
1034 | (BITS(val,16,5) << 45) \
1035 | (BITS(val,21,1) << 44) \
1036 | (BITS(val,40,23)) \
1037 | (BITS(val,63,1) << 59))
1040 StgStablePtr stable;
1041 IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
1042 StgWord64 wcode = wdesc->ip;
1046 /* we allocate on the Haskell heap since malloc'd memory isn't
1047 * executable - argh */
1048 /* Allocated memory is word-aligned (8 bytes) but functions on ia64
1049 * must be aligned to 16 bytes. We allocate an extra 8 bytes of
1050 * wiggle room so that we can put the code on a 16 byte boundary. */
1051 adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8+8, &stable);
1053 fdesc = (IA64FunDesc *)adjustor;
1054 code = (StgWord64 *)(fdesc + 1);
1055 /* add 8 bytes to code if needed to align to a 16-byte boundary */
1056 if ((StgWord64)code & 15) code++;
1057 fdesc->ip = (StgWord64)code;
1058 fdesc->gp = wdesc->gp;
1060 code[0] = 0x0000058004288004 | MOVL_LOWORD(wcode);
1061 code[1] = 0x6000000220000000 | MOVL_HIWORD(wcode);
1062 code[2] = 0x029015d818984001;
1063 code[3] = 0x8401200500420094;
1064 code[4] = 0x886011d8189c0001;
1065 code[5] = 0x84011004c00380c0;
1066 code[6] = 0x0250210046013800;
1067 code[7] = 0x8401000480420084;
1068 code[8] = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
1069 code[9] = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
1070 code[10] = 0x0200210020010811;
1071 code[11] = 0x1080006800006200;
1072 code[12] = 0x0000210018406000;
1073 code[13] = 0x00aa021000038005;
1074 code[14] = 0x000000010000001d;
1075 code[15] = 0x0084000880000200;
1077 /* save stable pointers in convenient form */
1078 code[16] = (StgWord64)hptr;
1079 code[17] = (StgWord64)stable;
1082 barf("adjustor creation not supported on this platform");
1097 freeHaskellFunctionPtr(void* ptr)
1099 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
1100 if ( *(unsigned char*)ptr != 0x68 &&
1101 *(unsigned char*)ptr != 0x58 ) {
1102 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1106 /* Free the stable pointer first..*/
1107 if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
1108 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
1110 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
1112 #elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
1113 if ( *(unsigned char*)ptr != 0xe8 ) {
1114 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1117 freeStablePtr(((AdjustorStub*)ptr)->hptr);
1118 #elif defined(x86_64_HOST_ARCH)
1119 if ( *(StgWord16 *)ptr == 0x894d ) {
1120 freeStablePtr(*(StgStablePtr*)(ptr+0x20));
1121 } else if ( *(StgWord16 *)ptr == 0x5141 ) {
1122 freeStablePtr(*(StgStablePtr*)(ptr+0x30));
1124 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1127 #elif defined(sparc_HOST_ARCH)
1128 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
1129 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1133 /* Free the stable pointer first..*/
1134 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
1135 #elif defined(alpha_HOST_ARCH)
1136 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
1137 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1141 /* Free the stable pointer first..*/
1142 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
1143 #elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
1144 if ( *(StgWord*)ptr != 0x48000008 ) {
1145 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1148 freeStablePtr(((StgStablePtr*)ptr)[1]);
1149 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
1150 extern void* adjustorCode;
1151 if ( ((AdjustorStub*)ptr)->code != (StgFunPtr) &adjustorCode ) {
1152 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1155 freeStablePtr(((AdjustorStub*)ptr)->hptr);
1156 #elif defined(ia64_HOST_ARCH)
1157 IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
1158 StgWord64 *code = (StgWord64 *)(fdesc+1);
1160 if (fdesc->ip != (StgWord64)code) {
1161 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1164 freeStablePtr((StgStablePtr)code[16]);
1165 freeStablePtr((StgStablePtr)code[17]);
1170 *((unsigned char*)ptr) = '\0';
1175 #endif // !USE_LIBFFI