1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
6 * ---------------------------------------------------------------------------*/
8 /* A little bit of background...
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so will result in memory leaks on both the C and
39 #include "PosixSource.h"
41 #include "RtsExternal.h"
45 #if defined(USE_LIBFFI_FOR_ADJUSTORS)
51 freeHaskellFunctionPtr(void* ptr)
55 cl = (ffi_closure*)ptr;
56 freeStablePtr(cl->user_data);
57 stgFree(cl->cif->arg_types);
62 static ffi_type * char_to_ffi_type(char c)
65 case 'v': return &ffi_type_void;
66 case 'f': return &ffi_type_float;
67 case 'd': return &ffi_type_double;
68 case 'L': return &ffi_type_sint64;
69 case 'l': return &ffi_type_uint64;
70 case 'W': return &ffi_type_sint32;
71 case 'w': return &ffi_type_uint32;
72 case 'S': return &ffi_type_sint16;
73 case 's': return &ffi_type_uint16;
74 case 'B': return &ffi_type_sint8;
75 case 'b': return &ffi_type_uint8;
76 case 'p': return &ffi_type_pointer;
77 default: barf("char_to_ffi_type: unknown type '%c'", c);
82 createAdjustor (int cconv,
90 ffi_type *result_type;
95 n_args = strlen(typeString) - 1;
96 cif = stgMallocBytes(sizeof(ffi_cif), "createAdjustor");
97 arg_types = stgMallocBytes(n_args * sizeof(ffi_type*), "createAdjustor");
99 result_type = char_to_ffi_type(typeString[0]);
100 for (i=0; i < n_args; i++) {
101 arg_types[i] = char_to_ffi_type(typeString[i+1]);
104 #ifdef mingw32_TARGET_OS
105 case 0: /* stdcall */
110 abi = FFI_DEFAULT_ABI;
113 barf("createAdjustor: convention %d not supported on this platform", cconv);
116 r = ffi_prep_cif(cif, abi, n_args, result_type, arg_types);
117 if (r != FFI_OK) barf("ffi_prep_cif failed: %d", r);
119 cl = allocateExec(sizeof(ffi_closure), &code);
121 barf("createAdjustor: failed to allocate memory");
124 r = ffi_prep_closure(cl, cif, (void*)wptr, hptr/*userdata*/);
125 if (r != FFI_OK) barf("ffi_prep_closure failed: %d", r);
130 #else // To end of file...
136 #if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
140 #ifdef LEADING_UNDERSCORE
141 #define UNDERSCORE "_"
143 #define UNDERSCORE ""
145 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
147 Now here's something obscure for you:
149 When generating an adjustor thunk that uses the C calling
150 convention, we have to make sure that the thunk kicks off
151 the process of jumping into Haskell with a tail jump. Why?
152 Because as a result of jumping in into Haskell we may end
153 up freeing the very adjustor thunk we came from using
154 freeHaskellFunctionPtr(). Hence, we better not return to
155 the adjustor code on our way out, since it could by then
158 The fix is readily at hand, just include the opcodes
159 for the C stack fixup code that we need to perform when
160 returning in some static piece of memory and arrange
161 to return to it before tail jumping from the adjustor thunk.
163 static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
166 ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
167 UNDERSCORE "obscure_ccall_ret_code:\n\t"
168 "addl $0x4, %esp\n\t"
172 extern void obscure_ccall_ret_code(void);
176 #if defined(x86_64_HOST_ARCH)
177 static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
180 ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
181 UNDERSCORE "obscure_ccall_ret_code:\n\t"
182 "addq $0x8, %rsp\n\t"
186 extern void obscure_ccall_ret_code(void);
189 #if defined(alpha_HOST_ARCH)
190 /* To get the definition of PAL_imb: */
191 # if defined(linux_HOST_OS)
192 # include <asm/pal.h>
194 # include <machine/pal.h>
198 #if defined(ia64_HOST_ARCH)
200 /* Layout of a function descriptor */
201 typedef struct _IA64FunDesc {
207 stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
210 nat data_size_in_words, total_size_in_words;
212 /* round up to a whole number of words */
213 data_size_in_words = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
214 total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
216 /* allocate and fill it in */
217 arr = (StgArrWords *)allocate(total_size_in_words);
218 SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
220 /* obtain a stable ptr */
221 *stable = getStablePtr((StgPtr)arr);
223 /* and return a ptr to the goods inside the array */
224 return(&(arr->payload));
228 #if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
229 __asm__("obscure_ccall_ret_code:\n\t"
234 extern void obscure_ccall_ret_code(void);
237 #if defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
238 #if !(defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS))
240 /* !!! !!! WARNING: !!! !!!
241 * This structure is accessed from AdjustorAsm.s
242 * Any changes here have to be mirrored in the offsets there.
245 typedef struct AdjustorStub {
246 #if defined(powerpc_HOST_ARCH) && defined(darwin_HOST_OS)
253 #elif defined(powerpc64_HOST_ARCH) && defined(darwin_HOST_OS)
254 /* powerpc64-darwin: just guessing that it won't use fundescs. */
265 /* fundesc-based ABIs */
274 StgInt negative_framesize;
275 StgInt extrawords_plus_one;
281 #if defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
283 /* !!! !!! WARNING: !!! !!!
284 * This structure is accessed from AdjustorAsm.s
285 * Any changes here have to be mirrored in the offsets there.
288 typedef struct AdjustorStub {
289 unsigned char call[8];
293 StgInt argument_size;
297 #if defined(darwin_HOST_OS) || defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
298 static int totalArgumentSize(char *typeString)
303 char t = *typeString++;
307 // on 32-bit platforms, Double and Int64 occupy two words.
311 if(sizeof(void*) == 4)
316 // everything else is one word.
326 createAdjustor(int cconv, StgStablePtr hptr,
329 #if !defined(powerpc_HOST_ARCH) && !defined(powerpc64_HOST_ARCH) && !defined(x86_64_HOST_ARCH)
334 void *adjustor = NULL;
339 case 0: /* _stdcall */
340 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
341 /* Magic constant computed by inspecting the code length of
342 the following assembly language snippet
343 (offset and machine code prefixed):
345 <0>: 58 popl %eax # temp. remove ret addr..
346 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
347 # hold a StgStablePtr
348 <6>: 50 pushl %eax # put back ret. addr
349 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
350 <c>: ff e0 jmp %eax # and jump to it.
351 # the callee cleans up the stack
353 adjustor = allocateExec(14,&code);
355 unsigned char *const adj_code = (unsigned char *)adjustor;
356 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
358 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
359 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
361 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
363 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
364 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
366 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
367 adj_code[0x0d] = (unsigned char)0xe0;
373 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
374 /* Magic constant computed by inspecting the code length of
375 the following assembly language snippet
376 (offset and machine code prefixed):
378 <00>: 68 ef be ad de pushl $0xdeadbeef # constant is large enough to
379 # hold a StgStablePtr
380 <05>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
381 <0a>: 68 ef be ad de pushl $obscure_ccall_ret_code # push the return address
382 <0f>: ff e0 jmp *%eax # jump to wptr
384 The ccall'ing version is a tad different, passing in the return
385 address of the caller to the auto-generated C stub (which enters
386 via the stable pointer.) (The auto-generated C stub is in on this
387 game, don't worry :-)
389 See the comment next to obscure_ccall_ret_code why we need to
390 perform a tail jump instead of a call, followed by some C stack
393 Note: The adjustor makes the assumption that any return value
394 coming back from the C stub is not stored on the stack.
395 That's (thankfully) the case here with the restricted set of
396 return types that we support.
398 adjustor = allocateExec(17,&code);
400 unsigned char *const adj_code = (unsigned char *)adjustor;
402 adj_code[0x00] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
403 *((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
405 adj_code[0x05] = (unsigned char)0xb8; /* movl $wptr, %eax */
406 *((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
408 adj_code[0x0a] = (unsigned char)0x68; /* pushl obscure_ccall_ret_code */
409 *((StgFunPtr*)(adj_code + 0x0b)) =
410 (StgFunPtr)obscure_ccall_ret_code;
412 adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
413 adj_code[0x10] = (unsigned char)0xe0;
415 #elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
418 What's special about Darwin/Mac OS X on i386?
419 It wants the stack to stay 16-byte aligned.
421 We offload most of the work to AdjustorAsm.S.
423 AdjustorStub *adjustorStub = allocateExec(sizeof(AdjustorStub),&code);
424 adjustor = adjustorStub;
426 extern void adjustorCode(void);
427 int sz = totalArgumentSize(typeString);
429 adjustorStub->call[0] = 0xe8;
430 *(long*)&adjustorStub->call[1] = ((char*)&adjustorCode) - ((char*)adjustorStub + 5);
431 adjustorStub->hptr = hptr;
432 adjustorStub->wptr = wptr;
434 // The adjustor puts the following things on the stack:
436 // 2.) padding and (a copy of) the arguments
437 // 3.) a dummy argument
439 // 5.) return address (for returning to the adjustor)
440 // All these have to add up to a multiple of 16.
442 // first, include everything in frame_size
443 adjustorStub->frame_size = sz * 4 + 16;
445 adjustorStub->frame_size = (adjustorStub->frame_size + 15) & ~15;
446 // only count 2.) and 3.) as part of frame_size
447 adjustorStub->frame_size -= 12;
448 adjustorStub->argument_size = sz;
451 #elif defined(x86_64_HOST_ARCH)
458 %rdi,%rsi,%rdx,%rcx,%r8,%r9 = arg0..arg6
460 if there are <6 integer args, then we can just push the
461 StablePtr into %edi and shuffle the other args up.
463 If there are >=6 integer args, then we have to flush one arg
464 to the stack, and arrange to adjust the stack ptr on return.
465 The stack will be rearranged to this:
470 return address *** <-- dummy arg in stub fn.
472 obscure_ccall_ret_code
474 This unfortunately means that the type of the stub function
475 must have a dummy argument for the original return address
476 pointer inserted just after the 6th integer argument.
478 Code for the simple case:
480 0: 4d 89 c1 mov %r8,%r9
481 3: 49 89 c8 mov %rcx,%r8
482 6: 48 89 d1 mov %rdx,%rcx
483 9: 48 89 f2 mov %rsi,%rdx
484 c: 48 89 fe mov %rdi,%rsi
485 f: 48 8b 3d 0a 00 00 00 mov 10(%rip),%rdi
486 16: ff 25 0c 00 00 00 jmpq *12(%rip)
488 20: .quad 0 # aligned on 8-byte boundary
489 28: .quad 0 # aligned on 8-byte boundary
492 And the version for >=6 integer arguments:
495 2: ff 35 20 00 00 00 pushq 32(%rip) # 28 <ccall_adjustor+0x28>
496 8: 4d 89 c1 mov %r8,%r9
497 b: 49 89 c8 mov %rcx,%r8
498 e: 48 89 d1 mov %rdx,%rcx
499 11: 48 89 f2 mov %rsi,%rdx
500 14: 48 89 fe mov %rdi,%rsi
501 17: 48 8b 3d 12 00 00 00 mov 18(%rip),%rdi # 30 <ccall_adjustor+0x30>
502 1e: ff 25 14 00 00 00 jmpq *20(%rip) # 38 <ccall_adjustor+0x38>
504 28: .quad 0 # aligned on 8-byte boundary
505 30: .quad 0 # aligned on 8-byte boundary
506 38: .quad 0 # aligned on 8-byte boundary
513 // determine whether we have 6 or more integer arguments,
514 // and therefore need to flush one to the stack.
515 for (c = typeString; *c != '\0'; c++) {
516 if (*c != 'f' && *c != 'd') i++;
521 adjustor = allocateExec(0x30,&code);
523 *(StgInt32 *)adjustor = 0x49c1894d;
524 *(StgInt32 *)(adjustor+0x4) = 0x8948c889;
525 *(StgInt32 *)(adjustor+0x8) = 0xf28948d1;
526 *(StgInt32 *)(adjustor+0xc) = 0x48fe8948;
527 *(StgInt32 *)(adjustor+0x10) = 0x000a3d8b;
528 *(StgInt32 *)(adjustor+0x14) = 0x25ff0000;
529 *(StgInt32 *)(adjustor+0x18) = 0x0000000c;
530 *(StgInt64 *)(adjustor+0x20) = (StgInt64)hptr;
531 *(StgInt64 *)(adjustor+0x28) = (StgInt64)wptr;
535 adjustor = allocateExec(0x40,&code);
537 *(StgInt32 *)adjustor = 0x35ff5141;
538 *(StgInt32 *)(adjustor+0x4) = 0x00000020;
539 *(StgInt32 *)(adjustor+0x8) = 0x49c1894d;
540 *(StgInt32 *)(adjustor+0xc) = 0x8948c889;
541 *(StgInt32 *)(adjustor+0x10) = 0xf28948d1;
542 *(StgInt32 *)(adjustor+0x14) = 0x48fe8948;
543 *(StgInt32 *)(adjustor+0x18) = 0x00123d8b;
544 *(StgInt32 *)(adjustor+0x1c) = 0x25ff0000;
545 *(StgInt32 *)(adjustor+0x20) = 0x00000014;
547 *(StgInt64 *)(adjustor+0x28) = (StgInt64)obscure_ccall_ret_code;
548 *(StgInt64 *)(adjustor+0x30) = (StgInt64)hptr;
549 *(StgInt64 *)(adjustor+0x38) = (StgInt64)wptr;
552 #elif defined(sparc_HOST_ARCH)
553 /* Magic constant computed by inspecting the code length of the following
554 assembly language snippet (offset and machine code prefixed):
556 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
557 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
558 <08>: D823A05C st %o4, [%sp + 92]
559 <0C>: 9A10000B mov %o3, %o5
560 <10>: 9810000A mov %o2, %o4
561 <14>: 96100009 mov %o1, %o3
562 <18>: 94100008 mov %o0, %o2
563 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
564 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
565 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
566 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
567 <2C> 00000000 ! place for getting hptr back easily
569 ccall'ing on SPARC is easy, because we are quite lucky to push a
570 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
571 existing arguments (note that %sp must stay double-word aligned at
572 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
573 To do this, we extend the *caller's* stack frame by 2 words and shift
574 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
575 procedure because of the tail-jump) by 2 positions. This makes room in
576 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
577 for destination addr of jump on SPARC, return address on x86, ...). This
578 shouldn't cause any problems for a C-like caller: alloca is implemented
579 similarly, and local variables should be accessed via %fp, not %sp. In a
580 nutshell: This should work! (Famous last words! :-)
582 adjustor = allocateExec(4*(11+1),&code);
584 unsigned long *const adj_code = (unsigned long *)adjustor;
586 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
587 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
588 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
589 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
590 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
591 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
592 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
593 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
594 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
595 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
596 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
597 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
598 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
599 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
600 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
602 adj_code[11] = (unsigned long)hptr;
605 asm("flush %0" : : "r" (adj_code ));
606 asm("flush %0" : : "r" (adj_code + 2));
607 asm("flush %0" : : "r" (adj_code + 4));
608 asm("flush %0" : : "r" (adj_code + 6));
609 asm("flush %0" : : "r" (adj_code + 10));
611 /* max. 5 instructions latency, and we need at >= 1 for returning */
617 #elif defined(alpha_HOST_ARCH)
618 /* Magic constant computed by inspecting the code length of
619 the following assembly language snippet
620 (offset and machine code prefixed; note that the machine code
621 shown is longwords stored in little-endian order):
623 <00>: 46520414 mov a2, a4
624 <04>: 46100412 mov a0, a2
625 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
626 <0c>: 46730415 mov a3, a5
627 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
628 <14>: 46310413 mov a1, a3
629 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
630 <1c>: 00000000 # padding for alignment
631 <20>: [8 bytes for hptr quadword]
632 <28>: [8 bytes for wptr quadword]
634 The "computed" jump at <08> above is really a jump to a fixed
635 location. Accordingly, we place an always-correct hint in the
636 jump instruction, namely the address offset from <0c> to wptr,
637 divided by 4, taking the lowest 14 bits.
639 We only support passing 4 or fewer argument words, for the same
640 reason described under sparc_HOST_ARCH above by JRS, 21 Aug 01.
641 On the Alpha the first 6 integer arguments are in a0 through a5,
642 and the rest on the stack. Hence we want to shuffle the original
643 caller's arguments by two.
645 On the Alpha the calling convention is so complex and dependent
646 on the callee's signature -- for example, the stack pointer has
647 to be a multiple of 16 -- that it seems impossible to me [ccshan]
648 to handle the general case correctly without changing how the
649 adjustor is called from C. For now, our solution of shuffling
650 registers only and ignoring the stack only works if the original
651 caller passed 4 or fewer argument words.
653 TODO: Depending on how much allocation overhead stgMallocBytes uses for
654 header information (more precisely, if the overhead is no more than
655 4 bytes), we should move the first three instructions above down by
656 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
658 ASSERT(((StgWord64)wptr & 3) == 0);
659 adjustor = allocateExec(48,&code);
661 StgWord64 *const code = (StgWord64 *)adjustor;
663 code[0] = 0x4610041246520414L;
664 code[1] = 0x46730415a61b0020L;
665 code[2] = 0x46310413a77b0028L;
666 code[3] = 0x000000006bfb0000L
667 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
669 code[4] = (StgWord64)hptr;
670 code[5] = (StgWord64)wptr;
672 /* Ensure that instruction cache is consistent with our new code */
673 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
675 #elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
677 #define OP_LO(op,lo) ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
678 #define OP_HI(op,hi) ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
680 /* The PowerPC Linux (32-bit) calling convention is annoyingly complex.
681 We need to calculate all the details of the stack frame layout,
682 taking into account the types of all the arguments, and then
683 generate code on the fly. */
685 int src_gpr = 3, dst_gpr = 5;
687 int src_offset = 0, dst_offset = 0;
688 int n = strlen(typeString),i;
689 int src_locs[n], dst_locs[n];
694 Calculate where the arguments should go.
695 src_locs[] will contain the locations of the arguments in the
696 original stack frame passed to the adjustor.
697 dst_locs[] will contain the locations of the arguments after the
698 adjustor runs, on entry to the wrapper proc pointed to by wptr.
700 This algorithm is based on the one described on page 3-19 of the
701 System V ABI PowerPC Processor Supplement.
703 for(i=0;typeString[i];i++)
705 char t = typeString[i];
706 if((t == 'f' || t == 'd') && fpr <= 8)
707 src_locs[i] = dst_locs[i] = -32-(fpr++);
710 if((t == 'l' || t == 'L') && src_gpr <= 9)
712 if((src_gpr & 1) == 0)
714 src_locs[i] = -src_gpr;
717 else if((t == 'w' || t == 'W') && src_gpr <= 10)
719 src_locs[i] = -(src_gpr++);
723 if(t == 'l' || t == 'L' || t == 'd')
728 src_locs[i] = src_offset;
729 src_offset += (t == 'l' || t == 'L' || t == 'd') ? 8 : 4;
732 if((t == 'l' || t == 'L') && dst_gpr <= 9)
734 if((dst_gpr & 1) == 0)
736 dst_locs[i] = -dst_gpr;
739 else if((t == 'w' || t == 'W') && dst_gpr <= 10)
741 dst_locs[i] = -(dst_gpr++);
745 if(t == 'l' || t == 'L' || t == 'd')
750 dst_locs[i] = dst_offset;
751 dst_offset += (t == 'l' || t == 'L' || t == 'd') ? 8 : 4;
756 frameSize = dst_offset + 8;
757 frameSize = (frameSize+15) & ~0xF;
762 // allocate space for at most 4 insns per parameter
763 // plus 14 more instructions.
764 adjustor = allocateExec(4 * (4*n + 14),&code);
765 code = (unsigned*)adjustor;
767 *code++ = 0x48000008; // b *+8
768 // * Put the hptr in a place where freeHaskellFunctionPtr
770 *code++ = (unsigned) hptr;
772 // * save the link register
773 *code++ = 0x7c0802a6; // mflr r0;
774 *code++ = 0x90010004; // stw r0, 4(r1);
775 // * and build a new stack frame
776 *code++ = OP_LO(0x9421, -frameSize); // stwu r1, -frameSize(r1)
778 // * now generate instructions to copy arguments
779 // from the old stack frame into the new stack frame.
782 if(src_locs[i] < -32)
783 ASSERT(dst_locs[i] == src_locs[i]);
784 else if(src_locs[i] < 0)
787 ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
790 ASSERT(dst_locs[i] > -32);
791 // dst is in GPR, too.
793 if(typeString[i] == 'l' || typeString[i] == 'L')
797 | ((-dst_locs[i]+1) << 16)
798 | ((-src_locs[i]+1) << 11)
799 | ((-src_locs[i]+1) << 21);
803 | ((-dst_locs[i]) << 16)
804 | ((-src_locs[i]) << 11)
805 | ((-src_locs[i]) << 21);
809 if(typeString[i] == 'l' || typeString[i] == 'L')
811 // stw src+1, dst_offset+4(r1)
813 | ((-src_locs[i]+1) << 21)
817 // stw src, dst_offset(r1)
819 | ((-src_locs[i]) << 21)
825 ASSERT(dst_locs[i] >= 0);
826 ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
828 if(typeString[i] == 'l' || typeString[i] == 'L')
830 // lwz r0, src_offset(r1)
832 | (src_locs[i] + frameSize + 8 + 4);
833 // stw r0, dst_offset(r1)
835 | (dst_locs[i] + 8 + 4);
837 // lwz r0, src_offset(r1)
839 | (src_locs[i] + frameSize + 8);
840 // stw r0, dst_offset(r1)
846 // * hptr will be the new first argument.
848 *code++ = OP_HI(0x3c60, hptr);
849 // ori r3,r3,lo(hptr)
850 *code++ = OP_LO(0x6063, hptr);
852 // * we need to return to a piece of code
853 // which will tear down the stack frame.
854 // lis r11,hi(obscure_ccall_ret_code)
855 *code++ = OP_HI(0x3d60, obscure_ccall_ret_code);
856 // ori r11,r11,lo(obscure_ccall_ret_code)
857 *code++ = OP_LO(0x616b, obscure_ccall_ret_code);
859 *code++ = 0x7d6803a6;
863 *code++ = OP_HI(0x3d60, wptr);
864 // ori r11,r11,lo(wptr)
865 *code++ = OP_LO(0x616b, wptr);
867 *code++ = 0x7d6903a6;
869 *code++ = 0x4e800420;
871 // Flush the Instruction cache:
873 unsigned *p = adjustor;
876 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
880 __asm__ volatile ("sync\n\tisync");
884 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
886 #define OP_LO(op,lo) ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
887 #define OP_HI(op,hi) ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
889 /* The following code applies to all PowerPC and PowerPC64 platforms
890 whose stack layout is based on the AIX ABI.
892 Besides (obviously) AIX, this includes
893 Mac OS 9 and BeOS/PPC (may they rest in peace),
894 which use the 32-bit AIX ABI
896 which uses the 64-bit AIX ABI
897 and Darwin (Mac OS X),
898 which uses the same stack layout as AIX,
899 but no function descriptors.
901 The actual stack-frame shuffling is implemented out-of-line
902 in the function adjustorCode, in AdjustorAsm.S.
903 Here, we set up an AdjustorStub structure, which
904 is a function descriptor (on platforms that have function
905 descriptors) or a short piece of stub code (on Darwin) to call
906 adjustorCode with a pointer to the AdjustorStub struct loaded
909 One nice thing about this is that there is _no_ code generated at
910 runtime on the platforms that have function descriptors.
912 AdjustorStub *adjustorStub;
913 int sz = 0, extra_sz, total_sz;
915 // from AdjustorAsm.s
916 // not declared as a function so that AIX-style
917 // fundescs can never get in the way.
918 extern void *adjustorCode;
921 adjustorStub = stgMallocBytes(sizeof(AdjustorStub), "createAdjustor");
923 adjustorStub = allocateExec(sizeof(AdjustorStub),&code);
925 adjustor = adjustorStub;
927 adjustorStub->code = (void*) &adjustorCode;
930 // function descriptors are a cool idea.
931 // We don't need to generate any code at runtime.
932 adjustorStub->toc = adjustorStub;
935 // no function descriptors :-(
936 // We need to do things "by hand".
937 #if defined(powerpc_HOST_ARCH)
938 // lis r2, hi(adjustorStub)
939 adjustorStub->lis = OP_HI(0x3c40, adjustorStub);
940 // ori r2, r2, lo(adjustorStub)
941 adjustorStub->ori = OP_LO(0x6042, adjustorStub);
943 adjustorStub->lwz = OP_LO(0x8002, (char*)(&adjustorStub->code)
944 - (char*)adjustorStub);
946 adjustorStub->mtctr = 0x7c0903a6;
948 adjustorStub->bctr = 0x4e800420;
950 barf("adjustor creation not supported on this platform");
953 // Flush the Instruction cache:
955 int n = sizeof(AdjustorStub)/sizeof(unsigned);
956 unsigned *p = (unsigned*)adjustor;
959 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
963 __asm__ volatile ("sync\n\tisync");
967 // Calculate the size of the stack frame, in words.
968 sz = totalArgumentSize(typeString);
970 // The first eight words of the parameter area
971 // are just "backing store" for the parameters passed in
972 // the GPRs. extra_sz is the number of words beyond those first
978 // Calculate the total size of the stack frame.
979 total_sz = (6 /* linkage area */
980 + 8 /* minimum parameter area */
981 + 2 /* two extra arguments */
982 + extra_sz)*sizeof(StgWord);
984 // align to 16 bytes.
985 // AIX only requires 8 bytes, but who cares?
986 total_sz = (total_sz+15) & ~0xF;
988 // Fill in the information that adjustorCode in AdjustorAsm.S
989 // will use to create a new stack frame with the additional args.
990 adjustorStub->hptr = hptr;
991 adjustorStub->wptr = wptr;
992 adjustorStub->negative_framesize = -total_sz;
993 adjustorStub->extrawords_plus_one = extra_sz + 1;
996 #elif defined(ia64_HOST_ARCH)
998 Up to 8 inputs are passed in registers. We flush the last two inputs to
999 the stack, initially into the 16-byte scratch region left by the caller.
1000 We then shuffle the others along by 4 (taking 2 registers for ourselves
1001 to save return address and previous function state - we need to come back
1002 here on the way out to restore the stack, so this is a real function
1003 rather than just a trampoline).
1005 The function descriptor we create contains the gp of the target function
1006 so gp is already loaded correctly.
1008 [MLX] alloc r16=ar.pfs,10,2,0
1010 [MII] st8.spill [r12]=r38,8 // spill in6 (out4)
1011 mov r41=r37 // out7 = in5 (out3)
1012 mov r40=r36;; // out6 = in4 (out2)
1013 [MII] st8.spill [r12]=r39 // spill in7 (out5)
1015 mov r38=r34;; // out4 = in2 (out0)
1016 [MII] mov r39=r35 // out5 = in3 (out1)
1017 mov r37=r33 // out3 = in1 (loc1)
1018 mov r36=r32 // out2 = in0 (loc0)
1019 [MLX] adds r12=-24,r12 // update sp
1020 movl r34=hptr;; // out0 = hptr
1021 [MIB] mov r33=r16 // loc1 = ar.pfs
1022 mov r32=b0 // loc0 = retaddr
1023 br.call.sptk.many b0=b6;;
1025 [MII] adds r12=-16,r12
1030 br.ret.sptk.many b0;;
1033 /* These macros distribute a long constant into the two words of an MLX bundle */
1034 #define BITS(val,start,count) (((val) >> (start)) & ((1 << (count))-1))
1035 #define MOVL_LOWORD(val) (BITS(val,22,18) << 46)
1036 #define MOVL_HIWORD(val) ( (BITS(val,0,7) << 36) \
1037 | (BITS(val,7,9) << 50) \
1038 | (BITS(val,16,5) << 45) \
1039 | (BITS(val,21,1) << 44) \
1040 | (BITS(val,40,23)) \
1041 | (BITS(val,63,1) << 59))
1044 StgStablePtr stable;
1045 IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
1046 StgWord64 wcode = wdesc->ip;
1050 /* we allocate on the Haskell heap since malloc'd memory isn't
1051 * executable - argh */
1052 /* Allocated memory is word-aligned (8 bytes) but functions on ia64
1053 * must be aligned to 16 bytes. We allocate an extra 8 bytes of
1054 * wiggle room so that we can put the code on a 16 byte boundary. */
1055 adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8+8, &stable);
1057 fdesc = (IA64FunDesc *)adjustor;
1058 code = (StgWord64 *)(fdesc + 1);
1059 /* add 8 bytes to code if needed to align to a 16-byte boundary */
1060 if ((StgWord64)code & 15) code++;
1061 fdesc->ip = (StgWord64)code;
1062 fdesc->gp = wdesc->gp;
1064 code[0] = 0x0000058004288004 | MOVL_LOWORD(wcode);
1065 code[1] = 0x6000000220000000 | MOVL_HIWORD(wcode);
1066 code[2] = 0x029015d818984001;
1067 code[3] = 0x8401200500420094;
1068 code[4] = 0x886011d8189c0001;
1069 code[5] = 0x84011004c00380c0;
1070 code[6] = 0x0250210046013800;
1071 code[7] = 0x8401000480420084;
1072 code[8] = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
1073 code[9] = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
1074 code[10] = 0x0200210020010811;
1075 code[11] = 0x1080006800006200;
1076 code[12] = 0x0000210018406000;
1077 code[13] = 0x00aa021000038005;
1078 code[14] = 0x000000010000001d;
1079 code[15] = 0x0084000880000200;
1081 /* save stable pointers in convenient form */
1082 code[16] = (StgWord64)hptr;
1083 code[17] = (StgWord64)stable;
1086 barf("adjustor creation not supported on this platform");
1101 freeHaskellFunctionPtr(void* ptr)
1103 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
1104 if ( *(unsigned char*)ptr != 0x68 &&
1105 *(unsigned char*)ptr != 0x58 ) {
1106 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1110 /* Free the stable pointer first..*/
1111 if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
1112 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
1114 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
1116 #elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
1117 if ( *(unsigned char*)ptr != 0xe8 ) {
1118 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1121 freeStablePtr(((AdjustorStub*)ptr)->hptr);
1122 #elif defined(x86_64_HOST_ARCH)
1123 if ( *(StgWord16 *)ptr == 0x894d ) {
1124 freeStablePtr(*(StgStablePtr*)(ptr+0x20));
1125 } else if ( *(StgWord16 *)ptr == 0x5141 ) {
1126 freeStablePtr(*(StgStablePtr*)(ptr+0x30));
1128 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1131 #elif defined(sparc_HOST_ARCH)
1132 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
1133 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1137 /* Free the stable pointer first..*/
1138 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
1139 #elif defined(alpha_HOST_ARCH)
1140 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
1141 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1145 /* Free the stable pointer first..*/
1146 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
1147 #elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
1148 if ( *(StgWord*)ptr != 0x48000008 ) {
1149 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1152 freeStablePtr(((StgStablePtr*)ptr)[1]);
1153 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
1154 extern void* adjustorCode;
1155 if ( ((AdjustorStub*)ptr)->code != (StgFunPtr) &adjustorCode ) {
1156 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1159 freeStablePtr(((AdjustorStub*)ptr)->hptr);
1160 #elif defined(ia64_HOST_ARCH)
1161 IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
1162 StgWord64 *code = (StgWord64 *)(fdesc+1);
1164 if (fdesc->ip != (StgWord64)code) {
1165 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1168 freeStablePtr((StgStablePtr)code[16]);
1169 freeStablePtr((StgStablePtr)code[17]);
1174 // Can't write to this memory, it is only executable:
1175 // *((unsigned char*)ptr) = '\0';
1180 #endif // !USE_LIBFFI_FOR_ADJUSTORS