1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
6 * ---------------------------------------------------------------------------*/
8 /* A little bit of background...
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so will result in memory leaks on both the C and
39 #include "PosixSource.h"
41 #include "RtsExternal.h"
45 #if defined(USE_LIBFFI)
51 freeHaskellFunctionPtr(void* ptr)
55 cl = (ffi_closure*)ptr;
56 freeStablePtr(cl->user_data);
57 stgFree(cl->cif->arg_types);
62 static ffi_type * char_to_ffi_type(char c)
65 case 'v': return &ffi_type_void;
66 case 'f': return &ffi_type_float;
67 case 'd': return &ffi_type_double;
68 case 'L': return &ffi_type_sint64;
69 case 'l': return &ffi_type_uint64;
70 case 'W': return &ffi_type_sint32;
71 case 'w': return &ffi_type_uint32;
72 case 'S': return &ffi_type_sint16;
73 case 's': return &ffi_type_uint16;
74 case 'B': return &ffi_type_sint8;
75 case 'b': return &ffi_type_uint8;
76 default: barf("char_to_ffi_type: unknown type '%c'", c);
81 createAdjustor (int cconv,
89 ffi_type *result_type;
93 n_args = strlen(typeString) - 1;
94 cif = stgMallocBytes(sizeof(ffi_cif), "createAdjustor");
95 arg_types = stgMallocBytes(n_args * sizeof(ffi_type*), "createAdjustor");
97 result_type = char_to_ffi_type(typeString[0]);
98 for (i=0; i < n_args; i++) {
99 arg_types[i] = char_to_ffi_type(typeString[i+1]);
102 #ifdef mingw32_TARGET_OS
103 case 0: /* stdcall */
108 abi = FFI_DEFAULT_ABI;
111 barf("createAdjustor: convention %d not supported on this platform", cconv);
114 r = ffi_prep_cif(cif, abi, n_args, result_type, arg_types);
115 if (r != FFI_OK) barf("ffi_prep_cif failed: %d", r);
117 // ToDo: use ffi_closure_alloc()
118 cl = allocateExec(sizeof(ffi_closure));
120 r = ffi_prep_closure(cl, cif, (void*)wptr, hptr/*userdata*/);
121 if (r != FFI_OK) barf("ffi_prep_closure failed: %d", r);
126 #else // To end of file...
132 #if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
136 #ifdef LEADING_UNDERSCORE
137 #define UNDERSCORE "_"
139 #define UNDERSCORE ""
141 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
143 Now here's something obscure for you:
145 When generating an adjustor thunk that uses the C calling
146 convention, we have to make sure that the thunk kicks off
147 the process of jumping into Haskell with a tail jump. Why?
148 Because as a result of jumping in into Haskell we may end
149 up freeing the very adjustor thunk we came from using
150 freeHaskellFunctionPtr(). Hence, we better not return to
151 the adjustor code on our way out, since it could by then
154 The fix is readily at hand, just include the opcodes
155 for the C stack fixup code that we need to perform when
156 returning in some static piece of memory and arrange
157 to return to it before tail jumping from the adjustor thunk.
159 static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
162 ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
163 UNDERSCORE "obscure_ccall_ret_code:\n\t"
164 "addl $0x4, %esp\n\t"
168 extern void obscure_ccall_ret_code(void);
172 #if defined(x86_64_HOST_ARCH)
173 static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
176 ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
177 UNDERSCORE "obscure_ccall_ret_code:\n\t"
178 "addq $0x8, %rsp\n\t"
182 extern void obscure_ccall_ret_code(void);
185 #if defined(alpha_HOST_ARCH)
186 /* To get the definition of PAL_imb: */
187 # if defined(linux_HOST_OS)
188 # include <asm/pal.h>
190 # include <machine/pal.h>
194 #if defined(ia64_HOST_ARCH)
196 /* Layout of a function descriptor */
197 typedef struct _IA64FunDesc {
203 stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
206 nat data_size_in_words, total_size_in_words;
208 /* round up to a whole number of words */
209 data_size_in_words = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
210 total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
212 /* allocate and fill it in */
213 arr = (StgArrWords *)allocate(total_size_in_words);
214 SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
216 /* obtain a stable ptr */
217 *stable = getStablePtr((StgPtr)arr);
219 /* and return a ptr to the goods inside the array */
220 return(&(arr->payload));
224 #if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
225 __asm__("obscure_ccall_ret_code:\n\t"
230 extern void obscure_ccall_ret_code(void);
233 #if defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
234 #if !(defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS))
236 /* !!! !!! WARNING: !!! !!!
237 * This structure is accessed from AdjustorAsm.s
238 * Any changes here have to be mirrored in the offsets there.
241 typedef struct AdjustorStub {
242 #if defined(powerpc_HOST_ARCH) && defined(darwin_HOST_OS)
249 #elif defined(powerpc64_HOST_ARCH) && defined(darwin_HOST_OS)
250 /* powerpc64-darwin: just guessing that it won't use fundescs. */
261 /* fundesc-based ABIs */
270 StgInt negative_framesize;
271 StgInt extrawords_plus_one;
277 #if defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
279 /* !!! !!! WARNING: !!! !!!
280 * This structure is accessed from AdjustorAsm.s
281 * Any changes here have to be mirrored in the offsets there.
284 typedef struct AdjustorStub {
285 unsigned char call[8];
289 StgInt argument_size;
293 #if defined(darwin_HOST_OS) || defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
294 static int totalArgumentSize(char *typeString)
299 char t = *typeString++;
303 // on 32-bit platforms, Double and Int64 occupy two words.
307 if(sizeof(void*) == 4)
312 // everything else is one word.
322 createAdjustor(int cconv, StgStablePtr hptr,
325 #if !defined(powerpc_HOST_ARCH) && !defined(powerpc64_HOST_ARCH) && !defined(x86_64_HOST_ARCH)
330 void *adjustor = NULL;
334 case 0: /* _stdcall */
335 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
336 /* Magic constant computed by inspecting the code length of
337 the following assembly language snippet
338 (offset and machine code prefixed):
340 <0>: 58 popl %eax # temp. remove ret addr..
341 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
342 # hold a StgStablePtr
343 <6>: 50 pushl %eax # put back ret. addr
344 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
345 <c>: ff e0 jmp %eax # and jump to it.
346 # the callee cleans up the stack
348 adjustor = allocateExec(14);
350 unsigned char *const adj_code = (unsigned char *)adjustor;
351 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
353 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
354 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
356 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
358 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
359 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
361 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
362 adj_code[0x0d] = (unsigned char)0xe0;
368 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
369 /* Magic constant computed by inspecting the code length of
370 the following assembly language snippet
371 (offset and machine code prefixed):
373 <00>: 68 ef be ad de pushl $0xdeadbeef # constant is large enough to
374 # hold a StgStablePtr
375 <05>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
376 <0a>: 68 ef be ad de pushl $obscure_ccall_ret_code # push the return address
377 <0f>: ff e0 jmp *%eax # jump to wptr
379 The ccall'ing version is a tad different, passing in the return
380 address of the caller to the auto-generated C stub (which enters
381 via the stable pointer.) (The auto-generated C stub is in on this
382 game, don't worry :-)
384 See the comment next to obscure_ccall_ret_code why we need to
385 perform a tail jump instead of a call, followed by some C stack
388 Note: The adjustor makes the assumption that any return value
389 coming back from the C stub is not stored on the stack.
390 That's (thankfully) the case here with the restricted set of
391 return types that we support.
393 adjustor = allocateExec(17);
395 unsigned char *const adj_code = (unsigned char *)adjustor;
397 adj_code[0x00] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
398 *((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
400 adj_code[0x05] = (unsigned char)0xb8; /* movl $wptr, %eax */
401 *((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
403 adj_code[0x0a] = (unsigned char)0x68; /* pushl obscure_ccall_ret_code */
404 *((StgFunPtr*)(adj_code + 0x0b)) =
405 (StgFunPtr)obscure_ccall_ret_code;
407 adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
408 adj_code[0x10] = (unsigned char)0xe0;
410 #elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
413 What's special about Darwin/Mac OS X on i386?
414 It wants the stack to stay 16-byte aligned.
416 We offload most of the work to AdjustorAsm.S.
418 AdjustorStub *adjustorStub = allocateExec(sizeof(AdjustorStub));
419 adjustor = adjustorStub;
421 extern void adjustorCode(void);
422 int sz = totalArgumentSize(typeString);
424 adjustorStub->call[0] = 0xe8;
425 *(long*)&adjustorStub->call[1] = ((char*)&adjustorCode) - ((char*)adjustorStub + 5);
426 adjustorStub->hptr = hptr;
427 adjustorStub->wptr = wptr;
429 // The adjustor puts the following things on the stack:
431 // 2.) padding and (a copy of) the arguments
432 // 3.) a dummy argument
434 // 5.) return address (for returning to the adjustor)
435 // All these have to add up to a multiple of 16.
437 // first, include everything in frame_size
438 adjustorStub->frame_size = sz * 4 + 16;
440 adjustorStub->frame_size = (adjustorStub->frame_size + 15) & ~15;
441 // only count 2.) and 3.) as part of frame_size
442 adjustorStub->frame_size -= 12;
443 adjustorStub->argument_size = sz;
446 #elif defined(x86_64_HOST_ARCH)
453 %rdi,%rsi,%rdx,%rcx,%r8,%r9 = arg0..arg6
455 if there are <6 integer args, then we can just push the
456 StablePtr into %edi and shuffle the other args up.
458 If there are >=6 integer args, then we have to flush one arg
459 to the stack, and arrange to adjust the stack ptr on return.
460 The stack will be rearranged to this:
465 return address *** <-- dummy arg in stub fn.
467 obscure_ccall_ret_code
469 This unfortunately means that the type of the stub function
470 must have a dummy argument for the original return address
471 pointer inserted just after the 6th integer argument.
473 Code for the simple case:
475 0: 4d 89 c1 mov %r8,%r9
476 3: 49 89 c8 mov %rcx,%r8
477 6: 48 89 d1 mov %rdx,%rcx
478 9: 48 89 f2 mov %rsi,%rdx
479 c: 48 89 fe mov %rdi,%rsi
480 f: 48 8b 3d 0a 00 00 00 mov 10(%rip),%rdi
481 16: ff 25 0c 00 00 00 jmpq *12(%rip)
483 20: .quad 0 # aligned on 8-byte boundary
484 28: .quad 0 # aligned on 8-byte boundary
487 And the version for >=6 integer arguments:
490 2: ff 35 20 00 00 00 pushq 32(%rip) # 28 <ccall_adjustor+0x28>
491 8: 4d 89 c1 mov %r8,%r9
492 b: 49 89 c8 mov %rcx,%r8
493 e: 48 89 d1 mov %rdx,%rcx
494 11: 48 89 f2 mov %rsi,%rdx
495 14: 48 89 fe mov %rdi,%rsi
496 17: 48 8b 3d 12 00 00 00 mov 18(%rip),%rdi # 30 <ccall_adjustor+0x30>
497 1e: ff 25 14 00 00 00 jmpq *20(%rip) # 38 <ccall_adjustor+0x38>
499 28: .quad 0 # aligned on 8-byte boundary
500 30: .quad 0 # aligned on 8-byte boundary
501 38: .quad 0 # aligned on 8-byte boundary
508 // determine whether we have 6 or more integer arguments,
509 // and therefore need to flush one to the stack.
510 for (c = typeString; *c != '\0'; c++) {
511 if (*c != 'f' && *c != 'd') i++;
516 adjustor = allocateExec(0x30);
518 *(StgInt32 *)adjustor = 0x49c1894d;
519 *(StgInt32 *)(adjustor+0x4) = 0x8948c889;
520 *(StgInt32 *)(adjustor+0x8) = 0xf28948d1;
521 *(StgInt32 *)(adjustor+0xc) = 0x48fe8948;
522 *(StgInt32 *)(adjustor+0x10) = 0x000a3d8b;
523 *(StgInt32 *)(adjustor+0x14) = 0x25ff0000;
524 *(StgInt32 *)(adjustor+0x18) = 0x0000000c;
525 *(StgInt64 *)(adjustor+0x20) = (StgInt64)hptr;
526 *(StgInt64 *)(adjustor+0x28) = (StgInt64)wptr;
530 adjustor = allocateExec(0x40);
532 *(StgInt32 *)adjustor = 0x35ff5141;
533 *(StgInt32 *)(adjustor+0x4) = 0x00000020;
534 *(StgInt32 *)(adjustor+0x8) = 0x49c1894d;
535 *(StgInt32 *)(adjustor+0xc) = 0x8948c889;
536 *(StgInt32 *)(adjustor+0x10) = 0xf28948d1;
537 *(StgInt32 *)(adjustor+0x14) = 0x48fe8948;
538 *(StgInt32 *)(adjustor+0x18) = 0x00123d8b;
539 *(StgInt32 *)(adjustor+0x1c) = 0x25ff0000;
540 *(StgInt32 *)(adjustor+0x20) = 0x00000014;
542 *(StgInt64 *)(adjustor+0x28) = (StgInt64)obscure_ccall_ret_code;
543 *(StgInt64 *)(adjustor+0x30) = (StgInt64)hptr;
544 *(StgInt64 *)(adjustor+0x38) = (StgInt64)wptr;
547 #elif defined(sparc_HOST_ARCH)
548 /* Magic constant computed by inspecting the code length of the following
549 assembly language snippet (offset and machine code prefixed):
551 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
552 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
553 <08>: D823A05C st %o4, [%sp + 92]
554 <0C>: 9A10000B mov %o3, %o5
555 <10>: 9810000A mov %o2, %o4
556 <14>: 96100009 mov %o1, %o3
557 <18>: 94100008 mov %o0, %o2
558 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
559 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
560 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
561 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
562 <2C> 00000000 ! place for getting hptr back easily
564 ccall'ing on SPARC is easy, because we are quite lucky to push a
565 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
566 existing arguments (note that %sp must stay double-word aligned at
567 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
568 To do this, we extend the *caller's* stack frame by 2 words and shift
569 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
570 procedure because of the tail-jump) by 2 positions. This makes room in
571 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
572 for destination addr of jump on SPARC, return address on x86, ...). This
573 shouldn't cause any problems for a C-like caller: alloca is implemented
574 similarly, and local variables should be accessed via %fp, not %sp. In a
575 nutshell: This should work! (Famous last words! :-)
577 adjustor = allocateExec(4*(11+1));
579 unsigned long *const adj_code = (unsigned long *)adjustor;
581 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
582 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
583 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
584 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
585 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
586 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
587 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
588 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
589 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
590 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
591 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
592 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
593 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
594 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
595 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
597 adj_code[11] = (unsigned long)hptr;
600 asm("flush %0" : : "r" (adj_code ));
601 asm("flush %0" : : "r" (adj_code + 2));
602 asm("flush %0" : : "r" (adj_code + 4));
603 asm("flush %0" : : "r" (adj_code + 6));
604 asm("flush %0" : : "r" (adj_code + 10));
606 /* max. 5 instructions latency, and we need at >= 1 for returning */
612 #elif defined(alpha_HOST_ARCH)
613 /* Magic constant computed by inspecting the code length of
614 the following assembly language snippet
615 (offset and machine code prefixed; note that the machine code
616 shown is longwords stored in little-endian order):
618 <00>: 46520414 mov a2, a4
619 <04>: 46100412 mov a0, a2
620 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
621 <0c>: 46730415 mov a3, a5
622 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
623 <14>: 46310413 mov a1, a3
624 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
625 <1c>: 00000000 # padding for alignment
626 <20>: [8 bytes for hptr quadword]
627 <28>: [8 bytes for wptr quadword]
629 The "computed" jump at <08> above is really a jump to a fixed
630 location. Accordingly, we place an always-correct hint in the
631 jump instruction, namely the address offset from <0c> to wptr,
632 divided by 4, taking the lowest 14 bits.
634 We only support passing 4 or fewer argument words, for the same
635 reason described under sparc_HOST_ARCH above by JRS, 21 Aug 01.
636 On the Alpha the first 6 integer arguments are in a0 through a5,
637 and the rest on the stack. Hence we want to shuffle the original
638 caller's arguments by two.
640 On the Alpha the calling convention is so complex and dependent
641 on the callee's signature -- for example, the stack pointer has
642 to be a multiple of 16 -- that it seems impossible to me [ccshan]
643 to handle the general case correctly without changing how the
644 adjustor is called from C. For now, our solution of shuffling
645 registers only and ignoring the stack only works if the original
646 caller passed 4 or fewer argument words.
648 TODO: Depending on how much allocation overhead stgMallocBytes uses for
649 header information (more precisely, if the overhead is no more than
650 4 bytes), we should move the first three instructions above down by
651 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
653 ASSERT(((StgWord64)wptr & 3) == 0);
654 adjustor = allocateExec(48);
656 StgWord64 *const code = (StgWord64 *)adjustor;
658 code[0] = 0x4610041246520414L;
659 code[1] = 0x46730415a61b0020L;
660 code[2] = 0x46310413a77b0028L;
661 code[3] = 0x000000006bfb0000L
662 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
664 code[4] = (StgWord64)hptr;
665 code[5] = (StgWord64)wptr;
667 /* Ensure that instruction cache is consistent with our new code */
668 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
670 #elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
672 #define OP_LO(op,lo) ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
673 #define OP_HI(op,hi) ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
675 /* The PowerPC Linux (32-bit) calling convention is annoyingly complex.
676 We need to calculate all the details of the stack frame layout,
677 taking into account the types of all the arguments, and then
678 generate code on the fly. */
680 int src_gpr = 3, dst_gpr = 5;
682 int src_offset = 0, dst_offset = 0;
683 int n = strlen(typeString),i;
684 int src_locs[n], dst_locs[n];
689 Calculate where the arguments should go.
690 src_locs[] will contain the locations of the arguments in the
691 original stack frame passed to the adjustor.
692 dst_locs[] will contain the locations of the arguments after the
693 adjustor runs, on entry to the wrapper proc pointed to by wptr.
695 This algorithm is based on the one described on page 3-19 of the
696 System V ABI PowerPC Processor Supplement.
698 for(i=0;typeString[i];i++)
700 char t = typeString[i];
701 if((t == 'f' || t == 'd') && fpr <= 8)
702 src_locs[i] = dst_locs[i] = -32-(fpr++);
705 if((t == 'l' || t == 'L') && src_gpr <= 9)
707 if((src_gpr & 1) == 0)
709 src_locs[i] = -src_gpr;
712 else if((t == 'w' || t == 'W') && src_gpr <= 10)
714 src_locs[i] = -(src_gpr++);
718 if((t == 'l' || t == 'L' || t == 'd')
723 src_locs[i] = src_offset;
724 src_offset += (t == 'l' || t == 'L' || t == 'd') ? 8 : 4;
727 if((t == 'l' || t == 'L') && dst_gpr <= 9)
729 if((dst_gpr & 1) == 0)
731 dst_locs[i] = -dst_gpr;
734 else if((t == 'w' || t == 'W') && dst_gpr <= 10)
736 dst_locs[i] = -(dst_gpr++);
740 if(t == 'l' || t == 'L' || t == 'd')
745 dst_locs[i] = dst_offset;
746 dst_offset += (t == 'l' || t == 'L' || t == 'd') ? 8 : 4;
751 frameSize = dst_offset + 8;
752 frameSize = (frameSize+15) & ~0xF;
757 // allocate space for at most 4 insns per parameter
758 // plus 14 more instructions.
759 adjustor = allocateExec(4 * (4*n + 14));
760 code = (unsigned*)adjustor;
762 *code++ = 0x48000008; // b *+8
763 // * Put the hptr in a place where freeHaskellFunctionPtr
765 *code++ = (unsigned) hptr;
767 // * save the link register
768 *code++ = 0x7c0802a6; // mflr r0;
769 *code++ = 0x90010004; // stw r0, 4(r1);
770 // * and build a new stack frame
771 *code++ = OP_LO(0x9421, -frameSize); // stwu r1, -frameSize(r1)
773 // * now generate instructions to copy arguments
774 // from the old stack frame into the new stack frame.
777 if(src_locs[i] < -32)
778 ASSERT(dst_locs[i] == src_locs[i]);
779 else if(src_locs[i] < 0)
782 ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
785 ASSERT(dst_locs[i] > -32);
786 // dst is in GPR, too.
788 if(typeString[i] == 'l' || typeString[i] == 'L')
792 | ((-dst_locs[i]+1) << 16)
793 | ((-src_locs[i]+1) << 11)
794 | ((-src_locs[i]+1) << 21);
798 | ((-dst_locs[i]) << 16)
799 | ((-src_locs[i]) << 11)
800 | ((-src_locs[i]) << 21);
804 if(typeString[i] == 'l' || typeString[i] == 'L')
806 // stw src+1, dst_offset+4(r1)
808 | ((-src_locs[i]+1) << 21)
812 // stw src, dst_offset(r1)
814 | ((-src_locs[i]) << 21)
820 ASSERT(dst_locs[i] >= 0);
821 ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
823 if(typeString[i] == 'l' || typeString[i] == 'L')
825 // lwz r0, src_offset(r1)
827 | (src_locs[i] + frameSize + 8 + 4);
828 // stw r0, dst_offset(r1)
830 | (dst_locs[i] + 8 + 4);
832 // lwz r0, src_offset(r1)
834 | (src_locs[i] + frameSize + 8);
835 // stw r0, dst_offset(r1)
841 // * hptr will be the new first argument.
843 *code++ = OP_HI(0x3c60, hptr);
844 // ori r3,r3,lo(hptr)
845 *code++ = OP_LO(0x6063, hptr);
847 // * we need to return to a piece of code
848 // which will tear down the stack frame.
849 // lis r11,hi(obscure_ccall_ret_code)
850 *code++ = OP_HI(0x3d60, obscure_ccall_ret_code);
851 // ori r11,r11,lo(obscure_ccall_ret_code)
852 *code++ = OP_LO(0x616b, obscure_ccall_ret_code);
854 *code++ = 0x7d6803a6;
858 *code++ = OP_HI(0x3d60, wptr);
859 // ori r11,r11,lo(wptr)
860 *code++ = OP_LO(0x616b, wptr);
862 *code++ = 0x7d6903a6;
864 *code++ = 0x4e800420;
866 // Flush the Instruction cache:
868 unsigned *p = adjustor;
871 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
875 __asm__ volatile ("sync\n\tisync");
879 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
881 #define OP_LO(op,lo) ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
882 #define OP_HI(op,hi) ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
884 /* The following code applies to all PowerPC and PowerPC64 platforms
885 whose stack layout is based on the AIX ABI.
887 Besides (obviously) AIX, this includes
888 Mac OS 9 and BeOS/PPC (may they rest in peace),
889 which use the 32-bit AIX ABI
891 which uses the 64-bit AIX ABI
892 and Darwin (Mac OS X),
893 which uses the same stack layout as AIX,
894 but no function descriptors.
896 The actual stack-frame shuffling is implemented out-of-line
897 in the function adjustorCode, in AdjustorAsm.S.
898 Here, we set up an AdjustorStub structure, which
899 is a function descriptor (on platforms that have function
900 descriptors) or a short piece of stub code (on Darwin) to call
901 adjustorCode with a pointer to the AdjustorStub struct loaded
904 One nice thing about this is that there is _no_ code generated at
905 runtime on the platforms that have function descriptors.
907 AdjustorStub *adjustorStub;
908 int sz = 0, extra_sz, total_sz;
910 // from AdjustorAsm.s
911 // not declared as a function so that AIX-style
912 // fundescs can never get in the way.
913 extern void *adjustorCode;
916 adjustorStub = stgMallocBytes(sizeof(AdjustorStub), "createAdjustor");
918 adjustorStub = allocateExec(sizeof(AdjustorStub));
920 adjustor = adjustorStub;
922 adjustorStub->code = (void*) &adjustorCode;
925 // function descriptors are a cool idea.
926 // We don't need to generate any code at runtime.
927 adjustorStub->toc = adjustorStub;
930 // no function descriptors :-(
931 // We need to do things "by hand".
932 #if defined(powerpc_HOST_ARCH)
933 // lis r2, hi(adjustorStub)
934 adjustorStub->lis = OP_HI(0x3c40, adjustorStub);
935 // ori r2, r2, lo(adjustorStub)
936 adjustorStub->ori = OP_LO(0x6042, adjustorStub);
938 adjustorStub->lwz = OP_LO(0x8002, (char*)(&adjustorStub->code)
939 - (char*)adjustorStub);
941 adjustorStub->mtctr = 0x7c0903a6;
943 adjustorStub->bctr = 0x4e800420;
945 barf("adjustor creation not supported on this platform");
948 // Flush the Instruction cache:
950 int n = sizeof(AdjustorStub)/sizeof(unsigned);
951 unsigned *p = (unsigned*)adjustor;
954 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
958 __asm__ volatile ("sync\n\tisync");
962 // Calculate the size of the stack frame, in words.
963 sz = totalArgumentSize(typeString);
965 // The first eight words of the parameter area
966 // are just "backing store" for the parameters passed in
967 // the GPRs. extra_sz is the number of words beyond those first
973 // Calculate the total size of the stack frame.
974 total_sz = (6 /* linkage area */
975 + 8 /* minimum parameter area */
976 + 2 /* two extra arguments */
977 + extra_sz)*sizeof(StgWord);
979 // align to 16 bytes.
980 // AIX only requires 8 bytes, but who cares?
981 total_sz = (total_sz+15) & ~0xF;
983 // Fill in the information that adjustorCode in AdjustorAsm.S
984 // will use to create a new stack frame with the additional args.
985 adjustorStub->hptr = hptr;
986 adjustorStub->wptr = wptr;
987 adjustorStub->negative_framesize = -total_sz;
988 adjustorStub->extrawords_plus_one = extra_sz + 1;
991 #elif defined(ia64_HOST_ARCH)
993 Up to 8 inputs are passed in registers. We flush the last two inputs to
994 the stack, initially into the 16-byte scratch region left by the caller.
995 We then shuffle the others along by 4 (taking 2 registers for ourselves
996 to save return address and previous function state - we need to come back
997 here on the way out to restore the stack, so this is a real function
998 rather than just a trampoline).
1000 The function descriptor we create contains the gp of the target function
1001 so gp is already loaded correctly.
1003 [MLX] alloc r16=ar.pfs,10,2,0
1005 [MII] st8.spill [r12]=r38,8 // spill in6 (out4)
1006 mov r41=r37 // out7 = in5 (out3)
1007 mov r40=r36;; // out6 = in4 (out2)
1008 [MII] st8.spill [r12]=r39 // spill in7 (out5)
1010 mov r38=r34;; // out4 = in2 (out0)
1011 [MII] mov r39=r35 // out5 = in3 (out1)
1012 mov r37=r33 // out3 = in1 (loc1)
1013 mov r36=r32 // out2 = in0 (loc0)
1014 [MLX] adds r12=-24,r12 // update sp
1015 movl r34=hptr;; // out0 = hptr
1016 [MIB] mov r33=r16 // loc1 = ar.pfs
1017 mov r32=b0 // loc0 = retaddr
1018 br.call.sptk.many b0=b6;;
1020 [MII] adds r12=-16,r12
1025 br.ret.sptk.many b0;;
1028 /* These macros distribute a long constant into the two words of an MLX bundle */
1029 #define BITS(val,start,count) (((val) >> (start)) & ((1 << (count))-1))
1030 #define MOVL_LOWORD(val) (BITS(val,22,18) << 46)
1031 #define MOVL_HIWORD(val) ( (BITS(val,0,7) << 36) \
1032 | (BITS(val,7,9) << 50) \
1033 | (BITS(val,16,5) << 45) \
1034 | (BITS(val,21,1) << 44) \
1035 | (BITS(val,40,23)) \
1036 | (BITS(val,63,1) << 59))
1039 StgStablePtr stable;
1040 IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
1041 StgWord64 wcode = wdesc->ip;
1045 /* we allocate on the Haskell heap since malloc'd memory isn't
1046 * executable - argh */
1047 /* Allocated memory is word-aligned (8 bytes) but functions on ia64
1048 * must be aligned to 16 bytes. We allocate an extra 8 bytes of
1049 * wiggle room so that we can put the code on a 16 byte boundary. */
1050 adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8+8, &stable);
1052 fdesc = (IA64FunDesc *)adjustor;
1053 code = (StgWord64 *)(fdesc + 1);
1054 /* add 8 bytes to code if needed to align to a 16-byte boundary */
1055 if ((StgWord64)code & 15) code++;
1056 fdesc->ip = (StgWord64)code;
1057 fdesc->gp = wdesc->gp;
1059 code[0] = 0x0000058004288004 | MOVL_LOWORD(wcode);
1060 code[1] = 0x6000000220000000 | MOVL_HIWORD(wcode);
1061 code[2] = 0x029015d818984001;
1062 code[3] = 0x8401200500420094;
1063 code[4] = 0x886011d8189c0001;
1064 code[5] = 0x84011004c00380c0;
1065 code[6] = 0x0250210046013800;
1066 code[7] = 0x8401000480420084;
1067 code[8] = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
1068 code[9] = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
1069 code[10] = 0x0200210020010811;
1070 code[11] = 0x1080006800006200;
1071 code[12] = 0x0000210018406000;
1072 code[13] = 0x00aa021000038005;
1073 code[14] = 0x000000010000001d;
1074 code[15] = 0x0084000880000200;
1076 /* save stable pointers in convenient form */
1077 code[16] = (StgWord64)hptr;
1078 code[17] = (StgWord64)stable;
1081 barf("adjustor creation not supported on this platform");
1096 freeHaskellFunctionPtr(void* ptr)
1098 #if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
1099 if ( *(unsigned char*)ptr != 0x68 &&
1100 *(unsigned char*)ptr != 0x58 ) {
1101 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1105 /* Free the stable pointer first..*/
1106 if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
1107 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
1109 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
1111 #elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
1112 if ( *(unsigned char*)ptr != 0xe8 ) {
1113 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1116 freeStablePtr(((AdjustorStub*)ptr)->hptr);
1117 #elif defined(x86_64_HOST_ARCH)
1118 if ( *(StgWord16 *)ptr == 0x894d ) {
1119 freeStablePtr(*(StgStablePtr*)(ptr+0x20));
1120 } else if ( *(StgWord16 *)ptr == 0x5141 ) {
1121 freeStablePtr(*(StgStablePtr*)(ptr+0x30));
1123 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1126 #elif defined(sparc_HOST_ARCH)
1127 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
1128 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1132 /* Free the stable pointer first..*/
1133 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
1134 #elif defined(alpha_HOST_ARCH)
1135 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
1136 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1140 /* Free the stable pointer first..*/
1141 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
1142 #elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
1143 if ( *(StgWord*)ptr != 0x48000008 ) {
1144 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1147 freeStablePtr(((StgStablePtr*)ptr)[1]);
1148 #elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
1149 extern void* adjustorCode;
1150 if ( ((AdjustorStub*)ptr)->code != (StgFunPtr) &adjustorCode ) {
1151 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1154 freeStablePtr(((AdjustorStub*)ptr)->hptr);
1155 #elif defined(ia64_HOST_ARCH)
1156 IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
1157 StgWord64 *code = (StgWord64 *)(fdesc+1);
1159 if (fdesc->ip != (StgWord64)code) {
1160 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
1163 freeStablePtr((StgStablePtr)code[16]);
1164 freeStablePtr((StgStablePtr)code[17]);
1169 *((unsigned char*)ptr) = '\0';
1174 #endif // !USE_LIBFFI