1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
6 * ---------------------------------------------------------------------------*/
8 /* A little bit of background...
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so result in memory leaks on both the C and
39 #include "PosixSource.h"
49 #if defined(i386_TARGET_ARCH)
50 static rtsBool execPage (void* addr, int writable);
53 /* Heavily arch-specific, I'm afraid.. */
55 #if defined(i386_TARGET_ARCH)
56 /* Now here's something obscure for you:
58 When generating an adjustor thunk that uses the C calling
59 convention, we have to make sure that the thunk kicks off
60 the process of jumping into Haskell with a tail jump. Why?
61 Because as a result of jumping in into Haskell we may end
62 up freeing the very adjustor thunk we came from using
63 freeHaskellFunctionPtr(). Hence, we better not return to
64 the adjustor code on our way out, since it could by then
67 The fix is readily at hand, just include the opcodes
68 for the C stack fixup code that we need to perform when
69 returning in some static piece of memory and arrange
70 to return to it before tail jumping from the adjustor thunk.
72 For this to work we make the assumption that bytes in .data
73 are considered executable.
75 static unsigned char __obscure_ccall_ret_code [] =
76 { 0x83, 0xc4, 0x04 /* addl $0x4, %esp */
81 #if defined(alpha_TARGET_ARCH)
82 /* To get the definition of PAL_imb: */
83 # if defined(linux_TARGET_OS)
86 # include <machine/pal.h>
90 #if defined(ia64_TARGET_ARCH)
93 /* Layout of a function descriptor */
94 typedef struct _IA64FunDesc {
100 stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
103 nat data_size_in_words, total_size_in_words;
105 /* round up to a whole number of words */
106 data_size_in_words = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
107 total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
109 /* allocate and fill it in */
110 arr = (StgArrWords *)allocate(total_size_in_words);
111 SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
113 /* obtain a stable ptr */
114 *stable = getStablePtr((StgPtr)arr);
116 /* and return a ptr to the goods inside the array */
117 return(BYTE_ARR_CTS(arr));
122 createAdjustor(int cconv, StgStablePtr hptr, StgFunPtr wptr)
124 void *adjustor = NULL;
128 case 0: /* _stdcall */
129 #if defined(i386_TARGET_ARCH)
130 /* Magic constant computed by inspecting the code length of
131 the following assembly language snippet
132 (offset and machine code prefixed):
134 <0>: 58 popl %eax # temp. remove ret addr..
135 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
136 # hold a StgStablePtr
137 <6>: 50 pushl %eax # put back ret. addr
138 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
139 <c>: ff e0 jmp %eax # and jump to it.
140 # the callee cleans up the stack
142 if ((adjustor = stgMallocBytes(14, "createAdjustor")) != NULL) {
143 unsigned char *const adj_code = (unsigned char *)adjustor;
144 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
146 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
147 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
149 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
151 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
152 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
154 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
155 adj_code[0x0d] = (unsigned char)0xe0;
159 execPage(adjustor,rtsTrue);
166 #if defined(i386_TARGET_ARCH)
167 /* Magic constant computed by inspecting the code length of
168 the following assembly language snippet
169 (offset and machine code prefixed):
171 <00>: 68 ef be ad de pushl $0xdeadbeef # constant is large enough to
172 # hold a StgStablePtr
173 <05>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
174 <0a>: 68 ef be ad de pushl $__obscure_ccall_ret_code # push the return address
175 <0f>: ff e0 jmp *%eax # jump to wptr
177 The ccall'ing version is a tad different, passing in the return
178 address of the caller to the auto-generated C stub (which enters
179 via the stable pointer.) (The auto-generated C stub is in on this
180 game, don't worry :-)
182 See the comment next to __obscure_ccall_ret_code why we need to
183 perform a tail jump instead of a call, followed by some C stack
186 Note: The adjustor makes the assumption that any return value
187 coming back from the C stub is not stored on the stack.
188 That's (thankfully) the case here with the restricted set of
189 return types that we support.
191 if ((adjustor = stgMallocBytes(17, "createAdjustor")) != NULL) {
192 unsigned char *const adj_code = (unsigned char *)adjustor;
194 adj_code[0x00] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
195 *((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
197 adj_code[0x05] = (unsigned char)0xb8; /* movl $wptr, %eax */
198 *((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
200 adj_code[0x0a] = (unsigned char)0x68; /* pushl __obscure_ccall_ret_code */
201 *((StgFunPtr*)(adj_code + 0x0b)) = (StgFunPtr)__obscure_ccall_ret_code;
203 adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
204 adj_code[0x10] = (unsigned char)0xe0;
208 execPage(adjustor,rtsTrue);
211 #elif defined(sparc_TARGET_ARCH)
212 /* Magic constant computed by inspecting the code length of the following
213 assembly language snippet (offset and machine code prefixed):
215 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
216 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
217 <08>: D823A05C st %o4, [%sp + 92]
218 <0C>: 9A10000B mov %o3, %o5
219 <10>: 9810000A mov %o2, %o4
220 <14>: 96100009 mov %o1, %o3
221 <18>: 94100008 mov %o0, %o2
222 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
223 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
224 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
225 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
226 <2C> 00000000 ! place for getting hptr back easily
228 ccall'ing on SPARC is easy, because we are quite lucky to push a
229 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
230 existing arguments (note that %sp must stay double-word aligned at
231 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
232 To do this, we extend the *caller's* stack frame by 2 words and shift
233 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
234 procedure because of the tail-jump) by 2 positions. This makes room in
235 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
236 for destination addr of jump on SPARC, return address on x86, ...). This
237 shouldn't cause any problems for a C-like caller: alloca is implemented
238 similarly, and local variables should be accessed via %fp, not %sp. In a
239 nutshell: This should work! (Famous last words! :-)
241 if ((adjustor = stgMallocBytes(4*(11+1), "createAdjustor")) != NULL) {
242 unsigned long *const adj_code = (unsigned long *)adjustor;
244 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
245 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
246 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
247 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
248 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
249 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
250 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
251 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
252 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
253 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
254 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
255 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
256 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
257 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
258 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
260 adj_code[11] = (unsigned long)hptr;
263 asm("flush %0" : : "r" (adj_code ));
264 asm("flush %0" : : "r" (adj_code + 2));
265 asm("flush %0" : : "r" (adj_code + 4));
266 asm("flush %0" : : "r" (adj_code + 6));
267 asm("flush %0" : : "r" (adj_code + 10));
269 /* max. 5 instructions latency, and we need at >= 1 for returning */
275 #elif defined(alpha_TARGET_ARCH)
276 /* Magic constant computed by inspecting the code length of
277 the following assembly language snippet
278 (offset and machine code prefixed; note that the machine code
279 shown is longwords stored in little-endian order):
281 <00>: 46520414 mov a2, a4
282 <04>: 46100412 mov a0, a2
283 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
284 <0c>: 46730415 mov a3, a5
285 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
286 <14>: 46310413 mov a1, a3
287 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
288 <1c>: 00000000 # padding for alignment
289 <20>: [8 bytes for hptr quadword]
290 <28>: [8 bytes for wptr quadword]
292 The "computed" jump at <08> above is really a jump to a fixed
293 location. Accordingly, we place an always-correct hint in the
294 jump instruction, namely the address offset from <0c> to wptr,
295 divided by 4, taking the lowest 14 bits.
297 We only support passing 4 or fewer argument words, for the same
298 reason described under sparc_TARGET_ARCH above by JRS, 21 Aug 01.
299 On the Alpha the first 6 integer arguments are in a0 through a5,
300 and the rest on the stack. Hence we want to shuffle the original
301 caller's arguments by two.
303 On the Alpha the calling convention is so complex and dependent
304 on the callee's signature -- for example, the stack pointer has
305 to be a multiple of 16 -- that it seems impossible to me [ccshan]
306 to handle the general case correctly without changing how the
307 adjustor is called from C. For now, our solution of shuffling
308 registers only and ignoring the stack only works if the original
309 caller passed 4 or fewer argument words.
311 TODO: Depending on how much allocation overhead stgMallocBytes uses for
312 header information (more precisely, if the overhead is no more than
313 4 bytes), we should move the first three instructions above down by
314 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
316 ASSERT(((StgWord64)wptr & 3) == 0);
317 if ((adjustor = stgMallocBytes(48, "createAdjustor")) != NULL) {
318 StgWord64 *const code = (StgWord64 *)adjustor;
320 code[0] = 0x4610041246520414L;
321 code[1] = 0x46730415a61b0020L;
322 code[2] = 0x46310413a77b0028L;
323 code[3] = 0x000000006bfb0000L
324 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
326 code[4] = (StgWord64)hptr;
327 code[5] = (StgWord64)wptr;
329 /* Ensure that instruction cache is consistent with our new code */
330 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
332 #elif defined(powerpc_TARGET_ARCH)
334 For PowerPC, the following code is used:
342 lis r0,0xDEAD ;hi(wptr)
343 lis r3,0xDEAF ;hi(hptr)
344 ori r0,r0,0xBEEF ; lo(wptr)
345 ori r3,r3,0xFACE ; lo(hptr)
349 The arguments (passed in registers r3 - r10) are shuffled along by two to
350 make room for hptr and a dummy argument. As r9 and r10 are overwritten by
351 this code, it only works for up to 6 arguments (when floating point arguments
352 are involved, this may be more or less, depending on the exact situation).
354 if ((adjustor = stgMallocBytes(4*13, "createAdjustor")) != NULL) {
355 unsigned long *const adj_code = (unsigned long *)adjustor;
357 // make room for extra arguments
358 adj_code[0] = 0x7d0a4378; //mr r10,r8
359 adj_code[1] = 0x7ce93b78; //mr r9,r7
360 adj_code[2] = 0x7cc83378; //mr r8,r6
361 adj_code[3] = 0x7ca72b78; //mr r7,r5
362 adj_code[4] = 0x7c862378; //mr r6,r4
363 adj_code[5] = 0x7c651b78; //mr r5,r3
365 adj_code[6] = 0x3c000000; //lis r0,hi(wptr)
366 adj_code[6] |= ((unsigned long)wptr) >> 16;
368 adj_code[7] = 0x3c600000; //lis r3,hi(hptr)
369 adj_code[7] |= ((unsigned long)hptr) >> 16;
371 adj_code[8] = 0x60000000; //ori r0,r0,lo(wptr)
372 adj_code[8] |= ((unsigned long)wptr) & 0xFFFF;
374 adj_code[9] = 0x60630000; //ori r3,r3,lo(hptr)
375 adj_code[9] |= ((unsigned long)hptr) & 0xFFFF;
377 adj_code[10] = 0x7c0903a6; //mtctr r0
378 adj_code[11] = 0x4e800420; //bctr
379 adj_code[12] = (unsigned long)hptr;
381 // Flush the Instruction cache:
382 // MakeDataExecutable(adjustor,4*13);
383 /* This would require us to link with CoreServices.framework */
384 { /* this should do the same: */
386 unsigned long *p = adj_code;
389 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
393 __asm__ volatile ("sync\n\tisync");
396 #elif defined(ia64_TARGET_ARCH)
398 Up to 8 inputs are passed in registers. We flush the last two inputs to
399 the stack, initially into the 16-byte scratch region left by the caller.
400 We then shuffle the others along by 4 (taking 2 registers for ourselves
401 to save return address and previous function state - we need to come back
402 here on the way out to restore the stack, so this is a real function
403 rather than just a trampoline).
405 The function descriptor we create contains the gp of the target function
406 so gp is already loaded correctly.
408 [MLX] alloc r16=ar.pfs,10,2,0
410 [MII] st8.spill [r12]=r38,8 // spill in6 (out4)
411 mov r41=r37 // out7 = in5 (out3)
412 mov r40=r36;; // out6 = in4 (out2)
413 [MII] st8.spill [r12]=r39 // spill in7 (out5)
415 mov r38=r34;; // out4 = in2 (out0)
416 [MII] mov r39=r35 // out5 = in3 (out1)
417 mov r37=r33 // out3 = in1 (loc1)
418 mov r36=r32 // out2 = in0 (loc0)
419 [MLX] adds r12=-24,r12 // update sp
420 movl r34=hptr;; // out0 = hptr
421 [MIB] mov r33=r16 // loc1 = ar.pfs
422 mov r32=b0 // loc0 = retaddr
423 br.call.sptk.many b0=b6;;
425 [MII] adds r12=-16,r12
430 br.ret.sptk.many b0;;
433 /* These macros distribute a long constant into the two words of an MLX bundle */
434 #define BITS(val,start,count) (((val) >> (start)) & ((1 << (count))-1))
435 #define MOVL_LOWORD(val) (BITS(val,22,18) << 46)
436 #define MOVL_HIWORD(val) (BITS(val,40,23) | (BITS(val,0,7) << 36) | (BITS(val,7,9) << 50) \
437 | (BITS(val,16,5) << 55) | (BITS(val,21,1) << 44) | BITS(val,63,1) << 59)
441 IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
442 StgWord64 wcode = wdesc->ip;
446 /* we allocate on the Haskell heap since malloc'd memory isn't executable - argh */
447 adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8, &stable);
449 fdesc = (IA64FunDesc *)adjustor;
450 code = (StgWord64 *)(fdesc + 1);
451 fdesc->ip = (StgWord64)code;
452 fdesc->gp = wdesc->gp;
454 code[0] = 0x0000058004288004 | MOVL_LOWORD(wcode);
455 code[1] = 0x6000000220000000 | MOVL_HIWORD(wcode);
456 code[2] = 0x029015d818984001;
457 code[3] = 0x8401200500420094;
458 code[4] = 0x886011d8189c0001;
459 code[5] = 0x84011004c00380c0;
460 code[6] = 0x0250210046013800;
461 code[7] = 0x8401000480420084;
462 code[8] = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
463 code[9] = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
464 code[10] = 0x0200210020010811;
465 code[11] = 0x1080006800006200;
466 code[12] = 0x0000210018406000;
467 code[13] = 0x00aa021000038005;
468 code[14] = 0x000000010000001d;
469 code[15] = 0x0084000880000200;
471 /* save stable pointers in convenient form */
472 code[16] = (StgWord64)hptr;
473 code[17] = (StgWord64)stable;
476 barf("adjustor creation not supported on this platform");
491 freeHaskellFunctionPtr(void* ptr)
493 #if defined(i386_TARGET_ARCH)
494 if ( *(unsigned char*)ptr != 0x68 &&
495 *(unsigned char*)ptr != 0x58 ) {
496 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
500 /* Free the stable pointer first..*/
501 if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
502 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
504 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
506 #elif defined(sparc_TARGET_ARCH)
507 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
508 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
512 /* Free the stable pointer first..*/
513 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
514 #elif defined(alpha_TARGET_ARCH)
515 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
516 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
520 /* Free the stable pointer first..*/
521 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
522 #elif defined(powerpc_TARGET_ARCH)
523 if ( *(StgWord*)ptr != 0x7d0a4378 ) {
524 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
527 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 4*12)));
528 #elif defined(ia64_TARGET_ARCH)
529 IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
530 StgWord64 *code = (StgWord64 *)(fdesc+1);
532 if (fdesc->ip != (StgWord64)code) {
533 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
536 freeStablePtr((StgStablePtr)code[16]);
537 freeStablePtr((StgStablePtr)code[17]);
542 *((unsigned char*)ptr) = '\0';
547 #if defined(i386_TARGET_ARCH)
549 * Function: execPage()
551 * Set the executable bit on page containin
555 execPage (void* addr, int writable)
559 DWORD dwOldProtect = 0;
561 /* doesn't return a result, so presumably it can't fail... */
562 GetSystemInfo(&sInfo);
564 if ( VirtualProtect ( (void*)((unsigned long)addr & (sInfo.dwPageSize - 1)),
566 ( writable ? PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ),
567 &dwOldProtect) == 0 ) {
569 DWORD rc = GetLastError();
570 fprintf(stderr, "execPage: failed to protect 0x%p; error=%lu; old protection: %lu\n", addr, rc, dwOldProtect);
582 * Function: initAdjustor()
584 * Perform initialisation of adjustor thunk layer (if needed.)
589 #if defined(i386_TARGET_ARCH) && defined(_WIN32)
590 return execPage(__obscure_ccall_ret_code, rtsFalse);