1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
6 * ---------------------------------------------------------------------------*/
8 /* A little bit of background...
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so result in memory leaks on both the C and
39 #include "PosixSource.h"
41 #include "RtsExternal.h"
49 #if defined(openbsd_TARGET_OS)
54 /* Heavily arch-specific, I'm afraid.. */
62 * Function: execPage()
64 * Set the executable bit on page containing addr.
66 * TODO: Can the code span more than one page? If yes, we need to make two
70 execPage (void* addr, pageMode mode)
72 #if defined(i386_TARGET_ARCH) && defined(_WIN32) && 0
74 DWORD dwOldProtect = 0;
76 /* doesn't return a result, so presumably it can't fail... */
77 GetSystemInfo(&sInfo);
79 if ( VirtualProtect ( (void*)((unsigned long)addr & (sInfo.dwPageSize - 1)),
81 ( mode == pageExecuteReadWrite ? PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ),
82 &dwOldProtect) == 0 ) {
83 DWORD rc = GetLastError();
84 barf("execPage: failed to protect 0x%p; error=%lu; old protection: %lu\n", addr, rc, dwOldProtect);
88 #if defined(openbsd_TARGET_OS)
89 /* malloc memory isn't executable by default on OpenBSD */
90 unsigned long pagesize = sysconf(_SC_PAGESIZE);
91 unsigned long round = (unsigned long)addr & (pagesize - 1);
92 if (mprotect(addr - round, pagesize, PROT_EXEC|PROT_READ|PROT_WRITE) == -1)
93 barf("execPage: failed to protect 0x%p\n", addr);
96 (void)addr; (void)mode; /* keep gcc -Wall happy */
100 #if defined(i386_TARGET_ARCH)
101 static unsigned char *obscure_ccall_ret_code;
104 #if defined(alpha_TARGET_ARCH)
105 /* To get the definition of PAL_imb: */
106 # if defined(linux_TARGET_OS)
107 # include <asm/pal.h>
109 # include <machine/pal.h>
113 #if defined(ia64_TARGET_ARCH)
116 /* Layout of a function descriptor */
117 typedef struct _IA64FunDesc {
123 stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
126 nat data_size_in_words, total_size_in_words;
128 /* round up to a whole number of words */
129 data_size_in_words = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
130 total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
132 /* allocate and fill it in */
133 arr = (StgArrWords *)allocate(total_size_in_words);
134 SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
136 /* obtain a stable ptr */
137 *stable = getStablePtr((StgPtr)arr);
139 /* and return a ptr to the goods inside the array */
140 return(BYTE_ARR_CTS(arr));
145 createAdjustor(int cconv, StgStablePtr hptr, StgFunPtr wptr)
147 void *adjustor = NULL;
151 case 0: /* _stdcall */
152 #if defined(i386_TARGET_ARCH)
153 /* Magic constant computed by inspecting the code length of
154 the following assembly language snippet
155 (offset and machine code prefixed):
157 <0>: 58 popl %eax # temp. remove ret addr..
158 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
159 # hold a StgStablePtr
160 <6>: 50 pushl %eax # put back ret. addr
161 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
162 <c>: ff e0 jmp %eax # and jump to it.
163 # the callee cleans up the stack
165 adjustor = stgMallocBytes(14, "createAdjustor");
167 unsigned char *const adj_code = (unsigned char *)adjustor;
168 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
170 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
171 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
173 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
175 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
176 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
178 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
179 adj_code[0x0d] = (unsigned char)0xe0;
181 execPage(adjustor, pageExecuteReadWrite);
187 #if defined(i386_TARGET_ARCH)
188 /* Magic constant computed by inspecting the code length of
189 the following assembly language snippet
190 (offset and machine code prefixed):
192 <00>: 68 ef be ad de pushl $0xdeadbeef # constant is large enough to
193 # hold a StgStablePtr
194 <05>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
195 <0a>: 68 ef be ad de pushl $obscure_ccall_ret_code # push the return address
196 <0f>: ff e0 jmp *%eax # jump to wptr
198 The ccall'ing version is a tad different, passing in the return
199 address of the caller to the auto-generated C stub (which enters
200 via the stable pointer.) (The auto-generated C stub is in on this
201 game, don't worry :-)
203 See the comment next to obscure_ccall_ret_code why we need to
204 perform a tail jump instead of a call, followed by some C stack
207 Note: The adjustor makes the assumption that any return value
208 coming back from the C stub is not stored on the stack.
209 That's (thankfully) the case here with the restricted set of
210 return types that we support.
212 adjustor = stgMallocBytes(17, "createAdjustor");
214 unsigned char *const adj_code = (unsigned char *)adjustor;
216 adj_code[0x00] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
217 *((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
219 adj_code[0x05] = (unsigned char)0xb8; /* movl $wptr, %eax */
220 *((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
222 adj_code[0x0a] = (unsigned char)0x68; /* pushl obscure_ccall_ret_code */
223 *((StgFunPtr*)(adj_code + 0x0b)) = (StgFunPtr)obscure_ccall_ret_code;
225 adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
226 adj_code[0x10] = (unsigned char)0xe0;
228 execPage(adjustor, pageExecuteReadWrite);
230 #elif defined(sparc_TARGET_ARCH)
231 /* Magic constant computed by inspecting the code length of the following
232 assembly language snippet (offset and machine code prefixed):
234 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
235 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
236 <08>: D823A05C st %o4, [%sp + 92]
237 <0C>: 9A10000B mov %o3, %o5
238 <10>: 9810000A mov %o2, %o4
239 <14>: 96100009 mov %o1, %o3
240 <18>: 94100008 mov %o0, %o2
241 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
242 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
243 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
244 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
245 <2C> 00000000 ! place for getting hptr back easily
247 ccall'ing on SPARC is easy, because we are quite lucky to push a
248 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
249 existing arguments (note that %sp must stay double-word aligned at
250 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
251 To do this, we extend the *caller's* stack frame by 2 words and shift
252 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
253 procedure because of the tail-jump) by 2 positions. This makes room in
254 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
255 for destination addr of jump on SPARC, return address on x86, ...). This
256 shouldn't cause any problems for a C-like caller: alloca is implemented
257 similarly, and local variables should be accessed via %fp, not %sp. In a
258 nutshell: This should work! (Famous last words! :-)
260 adjustor = stgMallocBytes(4*(11+1), "createAdjustor");
262 unsigned long *const adj_code = (unsigned long *)adjustor;
264 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
265 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
266 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
267 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
268 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
269 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
270 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
271 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
272 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
273 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
274 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
275 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
276 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
277 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
278 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
280 adj_code[11] = (unsigned long)hptr;
283 asm("flush %0" : : "r" (adj_code ));
284 asm("flush %0" : : "r" (adj_code + 2));
285 asm("flush %0" : : "r" (adj_code + 4));
286 asm("flush %0" : : "r" (adj_code + 6));
287 asm("flush %0" : : "r" (adj_code + 10));
289 /* max. 5 instructions latency, and we need at >= 1 for returning */
295 #elif defined(alpha_TARGET_ARCH)
296 /* Magic constant computed by inspecting the code length of
297 the following assembly language snippet
298 (offset and machine code prefixed; note that the machine code
299 shown is longwords stored in little-endian order):
301 <00>: 46520414 mov a2, a4
302 <04>: 46100412 mov a0, a2
303 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
304 <0c>: 46730415 mov a3, a5
305 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
306 <14>: 46310413 mov a1, a3
307 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
308 <1c>: 00000000 # padding for alignment
309 <20>: [8 bytes for hptr quadword]
310 <28>: [8 bytes for wptr quadword]
312 The "computed" jump at <08> above is really a jump to a fixed
313 location. Accordingly, we place an always-correct hint in the
314 jump instruction, namely the address offset from <0c> to wptr,
315 divided by 4, taking the lowest 14 bits.
317 We only support passing 4 or fewer argument words, for the same
318 reason described under sparc_TARGET_ARCH above by JRS, 21 Aug 01.
319 On the Alpha the first 6 integer arguments are in a0 through a5,
320 and the rest on the stack. Hence we want to shuffle the original
321 caller's arguments by two.
323 On the Alpha the calling convention is so complex and dependent
324 on the callee's signature -- for example, the stack pointer has
325 to be a multiple of 16 -- that it seems impossible to me [ccshan]
326 to handle the general case correctly without changing how the
327 adjustor is called from C. For now, our solution of shuffling
328 registers only and ignoring the stack only works if the original
329 caller passed 4 or fewer argument words.
331 TODO: Depending on how much allocation overhead stgMallocBytes uses for
332 header information (more precisely, if the overhead is no more than
333 4 bytes), we should move the first three instructions above down by
334 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
336 ASSERT(((StgWord64)wptr & 3) == 0);
337 adjustor = stgMallocBytes(48, "createAdjustor");
339 StgWord64 *const code = (StgWord64 *)adjustor;
341 code[0] = 0x4610041246520414L;
342 code[1] = 0x46730415a61b0020L;
343 code[2] = 0x46310413a77b0028L;
344 code[3] = 0x000000006bfb0000L
345 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
347 code[4] = (StgWord64)hptr;
348 code[5] = (StgWord64)wptr;
350 /* Ensure that instruction cache is consistent with our new code */
351 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
353 #elif defined(powerpc_TARGET_ARCH)
355 For PowerPC, the following code is used:
363 lis r0,0xDEAD ;hi(wptr)
364 lis r3,0xDEAF ;hi(hptr)
365 ori r0,r0,0xBEEF ; lo(wptr)
366 ori r3,r3,0xFACE ; lo(hptr)
370 The arguments (passed in registers r3 - r10) are shuffled along by two to
371 make room for hptr and a dummy argument. As r9 and r10 are overwritten by
372 this code, it only works for up to 6 arguments (when floating point arguments
373 are involved, this may be more or less, depending on the exact situation).
375 adjustor = stgMallocBytes(4*13, "createAdjustor");
377 unsigned long *const adj_code = (unsigned long *)adjustor;
379 // make room for extra arguments
380 adj_code[0] = 0x7d0a4378; //mr r10,r8
381 adj_code[1] = 0x7ce93b78; //mr r9,r7
382 adj_code[2] = 0x7cc83378; //mr r8,r6
383 adj_code[3] = 0x7ca72b78; //mr r7,r5
384 adj_code[4] = 0x7c862378; //mr r6,r4
385 adj_code[5] = 0x7c651b78; //mr r5,r3
387 adj_code[6] = 0x3c000000; //lis r0,hi(wptr)
388 adj_code[6] |= ((unsigned long)wptr) >> 16;
390 adj_code[7] = 0x3c600000; //lis r3,hi(hptr)
391 adj_code[7] |= ((unsigned long)hptr) >> 16;
393 adj_code[8] = 0x60000000; //ori r0,r0,lo(wptr)
394 adj_code[8] |= ((unsigned long)wptr) & 0xFFFF;
396 adj_code[9] = 0x60630000; //ori r3,r3,lo(hptr)
397 adj_code[9] |= ((unsigned long)hptr) & 0xFFFF;
399 adj_code[10] = 0x7c0903a6; //mtctr r0
400 adj_code[11] = 0x4e800420; //bctr
401 adj_code[12] = (unsigned long)hptr;
403 // Flush the Instruction cache:
404 // MakeDataExecutable(adjustor,4*13);
405 /* This would require us to link with CoreServices.framework */
406 { /* this should do the same: */
408 unsigned long *p = adj_code;
411 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
415 __asm__ volatile ("sync\n\tisync");
418 #elif defined(ia64_TARGET_ARCH)
420 Up to 8 inputs are passed in registers. We flush the last two inputs to
421 the stack, initially into the 16-byte scratch region left by the caller.
422 We then shuffle the others along by 4 (taking 2 registers for ourselves
423 to save return address and previous function state - we need to come back
424 here on the way out to restore the stack, so this is a real function
425 rather than just a trampoline).
427 The function descriptor we create contains the gp of the target function
428 so gp is already loaded correctly.
430 [MLX] alloc r16=ar.pfs,10,2,0
432 [MII] st8.spill [r12]=r38,8 // spill in6 (out4)
433 mov r41=r37 // out7 = in5 (out3)
434 mov r40=r36;; // out6 = in4 (out2)
435 [MII] st8.spill [r12]=r39 // spill in7 (out5)
437 mov r38=r34;; // out4 = in2 (out0)
438 [MII] mov r39=r35 // out5 = in3 (out1)
439 mov r37=r33 // out3 = in1 (loc1)
440 mov r36=r32 // out2 = in0 (loc0)
441 [MLX] adds r12=-24,r12 // update sp
442 movl r34=hptr;; // out0 = hptr
443 [MIB] mov r33=r16 // loc1 = ar.pfs
444 mov r32=b0 // loc0 = retaddr
445 br.call.sptk.many b0=b6;;
447 [MII] adds r12=-16,r12
452 br.ret.sptk.many b0;;
455 /* These macros distribute a long constant into the two words of an MLX bundle */
456 #define BITS(val,start,count) (((val) >> (start)) & ((1 << (count))-1))
457 #define MOVL_LOWORD(val) (BITS(val,22,18) << 46)
458 #define MOVL_HIWORD(val) (BITS(val,40,23) | (BITS(val,0,7) << 36) | (BITS(val,7,9) << 50) \
459 | (BITS(val,16,5) << 55) | (BITS(val,21,1) << 44) | BITS(val,63,1) << 59)
463 IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
464 StgWord64 wcode = wdesc->ip;
468 /* we allocate on the Haskell heap since malloc'd memory isn't executable - argh */
469 adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8, &stable);
471 fdesc = (IA64FunDesc *)adjustor;
472 code = (StgWord64 *)(fdesc + 1);
473 fdesc->ip = (StgWord64)code;
474 fdesc->gp = wdesc->gp;
476 code[0] = 0x0000058004288004 | MOVL_LOWORD(wcode);
477 code[1] = 0x6000000220000000 | MOVL_HIWORD(wcode);
478 code[2] = 0x029015d818984001;
479 code[3] = 0x8401200500420094;
480 code[4] = 0x886011d8189c0001;
481 code[5] = 0x84011004c00380c0;
482 code[6] = 0x0250210046013800;
483 code[7] = 0x8401000480420084;
484 code[8] = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
485 code[9] = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
486 code[10] = 0x0200210020010811;
487 code[11] = 0x1080006800006200;
488 code[12] = 0x0000210018406000;
489 code[13] = 0x00aa021000038005;
490 code[14] = 0x000000010000001d;
491 code[15] = 0x0084000880000200;
493 /* save stable pointers in convenient form */
494 code[16] = (StgWord64)hptr;
495 code[17] = (StgWord64)stable;
498 barf("adjustor creation not supported on this platform");
513 freeHaskellFunctionPtr(void* ptr)
515 #if defined(i386_TARGET_ARCH)
516 if ( *(unsigned char*)ptr != 0x68 &&
517 *(unsigned char*)ptr != 0x58 ) {
518 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
522 /* Free the stable pointer first..*/
523 if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
524 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
526 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
528 #elif defined(sparc_TARGET_ARCH)
529 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
530 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
534 /* Free the stable pointer first..*/
535 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
536 #elif defined(alpha_TARGET_ARCH)
537 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
538 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
542 /* Free the stable pointer first..*/
543 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
544 #elif defined(powerpc_TARGET_ARCH)
545 if ( *(StgWord*)ptr != 0x7d0a4378 ) {
546 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
549 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 4*12)));
550 #elif defined(ia64_TARGET_ARCH)
551 IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
552 StgWord64 *code = (StgWord64 *)(fdesc+1);
554 if (fdesc->ip != (StgWord64)code) {
555 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
558 freeStablePtr((StgStablePtr)code[16]);
559 freeStablePtr((StgStablePtr)code[17]);
564 *((unsigned char*)ptr) = '\0';
571 * Function: initAdjustor()
573 * Perform initialisation of adjustor thunk layer (if needed.)
578 #if defined(i386_TARGET_ARCH)
579 /* Now here's something obscure for you:
581 When generating an adjustor thunk that uses the C calling
582 convention, we have to make sure that the thunk kicks off
583 the process of jumping into Haskell with a tail jump. Why?
584 Because as a result of jumping in into Haskell we may end
585 up freeing the very adjustor thunk we came from using
586 freeHaskellFunctionPtr(). Hence, we better not return to
587 the adjustor code on our way out, since it could by then
590 The fix is readily at hand, just include the opcodes
591 for the C stack fixup code that we need to perform when
592 returning in some static piece of memory and arrange
593 to return to it before tail jumping from the adjustor thunk.
596 obscure_ccall_ret_code = stgMallocBytes(4, "initAdjustor");
598 obscure_ccall_ret_code[0x00] = (unsigned char)0x83; /* addl $0x4, %esp */
599 obscure_ccall_ret_code[0x01] = (unsigned char)0xc4;
600 obscure_ccall_ret_code[0x02] = (unsigned char)0x04;
602 obscure_ccall_ret_code[0x03] = (unsigned char)0xc3; /* ret */
604 execPage(obscure_ccall_ret_code, pageExecuteRead);