1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
6 * ---------------------------------------------------------------------------*/
8 /* A little bit of background...
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so result in memory leaks on both the C and
39 #include "PosixSource.h"
49 /* Heavily arch-specific, I'm afraid.. */
52 * Function: execPage()
54 * Set the executable bit on page containing addr. CURRENTLY DISABLED.
56 * TODO: Can the code span more than one page? If yes, we need to make two
60 execPage (void* addr, int writable)
62 #if defined(i386_TARGET_ARCH) && defined(_WIN32) && 0
64 DWORD dwOldProtect = 0;
66 /* doesn't return a result, so presumably it can't fail... */
67 GetSystemInfo(&sInfo);
69 if ( VirtualProtect ( (void*)((unsigned long)addr & (sInfo.dwPageSize - 1)),
71 ( writable ? PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ),
72 &dwOldProtect) == 0 ) {
74 DWORD rc = GetLastError();
75 fprintf(stderr, "execPage: failed to protect 0x%p; error=%lu; old protection: %lu\n", addr, rc, dwOldProtect);
81 (void)addr; (void)writable; /* keep gcc -Wall happy */
87 static unsigned char __obscure_ccall_ret_code [] =
88 #if defined(i386_TARGET_ARCH)
89 /* Now here's something obscure for you:
91 When generating an adjustor thunk that uses the C calling
92 convention, we have to make sure that the thunk kicks off
93 the process of jumping into Haskell with a tail jump. Why?
94 Because as a result of jumping in into Haskell we may end
95 up freeing the very adjustor thunk we came from using
96 freeHaskellFunctionPtr(). Hence, we better not return to
97 the adjustor code on our way out, since it could by then
100 The fix is readily at hand, just include the opcodes
101 for the C stack fixup code that we need to perform when
102 returning in some static piece of memory and arrange
103 to return to it before tail jumping from the adjustor thunk.
105 For this to work we make the assumption that bytes in .data
106 are considered executable.
108 { 0x83, 0xc4, 0x04 /* addl $0x4, %esp */
112 /* No such mind-twisters on non-Intel platforms */
116 #if defined(alpha_TARGET_ARCH)
117 /* To get the definition of PAL_imb: */
118 # if defined(linux_TARGET_OS)
119 # include <asm/pal.h>
121 # include <machine/pal.h>
125 #if defined(ia64_TARGET_ARCH)
128 /* Layout of a function descriptor */
129 typedef struct _IA64FunDesc {
135 stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
138 nat data_size_in_words, total_size_in_words;
140 /* round up to a whole number of words */
141 data_size_in_words = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
142 total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
144 /* allocate and fill it in */
145 arr = (StgArrWords *)allocate(total_size_in_words);
146 SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
148 /* obtain a stable ptr */
149 *stable = getStablePtr((StgPtr)arr);
151 /* and return a ptr to the goods inside the array */
152 return(BYTE_ARR_CTS(arr));
157 createAdjustor(int cconv, StgStablePtr hptr, StgFunPtr wptr)
159 void *adjustor = NULL;
163 case 0: /* _stdcall */
164 #if defined(i386_TARGET_ARCH)
165 /* Magic constant computed by inspecting the code length of
166 the following assembly language snippet
167 (offset and machine code prefixed):
169 <0>: 58 popl %eax # temp. remove ret addr..
170 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
171 # hold a StgStablePtr
172 <6>: 50 pushl %eax # put back ret. addr
173 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
174 <c>: ff e0 jmp %eax # and jump to it.
175 # the callee cleans up the stack
177 if ((adjustor = stgMallocBytes(14, "createAdjustor")) != NULL) {
178 unsigned char *const adj_code = (unsigned char *)adjustor;
179 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
181 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
182 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
184 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
186 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
187 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
189 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
190 adj_code[0x0d] = (unsigned char)0xe0;
192 execPage(adjustor,rtsTrue);
198 #if defined(i386_TARGET_ARCH)
199 /* Magic constant computed by inspecting the code length of
200 the following assembly language snippet
201 (offset and machine code prefixed):
203 <00>: 68 ef be ad de pushl $0xdeadbeef # constant is large enough to
204 # hold a StgStablePtr
205 <05>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
206 <0a>: 68 ef be ad de pushl $__obscure_ccall_ret_code # push the return address
207 <0f>: ff e0 jmp *%eax # jump to wptr
209 The ccall'ing version is a tad different, passing in the return
210 address of the caller to the auto-generated C stub (which enters
211 via the stable pointer.) (The auto-generated C stub is in on this
212 game, don't worry :-)
214 See the comment next to __obscure_ccall_ret_code why we need to
215 perform a tail jump instead of a call, followed by some C stack
218 Note: The adjustor makes the assumption that any return value
219 coming back from the C stub is not stored on the stack.
220 That's (thankfully) the case here with the restricted set of
221 return types that we support.
223 if ((adjustor = stgMallocBytes(17, "createAdjustor")) != NULL) {
224 unsigned char *const adj_code = (unsigned char *)adjustor;
226 adj_code[0x00] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
227 *((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
229 adj_code[0x05] = (unsigned char)0xb8; /* movl $wptr, %eax */
230 *((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
232 adj_code[0x0a] = (unsigned char)0x68; /* pushl __obscure_ccall_ret_code */
233 *((StgFunPtr*)(adj_code + 0x0b)) = (StgFunPtr)__obscure_ccall_ret_code;
235 adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
236 adj_code[0x10] = (unsigned char)0xe0;
238 execPage(adjustor,rtsTrue);
240 #elif defined(sparc_TARGET_ARCH)
241 /* Magic constant computed by inspecting the code length of the following
242 assembly language snippet (offset and machine code prefixed):
244 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
245 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
246 <08>: D823A05C st %o4, [%sp + 92]
247 <0C>: 9A10000B mov %o3, %o5
248 <10>: 9810000A mov %o2, %o4
249 <14>: 96100009 mov %o1, %o3
250 <18>: 94100008 mov %o0, %o2
251 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
252 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
253 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
254 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
255 <2C> 00000000 ! place for getting hptr back easily
257 ccall'ing on SPARC is easy, because we are quite lucky to push a
258 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
259 existing arguments (note that %sp must stay double-word aligned at
260 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
261 To do this, we extend the *caller's* stack frame by 2 words and shift
262 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
263 procedure because of the tail-jump) by 2 positions. This makes room in
264 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
265 for destination addr of jump on SPARC, return address on x86, ...). This
266 shouldn't cause any problems for a C-like caller: alloca is implemented
267 similarly, and local variables should be accessed via %fp, not %sp. In a
268 nutshell: This should work! (Famous last words! :-)
270 if ((adjustor = stgMallocBytes(4*(11+1), "createAdjustor")) != NULL) {
271 unsigned long *const adj_code = (unsigned long *)adjustor;
273 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
274 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
275 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
276 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
277 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
278 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
279 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
280 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
281 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
282 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
283 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
284 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
285 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
286 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
287 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
289 adj_code[11] = (unsigned long)hptr;
292 asm("flush %0" : : "r" (adj_code ));
293 asm("flush %0" : : "r" (adj_code + 2));
294 asm("flush %0" : : "r" (adj_code + 4));
295 asm("flush %0" : : "r" (adj_code + 6));
296 asm("flush %0" : : "r" (adj_code + 10));
298 /* max. 5 instructions latency, and we need at >= 1 for returning */
304 #elif defined(alpha_TARGET_ARCH)
305 /* Magic constant computed by inspecting the code length of
306 the following assembly language snippet
307 (offset and machine code prefixed; note that the machine code
308 shown is longwords stored in little-endian order):
310 <00>: 46520414 mov a2, a4
311 <04>: 46100412 mov a0, a2
312 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
313 <0c>: 46730415 mov a3, a5
314 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
315 <14>: 46310413 mov a1, a3
316 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
317 <1c>: 00000000 # padding for alignment
318 <20>: [8 bytes for hptr quadword]
319 <28>: [8 bytes for wptr quadword]
321 The "computed" jump at <08> above is really a jump to a fixed
322 location. Accordingly, we place an always-correct hint in the
323 jump instruction, namely the address offset from <0c> to wptr,
324 divided by 4, taking the lowest 14 bits.
326 We only support passing 4 or fewer argument words, for the same
327 reason described under sparc_TARGET_ARCH above by JRS, 21 Aug 01.
328 On the Alpha the first 6 integer arguments are in a0 through a5,
329 and the rest on the stack. Hence we want to shuffle the original
330 caller's arguments by two.
332 On the Alpha the calling convention is so complex and dependent
333 on the callee's signature -- for example, the stack pointer has
334 to be a multiple of 16 -- that it seems impossible to me [ccshan]
335 to handle the general case correctly without changing how the
336 adjustor is called from C. For now, our solution of shuffling
337 registers only and ignoring the stack only works if the original
338 caller passed 4 or fewer argument words.
340 TODO: Depending on how much allocation overhead stgMallocBytes uses for
341 header information (more precisely, if the overhead is no more than
342 4 bytes), we should move the first three instructions above down by
343 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
345 ASSERT(((StgWord64)wptr & 3) == 0);
346 if ((adjustor = stgMallocBytes(48, "createAdjustor")) != NULL) {
347 StgWord64 *const code = (StgWord64 *)adjustor;
349 code[0] = 0x4610041246520414L;
350 code[1] = 0x46730415a61b0020L;
351 code[2] = 0x46310413a77b0028L;
352 code[3] = 0x000000006bfb0000L
353 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
355 code[4] = (StgWord64)hptr;
356 code[5] = (StgWord64)wptr;
358 /* Ensure that instruction cache is consistent with our new code */
359 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
361 #elif defined(powerpc_TARGET_ARCH)
363 For PowerPC, the following code is used:
371 lis r0,0xDEAD ;hi(wptr)
372 lis r3,0xDEAF ;hi(hptr)
373 ori r0,r0,0xBEEF ; lo(wptr)
374 ori r3,r3,0xFACE ; lo(hptr)
378 The arguments (passed in registers r3 - r10) are shuffled along by two to
379 make room for hptr and a dummy argument. As r9 and r10 are overwritten by
380 this code, it only works for up to 6 arguments (when floating point arguments
381 are involved, this may be more or less, depending on the exact situation).
383 if ((adjustor = stgMallocBytes(4*13, "createAdjustor")) != NULL) {
384 unsigned long *const adj_code = (unsigned long *)adjustor;
386 // make room for extra arguments
387 adj_code[0] = 0x7d0a4378; //mr r10,r8
388 adj_code[1] = 0x7ce93b78; //mr r9,r7
389 adj_code[2] = 0x7cc83378; //mr r8,r6
390 adj_code[3] = 0x7ca72b78; //mr r7,r5
391 adj_code[4] = 0x7c862378; //mr r6,r4
392 adj_code[5] = 0x7c651b78; //mr r5,r3
394 adj_code[6] = 0x3c000000; //lis r0,hi(wptr)
395 adj_code[6] |= ((unsigned long)wptr) >> 16;
397 adj_code[7] = 0x3c600000; //lis r3,hi(hptr)
398 adj_code[7] |= ((unsigned long)hptr) >> 16;
400 adj_code[8] = 0x60000000; //ori r0,r0,lo(wptr)
401 adj_code[8] |= ((unsigned long)wptr) & 0xFFFF;
403 adj_code[9] = 0x60630000; //ori r3,r3,lo(hptr)
404 adj_code[9] |= ((unsigned long)hptr) & 0xFFFF;
406 adj_code[10] = 0x7c0903a6; //mtctr r0
407 adj_code[11] = 0x4e800420; //bctr
408 adj_code[12] = (unsigned long)hptr;
410 // Flush the Instruction cache:
411 // MakeDataExecutable(adjustor,4*13);
412 /* This would require us to link with CoreServices.framework */
413 { /* this should do the same: */
415 unsigned long *p = adj_code;
418 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
422 __asm__ volatile ("sync\n\tisync");
425 #elif defined(ia64_TARGET_ARCH)
427 Up to 8 inputs are passed in registers. We flush the last two inputs to
428 the stack, initially into the 16-byte scratch region left by the caller.
429 We then shuffle the others along by 4 (taking 2 registers for ourselves
430 to save return address and previous function state - we need to come back
431 here on the way out to restore the stack, so this is a real function
432 rather than just a trampoline).
434 The function descriptor we create contains the gp of the target function
435 so gp is already loaded correctly.
437 [MLX] alloc r16=ar.pfs,10,2,0
439 [MII] st8.spill [r12]=r38,8 // spill in6 (out4)
440 mov r41=r37 // out7 = in5 (out3)
441 mov r40=r36;; // out6 = in4 (out2)
442 [MII] st8.spill [r12]=r39 // spill in7 (out5)
444 mov r38=r34;; // out4 = in2 (out0)
445 [MII] mov r39=r35 // out5 = in3 (out1)
446 mov r37=r33 // out3 = in1 (loc1)
447 mov r36=r32 // out2 = in0 (loc0)
448 [MLX] adds r12=-24,r12 // update sp
449 movl r34=hptr;; // out0 = hptr
450 [MIB] mov r33=r16 // loc1 = ar.pfs
451 mov r32=b0 // loc0 = retaddr
452 br.call.sptk.many b0=b6;;
454 [MII] adds r12=-16,r12
459 br.ret.sptk.many b0;;
462 /* These macros distribute a long constant into the two words of an MLX bundle */
463 #define BITS(val,start,count) (((val) >> (start)) & ((1 << (count))-1))
464 #define MOVL_LOWORD(val) (BITS(val,22,18) << 46)
465 #define MOVL_HIWORD(val) (BITS(val,40,23) | (BITS(val,0,7) << 36) | (BITS(val,7,9) << 50) \
466 | (BITS(val,16,5) << 55) | (BITS(val,21,1) << 44) | BITS(val,63,1) << 59)
470 IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
471 StgWord64 wcode = wdesc->ip;
475 /* we allocate on the Haskell heap since malloc'd memory isn't executable - argh */
476 adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8, &stable);
478 fdesc = (IA64FunDesc *)adjustor;
479 code = (StgWord64 *)(fdesc + 1);
480 fdesc->ip = (StgWord64)code;
481 fdesc->gp = wdesc->gp;
483 code[0] = 0x0000058004288004 | MOVL_LOWORD(wcode);
484 code[1] = 0x6000000220000000 | MOVL_HIWORD(wcode);
485 code[2] = 0x029015d818984001;
486 code[3] = 0x8401200500420094;
487 code[4] = 0x886011d8189c0001;
488 code[5] = 0x84011004c00380c0;
489 code[6] = 0x0250210046013800;
490 code[7] = 0x8401000480420084;
491 code[8] = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
492 code[9] = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
493 code[10] = 0x0200210020010811;
494 code[11] = 0x1080006800006200;
495 code[12] = 0x0000210018406000;
496 code[13] = 0x00aa021000038005;
497 code[14] = 0x000000010000001d;
498 code[15] = 0x0084000880000200;
500 /* save stable pointers in convenient form */
501 code[16] = (StgWord64)hptr;
502 code[17] = (StgWord64)stable;
505 barf("adjustor creation not supported on this platform");
520 freeHaskellFunctionPtr(void* ptr)
522 #if defined(i386_TARGET_ARCH)
523 if ( *(unsigned char*)ptr != 0x68 &&
524 *(unsigned char*)ptr != 0x58 ) {
525 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
529 /* Free the stable pointer first..*/
530 if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
531 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
533 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
535 #elif defined(sparc_TARGET_ARCH)
536 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
537 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
541 /* Free the stable pointer first..*/
542 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
543 #elif defined(alpha_TARGET_ARCH)
544 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
545 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
549 /* Free the stable pointer first..*/
550 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
551 #elif defined(powerpc_TARGET_ARCH)
552 if ( *(StgWord*)ptr != 0x7d0a4378 ) {
553 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
556 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 4*12)));
557 #elif defined(ia64_TARGET_ARCH)
558 IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
559 StgWord64 *code = (StgWord64 *)(fdesc+1);
561 if (fdesc->ip != (StgWord64)code) {
562 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
565 freeStablePtr((StgStablePtr)code[16]);
566 freeStablePtr((StgStablePtr)code[17]);
571 *((unsigned char*)ptr) = '\0';
578 * Function: initAdjustor()
580 * Perform initialisation of adjustor thunk layer (if needed.)
582 * TODO: Call this at RTS initialisation time.
587 return execPage(__obscure_ccall_ret_code, rtsFalse);