1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
6 * ---------------------------------------------------------------------------*/
8 /* A little bit of background...
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so result in memory leaks on both the C and
39 #include "PosixSource.h"
41 #include "RtsExternal.h"
49 /* Heavily arch-specific, I'm afraid.. */
57 * Function: execPage()
59 * Set the executable bit on page containing addr.
61 * TODO: Can the code span more than one page? If yes, we need to make two
65 execPage (void* addr, pageMode mode)
67 #if defined(i386_TARGET_ARCH) && defined(_WIN32) && 0
69 DWORD dwOldProtect = 0;
71 /* doesn't return a result, so presumably it can't fail... */
72 GetSystemInfo(&sInfo);
74 if ( VirtualProtect ( (void*)((unsigned long)addr & (sInfo.dwPageSize - 1)),
76 ( mode == pageExecuteReadWrite ? PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ),
77 &dwOldProtect) == 0 ) {
79 DWORD rc = GetLastError();
80 prog_belch("execPage: failed to protect 0x%p; error=%lu; old protection: %lu\n", addr, rc, dwOldProtect);
86 (void)addr; (void)mode; /* keep gcc -Wall happy */
91 #if defined(i386_TARGET_ARCH)
92 static unsigned char *obscure_ccall_ret_code;
95 #if defined(alpha_TARGET_ARCH)
96 /* To get the definition of PAL_imb: */
97 # if defined(linux_TARGET_OS)
100 # include <machine/pal.h>
104 #if defined(ia64_TARGET_ARCH)
107 /* Layout of a function descriptor */
108 typedef struct _IA64FunDesc {
114 stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
117 nat data_size_in_words, total_size_in_words;
119 /* round up to a whole number of words */
120 data_size_in_words = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
121 total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
123 /* allocate and fill it in */
124 arr = (StgArrWords *)allocate(total_size_in_words);
125 SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
127 /* obtain a stable ptr */
128 *stable = getStablePtr((StgPtr)arr);
130 /* and return a ptr to the goods inside the array */
131 return(BYTE_ARR_CTS(arr));
136 createAdjustor(int cconv, StgStablePtr hptr, StgFunPtr wptr)
138 void *adjustor = NULL;
142 case 0: /* _stdcall */
143 #if defined(i386_TARGET_ARCH)
144 /* Magic constant computed by inspecting the code length of
145 the following assembly language snippet
146 (offset and machine code prefixed):
148 <0>: 58 popl %eax # temp. remove ret addr..
149 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
150 # hold a StgStablePtr
151 <6>: 50 pushl %eax # put back ret. addr
152 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
153 <c>: ff e0 jmp %eax # and jump to it.
154 # the callee cleans up the stack
156 adjustor = stgMallocBytes(14, "createAdjustor");
157 unsigned char *const adj_code = (unsigned char *)adjustor;
158 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
160 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
161 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
163 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
165 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
166 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
168 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
169 adj_code[0x0d] = (unsigned char)0xe0;
171 execPage(adjustor, pageExecuteReadWrite);
176 #if defined(i386_TARGET_ARCH)
177 /* Magic constant computed by inspecting the code length of
178 the following assembly language snippet
179 (offset and machine code prefixed):
181 <00>: 68 ef be ad de pushl $0xdeadbeef # constant is large enough to
182 # hold a StgStablePtr
183 <05>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
184 <0a>: 68 ef be ad de pushl $obscure_ccall_ret_code # push the return address
185 <0f>: ff e0 jmp *%eax # jump to wptr
187 The ccall'ing version is a tad different, passing in the return
188 address of the caller to the auto-generated C stub (which enters
189 via the stable pointer.) (The auto-generated C stub is in on this
190 game, don't worry :-)
192 See the comment next to obscure_ccall_ret_code why we need to
193 perform a tail jump instead of a call, followed by some C stack
196 Note: The adjustor makes the assumption that any return value
197 coming back from the C stub is not stored on the stack.
198 That's (thankfully) the case here with the restricted set of
199 return types that we support.
201 adjustor = stgMallocBytes(17, "createAdjustor");
202 unsigned char *const adj_code = (unsigned char *)adjustor;
204 adj_code[0x00] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
205 *((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
207 adj_code[0x05] = (unsigned char)0xb8; /* movl $wptr, %eax */
208 *((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
210 adj_code[0x0a] = (unsigned char)0x68; /* pushl obscure_ccall_ret_code */
211 *((StgFunPtr*)(adj_code + 0x0b)) = (StgFunPtr)obscure_ccall_ret_code;
213 adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
214 adj_code[0x10] = (unsigned char)0xe0;
216 execPage(adjustor, pageExecuteReadWrite);
217 #elif defined(sparc_TARGET_ARCH)
218 /* Magic constant computed by inspecting the code length of the following
219 assembly language snippet (offset and machine code prefixed):
221 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
222 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
223 <08>: D823A05C st %o4, [%sp + 92]
224 <0C>: 9A10000B mov %o3, %o5
225 <10>: 9810000A mov %o2, %o4
226 <14>: 96100009 mov %o1, %o3
227 <18>: 94100008 mov %o0, %o2
228 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
229 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
230 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
231 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
232 <2C> 00000000 ! place for getting hptr back easily
234 ccall'ing on SPARC is easy, because we are quite lucky to push a
235 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
236 existing arguments (note that %sp must stay double-word aligned at
237 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
238 To do this, we extend the *caller's* stack frame by 2 words and shift
239 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
240 procedure because of the tail-jump) by 2 positions. This makes room in
241 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
242 for destination addr of jump on SPARC, return address on x86, ...). This
243 shouldn't cause any problems for a C-like caller: alloca is implemented
244 similarly, and local variables should be accessed via %fp, not %sp. In a
245 nutshell: This should work! (Famous last words! :-)
247 adjustor = stgMallocBytes(4*(11+1), "createAdjustor");
248 unsigned long *const adj_code = (unsigned long *)adjustor;
250 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
251 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
252 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
253 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
254 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
255 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
256 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
257 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
258 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
259 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
260 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
261 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
262 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
263 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
264 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
266 adj_code[11] = (unsigned long)hptr;
269 asm("flush %0" : : "r" (adj_code ));
270 asm("flush %0" : : "r" (adj_code + 2));
271 asm("flush %0" : : "r" (adj_code + 4));
272 asm("flush %0" : : "r" (adj_code + 6));
273 asm("flush %0" : : "r" (adj_code + 10));
275 /* max. 5 instructions latency, and we need at >= 1 for returning */
280 #elif defined(alpha_TARGET_ARCH)
281 /* Magic constant computed by inspecting the code length of
282 the following assembly language snippet
283 (offset and machine code prefixed; note that the machine code
284 shown is longwords stored in little-endian order):
286 <00>: 46520414 mov a2, a4
287 <04>: 46100412 mov a0, a2
288 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
289 <0c>: 46730415 mov a3, a5
290 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
291 <14>: 46310413 mov a1, a3
292 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
293 <1c>: 00000000 # padding for alignment
294 <20>: [8 bytes for hptr quadword]
295 <28>: [8 bytes for wptr quadword]
297 The "computed" jump at <08> above is really a jump to a fixed
298 location. Accordingly, we place an always-correct hint in the
299 jump instruction, namely the address offset from <0c> to wptr,
300 divided by 4, taking the lowest 14 bits.
302 We only support passing 4 or fewer argument words, for the same
303 reason described under sparc_TARGET_ARCH above by JRS, 21 Aug 01.
304 On the Alpha the first 6 integer arguments are in a0 through a5,
305 and the rest on the stack. Hence we want to shuffle the original
306 caller's arguments by two.
308 On the Alpha the calling convention is so complex and dependent
309 on the callee's signature -- for example, the stack pointer has
310 to be a multiple of 16 -- that it seems impossible to me [ccshan]
311 to handle the general case correctly without changing how the
312 adjustor is called from C. For now, our solution of shuffling
313 registers only and ignoring the stack only works if the original
314 caller passed 4 or fewer argument words.
316 TODO: Depending on how much allocation overhead stgMallocBytes uses for
317 header information (more precisely, if the overhead is no more than
318 4 bytes), we should move the first three instructions above down by
319 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
321 ASSERT(((StgWord64)wptr & 3) == 0);
322 adjustor = stgMallocBytes(48, "createAdjustor");
323 StgWord64 *const code = (StgWord64 *)adjustor;
325 code[0] = 0x4610041246520414L;
326 code[1] = 0x46730415a61b0020L;
327 code[2] = 0x46310413a77b0028L;
328 code[3] = 0x000000006bfb0000L
329 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
331 code[4] = (StgWord64)hptr;
332 code[5] = (StgWord64)wptr;
334 /* Ensure that instruction cache is consistent with our new code */
335 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
336 #elif defined(powerpc_TARGET_ARCH)
338 For PowerPC, the following code is used:
346 lis r0,0xDEAD ;hi(wptr)
347 lis r3,0xDEAF ;hi(hptr)
348 ori r0,r0,0xBEEF ; lo(wptr)
349 ori r3,r3,0xFACE ; lo(hptr)
353 The arguments (passed in registers r3 - r10) are shuffled along by two to
354 make room for hptr and a dummy argument. As r9 and r10 are overwritten by
355 this code, it only works for up to 6 arguments (when floating point arguments
356 are involved, this may be more or less, depending on the exact situation).
358 adjustor = stgMallocBytes(4*13, "createAdjustor");
359 unsigned long *const adj_code = (unsigned long *)adjustor;
361 // make room for extra arguments
362 adj_code[0] = 0x7d0a4378; //mr r10,r8
363 adj_code[1] = 0x7ce93b78; //mr r9,r7
364 adj_code[2] = 0x7cc83378; //mr r8,r6
365 adj_code[3] = 0x7ca72b78; //mr r7,r5
366 adj_code[4] = 0x7c862378; //mr r6,r4
367 adj_code[5] = 0x7c651b78; //mr r5,r3
369 adj_code[6] = 0x3c000000; //lis r0,hi(wptr)
370 adj_code[6] |= ((unsigned long)wptr) >> 16;
372 adj_code[7] = 0x3c600000; //lis r3,hi(hptr)
373 adj_code[7] |= ((unsigned long)hptr) >> 16;
375 adj_code[8] = 0x60000000; //ori r0,r0,lo(wptr)
376 adj_code[8] |= ((unsigned long)wptr) & 0xFFFF;
378 adj_code[9] = 0x60630000; //ori r3,r3,lo(hptr)
379 adj_code[9] |= ((unsigned long)hptr) & 0xFFFF;
381 adj_code[10] = 0x7c0903a6; //mtctr r0
382 adj_code[11] = 0x4e800420; //bctr
383 adj_code[12] = (unsigned long)hptr;
385 /* Flush the Instruction cache: */
386 /* MakeDataExecutable(adjustor,4*13); */
387 /* This would require us to link with CoreServices.framework */
388 { /* this should do the same: */
390 unsigned long *p = adj_code;
392 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0" : : "r" (p));
395 __asm__ volatile ("sync\n\tisync");
397 #elif defined(ia64_TARGET_ARCH)
399 Up to 8 inputs are passed in registers. We flush the last two inputs to
400 the stack, initially into the 16-byte scratch region left by the caller.
401 We then shuffle the others along by 4 (taking 2 registers for ourselves
402 to save return address and previous function state - we need to come back
403 here on the way out to restore the stack, so this is a real function
404 rather than just a trampoline).
406 The function descriptor we create contains the gp of the target function
407 so gp is already loaded correctly.
409 [MLX] alloc r16=ar.pfs,10,2,0
411 [MII] st8.spill [r12]=r38,8 // spill in6 (out4)
412 mov r41=r37 // out7 = in5 (out3)
413 mov r40=r36;; // out6 = in4 (out2)
414 [MII] st8.spill [r12]=r39 // spill in7 (out5)
416 mov r38=r34;; // out4 = in2 (out0)
417 [MII] mov r39=r35 // out5 = in3 (out1)
418 mov r37=r33 // out3 = in1 (loc1)
419 mov r36=r32 // out2 = in0 (loc0)
420 [MLX] adds r12=-24,r12 // update sp
421 movl r34=hptr;; // out0 = hptr
422 [MIB] mov r33=r16 // loc1 = ar.pfs
423 mov r32=b0 // loc0 = retaddr
424 br.call.sptk.many b0=b6;;
426 [MII] adds r12=-16,r12
431 br.ret.sptk.many b0;;
434 /* These macros distribute a long constant into the two words of an MLX bundle */
435 #define BITS(val,start,count) (((val) >> (start)) & ((1 << (count))-1))
436 #define MOVL_LOWORD(val) (BITS(val,22,18) << 46)
437 #define MOVL_HIWORD(val) (BITS(val,40,23) | (BITS(val,0,7) << 36) | (BITS(val,7,9) << 50) \
438 | (BITS(val,16,5) << 55) | (BITS(val,21,1) << 44) | BITS(val,63,1) << 59)
442 IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
443 StgWord64 wcode = wdesc->ip;
447 /* we allocate on the Haskell heap since malloc'd memory isn't executable - argh */
448 adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8, &stable);
450 fdesc = (IA64FunDesc *)adjustor;
451 code = (StgWord64 *)(fdesc + 1);
452 fdesc->ip = (StgWord64)code;
453 fdesc->gp = wdesc->gp;
455 code[0] = 0x0000058004288004 | MOVL_LOWORD(wcode);
456 code[1] = 0x6000000220000000 | MOVL_HIWORD(wcode);
457 code[2] = 0x029015d818984001;
458 code[3] = 0x8401200500420094;
459 code[4] = 0x886011d8189c0001;
460 code[5] = 0x84011004c00380c0;
461 code[6] = 0x0250210046013800;
462 code[7] = 0x8401000480420084;
463 code[8] = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
464 code[9] = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
465 code[10] = 0x0200210020010811;
466 code[11] = 0x1080006800006200;
467 code[12] = 0x0000210018406000;
468 code[13] = 0x00aa021000038005;
469 code[14] = 0x000000010000001d;
470 code[15] = 0x0084000880000200;
472 /* save stable pointers in convenient form */
473 code[16] = (StgWord64)hptr;
474 code[17] = (StgWord64)stable;
477 barf("adjustor creation not supported on this platform");
492 freeHaskellFunctionPtr(void* ptr)
494 #if defined(i386_TARGET_ARCH)
495 if ( *(unsigned char*)ptr != 0x68 &&
496 *(unsigned char*)ptr != 0x58 ) {
497 prog_belch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
501 /* Free the stable pointer first..*/
502 if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
503 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
505 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
507 #elif defined(sparc_TARGET_ARCH)
508 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
509 prog_belch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
513 /* Free the stable pointer first..*/
514 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
515 #elif defined(alpha_TARGET_ARCH)
516 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
517 prog_belch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
521 /* Free the stable pointer first..*/
522 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
523 #elif defined(powerpc_TARGET_ARCH)
524 if ( *(StgWord*)ptr != 0x7d0a4378 ) {
525 prog_belch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
528 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 4*12)));
529 #elif defined(ia64_TARGET_ARCH)
530 IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
531 StgWord64 *code = (StgWord64 *)(fdesc+1);
533 if (fdesc->ip != (StgWord64)code) {
534 prog_belch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
537 freeStablePtr((StgStablePtr)code[16]);
538 freeStablePtr((StgStablePtr)code[17]);
543 *((unsigned char*)ptr) = '\0';
550 * Function: initAdjustor()
552 * Perform initialisation of adjustor thunk layer (if needed.)
557 #if defined(i386_TARGET_ARCH)
558 /* Now here's something obscure for you:
560 When generating an adjustor thunk that uses the C calling
561 convention, we have to make sure that the thunk kicks off
562 the process of jumping into Haskell with a tail jump. Why?
563 Because as a result of jumping in into Haskell we may end
564 up freeing the very adjustor thunk we came from using
565 freeHaskellFunctionPtr(). Hence, we better not return to
566 the adjustor code on our way out, since it could by then
569 The fix is readily at hand, just include the opcodes
570 for the C stack fixup code that we need to perform when
571 returning in some static piece of memory and arrange
572 to return to it before tail jumping from the adjustor thunk.
575 obscure_ccall_ret_code = stgMallocBytes(4, "initAdjustor");
577 obscure_ccall_ret_code[0x00] = (unsigned char)0x83; /* addl $0x4, %esp */
578 obscure_ccall_ret_code[0x01] = (unsigned char)0xc4;
579 obscure_ccall_ret_code[0x02] = (unsigned char)0x04;
581 obscure_ccall_ret_code[0x03] = (unsigned char)0xc3; /* ret */
583 execPage(obscure_ccall_ret_code, pageExecuteRead);