1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
6 * ---------------------------------------------------------------------------*/
8 /* A little bit of background...
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so result in memory leaks on both the C and
39 #include "PosixSource.h"
41 #include "RtsExternal.h"
49 #if defined(openbsd_TARGET_OS)
51 #include <sys/types.h>
54 /* no C99 header stdint.h on OpenBSD? */
55 typedef unsigned long my_uintptr_t;
58 /* Heavily arch-specific, I'm afraid.. */
61 * Allocate len bytes which are readable, writable, and executable.
63 * ToDo: If this turns out to be a performance bottleneck, one could
64 * e.g. cache the last VirtualProtect/mprotect-ed region and do
65 * nothing in case of a cache hit.
68 mallocBytesRWX(int len)
70 void *addr = stgMallocBytes(len, "mallocBytesRWX");
71 #if defined(i386_TARGET_ARCH) && defined(_WIN32)
72 /* This could be necessary for processors which distinguish between READ and
73 EXECUTE memory accesses, e.g. Itaniums. */
74 DWORD dwOldProtect = 0;
75 if (VirtualProtect (addr, len, PAGE_EXECUTE_READWRITE, &dwOldProtect) == 0) {
76 barf("mallocBytesRWX: failed to protect 0x%p; error=%lu; old protection: %lu\n",
77 addr, (unsigned long)GetLastError(), (unsigned long)dwOldProtect);
79 #elif defined(openbsd_TARGET_OS)
80 /* malloced memory isn't executable by default on OpenBSD */
81 my_uintptr_t pageSize = sysconf(_SC_PAGESIZE);
82 my_uintptr_t mask = ~(pageSize - 1);
83 my_uintptr_t startOfFirstPage = ((my_uintptr_t)addr ) & mask;
84 my_uintptr_t startOfLastPage = ((my_uintptr_t)addr + len - 1) & mask;
85 my_uintptr_t size = startOfLastPage - startOfFirstPage + pageSize;
86 if (mprotect((void*)startOfFirstPage, (size_t)size, PROT_EXEC | PROT_READ | PROT_WRITE) != 0) {
87 barf("mallocBytesRWX: failed to protect 0x%p\n", addr);
93 #if defined(i386_TARGET_ARCH)
94 static unsigned char *obscure_ccall_ret_code;
97 #if defined(alpha_TARGET_ARCH)
98 /* To get the definition of PAL_imb: */
99 # if defined(linux_TARGET_OS)
100 # include <asm/pal.h>
102 # include <machine/pal.h>
106 #if defined(ia64_TARGET_ARCH)
109 /* Layout of a function descriptor */
110 typedef struct _IA64FunDesc {
116 stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
119 nat data_size_in_words, total_size_in_words;
121 /* round up to a whole number of words */
122 data_size_in_words = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
123 total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
125 /* allocate and fill it in */
126 arr = (StgArrWords *)allocate(total_size_in_words);
127 SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
129 /* obtain a stable ptr */
130 *stable = getStablePtr((StgPtr)arr);
132 /* and return a ptr to the goods inside the array */
133 return(BYTE_ARR_CTS(arr));
138 createAdjustor(int cconv, StgStablePtr hptr, StgFunPtr wptr)
140 void *adjustor = NULL;
144 case 0: /* _stdcall */
145 #if defined(i386_TARGET_ARCH)
146 /* Magic constant computed by inspecting the code length of
147 the following assembly language snippet
148 (offset and machine code prefixed):
150 <0>: 58 popl %eax # temp. remove ret addr..
151 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
152 # hold a StgStablePtr
153 <6>: 50 pushl %eax # put back ret. addr
154 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
155 <c>: ff e0 jmp %eax # and jump to it.
156 # the callee cleans up the stack
158 adjustor = mallocBytesRWX(14);
160 unsigned char *const adj_code = (unsigned char *)adjustor;
161 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
163 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
164 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
166 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
168 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
169 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
171 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
172 adj_code[0x0d] = (unsigned char)0xe0;
178 #if defined(i386_TARGET_ARCH)
179 /* Magic constant computed by inspecting the code length of
180 the following assembly language snippet
181 (offset and machine code prefixed):
183 <00>: 68 ef be ad de pushl $0xdeadbeef # constant is large enough to
184 # hold a StgStablePtr
185 <05>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
186 <0a>: 68 ef be ad de pushl $obscure_ccall_ret_code # push the return address
187 <0f>: ff e0 jmp *%eax # jump to wptr
189 The ccall'ing version is a tad different, passing in the return
190 address of the caller to the auto-generated C stub (which enters
191 via the stable pointer.) (The auto-generated C stub is in on this
192 game, don't worry :-)
194 See the comment next to obscure_ccall_ret_code why we need to
195 perform a tail jump instead of a call, followed by some C stack
198 Note: The adjustor makes the assumption that any return value
199 coming back from the C stub is not stored on the stack.
200 That's (thankfully) the case here with the restricted set of
201 return types that we support.
203 adjustor = mallocBytesRWX(17);
205 unsigned char *const adj_code = (unsigned char *)adjustor;
207 adj_code[0x00] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
208 *((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
210 adj_code[0x05] = (unsigned char)0xb8; /* movl $wptr, %eax */
211 *((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
213 adj_code[0x0a] = (unsigned char)0x68; /* pushl obscure_ccall_ret_code */
214 *((StgFunPtr*)(adj_code + 0x0b)) = (StgFunPtr)obscure_ccall_ret_code;
216 adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
217 adj_code[0x10] = (unsigned char)0xe0;
219 #elif defined(sparc_TARGET_ARCH)
220 /* Magic constant computed by inspecting the code length of the following
221 assembly language snippet (offset and machine code prefixed):
223 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
224 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
225 <08>: D823A05C st %o4, [%sp + 92]
226 <0C>: 9A10000B mov %o3, %o5
227 <10>: 9810000A mov %o2, %o4
228 <14>: 96100009 mov %o1, %o3
229 <18>: 94100008 mov %o0, %o2
230 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
231 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
232 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
233 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
234 <2C> 00000000 ! place for getting hptr back easily
236 ccall'ing on SPARC is easy, because we are quite lucky to push a
237 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
238 existing arguments (note that %sp must stay double-word aligned at
239 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
240 To do this, we extend the *caller's* stack frame by 2 words and shift
241 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
242 procedure because of the tail-jump) by 2 positions. This makes room in
243 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
244 for destination addr of jump on SPARC, return address on x86, ...). This
245 shouldn't cause any problems for a C-like caller: alloca is implemented
246 similarly, and local variables should be accessed via %fp, not %sp. In a
247 nutshell: This should work! (Famous last words! :-)
249 adjustor = mallocBytesRWX(4*(11+1));
251 unsigned long *const adj_code = (unsigned long *)adjustor;
253 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
254 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
255 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
256 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
257 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
258 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
259 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
260 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
261 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
262 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
263 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
264 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
265 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
266 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
267 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
269 adj_code[11] = (unsigned long)hptr;
272 asm("flush %0" : : "r" (adj_code ));
273 asm("flush %0" : : "r" (adj_code + 2));
274 asm("flush %0" : : "r" (adj_code + 4));
275 asm("flush %0" : : "r" (adj_code + 6));
276 asm("flush %0" : : "r" (adj_code + 10));
278 /* max. 5 instructions latency, and we need at >= 1 for returning */
284 #elif defined(alpha_TARGET_ARCH)
285 /* Magic constant computed by inspecting the code length of
286 the following assembly language snippet
287 (offset and machine code prefixed; note that the machine code
288 shown is longwords stored in little-endian order):
290 <00>: 46520414 mov a2, a4
291 <04>: 46100412 mov a0, a2
292 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
293 <0c>: 46730415 mov a3, a5
294 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
295 <14>: 46310413 mov a1, a3
296 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
297 <1c>: 00000000 # padding for alignment
298 <20>: [8 bytes for hptr quadword]
299 <28>: [8 bytes for wptr quadword]
301 The "computed" jump at <08> above is really a jump to a fixed
302 location. Accordingly, we place an always-correct hint in the
303 jump instruction, namely the address offset from <0c> to wptr,
304 divided by 4, taking the lowest 14 bits.
306 We only support passing 4 or fewer argument words, for the same
307 reason described under sparc_TARGET_ARCH above by JRS, 21 Aug 01.
308 On the Alpha the first 6 integer arguments are in a0 through a5,
309 and the rest on the stack. Hence we want to shuffle the original
310 caller's arguments by two.
312 On the Alpha the calling convention is so complex and dependent
313 on the callee's signature -- for example, the stack pointer has
314 to be a multiple of 16 -- that it seems impossible to me [ccshan]
315 to handle the general case correctly without changing how the
316 adjustor is called from C. For now, our solution of shuffling
317 registers only and ignoring the stack only works if the original
318 caller passed 4 or fewer argument words.
320 TODO: Depending on how much allocation overhead stgMallocBytes uses for
321 header information (more precisely, if the overhead is no more than
322 4 bytes), we should move the first three instructions above down by
323 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
325 ASSERT(((StgWord64)wptr & 3) == 0);
326 adjustor = mallocBytesRWX(48);
328 StgWord64 *const code = (StgWord64 *)adjustor;
330 code[0] = 0x4610041246520414L;
331 code[1] = 0x46730415a61b0020L;
332 code[2] = 0x46310413a77b0028L;
333 code[3] = 0x000000006bfb0000L
334 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
336 code[4] = (StgWord64)hptr;
337 code[5] = (StgWord64)wptr;
339 /* Ensure that instruction cache is consistent with our new code */
340 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
342 #elif defined(powerpc_TARGET_ARCH)
344 For PowerPC, the following code is used:
352 lis r0,0xDEAD ;hi(wptr)
353 lis r3,0xDEAF ;hi(hptr)
354 ori r0,r0,0xBEEF ; lo(wptr)
355 ori r3,r3,0xFACE ; lo(hptr)
359 The arguments (passed in registers r3 - r10) are shuffled along by two to
360 make room for hptr and a dummy argument. As r9 and r10 are overwritten by
361 this code, it only works for up to 6 arguments (when floating point arguments
362 are involved, this may be more or less, depending on the exact situation).
364 adjustor = mallocBytesRWX(4*13);
366 unsigned long *const adj_code = (unsigned long *)adjustor;
368 // make room for extra arguments
369 adj_code[0] = 0x7d0a4378; //mr r10,r8
370 adj_code[1] = 0x7ce93b78; //mr r9,r7
371 adj_code[2] = 0x7cc83378; //mr r8,r6
372 adj_code[3] = 0x7ca72b78; //mr r7,r5
373 adj_code[4] = 0x7c862378; //mr r6,r4
374 adj_code[5] = 0x7c651b78; //mr r5,r3
376 adj_code[6] = 0x3c000000; //lis r0,hi(wptr)
377 adj_code[6] |= ((unsigned long)wptr) >> 16;
379 adj_code[7] = 0x3c600000; //lis r3,hi(hptr)
380 adj_code[7] |= ((unsigned long)hptr) >> 16;
382 adj_code[8] = 0x60000000; //ori r0,r0,lo(wptr)
383 adj_code[8] |= ((unsigned long)wptr) & 0xFFFF;
385 adj_code[9] = 0x60630000; //ori r3,r3,lo(hptr)
386 adj_code[9] |= ((unsigned long)hptr) & 0xFFFF;
388 adj_code[10] = 0x7c0903a6; //mtctr r0
389 adj_code[11] = 0x4e800420; //bctr
390 adj_code[12] = (unsigned long)hptr;
392 // Flush the Instruction cache:
393 // MakeDataExecutable(adjustor,4*13);
394 /* This would require us to link with CoreServices.framework */
395 { /* this should do the same: */
397 unsigned long *p = adj_code;
400 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
404 __asm__ volatile ("sync\n\tisync");
407 #elif defined(ia64_TARGET_ARCH)
409 Up to 8 inputs are passed in registers. We flush the last two inputs to
410 the stack, initially into the 16-byte scratch region left by the caller.
411 We then shuffle the others along by 4 (taking 2 registers for ourselves
412 to save return address and previous function state - we need to come back
413 here on the way out to restore the stack, so this is a real function
414 rather than just a trampoline).
416 The function descriptor we create contains the gp of the target function
417 so gp is already loaded correctly.
419 [MLX] alloc r16=ar.pfs,10,2,0
421 [MII] st8.spill [r12]=r38,8 // spill in6 (out4)
422 mov r41=r37 // out7 = in5 (out3)
423 mov r40=r36;; // out6 = in4 (out2)
424 [MII] st8.spill [r12]=r39 // spill in7 (out5)
426 mov r38=r34;; // out4 = in2 (out0)
427 [MII] mov r39=r35 // out5 = in3 (out1)
428 mov r37=r33 // out3 = in1 (loc1)
429 mov r36=r32 // out2 = in0 (loc0)
430 [MLX] adds r12=-24,r12 // update sp
431 movl r34=hptr;; // out0 = hptr
432 [MIB] mov r33=r16 // loc1 = ar.pfs
433 mov r32=b0 // loc0 = retaddr
434 br.call.sptk.many b0=b6;;
436 [MII] adds r12=-16,r12
441 br.ret.sptk.many b0;;
444 /* These macros distribute a long constant into the two words of an MLX bundle */
445 #define BITS(val,start,count) (((val) >> (start)) & ((1 << (count))-1))
446 #define MOVL_LOWORD(val) (BITS(val,22,18) << 46)
447 #define MOVL_HIWORD(val) (BITS(val,40,23) | (BITS(val,0,7) << 36) | (BITS(val,7,9) << 50) \
448 | (BITS(val,16,5) << 55) | (BITS(val,21,1) << 44) | BITS(val,63,1) << 59)
452 IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
453 StgWord64 wcode = wdesc->ip;
457 /* we allocate on the Haskell heap since malloc'd memory isn't executable - argh */
458 adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8, &stable);
460 fdesc = (IA64FunDesc *)adjustor;
461 code = (StgWord64 *)(fdesc + 1);
462 fdesc->ip = (StgWord64)code;
463 fdesc->gp = wdesc->gp;
465 code[0] = 0x0000058004288004 | MOVL_LOWORD(wcode);
466 code[1] = 0x6000000220000000 | MOVL_HIWORD(wcode);
467 code[2] = 0x029015d818984001;
468 code[3] = 0x8401200500420094;
469 code[4] = 0x886011d8189c0001;
470 code[5] = 0x84011004c00380c0;
471 code[6] = 0x0250210046013800;
472 code[7] = 0x8401000480420084;
473 code[8] = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
474 code[9] = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
475 code[10] = 0x0200210020010811;
476 code[11] = 0x1080006800006200;
477 code[12] = 0x0000210018406000;
478 code[13] = 0x00aa021000038005;
479 code[14] = 0x000000010000001d;
480 code[15] = 0x0084000880000200;
482 /* save stable pointers in convenient form */
483 code[16] = (StgWord64)hptr;
484 code[17] = (StgWord64)stable;
487 barf("adjustor creation not supported on this platform");
502 freeHaskellFunctionPtr(void* ptr)
504 #if defined(i386_TARGET_ARCH)
505 if ( *(unsigned char*)ptr != 0x68 &&
506 *(unsigned char*)ptr != 0x58 ) {
507 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
511 /* Free the stable pointer first..*/
512 if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
513 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
515 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
517 #elif defined(sparc_TARGET_ARCH)
518 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
519 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
523 /* Free the stable pointer first..*/
524 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
525 #elif defined(alpha_TARGET_ARCH)
526 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
527 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
531 /* Free the stable pointer first..*/
532 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
533 #elif defined(powerpc_TARGET_ARCH)
534 if ( *(StgWord*)ptr != 0x7d0a4378 ) {
535 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
538 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 4*12)));
539 #elif defined(ia64_TARGET_ARCH)
540 IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
541 StgWord64 *code = (StgWord64 *)(fdesc+1);
543 if (fdesc->ip != (StgWord64)code) {
544 errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
547 freeStablePtr((StgStablePtr)code[16]);
548 freeStablePtr((StgStablePtr)code[17]);
553 *((unsigned char*)ptr) = '\0';
560 * Function: initAdjustor()
562 * Perform initialisation of adjustor thunk layer (if needed.)
567 #if defined(i386_TARGET_ARCH)
568 /* Now here's something obscure for you:
570 When generating an adjustor thunk that uses the C calling
571 convention, we have to make sure that the thunk kicks off
572 the process of jumping into Haskell with a tail jump. Why?
573 Because as a result of jumping in into Haskell we may end
574 up freeing the very adjustor thunk we came from using
575 freeHaskellFunctionPtr(). Hence, we better not return to
576 the adjustor code on our way out, since it could by then
579 The fix is readily at hand, just include the opcodes
580 for the C stack fixup code that we need to perform when
581 returning in some static piece of memory and arrange
582 to return to it before tail jumping from the adjustor thunk.
585 obscure_ccall_ret_code = mallocBytesRWX(4);
587 obscure_ccall_ret_code[0x00] = (unsigned char)0x83; /* addl $0x4, %esp */
588 obscure_ccall_ret_code[0x01] = (unsigned char)0xc4;
589 obscure_ccall_ret_code[0x02] = (unsigned char)0x04;
591 obscure_ccall_ret_code[0x03] = (unsigned char)0xc3; /* ret */