1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
6 * ---------------------------------------------------------------------------*/
8 /* A little bit of background...
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so result in memory leaks on both the C and
39 #include "PosixSource.h"
46 /* Heavily arch-specific, I'm afraid.. */
48 #if defined(i386_TARGET_ARCH)
49 /* Now here's something obscure for you:
51 When generating an adjustor thunk that uses the C calling
52 convention, we have to make sure that the thunk kicks off
53 the process of jumping into Haskell with a tail jump. Why?
54 Because as a result of jumping in into Haskell we may end
55 up freeing the very adjustor thunk we came from using
56 freeHaskellFunctionPtr(). Hence, we better not return to
57 the adjustor code on our way out, since it could by then
60 The fix is readily at hand, just include the opcodes
61 for the C stack fixup code that we need to perform when
62 returning in some static piece of memory and arrange
63 to return to it before tail jumping from the adjustor thunk.
65 For this to work we make the assumption that bytes in .data
66 are considered executable.
68 static unsigned char __obscure_ccall_ret_code [] =
69 { 0x83, 0xc4, 0x04 /* addl $0x4, %esp */
74 #if defined(alpha_TARGET_ARCH)
75 /* To get the definition of PAL_imb: */
76 #include <machine/pal.h>
79 #if defined(ia64_TARGET_ARCH)
82 /* Layout of a function descriptor */
83 typedef struct _IA64FunDesc {
89 stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
92 nat data_size_in_words, total_size_in_words;
94 /* round up to a whole number of words */
95 data_size_in_words = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
96 total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
98 /* allocate and fill it in */
99 arr = (StgArrWords *)allocate(total_size_in_words);
100 SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
102 /* obtain a stable ptr */
103 *stable = getStablePtr((StgPtr)arr);
105 /* and return a ptr to the goods inside the array */
106 return(BYTE_ARR_CTS(arr));
111 createAdjustor(int cconv, StgStablePtr hptr, StgFunPtr wptr)
113 void *adjustor = NULL;
117 case 0: /* _stdcall */
118 #if defined(i386_TARGET_ARCH)
119 /* Magic constant computed by inspecting the code length of
120 the following assembly language snippet
121 (offset and machine code prefixed):
123 <0>: 58 popl %eax # temp. remove ret addr..
124 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
125 # hold a StgStablePtr
126 <6>: 50 pushl %eax # put back ret. addr
127 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
128 <c>: ff e0 jmp %eax # and jump to it.
129 # the callee cleans up the stack
131 if ((adjustor = stgMallocBytes(14, "createAdjustor")) != NULL) {
132 unsigned char *const adj_code = (unsigned char *)adjustor;
133 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
135 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
136 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
138 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
140 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
141 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
143 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
144 adj_code[0x0d] = (unsigned char)0xe0;
150 #if defined(i386_TARGET_ARCH)
151 /* Magic constant computed by inspecting the code length of
152 the following assembly language snippet
153 (offset and machine code prefixed):
155 <00>: 68 ef be ad de pushl $0xdeadbeef # constant is large enough to
156 # hold a StgStablePtr
157 <05>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
158 <0a>: 68 ef be ad de pushl $__obscure_ccall_ret_code # push the return address
159 <0f>: ff e0 jmp *%eax # jump to wptr
161 The ccall'ing version is a tad different, passing in the return
162 address of the caller to the auto-generated C stub (which enters
163 via the stable pointer.) (The auto-generated C stub is in on this
164 game, don't worry :-)
166 See the comment next to __obscure_ccall_ret_code why we need to
167 perform a tail jump instead of a call, followed by some C stack
170 Note: The adjustor makes the assumption that any return value
171 coming back from the C stub is not stored on the stack.
172 That's (thankfully) the case here with the restricted set of
173 return types that we support.
175 if ((adjustor = stgMallocBytes(17, "createAdjustor")) != NULL) {
176 unsigned char *const adj_code = (unsigned char *)adjustor;
178 adj_code[0x00] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
179 *((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
181 adj_code[0x05] = (unsigned char)0xb8; /* movl $wptr, %eax */
182 *((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
184 adj_code[0x0a] = (unsigned char)0x68; /* pushl __obscure_ccall_ret_code */
185 *((StgFunPtr*)(adj_code + 0x0b)) = (StgFunPtr)__obscure_ccall_ret_code;
187 adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
188 adj_code[0x10] = (unsigned char)0xe0;
190 #elif defined(sparc_TARGET_ARCH)
191 /* Magic constant computed by inspecting the code length of the following
192 assembly language snippet (offset and machine code prefixed):
194 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
195 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
196 <08>: D823A05C st %o4, [%sp + 92]
197 <0C>: 9A10000B mov %o3, %o5
198 <10>: 9810000A mov %o2, %o4
199 <14>: 96100009 mov %o1, %o3
200 <18>: 94100008 mov %o0, %o2
201 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
202 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
203 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
204 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
205 <2C> 00000000 ! place for getting hptr back easily
207 ccall'ing on SPARC is easy, because we are quite lucky to push a
208 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
209 existing arguments (note that %sp must stay double-word aligned at
210 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
211 To do this, we extend the *caller's* stack frame by 2 words and shift
212 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
213 procedure because of the tail-jump) by 2 positions. This makes room in
214 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
215 for destination addr of jump on SPARC, return address on x86, ...). This
216 shouldn't cause any problems for a C-like caller: alloca is implemented
217 similarly, and local variables should be accessed via %fp, not %sp. In a
218 nutshell: This should work! (Famous last words! :-)
220 if ((adjustor = stgMallocBytes(4*(11+1), "createAdjustor")) != NULL) {
221 unsigned long *const adj_code = (unsigned long *)adjustor;
223 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
224 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
225 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
226 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
227 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
228 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
229 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
230 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
231 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
232 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
233 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
234 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
235 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
236 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
237 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
239 adj_code[11] = (unsigned long)hptr;
242 asm("flush %0" : : "r" (adj_code ));
243 asm("flush %0" : : "r" (adj_code + 2));
244 asm("flush %0" : : "r" (adj_code + 4));
245 asm("flush %0" : : "r" (adj_code + 6));
246 asm("flush %0" : : "r" (adj_code + 10));
248 /* max. 5 instructions latency, and we need at >= 1 for returning */
254 #elif defined(alpha_TARGET_ARCH)
255 /* Magic constant computed by inspecting the code length of
256 the following assembly language snippet
257 (offset and machine code prefixed; note that the machine code
258 shown is longwords stored in little-endian order):
260 <00>: 46520414 mov a2, a4
261 <04>: 46100412 mov a0, a2
262 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
263 <0c>: 46730415 mov a3, a5
264 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
265 <14>: 46310413 mov a1, a3
266 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
267 <1c>: 00000000 # padding for alignment
268 <20>: [8 bytes for hptr quadword]
269 <28>: [8 bytes for wptr quadword]
271 The "computed" jump at <08> above is really a jump to a fixed
272 location. Accordingly, we place an always-correct hint in the
273 jump instruction, namely the address offset from <0c> to wptr,
274 divided by 4, taking the lowest 14 bits.
276 We only support passing 4 or fewer argument words, for the same
277 reason described under sparc_TARGET_ARCH above by JRS, 21 Aug 01.
278 On the Alpha the first 6 integer arguments are in a0 through a5,
279 and the rest on the stack. Hence we want to shuffle the original
280 caller's arguments by two.
282 On the Alpha the calling convention is so complex and dependent
283 on the callee's signature -- for example, the stack pointer has
284 to be a multiple of 16 -- that it seems impossible to me [ccshan]
285 to handle the general case correctly without changing how the
286 adjustor is called from C. For now, our solution of shuffling
287 registers only and ignoring the stack only works if the original
288 caller passed 4 or fewer argument words.
290 TODO: Depending on how much allocation overhead stgMallocBytes uses for
291 header information (more precisely, if the overhead is no more than
292 4 bytes), we should move the first three instructions above down by
293 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
295 ASSERT(((StgWord64)wptr & 3) == 0);
296 if ((adjustor = stgMallocBytes(48, "createAdjustor")) != NULL) {
297 StgWord64 *const code = (StgWord64 *)adjustor;
299 code[0] = 0x4610041246520414L;
300 code[1] = 0x46730415a61b0020L;
301 code[2] = 0x46310413a77b0028L;
302 code[3] = 0x000000006bfb0000L
303 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
305 code[4] = (StgWord64)hptr;
306 code[5] = (StgWord64)wptr;
308 /* Ensure that instruction cache is consistent with our new code */
309 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
311 #elif defined(powerpc_TARGET_ARCH)
313 For PowerPC, the following code is used:
321 lis r0,0xDEAD ;hi(wptr)
322 lis r3,0xDEAF ;hi(hptr)
323 ori r0,r0,0xBEEF ; lo(wptr)
324 ori r3,r3,0xFACE ; lo(hptr)
328 The arguments (passed in registers r3 - r10) are shuffled along by two to
329 make room for hptr and a dummy argument. As r9 and r10 are overwritten by
330 this code, it only works for up to 6 arguments (when floating point arguments
331 are involved, this may be more or less, depending on the exact situation).
333 if ((adjustor = stgMallocBytes(4*13, "createAdjustor")) != NULL) {
334 unsigned long *const adj_code = (unsigned long *)adjustor;
336 // make room for extra arguments
337 adj_code[0] = 0x7d0a4378; //mr r10,r8
338 adj_code[1] = 0x7ce93b78; //mr r9,r7
339 adj_code[2] = 0x7cc83378; //mr r8,r6
340 adj_code[3] = 0x7ca72b78; //mr r7,r5
341 adj_code[4] = 0x7c862378; //mr r6,r4
342 adj_code[5] = 0x7c651b78; //mr r5,r3
344 adj_code[6] = 0x3c000000; //lis r0,hi(wptr)
345 adj_code[6] |= ((unsigned long)wptr) >> 16;
347 adj_code[7] = 0x3c600000; //lis r3,hi(hptr)
348 adj_code[7] |= ((unsigned long)hptr) >> 16;
350 adj_code[8] = 0x60000000; //ori r0,r0,lo(wptr)
351 adj_code[8] |= ((unsigned long)wptr) & 0xFFFF;
353 adj_code[9] = 0x60630000; //ori r3,r3,lo(hptr)
354 adj_code[9] |= ((unsigned long)hptr) & 0xFFFF;
356 adj_code[10] = 0x7c0903a6; //mtctr r0
357 adj_code[11] = 0x4e800420; //bctr
358 adj_code[12] = (unsigned long)hptr;
360 // Flush the Instruction cache:
361 // MakeDataExecutable(adjustor,4*13);
362 /* This would require us to link with CoreServices.framework */
363 { /* this should do the same: */
365 unsigned long *p = adj_code;
368 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
372 __asm__ volatile ("sync\n\tisync");
375 #elif defined(ia64_TARGET_ARCH)
377 Up to 8 inputs are passed in registers. We flush the last two inputs to
378 the stack, initially into the 16-byte scratch region left by the caller.
379 We then shuffle the others along by 4 (taking 2 registers for ourselves
380 to save return address and previous function state - we need to come back
381 here on the way out to restore the stack, so this is a real function
382 rather than just a trampoline).
384 The function descriptor we create contains the gp of the target function
385 so gp is already loaded correctly.
387 [MLX] alloc r16=ar.pfs,10,2,0
389 [MII] st8.spill [r12]=r38,8 // spill in6 (out4)
390 mov r41=r37 // out7 = in5 (out3)
391 mov r40=r36;; // out6 = in4 (out2)
392 [MII] st8.spill [r12]=r39 // spill in7 (out5)
394 mov r38=r34;; // out4 = in2 (out0)
395 [MII] mov r39=r35 // out5 = in3 (out1)
396 mov r37=r33 // out3 = in1 (loc1)
397 mov r36=r32 // out2 = in0 (loc0)
398 [MLX] adds r12=-24,r12 // update sp
399 movl r34=hptr;; // out0 = hptr
400 [MIB] mov r33=r16 // loc1 = ar.pfs
401 mov r32=b0 // loc0 = retaddr
402 br.call.sptk.many b0=b6;;
404 [MII] adds r12=-16,r12
409 br.ret.sptk.many b0;;
412 /* These macros distribute a long constant into the two words of an MLX bundle */
413 #define BITS(val,start,count) (((val) >> (start)) & ((1 << (count))-1))
414 #define MOVL_LOWORD(val) (BITS(val,22,18) << 46)
415 #define MOVL_HIWORD(val) (BITS(val,40,23) | (BITS(val,0,7) << 36) | (BITS(val,7,9) << 50) \
416 | (BITS(val,16,5) << 55) | (BITS(val,21,1) << 44) | BITS(val,63,1) << 59)
420 IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
421 StgWord64 wcode = wdesc->ip;
425 /* we allocate on the Haskell heap since malloc'd memory isn't executable - argh */
426 adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8, &stable);
428 fdesc = (IA64FunDesc *)adjustor;
429 code = (StgWord64 *)(fdesc + 1);
430 fdesc->ip = (StgWord64)code;
431 fdesc->gp = wdesc->gp;
433 code[0] = 0x0000058004288004 | MOVL_LOWORD(wcode);
434 code[1] = 0x6000000220000000 | MOVL_HIWORD(wcode);
435 code[2] = 0x029015d818984001;
436 code[3] = 0x8401200500420094;
437 code[4] = 0x886011d8189c0001;
438 code[5] = 0x84011004c00380c0;
439 code[6] = 0x0250210046013800;
440 code[7] = 0x8401000480420084;
441 code[8] = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
442 code[9] = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
443 code[10] = 0x0200210020010811;
444 code[11] = 0x1080006800006200;
445 code[12] = 0x0000210018406000;
446 code[13] = 0x00aa021000038005;
447 code[14] = 0x000000010000001d;
448 code[15] = 0x0084000880000200;
450 /* save stable pointers in convenient form */
451 code[16] = (StgWord64)hptr;
452 code[17] = (StgWord64)stable;
455 barf("adjustor creation not supported on this platform");
470 freeHaskellFunctionPtr(void* ptr)
472 #if defined(i386_TARGET_ARCH)
473 if ( *(unsigned char*)ptr != 0x68 &&
474 *(unsigned char*)ptr != 0x58 ) {
475 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
479 /* Free the stable pointer first..*/
480 if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
481 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
483 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
485 #elif defined(sparc_TARGET_ARCH)
486 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
487 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
491 /* Free the stable pointer first..*/
492 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
493 #elif defined(alpha_TARGET_ARCH)
494 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
495 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
499 /* Free the stable pointer first..*/
500 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
501 #elif defined(powerpc_TARGET_ARCH)
502 if ( *(StgWord*)ptr != 0x7d0a4378 ) {
503 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
506 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 4*12)));
507 #elif defined(ia64_TARGET_ARCH)
508 IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
509 StgWord64 *code = (StgWord64 *)(fdesc+1);
511 if (fdesc->ip != (StgWord64)code) {
512 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
515 freeStablePtr((StgStablePtr)code[16]);
516 freeStablePtr((StgStablePtr)code[17]);
521 *((unsigned char*)ptr) = '\0';