1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
6 * ---------------------------------------------------------------------------*/
8 /* A little bit of background...
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so result in memory leaks on both the C and
39 #include "PosixSource.h"
46 /* Heavily arch-specific, I'm afraid.. */
48 #if defined(i386_TARGET_ARCH)
49 /* Now here's something obscure for you:
51 When generating an adjustor thunk that uses the C calling
52 convention, we have to make sure that the thunk kicks off
53 the process of jumping into Haskell with a tail jump. Why?
54 Because as a result of jumping in into Haskell we may end
55 up freeing the very adjustor thunk we came from using
56 freeHaskellFunctionPtr(). Hence, we better not return to
57 the adjustor code on our way out, since it could by then
60 The fix is readily at hand, just include the opcodes
61 for the C stack fixup code that we need to perform when
62 returning in some static piece of memory and arrange
63 to return to it before tail jumping from the adjustor thunk.
65 For this to work we make the assumption that bytes in .data
66 are considered executable.
68 static unsigned char __obscure_ccall_ret_code [] =
69 { 0x83, 0xc4, 0x04 /* addl $0x4, %esp */
74 #if defined(alpha_TARGET_ARCH)
75 /* To get the definition of PAL_imb: */
76 # if defined(linux_TARGET_OS)
79 # include <machine/pal.h>
83 #if defined(ia64_TARGET_ARCH)
86 /* Layout of a function descriptor */
87 typedef struct _IA64FunDesc {
93 stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
96 nat data_size_in_words, total_size_in_words;
98 /* round up to a whole number of words */
99 data_size_in_words = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
100 total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
102 /* allocate and fill it in */
103 arr = (StgArrWords *)allocate(total_size_in_words);
104 SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
106 /* obtain a stable ptr */
107 *stable = getStablePtr((StgPtr)arr);
109 /* and return a ptr to the goods inside the array */
110 return(BYTE_ARR_CTS(arr));
115 createAdjustor(int cconv, StgStablePtr hptr, StgFunPtr wptr)
117 void *adjustor = NULL;
121 case 0: /* _stdcall */
122 #if defined(i386_TARGET_ARCH)
123 /* Magic constant computed by inspecting the code length of
124 the following assembly language snippet
125 (offset and machine code prefixed):
127 <0>: 58 popl %eax # temp. remove ret addr..
128 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
129 # hold a StgStablePtr
130 <6>: 50 pushl %eax # put back ret. addr
131 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
132 <c>: ff e0 jmp %eax # and jump to it.
133 # the callee cleans up the stack
135 if ((adjustor = stgMallocBytes(14, "createAdjustor")) != NULL) {
136 unsigned char *const adj_code = (unsigned char *)adjustor;
137 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
139 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
140 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
142 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
144 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
145 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
147 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
148 adj_code[0x0d] = (unsigned char)0xe0;
154 #if defined(i386_TARGET_ARCH)
155 /* Magic constant computed by inspecting the code length of
156 the following assembly language snippet
157 (offset and machine code prefixed):
159 <00>: 68 ef be ad de pushl $0xdeadbeef # constant is large enough to
160 # hold a StgStablePtr
161 <05>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
162 <0a>: 68 ef be ad de pushl $__obscure_ccall_ret_code # push the return address
163 <0f>: ff e0 jmp *%eax # jump to wptr
165 The ccall'ing version is a tad different, passing in the return
166 address of the caller to the auto-generated C stub (which enters
167 via the stable pointer.) (The auto-generated C stub is in on this
168 game, don't worry :-)
170 See the comment next to __obscure_ccall_ret_code why we need to
171 perform a tail jump instead of a call, followed by some C stack
174 Note: The adjustor makes the assumption that any return value
175 coming back from the C stub is not stored on the stack.
176 That's (thankfully) the case here with the restricted set of
177 return types that we support.
179 if ((adjustor = stgMallocBytes(17, "createAdjustor")) != NULL) {
180 unsigned char *const adj_code = (unsigned char *)adjustor;
182 adj_code[0x00] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
183 *((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
185 adj_code[0x05] = (unsigned char)0xb8; /* movl $wptr, %eax */
186 *((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
188 adj_code[0x0a] = (unsigned char)0x68; /* pushl __obscure_ccall_ret_code */
189 *((StgFunPtr*)(adj_code + 0x0b)) = (StgFunPtr)__obscure_ccall_ret_code;
191 adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
192 adj_code[0x10] = (unsigned char)0xe0;
194 #elif defined(sparc_TARGET_ARCH)
195 /* Magic constant computed by inspecting the code length of the following
196 assembly language snippet (offset and machine code prefixed):
198 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
199 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
200 <08>: D823A05C st %o4, [%sp + 92]
201 <0C>: 9A10000B mov %o3, %o5
202 <10>: 9810000A mov %o2, %o4
203 <14>: 96100009 mov %o1, %o3
204 <18>: 94100008 mov %o0, %o2
205 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
206 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
207 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
208 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
209 <2C> 00000000 ! place for getting hptr back easily
211 ccall'ing on SPARC is easy, because we are quite lucky to push a
212 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
213 existing arguments (note that %sp must stay double-word aligned at
214 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
215 To do this, we extend the *caller's* stack frame by 2 words and shift
216 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
217 procedure because of the tail-jump) by 2 positions. This makes room in
218 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
219 for destination addr of jump on SPARC, return address on x86, ...). This
220 shouldn't cause any problems for a C-like caller: alloca is implemented
221 similarly, and local variables should be accessed via %fp, not %sp. In a
222 nutshell: This should work! (Famous last words! :-)
224 if ((adjustor = stgMallocBytes(4*(11+1), "createAdjustor")) != NULL) {
225 unsigned long *const adj_code = (unsigned long *)adjustor;
227 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
228 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
229 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
230 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
231 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
232 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
233 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
234 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
235 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
236 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
237 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
238 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
239 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
240 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
241 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
243 adj_code[11] = (unsigned long)hptr;
246 asm("flush %0" : : "r" (adj_code ));
247 asm("flush %0" : : "r" (adj_code + 2));
248 asm("flush %0" : : "r" (adj_code + 4));
249 asm("flush %0" : : "r" (adj_code + 6));
250 asm("flush %0" : : "r" (adj_code + 10));
252 /* max. 5 instructions latency, and we need at >= 1 for returning */
258 #elif defined(alpha_TARGET_ARCH)
259 /* Magic constant computed by inspecting the code length of
260 the following assembly language snippet
261 (offset and machine code prefixed; note that the machine code
262 shown is longwords stored in little-endian order):
264 <00>: 46520414 mov a2, a4
265 <04>: 46100412 mov a0, a2
266 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
267 <0c>: 46730415 mov a3, a5
268 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
269 <14>: 46310413 mov a1, a3
270 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
271 <1c>: 00000000 # padding for alignment
272 <20>: [8 bytes for hptr quadword]
273 <28>: [8 bytes for wptr quadword]
275 The "computed" jump at <08> above is really a jump to a fixed
276 location. Accordingly, we place an always-correct hint in the
277 jump instruction, namely the address offset from <0c> to wptr,
278 divided by 4, taking the lowest 14 bits.
280 We only support passing 4 or fewer argument words, for the same
281 reason described under sparc_TARGET_ARCH above by JRS, 21 Aug 01.
282 On the Alpha the first 6 integer arguments are in a0 through a5,
283 and the rest on the stack. Hence we want to shuffle the original
284 caller's arguments by two.
286 On the Alpha the calling convention is so complex and dependent
287 on the callee's signature -- for example, the stack pointer has
288 to be a multiple of 16 -- that it seems impossible to me [ccshan]
289 to handle the general case correctly without changing how the
290 adjustor is called from C. For now, our solution of shuffling
291 registers only and ignoring the stack only works if the original
292 caller passed 4 or fewer argument words.
294 TODO: Depending on how much allocation overhead stgMallocBytes uses for
295 header information (more precisely, if the overhead is no more than
296 4 bytes), we should move the first three instructions above down by
297 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
299 ASSERT(((StgWord64)wptr & 3) == 0);
300 if ((adjustor = stgMallocBytes(48, "createAdjustor")) != NULL) {
301 StgWord64 *const code = (StgWord64 *)adjustor;
303 code[0] = 0x4610041246520414L;
304 code[1] = 0x46730415a61b0020L;
305 code[2] = 0x46310413a77b0028L;
306 code[3] = 0x000000006bfb0000L
307 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
309 code[4] = (StgWord64)hptr;
310 code[5] = (StgWord64)wptr;
312 /* Ensure that instruction cache is consistent with our new code */
313 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
315 #elif defined(powerpc_TARGET_ARCH)
317 For PowerPC, the following code is used:
325 lis r0,0xDEAD ;hi(wptr)
326 lis r3,0xDEAF ;hi(hptr)
327 ori r0,r0,0xBEEF ; lo(wptr)
328 ori r3,r3,0xFACE ; lo(hptr)
332 The arguments (passed in registers r3 - r10) are shuffled along by two to
333 make room for hptr and a dummy argument. As r9 and r10 are overwritten by
334 this code, it only works for up to 6 arguments (when floating point arguments
335 are involved, this may be more or less, depending on the exact situation).
337 if ((adjustor = stgMallocBytes(4*13, "createAdjustor")) != NULL) {
338 unsigned long *const adj_code = (unsigned long *)adjustor;
340 // make room for extra arguments
341 adj_code[0] = 0x7d0a4378; //mr r10,r8
342 adj_code[1] = 0x7ce93b78; //mr r9,r7
343 adj_code[2] = 0x7cc83378; //mr r8,r6
344 adj_code[3] = 0x7ca72b78; //mr r7,r5
345 adj_code[4] = 0x7c862378; //mr r6,r4
346 adj_code[5] = 0x7c651b78; //mr r5,r3
348 adj_code[6] = 0x3c000000; //lis r0,hi(wptr)
349 adj_code[6] |= ((unsigned long)wptr) >> 16;
351 adj_code[7] = 0x3c600000; //lis r3,hi(hptr)
352 adj_code[7] |= ((unsigned long)hptr) >> 16;
354 adj_code[8] = 0x60000000; //ori r0,r0,lo(wptr)
355 adj_code[8] |= ((unsigned long)wptr) & 0xFFFF;
357 adj_code[9] = 0x60630000; //ori r3,r3,lo(hptr)
358 adj_code[9] |= ((unsigned long)hptr) & 0xFFFF;
360 adj_code[10] = 0x7c0903a6; //mtctr r0
361 adj_code[11] = 0x4e800420; //bctr
362 adj_code[12] = (unsigned long)hptr;
364 // Flush the Instruction cache:
365 // MakeDataExecutable(adjustor,4*13);
366 /* This would require us to link with CoreServices.framework */
367 { /* this should do the same: */
369 unsigned long *p = adj_code;
372 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
376 __asm__ volatile ("sync\n\tisync");
379 #elif defined(ia64_TARGET_ARCH)
381 Up to 8 inputs are passed in registers. We flush the last two inputs to
382 the stack, initially into the 16-byte scratch region left by the caller.
383 We then shuffle the others along by 4 (taking 2 registers for ourselves
384 to save return address and previous function state - we need to come back
385 here on the way out to restore the stack, so this is a real function
386 rather than just a trampoline).
388 The function descriptor we create contains the gp of the target function
389 so gp is already loaded correctly.
391 [MLX] alloc r16=ar.pfs,10,2,0
393 [MII] st8.spill [r12]=r38,8 // spill in6 (out4)
394 mov r41=r37 // out7 = in5 (out3)
395 mov r40=r36;; // out6 = in4 (out2)
396 [MII] st8.spill [r12]=r39 // spill in7 (out5)
398 mov r38=r34;; // out4 = in2 (out0)
399 [MII] mov r39=r35 // out5 = in3 (out1)
400 mov r37=r33 // out3 = in1 (loc1)
401 mov r36=r32 // out2 = in0 (loc0)
402 [MLX] adds r12=-24,r12 // update sp
403 movl r34=hptr;; // out0 = hptr
404 [MIB] mov r33=r16 // loc1 = ar.pfs
405 mov r32=b0 // loc0 = retaddr
406 br.call.sptk.many b0=b6;;
408 [MII] adds r12=-16,r12
413 br.ret.sptk.many b0;;
416 /* These macros distribute a long constant into the two words of an MLX bundle */
417 #define BITS(val,start,count) (((val) >> (start)) & ((1 << (count))-1))
418 #define MOVL_LOWORD(val) (BITS(val,22,18) << 46)
419 #define MOVL_HIWORD(val) (BITS(val,40,23) | (BITS(val,0,7) << 36) | (BITS(val,7,9) << 50) \
420 | (BITS(val,16,5) << 55) | (BITS(val,21,1) << 44) | BITS(val,63,1) << 59)
424 IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
425 StgWord64 wcode = wdesc->ip;
429 /* we allocate on the Haskell heap since malloc'd memory isn't executable - argh */
430 adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8, &stable);
432 fdesc = (IA64FunDesc *)adjustor;
433 code = (StgWord64 *)(fdesc + 1);
434 fdesc->ip = (StgWord64)code;
435 fdesc->gp = wdesc->gp;
437 code[0] = 0x0000058004288004 | MOVL_LOWORD(wcode);
438 code[1] = 0x6000000220000000 | MOVL_HIWORD(wcode);
439 code[2] = 0x029015d818984001;
440 code[3] = 0x8401200500420094;
441 code[4] = 0x886011d8189c0001;
442 code[5] = 0x84011004c00380c0;
443 code[6] = 0x0250210046013800;
444 code[7] = 0x8401000480420084;
445 code[8] = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
446 code[9] = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
447 code[10] = 0x0200210020010811;
448 code[11] = 0x1080006800006200;
449 code[12] = 0x0000210018406000;
450 code[13] = 0x00aa021000038005;
451 code[14] = 0x000000010000001d;
452 code[15] = 0x0084000880000200;
454 /* save stable pointers in convenient form */
455 code[16] = (StgWord64)hptr;
456 code[17] = (StgWord64)stable;
459 barf("adjustor creation not supported on this platform");
474 freeHaskellFunctionPtr(void* ptr)
476 #if defined(i386_TARGET_ARCH)
477 if ( *(unsigned char*)ptr != 0x68 &&
478 *(unsigned char*)ptr != 0x58 ) {
479 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
483 /* Free the stable pointer first..*/
484 if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
485 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
487 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
489 #elif defined(sparc_TARGET_ARCH)
490 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
491 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
495 /* Free the stable pointer first..*/
496 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
497 #elif defined(alpha_TARGET_ARCH)
498 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
499 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
503 /* Free the stable pointer first..*/
504 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
505 #elif defined(powerpc_TARGET_ARCH)
506 if ( *(StgWord*)ptr != 0x7d0a4378 ) {
507 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
510 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 4*12)));
511 #elif defined(ia64_TARGET_ARCH)
512 IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
513 StgWord64 *code = (StgWord64 *)(fdesc+1);
515 if (fdesc->ip != (StgWord64)code) {
516 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
519 freeStablePtr((StgStablePtr)code[16]);
520 freeStablePtr((StgStablePtr)code[17]);
525 *((unsigned char*)ptr) = '\0';