1 /* -----------------------------------------------------------------------------
2 * Foreign export adjustor thunks
6 * ---------------------------------------------------------------------------*/
8 /* A little bit of background...
10 An adjustor thunk is a dynamically allocated code snippet that allows
11 Haskell closures to be viewed as C function pointers.
13 Stable pointers provide a way for the outside world to get access to,
14 and evaluate, Haskell heap objects, with the RTS providing a small
15 range of ops for doing so. So, assuming we've got a stable pointer in
16 our hand in C, we can jump into the Haskell world and evaluate a callback
17 procedure, say. This works OK in some cases where callbacks are used, but
18 does require the external code to know about stable pointers and how to deal
19 with them. We'd like to hide the Haskell-nature of a callback and have it
20 be invoked just like any other C function pointer.
22 Enter adjustor thunks. An adjustor thunk is a little piece of code
23 that's generated on-the-fly (one per Haskell closure being exported)
24 that, when entered using some 'universal' calling convention (e.g., the
25 C calling convention on platform X), pushes an implicit stable pointer
26 (to the Haskell callback) before calling another (static) C function stub
27 which takes care of entering the Haskell code via its stable pointer.
29 An adjustor thunk is allocated on the C heap, and is called from within
30 Haskell just before handing out the function pointer to the Haskell (IO)
31 action. User code should never have to invoke it explicitly.
33 An adjustor thunk differs from a C function pointer in one respect: when
34 the code is through with it, it has to be freed in order to release Haskell
35 and C resources. Failure to do so result in memory leaks on both the C and
39 #include "PosixSource.h"
46 /* Heavily arch-specific, I'm afraid.. */
48 #if defined(i386_TARGET_ARCH)
49 /* Now here's something obscure for you:
51 When generating an adjustor thunk that uses the C calling
52 convention, we have to make sure that the thunk kicks off
53 the process of jumping into Haskell with a tail jump. Why?
54 Because as a result of jumping in into Haskell we may end
55 up freeing the very adjustor thunk we came from using
56 freeHaskellFunctionPtr(). Hence, we better not return to
57 the adjustor code on our way out, since it could by then
60 The fix is readily at hand, just include the opcodes
61 for the C stack fixup code that we need to perform when
62 returning in some static piece of memory and arrange
63 to return to it before tail jumping from the adjustor thunk.
65 For this to work we make the assumption that bytes in .data
66 are considered executable.
68 static unsigned char __obscure_ccall_ret_code [] =
69 { 0x83, 0xc4, 0x04 /* addl $0x4, %esp */
74 #if defined(alpha_TARGET_ARCH)
75 /* To get the definition of PAL_imb: */
76 #include <machine/pal.h>
80 createAdjustor(int cconv, StgStablePtr hptr, StgFunPtr wptr)
82 void *adjustor = NULL;
86 case 0: /* _stdcall */
87 #if defined(i386_TARGET_ARCH)
88 /* Magic constant computed by inspecting the code length of
89 the following assembly language snippet
90 (offset and machine code prefixed):
92 <0>: 58 popl %eax # temp. remove ret addr..
93 <1>: 68 fd fc fe fa pushl 0xfafefcfd # constant is large enough to
95 <6>: 50 pushl %eax # put back ret. addr
96 <7>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
97 <c>: ff e0 jmp %eax # and jump to it.
98 # the callee cleans up the stack
100 if ((adjustor = stgMallocBytes(14, "createAdjustor")) != NULL) {
101 unsigned char *const adj_code = (unsigned char *)adjustor;
102 adj_code[0x00] = (unsigned char)0x58; /* popl %eax */
104 adj_code[0x01] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
105 *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
107 adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
109 adj_code[0x07] = (unsigned char)0xb8; /* movl $wptr, %eax */
110 *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
112 adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
113 adj_code[0x0d] = (unsigned char)0xe0;
119 #if defined(i386_TARGET_ARCH)
120 /* Magic constant computed by inspecting the code length of
121 the following assembly language snippet
122 (offset and machine code prefixed):
124 <00>: 68 ef be ad de pushl $0xdeadbeef # constant is large enough to
125 # hold a StgStablePtr
126 <05>: b8 fa ef ff 00 movl $0x00ffeffa, %eax # load up wptr
127 <0a>: 68 ef be ad de pushl $__obscure_ccall_ret_code # push the return address
128 <0f>: ff e0 jmp *%eax # jump to wptr
130 The ccall'ing version is a tad different, passing in the return
131 address of the caller to the auto-generated C stub (which enters
132 via the stable pointer.) (The auto-generated C stub is in on this
133 game, don't worry :-)
135 See the comment next to __obscure_ccall_ret_code why we need to
136 perform a tail jump instead of a call, followed by some C stack
139 Note: The adjustor makes the assumption that any return value
140 coming back from the C stub is not stored on the stack.
141 That's (thankfully) the case here with the restricted set of
142 return types that we support.
144 if ((adjustor = stgMallocBytes(17, "createAdjustor")) != NULL) {
145 unsigned char *const adj_code = (unsigned char *)adjustor;
147 adj_code[0x00] = (unsigned char)0x68; /* pushl hptr (which is a dword immediate ) */
148 *((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
150 adj_code[0x05] = (unsigned char)0xb8; /* movl $wptr, %eax */
151 *((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
153 adj_code[0x0a] = (unsigned char)0x68; /* pushl __obscure_ccall_ret_code */
154 *((StgFunPtr*)(adj_code + 0x0b)) = (StgFunPtr)__obscure_ccall_ret_code;
156 adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
157 adj_code[0x10] = (unsigned char)0xe0;
159 #elif defined(sparc_TARGET_ARCH)
160 /* Magic constant computed by inspecting the code length of the following
161 assembly language snippet (offset and machine code prefixed):
163 <00>: 9C23A008 sub %sp, 8, %sp ! make room for %o4/%o5 in caller's frame
164 <04>: DA23A060 st %o5, [%sp + 96] ! shift registers by 2 positions
165 <08>: D823A05C st %o4, [%sp + 92]
166 <0C>: 9A10000B mov %o3, %o5
167 <10>: 9810000A mov %o2, %o4
168 <14>: 96100009 mov %o1, %o3
169 <18>: 94100008 mov %o0, %o2
170 <1C>: 13000000 sethi %hi(wptr), %o1 ! load up wptr (1 of 2)
171 <20>: 11000000 sethi %hi(hptr), %o0 ! load up hptr (1 of 2)
172 <24>: 81C26000 jmp %o1 + %lo(wptr) ! jump to wptr (load 2 of 2)
173 <28>: 90122000 or %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
174 <2C> 00000000 ! place for getting hptr back easily
176 ccall'ing on SPARC is easy, because we are quite lucky to push a
177 multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
178 existing arguments (note that %sp must stay double-word aligned at
179 all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
180 To do this, we extend the *caller's* stack frame by 2 words and shift
181 the output registers used for argument passing (%o0 - %o5, we are a *leaf*
182 procedure because of the tail-jump) by 2 positions. This makes room in
183 %o0 and %o1 for the additinal arguments, namely hptr and a dummy (used
184 for destination addr of jump on SPARC, return address on x86, ...). This
185 shouldn't cause any problems for a C-like caller: alloca is implemented
186 similarly, and local variables should be accessed via %fp, not %sp. In a
187 nutshell: This should work! (Famous last words! :-)
189 if ((adjustor = stgMallocBytes(4*(11+1), "createAdjustor")) != NULL) {
190 unsigned long *const adj_code = (unsigned long *)adjustor;
192 adj_code[ 0] = 0x9C23A008UL; /* sub %sp, 8, %sp */
193 adj_code[ 1] = 0xDA23A060UL; /* st %o5, [%sp + 96] */
194 adj_code[ 2] = 0xD823A05CUL; /* st %o4, [%sp + 92] */
195 adj_code[ 3] = 0x9A10000BUL; /* mov %o3, %o5 */
196 adj_code[ 4] = 0x9810000AUL; /* mov %o2, %o4 */
197 adj_code[ 5] = 0x96100009UL; /* mov %o1, %o3 */
198 adj_code[ 6] = 0x94100008UL; /* mov %o0, %o2 */
199 adj_code[ 7] = 0x13000000UL; /* sethi %hi(wptr), %o1 */
200 adj_code[ 7] |= ((unsigned long)wptr) >> 10;
201 adj_code[ 8] = 0x11000000UL; /* sethi %hi(hptr), %o0 */
202 adj_code[ 8] |= ((unsigned long)hptr) >> 10;
203 adj_code[ 9] = 0x81C26000UL; /* jmp %o1 + %lo(wptr) */
204 adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
205 adj_code[10] = 0x90122000UL; /* or %o0, %lo(hptr), %o0 */
206 adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
208 adj_code[11] = (unsigned long)hptr;
211 asm("flush %0" : : "r" (adj_code ));
212 asm("flush %0" : : "r" (adj_code + 2));
213 asm("flush %0" : : "r" (adj_code + 4));
214 asm("flush %0" : : "r" (adj_code + 6));
215 asm("flush %0" : : "r" (adj_code + 10));
217 /* max. 5 instructions latency, and we need at >= 1 for returning */
223 #elif defined(alpha_TARGET_ARCH)
224 /* Magic constant computed by inspecting the code length of
225 the following assembly language snippet
226 (offset and machine code prefixed; note that the machine code
227 shown is longwords stored in little-endian order):
229 <00>: 46520414 mov a2, a4
230 <04>: 46100412 mov a0, a2
231 <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
232 <0c>: 46730415 mov a3, a5
233 <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
234 <14>: 46310413 mov a1, a3
235 <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
236 <1c>: 00000000 # padding for alignment
237 <20>: [8 bytes for hptr quadword]
238 <28>: [8 bytes for wptr quadword]
240 The "computed" jump at <08> above is really a jump to a fixed
241 location. Accordingly, we place an always-correct hint in the
242 jump instruction, namely the address offset from <0c> to wptr,
243 divided by 4, taking the lowest 14 bits.
245 We only support passing 4 or fewer argument words, for the same
246 reason described under sparc_TARGET_ARCH above by JRS, 21 Aug 01.
247 On the Alpha the first 6 integer arguments are in a0 through a5,
248 and the rest on the stack. Hence we want to shuffle the original
249 caller's arguments by two.
251 On the Alpha the calling convention is so complex and dependent
252 on the callee's signature -- for example, the stack pointer has
253 to be a multiple of 16 -- that it seems impossible to me [ccshan]
254 to handle the general case correctly without changing how the
255 adjustor is called from C. For now, our solution of shuffling
256 registers only and ignoring the stack only works if the original
257 caller passed 4 or fewer argument words.
259 TODO: Depending on how much allocation overhead stgMallocBytes uses for
260 header information (more precisely, if the overhead is no more than
261 4 bytes), we should move the first three instructions above down by
262 4 bytes (getting rid of the nop), hence saving memory. [ccshan]
264 ASSERT(((StgWord64)wptr & 3) == 0);
265 if ((adjustor = stgMallocBytes(48, "createAdjustor")) != NULL) {
266 StgWord64 *const code = (StgWord64 *)adjustor;
268 code[0] = 0x4610041246520414L;
269 code[1] = 0x46730415a61b0020L;
270 code[2] = 0x46310413a77b0028L;
271 code[3] = 0x000000006bfb0000L
272 | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
274 code[4] = (StgWord64)hptr;
275 code[5] = (StgWord64)wptr;
277 /* Ensure that instruction cache is consistent with our new code */
278 __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
280 #elif defined(powerpc_TARGET_ARCH)
282 For PowerPC, the following code is used:
290 lis r0,0xDEAD ;hi(wptr)
291 lis r3,0xDEAF ;hi(hptr)
292 ori r0,r0,0xBEEF ; lo(wptr)
293 ori r3,r3,0xFACE ; lo(hptr)
297 The arguments (passed in registers r3 - r10) are shuffled along by two to
298 make room for hptr and a dummy argument. As r9 and r10 are overwritten by
299 this code, it only works for up to 6 arguments (when floating point arguments
300 are involved, this may be more or less, depending on the exact situation).
302 if ((adjustor = stgMallocBytes(4*13, "createAdjustor")) != NULL) {
303 unsigned long *const adj_code = (unsigned long *)adjustor;
305 // make room for extra arguments
306 adj_code[0] = 0x7d0a4378; //mr r10,r8
307 adj_code[1] = 0x7ce93b78; //mr r9,r7
308 adj_code[2] = 0x7cc83378; //mr r8,r6
309 adj_code[3] = 0x7ca72b78; //mr r7,r5
310 adj_code[4] = 0x7c862378; //mr r6,r4
311 adj_code[5] = 0x7c651b78; //mr r5,r3
313 adj_code[6] = 0x3c000000; //lis r0,hi(wptr)
314 adj_code[6] |= ((unsigned long)wptr) >> 16;
316 adj_code[7] = 0x3c600000; //lis r3,hi(hptr)
317 adj_code[7] |= ((unsigned long)hptr) >> 16;
319 adj_code[8] = 0x60000000; //ori r0,r0,lo(wptr)
320 adj_code[8] |= ((unsigned long)wptr) & 0xFFFF;
322 adj_code[9] = 0x60630000; //ori r3,r3,lo(hptr)
323 adj_code[9] |= ((unsigned long)hptr) & 0xFFFF;
325 adj_code[10] = 0x7c0903a6; //mtctr r0
326 adj_code[11] = 0x4e800420; //bctr
327 adj_code[12] = (unsigned long)hptr;
329 // Flush the Instruction cache:
330 // MakeDataExecutable(adjustor,4*13);
331 /* This would require us to link with CoreServices.framework */
332 { /* this should do the same: */
334 unsigned long *p = adj_code;
337 __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
341 __asm__ volatile ("sync\n\tisync");
345 barf("adjustor creation not supported on this platform");
360 freeHaskellFunctionPtr(void* ptr)
362 #if defined(i386_TARGET_ARCH)
363 if ( *(unsigned char*)ptr != 0x68 &&
364 *(unsigned char*)ptr != 0x58 ) {
365 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
369 /* Free the stable pointer first..*/
370 if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
371 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
373 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
375 #elif defined(sparc_TARGET_ARCH)
376 if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
377 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
381 /* Free the stable pointer first..*/
382 freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
383 #elif defined(alpha_TARGET_ARCH)
384 if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
385 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
389 /* Free the stable pointer first..*/
390 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
391 #elif defined(powerpc_TARGET_ARCH)
392 if ( *(StgWord*)ptr != 0x7d0a4378 ) {
393 fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
396 freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 4*12)));
400 *((unsigned char*)ptr) = '\0';