#include "RtsFlags.h"
/* Heavily arch-specific, I'm afraid.. */
-#if defined(i386_TARGET_ARCH) || defined(sparc_TARGET_ARCH) || defined(alpha_TARGET_ARCH)
+#if defined(i386_TARGET_ARCH) || defined(sparc_TARGET_ARCH) || defined(alpha_TARGET_ARCH) || defined(powerpc_TARGET_ARCH)
#if defined(i386_TARGET_ARCH)
/* Now here's something obscure for you:
};
#endif
+#if defined(alpha_TARGET_ARCH)
+/* To get the definition of PAL_imb: */
+#include <machine/pal.h>
+#endif
+
void*
createAdjustor(int cconv, StgStablePtr hptr, StgFunPtr wptr)
{
the following assembly language snippet
(offset and machine code prefixed):
- <00>: 13 00 3f fb sethi %hi(0x00ffeffa), %o1 # load up wptr (1 of 2)
- <04>: 11 37 ab 6f sethi %hi(0xdeadbeef), %o0 # load up hptr (1 of 2)
- <08>: 81 c2 63 fa jmp %o1+%lo(0x00ffeffa) # jump to wptr (load 2 of 2)
- <0c>: 90 12 22 ef or %o0, %lo(0xdeadbeef), %o0 # load up hptr (2 of 2)
+ <00>: BA 10 00 1B mov %i3, %i5
+ <04>: B8 10 00 1A mov %i2, %i4
+ <08>: B6 10 00 19 mov %i1, %i3
+ <0c>: B4 10 00 18 mov %i0, %i2
+ <10>: 13 00 3f fb sethi %hi(0x00ffeffa), %o1 # load up wptr (1 of 2)
+ <14>: 11 37 ab 6f sethi %hi(0xdeadbeef), %o0 # load up hptr (1 of 2)
+ <18>: 81 c2 63 fa jmp %o1+%lo(0x00ffeffa) # jump to wptr (load 2 of 2)
+ <1c>: 90 12 22 ef or %o0, %lo(0xdeadbeef), %o0 # load up hptr (2 of 2)
# [in delay slot]
- <10>: de ad be ef # Place the value of the StgStablePtr somewhere readable
+ <20>: de ad be ef # Place the value of the StgStablePtr somewhere readable
ccall'ing on a SPARC leaves little to be performed by the caller.
The callee shifts the window on entry and restores it on exit.
code above contains the input paramter to wptr.) The return address
is stored in %o7/%i7. Since we don't shift the window in this code,
the return address is preserved and wptr will return to our caller.
+
+ JRS, 21 Aug 01: the above para is a fiction. The caller passes
+ args in %i0 .. %i5 and then the rest at [%sp+92]. We want to
+ tailjump to wptr, passing hptr as the new first arg, and a dummy
+ second arg, which would be where the return address is on x86.
+ That means we have to shuffle the original caller's args along by
+ two.
+
+ We do a half-correct solution which works only if the original
+ caller passed 4 or fewer arg words. Move %i0 .. %i3 into %i3
+ .. %i6, so we can park hptr in %i0 and a bogus arg in %i1. The
+ fully correct solution would be to subtract 8 from %sp and then
+ place %i4 and %i5 at [%sp+92] and [%sp+96] respectively. This
+ machinery should then work in all cases. (Or would it? Perhaps
+ it would trash parts of the caller's frame. Dunno).
+
+ SUP, 25 Apr 02: We are quite lucky to push a multiple of 8 bytes in
+ front of the existing arguments, because %sp must stay double-word
+ aligned at all times, see: http://www.sparc.org/standards/psABI3rd.pdf
+ Although we extend the *caller's* stack frame, this shouldn't cause
+ any problems for a C-like caller: alloca is implemented similarly, and
+ local variables should be accessed via %fp, not %sp. In a nutshell:
+ This should work. (Famous last words! :-)
*/
- if ((adjustor = stgMallocBytes(28, "createAdjustor")) != NULL) {
- unsigned char *const adj_code = (unsigned char *)adjustor;
+ if ((adjustor = stgMallocBytes(4*(8+1), "createAdjustor")) != NULL) {
+ unsigned long *const adj_code = (unsigned long *)adjustor;
+
+ /* mov %o3, %o5 */
+ adj_code[0] = (unsigned long)0x9A10000B;
+ /* mov %o2, %o4 */
+ adj_code[1] = (unsigned long)0x9810000A;
+ /* mov %o1, %o3 */
+ adj_code[2] = (unsigned long)0x96100009;
+ /* mov %o0, %o2 */
+ adj_code[3] = (unsigned long)0x94100008;
/* sethi %hi(wptr), %o1 */
- *((unsigned long*)(adj_code+0x00)) = (unsigned long)0x13000000;
- *((unsigned long*)(adj_code+0x00)) |= ((unsigned long)wptr) >> 10;
+ adj_code[4] = (unsigned long)0x13000000;
+ adj_code[4] |= ((unsigned long)wptr) >> 10;
/* sethi %hi(hptr), %o0 */
- *((unsigned long*)(adj_code+0x04)) = (unsigned long)0x11000000;
- *((unsigned long*)(adj_code+0x04)) |= ((unsigned long)hptr) >> 10;
+ adj_code[5] = (unsigned long)0x11000000;
+ adj_code[5] |= ((unsigned long)hptr) >> 10;
/* jmp %o1+%lo(wptr) */
- *((unsigned long*)(adj_code+0x08)) = (unsigned long)0x81c26000;
- *((unsigned long*)(adj_code+0x08)) |= ((unsigned long)wptr) & 0x000003ff;
+ adj_code[6] = (unsigned long)0x81c26000;
+ adj_code[6] |= ((unsigned long)wptr) & 0x000003ff;
/* or %o0, %lo(hptr), %o0 */
- *((unsigned long*)(adj_code+0x0c)) = (unsigned long)0x90122000;
- *((unsigned long*)(adj_code+0x0c)) |= ((unsigned long)hptr) & 0x000003ff;
-
- *((StgStablePtr*)(adj_code+0x10)) = (StgStablePtr)hptr;
+ adj_code[7] = (unsigned long)0x90122000;
+ adj_code[7] |= ((unsigned long)hptr) & 0x000003ff;
+
+ adj_code[8] = (StgStablePtr)hptr;
+
+ /* flush cache */
+ asm("flush %0" : : "r" (adj_code ));
+ asm("flush %0" : : "r" (adj_code + 2));
+ asm("flush %0" : : "r" (adj_code + 4));
+ asm("flush %0" : : "r" (adj_code + 6));
+
+ /* max. 5 instructions latency, and we need at >= 1 for returning */
+ asm("nop");
+ asm("nop");
+ asm("nop");
+ asm("nop");
}
#elif defined(alpha_TARGET_ARCH)
/* Magic constant computed by inspecting the code length of
(offset and machine code prefixed; note that the machine code
shown is longwords stored in little-endian order):
- <00>: a61b0010 ldq a0, 0x10(pv) # load up hptr
- <04>: a77b0018 ldq pv, 0x18(pv) # load up wptr
- <08>: 6bfbabcd jmp (pv), 0xabcd # jump to wptr (with hint)
- <0c>: 47ff041f nop # padding for alignment
- <10>: [8 bytes for hptr quadword]
- <18>: [8 bytes for wptr quadword]
+ <00>: 46520414 mov a2, a4
+ <04>: 46100412 mov a0, a2
+ <08>: a61b0020 ldq a0, 0x20(pv) # load up hptr
+ <0c>: 46730415 mov a3, a5
+ <10>: a77b0028 ldq pv, 0x28(pv) # load up wptr
+ <14>: 46310413 mov a1, a3
+ <18>: 6bfb---- jmp (pv), <hint> # jump to wptr (with hint)
+ <1c>: 00000000 # padding for alignment
+ <20>: [8 bytes for hptr quadword]
+ <28>: [8 bytes for wptr quadword]
The "computed" jump at <08> above is really a jump to a fixed
location. Accordingly, we place an always-correct hint in the
jump instruction, namely the address offset from <0c> to wptr,
divided by 4, taking the lowest 14 bits.
+ We only support passing 4 or fewer argument words, for the same
+ reason described under sparc_TARGET_ARCH above by JRS, 21 Aug 01.
+ On the Alpha the first 6 integer arguments are in a0 through a5,
+ and the rest on the stack. Hence we want to shuffle the original
+ caller's arguments by two.
+
+ On the Alpha the calling convention is so complex and dependent
+ on the callee's signature -- for example, the stack pointer has
+ to be a multiple of 16 -- that it seems impossible to me [ccshan]
+ to handle the general case correctly without changing how the
+ adjustor is called from C. For now, our solution of shuffling
+ registers only and ignoring the stack only works if the original
+ caller passed 4 or fewer argument words.
+
TODO: Depending on how much allocation overhead stgMallocBytes uses for
header information (more precisely, if the overhead is no more than
4 bytes), we should move the first three instructions above down by
4 bytes (getting rid of the nop), hence saving memory. [ccshan]
*/
ASSERT(((StgWord64)wptr & 3) == 0);
- if ((adjustor = stgMallocBytes(32, "createAdjustor")) != NULL) {
+ if ((adjustor = stgMallocBytes(48, "createAdjustor")) != NULL) {
StgWord64 *const code = (StgWord64 *)adjustor;
- code[0] = 0xa77b0018a61b0010L;
- code[1] = 0x47ff041f6bfb0000L
+ code[0] = 0x4610041246520414L;
+ code[1] = 0x46730415a61b0020L;
+ code[2] = 0x46310413a77b0028L;
+ code[3] = 0x000000006bfb0000L
| (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
- code[2] = (StgWord64)hptr;
- code[3] = (StgWord64)wptr;
+ code[4] = (StgWord64)hptr;
+ code[5] = (StgWord64)wptr;
+
+ /* Ensure that instruction cache is consistent with our new code */
+ __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
}
+#elif defined(powerpc_TARGET_ARCH)
+/*
+ For PowerPC, the following code is used:
+
+ mr r10,r8
+ mr r9,r7
+ mr r8,r6
+ mr r7,r5
+ mr r6,r4
+ mr r5,r3
+ lis r0,0xDEAD ;hi(wptr)
+ lis r3,0xDEAF ;hi(hptr)
+ ori r0,r0,0xBEEF ; lo(wptr)
+ ori r3,r3,0xFACE ; lo(hptr)
+ mtctr r0
+ bctr
+
+ The arguments (passed in registers r3 - r10) are shuffled along by two to
+ make room for hptr and a dummy argument. As r9 and r10 are overwritten by
+ this code, it only works for up to 6 arguments (when floating point arguments
+ are involved, this may be more or less, depending on the exact situation).
+*/
+ if ((adjustor = stgMallocBytes(4*13, "createAdjustor")) != NULL) {
+ unsigned long *const adj_code = (unsigned long *)adjustor;
+
+ // make room for extra arguments
+ adj_code[0] = 0x7d0a4378; //mr r10,r8
+ adj_code[1] = 0x7ce93b78; //mr r9,r7
+ adj_code[2] = 0x7cc83378; //mr r8,r6
+ adj_code[3] = 0x7ca72b78; //mr r7,r5
+ adj_code[4] = 0x7c862378; //mr r6,r4
+ adj_code[5] = 0x7c651b78; //mr r5,r3
+
+ adj_code[6] = 0x3c000000; //lis r0,hi(wptr)
+ adj_code[6] |= ((unsigned long)wptr) >> 16;
+
+ adj_code[7] = 0x3c600000; //lis r3,hi(hptr)
+ adj_code[6] |= ((unsigned long)hptr) >> 16;
+
+ adj_code[8] = 0x60000000; //ori r0,r0,lo(wptr)
+ adj_code[8] |= ((unsigned long)wptr) & 0xFFFF;
+
+ adj_code[9] = 0x60630000; //ori r3,r3,lo(hptr)
+ adj_code[9] |= ((unsigned long)hptr) & 0xFFFF;
+
+ adj_code[10] = 0x7c0903a6; //mtctr r0
+ adj_code[11] = 0x4e800420; //bctr
+ adj_code[12] = (unsigned long)hptr;
+
+ // Flush the Instruction cache:
+ // MakeDataExecutable(adjustor,4*13);
+ /* This would require us to link with CoreServices.framework */
+ { /* this should do the same: */
+ int n = 13;
+ unsigned long *p = adj_code;
+ while(n--)
+ {
+ __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
+ : : "g" (p));
+ p++;
+ }
+ __asm__ volatile ("sync\n\tisync");
+ }
+ }
#else
#error Adjustor creation is not supported on this platform.
#endif
freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
}
#elif defined(sparc_TARGET_ARCH)
- if ( *(unsigned char*)ptr != 0x13 ) {
+ if ( *(unsigned long*)ptr != 0x9A10000B ) {
fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
return;
}
/* Free the stable pointer first..*/
- freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
-#elif defined(sparc_TARGET_ARCH)
+ freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 8)));
+#elif defined(alpha_TARGET_ARCH)
if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
return;
/* Free the stable pointer first..*/
freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
+#elif defined(powerpc_TARGET_ARCH)
+ if ( *(StgWord*)ptr != 0x7d0a4378 ) {
+ fprintf(stderr, "freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+ return;
+ }
+ freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 4*12)));
#else
ASSERT(0);
#endif