+/* ----------------------------------------------------------------------------
+ x86-64 is almost the same as plain x86.
+
+ I've done it using entirely inline assembler, because I couldn't
+ get gcc to generate the correct subtraction from %rsp by using
+ the local array variable trick. It didn't seem to reserve
+ enough space. Oh well, it's not much harder this way.
+
+ ------------------------------------------------------------------------- */
+
+#ifdef x86_64_HOST_ARCH
+
+extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
+
+void StgRunIsImplementedInAssembler(void);
+void StgRunIsImplementedInAssembler(void)
+{
+ __asm__ volatile (
+ /*
+ * save callee-saves registers on behalf of the STG code.
+ */
+ ".globl StgRun\n"
+ "StgRun:\n\t"
+ "subq %0, %%rsp\n\t"
+ "movq %%rsp, %%rax\n\t"
+ "addq %0-48, %%rax\n\t"
+ "movq %%rbx,0(%%rax)\n\t"
+ "movq %%rbp,8(%%rax)\n\t"
+ "movq %%r12,16(%%rax)\n\t"
+ "movq %%r13,24(%%rax)\n\t"
+ "movq %%r14,32(%%rax)\n\t"
+ "movq %%r15,40(%%rax)\n\t"
+ /*
+ * Set BaseReg
+ */
+ "movq %%rsi,%%rbx\n\t"
+ /*
+ * grab the function argument from the stack, and jump to it.
+ */
+ "movq %%rdi,%%rax\n\t"
+ "jmp *%%rax\n\t"
+
+ ".global " STG_RETURN "\n"
+ STG_RETURN ":\n\t"
+
+ "movq %%r13, %%rax\n\t" /* Return value in R1 */
+
+ /*
+ * restore callee-saves registers. (Don't stomp on %%rax!)
+ */
+ "movq %%rsp, %%rdx\n\t"
+ "addq %0-48, %%rdx\n\t"
+ "movq 0(%%rdx),%%rbx\n\t" /* restore the registers saved above */
+ "movq 8(%%rdx),%%rbp\n\t"
+ "movq 16(%%rdx),%%r12\n\t"
+ "movq 24(%%rdx),%%r13\n\t"
+ "movq 32(%%rdx),%%r14\n\t"
+ "movq 40(%%rdx),%%r15\n\t"
+ "addq %0, %%rsp\n\t"
+ "retq"
+
+ : : "i"(RESERVED_C_STACK_BYTES+48+8 /*stack frame size*/));
+ /*
+ HACK alert!
+
+ The x86_64 ABI specifies that on a procedure call, %rsp is
+ aligned on a 16-byte boundary + 8. That is, the first
+ argument on the stack after the return address will be
+ 16-byte aligned.
+
+ Which should be fine: RESERVED_C_STACK_BYTES+48 is a multiple
+ of 16 bytes.
+
+ BUT... when we do a C-call from STG land, gcc likes to put the
+ stack alignment adjustment in the prolog. eg. if we're calling
+ a function with arguments in regs, gcc will insert 'subq $8,%rsp'
+ in the prolog, to keep %rsp aligned (the return address is 8
+ bytes, remember). The mangler throws away the prolog, so we
+ lose the stack alignment.
+
+ The hack is to add this extra 8 bytes to our %rsp adjustment
+ here, so that throughout STG code, %rsp is 16-byte aligned,
+ ready for a C-call.
+
+ A quick way to see if this is wrong is to compile this code:
+
+ main = System.Exit.exitWith ExitSuccess
+
+ And run it with +RTS -sstderr. The stats code in the RTS, in
+ particular statsPrintf(), relies on the stack alignment because
+ it saves the %xmm regs on the stack, so it'll fall over if the
+ stack isn't aligned, and calling exitWith from Haskell invokes
+ shutdownHaskellAndExit using a C call.
+
+ Future gcc releases will almost certainly break this hack...
+ */
+}
+
+#endif /* x86-64 */
+