1 -----------------------------------------------------------------------------
3 -- Machine-dependent assembly language
5 -- (c) The University of Glasgow 1993-2004
7 -----------------------------------------------------------------------------
9 #include "nativeGen/NCG.h"
12 -- * Cmm instantiations
13 NatCmm, NatCmmTop, NatBasicBlock,
15 -- * Machine instructions
17 Cond(..), condUnsigned, condToSigned, condToUnsigned,
18 #if powerpc_TARGET_ARCH
21 #if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH && !x86_64_TARGET_ARCH
22 Size(..), machRepSize,
26 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
33 riZero, fpRelEA, moveSp, fPair,
37 #include "HsVersions.h"
41 import MachOp ( MachRep(..) )
42 import CLabel ( CLabel, pprCLabel )
43 import Panic ( panic )
46 import Constants ( wORD_SIZE )
51 -- -----------------------------------------------------------------------------
52 -- Our flavours of the Cmm types
54 -- Type synonyms for Cmm populated with native code
55 type NatCmm = GenCmm CmmStatic [CmmStatic] Instr
56 type NatCmmTop = GenCmmTop CmmStatic [CmmStatic] Instr
57 type NatBasicBlock = GenBasicBlock Instr
59 -- -----------------------------------------------------------------------------
60 -- Conditions on this architecture
64 = ALWAYS -- For BI (same as BR)
65 | EQQ -- For CMP and BI (NB: "EQ" is a 1.3 Prelude name)
67 | GTT -- For BI only (NB: "GT" is a 1.3 Prelude name)
68 | LE -- For CMP and BI
69 | LTT -- For CMP and BI (NB: "LT" is a 1.3 Prelude name)
71 | NEVER -- For BI (null instruction)
75 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
76 = ALWAYS -- What's really used? ToDo
95 = ALWAYS -- What's really used? ToDo
112 #if powerpc_TARGET_ARCH
125 deriving Eq -- to make an assertion work
127 condUnsigned GU = True
128 condUnsigned LU = True
129 condUnsigned GEU = True
130 condUnsigned LEU = True
131 condUnsigned _ = False
133 condToSigned GU = GTT
134 condToSigned LU = LTT
135 condToSigned GEU = GE
136 condToSigned LEU = LE
139 condToUnsigned GTT = GU
140 condToUnsigned LTT = LU
141 condToUnsigned GE = GEU
142 condToUnsigned LE = LEU
145 #if powerpc_TARGET_ARCH
146 condNegate ALWAYS = panic "condNegate: ALWAYS"
159 -- -----------------------------------------------------------------------------
160 -- Sizes on this architecture
162 -- ToDo: it's not clear to me that we need separate signed-vs-unsigned sizes
163 -- here. I've removed them from the x86 version, we'll see what happens --SDM
165 #if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH && !x86_64_TARGET_ARCH
167 #if alpha_TARGET_ARCH
170 -- | W -- word (2 bytes): UNUSED
172 | L -- longword (4 bytes)
173 | Q -- quadword (8 bytes)
174 -- | FF -- VAX F-style floating pt: UNUSED
175 -- | GF -- VAX G-style floating pt: UNUSED
176 -- | DF -- VAX D-style floating pt: UNUSED
177 -- | SF -- IEEE single-precision floating pt: UNUSED
178 | TF -- IEEE double-precision floating pt
180 #if sparc_TARGET_ARCH || powerpc_TARGET_ARCH
182 | Bu -- byte (unsigned)
183 | H -- halfword (signed, 2 bytes)
184 | Hu -- halfword (unsigned, 2 bytes)
185 | W -- word (4 bytes)
186 | F -- IEEE single-precision floating pt
187 | DF -- IEEE single-precision floating pt
191 machRepSize :: MachRep -> Size
192 machRepSize I8 = IF_ARCH_alpha(Bu, IF_ARCH_sparc(Bu, ))
193 machRepSize I16 = IF_ARCH_alpha(err,IF_ARCH_sparc(Hu, ))
194 machRepSize I32 = IF_ARCH_alpha(L, IF_ARCH_sparc(W, ))
195 machRepSize I64 = panic "machRepSize: I64"
196 machRepSize I128 = panic "machRepSize: I128"
197 machRepSize F32 = IF_ARCH_alpha(TF, IF_ARCH_sparc(F, ))
198 machRepSize F64 = IF_ARCH_alpha(TF, IF_ARCH_sparc(DF,))
201 -- -----------------------------------------------------------------------------
202 -- Register or immediate (a handy type on some platforms)
208 -- -----------------------------------------------------------------------------
209 -- Machine's assembly language
211 -- We have a few common "instructions" (nearly all the pseudo-ops) but
212 -- mostly all of 'Instr' is machine-specific.
215 = COMMENT FastString -- comment pseudo-op
217 | LDATA Section [CmmStatic] -- some static data spat out during code
218 -- generation. Will be extracted before
221 | NEWBLOCK BlockId -- start a new basic block. Useful during
222 -- codegen, removed later. Preceding
223 -- instruction should be a jump, as per the
224 -- invariants for a BasicBlock (see Cmm).
226 | DELTA Int -- specify current stack offset for
227 -- benefit of subsequent passes
229 | SPILL Reg Int -- ^ spill this reg to a stack slot
230 | RELOAD Int Reg -- ^ reload this reg from a stack slot
232 -- -----------------------------------------------------------------------------
233 -- Alpha instructions
235 #if alpha_TARGET_ARCH
237 -- data Instr continues...
240 | LD Size Reg AddrMode -- size, dst, src
241 | LDA Reg AddrMode -- dst, src
242 | LDAH Reg AddrMode -- dst, src
243 | LDGP Reg AddrMode -- dst, src
244 | LDI Size Reg Imm -- size, dst, src
245 | ST Size Reg AddrMode -- size, src, dst
249 | ABS Size RI Reg -- size, src, dst
250 | NEG Size Bool RI Reg -- size, overflow, src, dst
251 | ADD Size Bool Reg RI Reg -- size, overflow, src, src, dst
252 | SADD Size Size Reg RI Reg -- size, scale, src, src, dst
253 | SUB Size Bool Reg RI Reg -- size, overflow, src, src, dst
254 | SSUB Size Size Reg RI Reg -- size, scale, src, src, dst
255 | MUL Size Bool Reg RI Reg -- size, overflow, src, src, dst
256 | DIV Size Bool Reg RI Reg -- size, unsigned, src, src, dst
257 | REM Size Bool Reg RI Reg -- size, unsigned, src, src, dst
259 -- Simple bit-twiddling.
277 | CMP Cond Reg RI Reg
283 | FADD Size Reg Reg Reg
284 | FDIV Size Reg Reg Reg
285 | FMUL Size Reg Reg Reg
286 | FSUB Size Reg Reg Reg
287 | CVTxy Size Size Reg Reg
288 | FCMP Size Cond Reg Reg Reg
295 | JMP Reg AddrMode Int
297 | JSR Reg AddrMode Int
299 -- Alpha-specific pseudo-ops.
307 #endif /* alpha_TARGET_ARCH */
310 -- -----------------------------------------------------------------------------
311 -- Intel x86 instructions
314 Intel, in their infinite wisdom, selected a stack model for floating
315 point registers on x86. That might have made sense back in 1979 --
316 nowadays we can see it for the nonsense it really is. A stack model
317 fits poorly with the existing nativeGen infrastructure, which assumes
318 flat integer and FP register sets. Prior to this commit, nativeGen
319 could not generate correct x86 FP code -- to do so would have meant
320 somehow working the register-stack paradigm into the register
321 allocator and spiller, which sounds very difficult.
323 We have decided to cheat, and go for a simple fix which requires no
324 infrastructure modifications, at the expense of generating ropey but
325 correct FP code. All notions of the x86 FP stack and its insns have
326 been removed. Instead, we pretend (to the instruction selector and
327 register allocator) that x86 has six floating point registers, %fake0
328 .. %fake5, which can be used in the usual flat manner. We further
329 claim that x86 has floating point instructions very similar to SPARC
330 and Alpha, that is, a simple 3-operand register-register arrangement.
331 Code generation and register allocation proceed on this basis.
333 When we come to print out the final assembly, our convenient fiction
334 is converted to dismal reality. Each fake instruction is
335 independently converted to a series of real x86 instructions.
336 %fake0 .. %fake5 are mapped to %st(0) .. %st(5). To do reg-reg
337 arithmetic operations, the two operands are pushed onto the top of the
338 FP stack, the operation done, and the result copied back into the
339 relevant register. There are only six %fake registers because 2 are
340 needed for the translation, and x86 has 8 in total.
342 The translation is inefficient but is simple and it works. A cleverer
343 translation would handle a sequence of insns, simulating the FP stack
344 contents, would not impose a fixed mapping from %fake to %st regs, and
345 hopefully could avoid most of the redundant reg-reg moves of the
348 We might as well make use of whatever unique FP facilities Intel have
349 chosen to bless us with (let's not be churlish, after all).
350 Hence GLDZ and GLD1. Bwahahahahahahaha!
354 MORE FLOATING POINT MUSINGS...
356 Intel's internal floating point registers are by default 80 bit
357 extended precision. This means that all operations done on values in
358 registers are done at 80 bits, and unless the intermediate values are
359 truncated to the appropriate size (32 or 64 bits) by storing in
360 memory, calculations in registers will give different results from
361 calculations which pass intermediate values in memory (eg. via
364 One solution is to set the FPU into 64 bit precision mode. Some OSs
365 do this (eg. FreeBSD) and some don't (eg. Linux). The problem here is
366 that this will only affect 64-bit precision arithmetic; 32-bit
367 calculations will still be done at 64-bit precision in registers. So
368 it doesn't solve the whole problem.
370 There's also the issue of what the C library is expecting in terms of
371 precision. It seems to be the case that glibc on Linux expects the
372 FPU to be set to 80 bit precision, so setting it to 64 bit could have
373 unexpected effects. Changing the default could have undesirable
374 effects on other 3rd-party library code too, so the right thing would
375 be to save/restore the FPU control word across Haskell code if we were
378 gcc's -ffloat-store gives consistent results by always storing the
379 results of floating-point calculations in memory, which works for both
380 32 and 64-bit precision. However, it only affects the values of
381 user-declared floating point variables in C, not intermediate results.
382 GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
385 Another problem is how to spill floating point registers in the
386 register allocator. Should we spill the whole 80 bits, or just 64?
387 On an OS which is set to 64 bit precision, spilling 64 is fine. On
388 Linux, spilling 64 bits will round the results of some operations.
389 This is what gcc does. Spilling at 80 bits requires taking up a full
390 128 bit slot (so we get alignment). We spill at 80-bits and ignore
391 the alignment problems.
393 In the future, we'll use the SSE registers for floating point. This
394 requires a CPU that supports SSE2 (ordinary SSE only supports 32 bit
395 precision float ops), which means P4 or Xeon and above. Using SSE
396 will solve all these problems, because the SSE registers use fixed 32
397 bit or 64 bit precision.
402 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
404 -- data Instr continues...
407 | MOV MachRep Operand Operand
408 | MOVZxL MachRep Operand Operand -- size is the size of operand 1
409 | MOVSxL MachRep Operand Operand -- size is the size of operand 1
410 -- x86_64 note: plain mov into a 32-bit register always zero-extends
411 -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
412 -- don't affect the high bits of the register.
414 -- Load effective address (also a very useful three-operand add instruction :-)
415 | LEA MachRep Operand Operand
418 | ADD MachRep Operand Operand
419 | ADC MachRep Operand Operand
420 | SUB MachRep Operand Operand
422 | MUL MachRep Operand Operand
423 | IMUL MachRep Operand Operand -- signed int mul
424 | IMUL2 MachRep Operand -- %edx:%eax = operand * %eax
426 | DIV MachRep Operand -- eax := eax:edx/op, edx := eax:edx%op
427 | IDIV MachRep Operand -- ditto, but signed
429 -- Simple bit-twiddling.
430 | AND MachRep Operand Operand
431 | OR MachRep Operand Operand
432 | XOR MachRep Operand Operand
433 | NOT MachRep Operand
434 | NEGI MachRep Operand -- NEG instruction (name clash with Cond)
436 -- Shifts (amount may be immediate or %cl only)
437 | SHL MachRep Operand{-amount-} Operand
438 | SAR MachRep Operand{-amount-} Operand
439 | SHR MachRep Operand{-amount-} Operand
441 | BT MachRep Imm Operand
447 -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles
448 -- as single instructions right up until we spit them out.
449 -- all the 3-operand fake fp insns are src1 src2 dst
450 -- and furthermore are constrained to be fp regs only.
451 -- IMPORTANT: keep is_G_insn up to date with any changes here
452 | GMOV Reg Reg -- src(fpreg), dst(fpreg)
453 | GLD MachRep AddrMode Reg -- src, dst(fpreg)
454 | GST MachRep Reg AddrMode -- src(fpreg), dst
456 | GLDZ Reg -- dst(fpreg)
457 | GLD1 Reg -- dst(fpreg)
459 | GFTOI Reg Reg -- src(fpreg), dst(intreg)
460 | GDTOI Reg Reg -- src(fpreg), dst(intreg)
462 | GITOF Reg Reg -- src(intreg), dst(fpreg)
463 | GITOD Reg Reg -- src(intreg), dst(fpreg)
465 | GADD MachRep Reg Reg Reg -- src1, src2, dst
466 | GDIV MachRep Reg Reg Reg -- src1, src2, dst
467 | GSUB MachRep Reg Reg Reg -- src1, src2, dst
468 | GMUL MachRep Reg Reg Reg -- src1, src2, dst
470 -- FP compare. Cond must be `elem` [EQQ, NE, LE, LTT, GE, GTT]
471 -- Compare src1 with src2; set the Zero flag iff the numbers are
472 -- comparable and the comparison is True. Subsequent code must
473 -- test the %eflags zero flag regardless of the supplied Cond.
474 | GCMP Cond Reg Reg -- src1, src2
476 | GABS MachRep Reg Reg -- src, dst
477 | GNEG MachRep Reg Reg -- src, dst
478 | GSQRT MachRep Reg Reg -- src, dst
479 | GSIN MachRep Reg Reg -- src, dst
480 | GCOS MachRep Reg Reg -- src, dst
481 | GTAN MachRep Reg Reg -- src, dst
483 | GFREE -- do ffree on all x86 regs; an ugly hack
486 #if x86_64_TARGET_ARCH
487 -- SSE2 floating point: we use a restricted set of the available SSE2
488 -- instructions for floating-point.
490 -- use MOV for moving (either movss or movsd (movlpd better?))
492 | CVTSS2SD Reg Reg -- F32 to F64
493 | CVTSD2SS Reg Reg -- F64 to F32
494 | CVTTSS2SIQ Operand Reg -- F32 to I32/I64 (with truncation)
495 | CVTTSD2SIQ Operand Reg -- F64 to I32/I64 (with truncation)
496 | CVTSI2SS Operand Reg -- I32/I64 to F32
497 | CVTSI2SD Operand Reg -- I32/I64 to F64
499 -- use ADD & SUB for arithmetic. In both cases, operands
502 -- SSE2 floating-point division:
503 | FDIV MachRep Operand Operand -- divisor, dividend(dst)
505 -- use CMP for comparisons. ucomiss and ucomisd instructions
506 -- compare single/double prec floating point respectively.
508 | SQRT MachRep Operand Reg -- src, dst
512 | TEST MachRep Operand Operand
513 | CMP MachRep Operand Operand
517 | PUSH MachRep Operand
518 | POP MachRep Operand
519 -- both unused (SDM):
525 | JXX Cond BlockId -- includes unconditional branches
526 | JXX_GBL Cond Imm -- non-local version of JXX
527 | JMP_TBL Operand [BlockId] -- table jump
528 | CALL (Either Imm Reg) [Reg]
531 | CLTD MachRep -- sign extend %eax into %edx:%eax
533 | FETCHGOT Reg -- pseudo-insn for ELF position-independent code
537 -- addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
538 | FETCHPC Reg -- pseudo-insn for Darwin position-independent code
545 = OpReg Reg -- register
546 | OpImm Imm -- immediate value
547 | OpAddr AddrMode -- memory reference
549 #endif /* i386 or x86_64 */
552 i386_insert_ffrees :: [Instr] -> [Instr]
553 i386_insert_ffrees insns
554 | any is_G_instr insns
555 = concatMap ffree_before_nonlocal_transfers insns
559 ffree_before_nonlocal_transfers insn
561 CALL _ _ -> [GFREE, insn]
562 JMP _ -> [GFREE, insn]
566 -- if you ever add a new FP insn to the fake x86 FP insn set,
567 -- you must update this too
568 is_G_instr :: Instr -> Bool
571 GMOV _ _ -> True; GLD _ _ _ -> True; GST _ _ _ -> True
572 GLDZ _ -> True; GLD1 _ -> True
573 GFTOI _ _ -> True; GDTOI _ _ -> True
574 GITOF _ _ -> True; GITOD _ _ -> True
575 GADD _ _ _ _ -> True; GDIV _ _ _ _ -> True
576 GSUB _ _ _ _ -> True; GMUL _ _ _ _ -> True
577 GCMP _ _ _ -> True; GABS _ _ _ -> True
578 GNEG _ _ _ -> True; GSQRT _ _ _ -> True
579 GSIN _ _ _ -> True; GCOS _ _ _ -> True; GTAN _ _ _ -> True
580 GFREE -> panic "is_G_instr: GFREE (!)"
582 #endif /* i386_TARGET_ARCH */
585 -- -----------------------------------------------------------------------------
586 -- Sparc instructions
588 #if sparc_TARGET_ARCH
590 -- data Instr continues...
593 | LD MachRep AddrMode Reg -- size, src, dst
594 | ST MachRep Reg AddrMode -- size, src, dst
597 | ADD Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
598 | SUB Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
599 | UMUL Bool Reg RI Reg -- cc?, src1, src2, dst
600 | SMUL Bool Reg RI Reg -- cc?, src1, src2, dst
601 | RDY Reg -- move contents of Y register to reg
603 -- Simple bit-twiddling.
604 | AND Bool Reg RI Reg -- cc?, src1, src2, dst
605 | ANDN Bool Reg RI Reg -- cc?, src1, src2, dst
606 | OR Bool Reg RI Reg -- cc?, src1, src2, dst
607 | ORN Bool Reg RI Reg -- cc?, src1, src2, dst
608 | XOR Bool Reg RI Reg -- cc?, src1, src2, dst
609 | XNOR Bool Reg RI Reg -- cc?, src1, src2, dst
610 | SLL Reg RI Reg -- src1, src2, dst
611 | SRL Reg RI Reg -- src1, src2, dst
612 | SRA Reg RI Reg -- src1, src2, dst
613 | SETHI Imm Reg -- src, dst
614 | NOP -- Really SETHI 0, %g0, but worth an alias
618 -- Note that we cheat by treating F{ABS,MOV,NEG} of doubles as single
619 -- instructions right up until we spit them out.
620 | FABS MachRep Reg Reg -- src dst
621 | FADD MachRep Reg Reg Reg -- src1, src2, dst
622 | FCMP Bool MachRep Reg Reg -- exception?, src1, src2, dst
623 | FDIV MachRep Reg Reg Reg -- src1, src2, dst
624 | FMOV MachRep Reg Reg -- src, dst
625 | FMUL MachRep Reg Reg Reg -- src1, src2, dst
626 | FNEG MachRep Reg Reg -- src, dst
627 | FSQRT MachRep Reg Reg -- src, dst
628 | FSUB MachRep Reg Reg Reg -- src1, src2, dst
629 | FxTOy MachRep MachRep Reg Reg -- src, dst
632 | BI Cond Bool Imm -- cond, annul?, target
633 | BF Cond Bool Imm -- cond, annul?, target
635 | JMP AddrMode -- target
636 | CALL (Either Imm Reg) Int Bool -- target, args, terminal
640 riZero (RIImm (ImmInt 0)) = True
641 riZero (RIImm (ImmInteger 0)) = True
642 riZero (RIReg (RealReg 0)) = True
645 -- Calculate the effective address which would be used by the
646 -- corresponding fpRel sequence. fpRel is in MachRegs.lhs,
647 -- alas -- can't have fpRelEA here because of module dependencies.
648 fpRelEA :: Int -> Reg -> Instr
650 = ADD False False fp (RIImm (ImmInt (n * wORD_SIZE))) dst
652 -- Code to shift the stack pointer by n words.
653 moveSp :: Int -> Instr
655 = ADD False False sp (RIImm (ImmInt (n * wORD_SIZE))) sp
657 -- Produce the second-half-of-a-double register given the first half.
659 fPair (RealReg n) | n >= 32 && n `mod` 2 == 0 = RealReg (n+1)
660 fPair other = pprPanic "fPair(sparc NCG)" (ppr other)
661 #endif /* sparc_TARGET_ARCH */
664 -- -----------------------------------------------------------------------------
665 -- PowerPC instructions
667 #ifdef powerpc_TARGET_ARCH
668 -- data Instr continues...
671 | LD MachRep Reg AddrMode -- Load size, dst, src
672 | LA MachRep Reg AddrMode -- Load arithmetic size, dst, src
673 | ST MachRep Reg AddrMode -- Store size, src, dst
674 | STU MachRep Reg AddrMode -- Store with Update size, src, dst
675 | LIS Reg Imm -- Load Immediate Shifted dst, src
676 | LI Reg Imm -- Load Immediate dst, src
677 | MR Reg Reg -- Move Register dst, src -- also for fmr
679 | CMP MachRep Reg RI --- size, src1, src2
680 | CMPL MachRep Reg RI --- size, src1, src2
683 | BCCFAR Cond BlockId
684 | JMP CLabel -- same as branch,
685 -- but with CLabel instead of block ID
687 | BCTR [BlockId] -- with list of local destinations
688 | BL CLabel [Reg] -- with list of argument regs
691 | ADD Reg Reg RI -- dst, src1, src2
692 | ADDC Reg Reg Reg -- (carrying) dst, src1, src2
693 | ADDE Reg Reg Reg -- (extend) dst, src1, src2
694 | ADDIS Reg Reg Imm -- Add Immediate Shifted dst, src1, src2
695 | SUBF Reg Reg Reg -- dst, src1, src2 ; dst = src2 - src1
700 | MULLW_MayOflo Reg Reg Reg
701 -- dst = 1 if src1 * src2 overflows
702 -- pseudo-instruction; pretty-printed as:
703 -- mullwo. dst, src1, src2
705 -- rlwinm dst, dst, 2, 31,31
707 | AND Reg Reg RI -- dst, src1, src2
708 | OR Reg Reg RI -- dst, src1, src2
709 | XOR Reg Reg RI -- dst, src1, src2
710 | XORIS Reg Reg Imm -- XOR Immediate Shifted dst, src1, src2
712 | EXTS MachRep Reg Reg
717 | SLW Reg Reg RI -- shift left word
718 | SRW Reg Reg RI -- shift right word
719 | SRAW Reg Reg RI -- shift right arithmetic word
721 -- Rotate Left Word Immediate then AND with Mask
722 | RLWINM Reg Reg Int Int Int
724 | FADD MachRep Reg Reg Reg
725 | FSUB MachRep Reg Reg Reg
726 | FMUL MachRep Reg Reg Reg
727 | FDIV MachRep Reg Reg Reg
728 | FNEG Reg Reg -- negate is the same for single and double prec.
732 | FCTIWZ Reg Reg -- convert to integer word
733 | FRSP Reg Reg -- reduce to single precision
734 -- (but destination is a FP register)
736 | CRNOR Int Int Int -- condition register nor
737 | MFCR Reg -- move from condition register
739 | MFLR Reg -- move from link register
740 | FETCHPC Reg -- pseudo-instruction:
741 -- bcl to next insn, mflr reg
743 | LWSYNC -- memory barrier
744 #endif /* powerpc_TARGET_ARCH */