1 -----------------------------------------------------------------------------
3 -- Machine-dependent assembly language
5 -- (c) The University of Glasgow 1993-2004
7 -----------------------------------------------------------------------------
9 #include "nativeGen/NCG.h"
12 -- * Cmm instantiations
13 NatCmm, NatCmmTop, NatBasicBlock,
15 -- * Machine instructions
17 Cond(..), condUnsigned, condToSigned, condToUnsigned,
19 #if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH && !x86_64_TARGET_ARCH
20 Size(..), machRepSize,
24 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
31 riZero, fpRelEA, moveSp, fPair,
35 #include "HsVersions.h"
39 import MachOp ( MachRep(..) )
40 import CLabel ( CLabel, pprCLabel )
41 import Panic ( panic )
48 -- -----------------------------------------------------------------------------
49 -- Our flavours of the Cmm types
51 -- Type synonyms for Cmm populated with native code
52 type NatCmm = GenCmm CmmStatic Instr
53 type NatCmmTop = GenCmmTop CmmStatic Instr
54 type NatBasicBlock = GenBasicBlock Instr
56 -- -----------------------------------------------------------------------------
57 -- Conditions on this architecture
61 = ALWAYS -- For BI (same as BR)
62 | EQQ -- For CMP and BI (NB: "EQ" is a 1.3 Prelude name)
64 | GTT -- For BI only (NB: "GT" is a 1.3 Prelude name)
65 | LE -- For CMP and BI
66 | LTT -- For CMP and BI (NB: "LT" is a 1.3 Prelude name)
68 | NEVER -- For BI (null instruction)
72 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
73 = ALWAYS -- What's really used? ToDo
90 = ALWAYS -- What's really used? ToDo
107 #if powerpc_TARGET_ARCH
120 deriving Eq -- to make an assertion work
122 condUnsigned GU = True
123 condUnsigned LU = True
124 condUnsigned GEU = True
125 condUnsigned LEU = True
126 condUnsigned _ = False
128 condToSigned GU = GTT
129 condToSigned LU = LTT
130 condToSigned GEU = GE
131 condToSigned LEU = LE
134 condToUnsigned GTT = GU
135 condToUnsigned LTT = LU
136 condToUnsigned GE = GEU
137 condToUnsigned LE = LEU
140 -- -----------------------------------------------------------------------------
141 -- Sizes on this architecture
143 -- ToDo: it's not clear to me that we need separate signed-vs-unsigned sizes
144 -- here. I've removed them from the x86 version, we'll see what happens --SDM
146 #if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH && !x86_64_TARGET_ARCH
148 #if alpha_TARGET_ARCH
151 -- | W -- word (2 bytes): UNUSED
153 | L -- longword (4 bytes)
154 | Q -- quadword (8 bytes)
155 -- | FF -- VAX F-style floating pt: UNUSED
156 -- | GF -- VAX G-style floating pt: UNUSED
157 -- | DF -- VAX D-style floating pt: UNUSED
158 -- | SF -- IEEE single-precision floating pt: UNUSED
159 | TF -- IEEE double-precision floating pt
161 #if sparc_TARGET_ARCH || powerpc_TARGET_ARCH
163 | Bu -- byte (unsigned)
164 | H -- halfword (signed, 2 bytes)
165 | Hu -- halfword (unsigned, 2 bytes)
166 | W -- word (4 bytes)
167 | F -- IEEE single-precision floating pt
168 | DF -- IEEE single-precision floating pt
172 machRepSize :: MachRep -> Size
173 machRepSize I8 = IF_ARCH_alpha(Bu, IF_ARCH_sparc(Bu, ))
174 machRepSize I16 = IF_ARCH_alpha(err,IF_ARCH_sparc(Hu, ))
175 machRepSize I32 = IF_ARCH_alpha(L, IF_ARCH_sparc(W, ))
176 machRepSize I64 = panic "machRepSize: I64"
177 machRepSize I128 = panic "machRepSize: I128"
178 machRepSize F32 = IF_ARCH_alpha(TF, IF_ARCH_sparc(F, ))
179 machRepSize F64 = IF_ARCH_alpha(TF, IF_ARCH_sparc(DF,))
182 -- -----------------------------------------------------------------------------
183 -- Register or immediate (a handy type on some platforms)
189 -- -----------------------------------------------------------------------------
190 -- Machine's assembly language
192 -- We have a few common "instructions" (nearly all the pseudo-ops) but
193 -- mostly all of 'Instr' is machine-specific.
196 = COMMENT FastString -- comment pseudo-op
198 | LDATA Section [CmmStatic] -- some static data spat out during code
199 -- generation. Will be extracted before
202 | NEWBLOCK BlockId -- start a new basic block. Useful during
203 -- codegen, removed later. Preceding
204 -- instruction should be a jump, as per the
205 -- invariants for a BasicBlock (see Cmm).
207 | DELTA Int -- specify current stack offset for
208 -- benefit of subsequent passes
210 -- -----------------------------------------------------------------------------
211 -- Alpha instructions
213 #if alpha_TARGET_ARCH
215 -- data Instr continues...
218 | LD Size Reg AddrMode -- size, dst, src
219 | LDA Reg AddrMode -- dst, src
220 | LDAH Reg AddrMode -- dst, src
221 | LDGP Reg AddrMode -- dst, src
222 | LDI Size Reg Imm -- size, dst, src
223 | ST Size Reg AddrMode -- size, src, dst
227 | ABS Size RI Reg -- size, src, dst
228 | NEG Size Bool RI Reg -- size, overflow, src, dst
229 | ADD Size Bool Reg RI Reg -- size, overflow, src, src, dst
230 | SADD Size Size Reg RI Reg -- size, scale, src, src, dst
231 | SUB Size Bool Reg RI Reg -- size, overflow, src, src, dst
232 | SSUB Size Size Reg RI Reg -- size, scale, src, src, dst
233 | MUL Size Bool Reg RI Reg -- size, overflow, src, src, dst
234 | DIV Size Bool Reg RI Reg -- size, unsigned, src, src, dst
235 | REM Size Bool Reg RI Reg -- size, unsigned, src, src, dst
237 -- Simple bit-twiddling.
255 | CMP Cond Reg RI Reg
261 | FADD Size Reg Reg Reg
262 | FDIV Size Reg Reg Reg
263 | FMUL Size Reg Reg Reg
264 | FSUB Size Reg Reg Reg
265 | CVTxy Size Size Reg Reg
266 | FCMP Size Cond Reg Reg Reg
273 | JMP Reg AddrMode Int
275 | JSR Reg AddrMode Int
277 -- Alpha-specific pseudo-ops.
285 #endif /* alpha_TARGET_ARCH */
288 -- -----------------------------------------------------------------------------
289 -- Intel x86 instructions
292 Intel, in their infinite wisdom, selected a stack model for floating
293 point registers on x86. That might have made sense back in 1979 --
294 nowadays we can see it for the nonsense it really is. A stack model
295 fits poorly with the existing nativeGen infrastructure, which assumes
296 flat integer and FP register sets. Prior to this commit, nativeGen
297 could not generate correct x86 FP code -- to do so would have meant
298 somehow working the register-stack paradigm into the register
299 allocator and spiller, which sounds very difficult.
301 We have decided to cheat, and go for a simple fix which requires no
302 infrastructure modifications, at the expense of generating ropey but
303 correct FP code. All notions of the x86 FP stack and its insns have
304 been removed. Instead, we pretend (to the instruction selector and
305 register allocator) that x86 has six floating point registers, %fake0
306 .. %fake5, which can be used in the usual flat manner. We further
307 claim that x86 has floating point instructions very similar to SPARC
308 and Alpha, that is, a simple 3-operand register-register arrangement.
309 Code generation and register allocation proceed on this basis.
311 When we come to print out the final assembly, our convenient fiction
312 is converted to dismal reality. Each fake instruction is
313 independently converted to a series of real x86 instructions.
314 %fake0 .. %fake5 are mapped to %st(0) .. %st(5). To do reg-reg
315 arithmetic operations, the two operands are pushed onto the top of the
316 FP stack, the operation done, and the result copied back into the
317 relevant register. There are only six %fake registers because 2 are
318 needed for the translation, and x86 has 8 in total.
320 The translation is inefficient but is simple and it works. A cleverer
321 translation would handle a sequence of insns, simulating the FP stack
322 contents, would not impose a fixed mapping from %fake to %st regs, and
323 hopefully could avoid most of the redundant reg-reg moves of the
326 We might as well make use of whatever unique FP facilities Intel have
327 chosen to bless us with (let's not be churlish, after all).
328 Hence GLDZ and GLD1. Bwahahahahahahaha!
332 MORE FLOATING POINT MUSINGS...
334 Intel's internal floating point registers are by default 80 bit
335 extended precision. This means that all operations done on values in
336 registers are done at 80 bits, and unless the intermediate values are
337 truncated to the appropriate size (32 or 64 bits) by storing in
338 memory, calculations in registers will give different results from
339 calculations which pass intermediate values in memory (eg. via
342 One solution is to set the FPU into 64 bit precision mode. Some OSs
343 do this (eg. FreeBSD) and some don't (eg. Linux). The problem here is
344 that this will only affect 64-bit precision arithmetic; 32-bit
345 calculations will still be done at 64-bit precision in registers. So
346 it doesn't solve the whole problem.
348 There's also the issue of what the C library is expecting in terms of
349 precision. It seems to be the case that glibc on Linux expects the
350 FPU to be set to 80 bit precision, so setting it to 64 bit could have
351 unexpected effects. Changing the default could have undesirable
352 effects on other 3rd-party library code too, so the right thing would
353 be to save/restore the FPU control word across Haskell code if we were
356 gcc's -ffloat-store gives consistent results by always storing the
357 results of floating-point calculations in memory, which works for both
358 32 and 64-bit precision. However, it only affects the values of
359 user-declared floating point variables in C, not intermediate results.
360 GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
363 Another problem is how to spill floating point registers in the
364 register allocator. Should we spill the whole 80 bits, or just 64?
365 On an OS which is set to 64 bit precision, spilling 64 is fine. On
366 Linux, spilling 64 bits will round the results of some operations.
367 This is what gcc does. Spilling at 80 bits requires taking up a full
368 128 bit slot (so we get alignment). We spill at 80-bits and ignore
369 the alignment problems.
371 In the future, we'll use the SSE registers for floating point. This
372 requires a CPU that supports SSE2 (ordinary SSE only supports 32 bit
373 precision float ops), which means P4 or Xeon and above. Using SSE
374 will solve all these problems, because the SSE registers use fixed 32
375 bit or 64 bit precision.
380 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
382 -- data Instr continues...
385 | MOV MachRep Operand Operand
386 | MOVZxL MachRep Operand Operand -- size is the size of operand 1
387 | MOVSxL MachRep Operand Operand -- size is the size of operand 1
388 -- x86_64 note: plain mov into a 32-bit register always zero-extends
389 -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
390 -- don't affect the high bits of the register.
392 -- Load effective address (also a very useful three-operand add instruction :-)
393 | LEA MachRep Operand Operand
396 | ADD MachRep Operand Operand
397 | ADC MachRep Operand Operand
398 | SUB MachRep Operand Operand
400 | MUL MachRep Operand Operand
401 | IMUL MachRep Operand Operand -- signed int mul
403 -- operand1:operand2 := (operand1[31:0] *signed operand2[31:0])
405 | DIV MachRep Operand -- eax := eax:edx/op, edx := eax:edx%op
406 | IDIV MachRep Operand -- ditto, but signed
408 -- Simple bit-twiddling.
409 | AND MachRep Operand Operand
410 | OR MachRep Operand Operand
411 | XOR MachRep Operand Operand
412 | NOT MachRep Operand
413 | NEGI MachRep Operand -- NEG instruction (name clash with Cond)
415 -- Shifts (amount may be immediate or %cl only)
416 | SHL MachRep Operand{-amount-} Operand
417 | SAR MachRep Operand{-amount-} Operand
418 | SHR MachRep Operand{-amount-} Operand
420 | BT MachRep Imm Operand
426 -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles
427 -- as single instructions right up until we spit them out.
428 -- all the 3-operand fake fp insns are src1 src2 dst
429 -- and furthermore are constrained to be fp regs only.
430 -- IMPORTANT: keep is_G_insn up to date with any changes here
431 | GMOV Reg Reg -- src(fpreg), dst(fpreg)
432 | GLD MachRep AddrMode Reg -- src, dst(fpreg)
433 | GST MachRep Reg AddrMode -- src(fpreg), dst
435 | GLDZ Reg -- dst(fpreg)
436 | GLD1 Reg -- dst(fpreg)
438 | GFTOI Reg Reg -- src(fpreg), dst(intreg)
439 | GDTOI Reg Reg -- src(fpreg), dst(intreg)
441 | GITOF Reg Reg -- src(intreg), dst(fpreg)
442 | GITOD Reg Reg -- src(intreg), dst(fpreg)
444 | GADD MachRep Reg Reg Reg -- src1, src2, dst
445 | GDIV MachRep Reg Reg Reg -- src1, src2, dst
446 | GSUB MachRep Reg Reg Reg -- src1, src2, dst
447 | GMUL MachRep Reg Reg Reg -- src1, src2, dst
449 -- FP compare. Cond must be `elem` [EQQ, NE, LE, LTT, GE, GTT]
450 -- Compare src1 with src2; set the Zero flag iff the numbers are
451 -- comparable and the comparison is True. Subsequent code must
452 -- test the %eflags zero flag regardless of the supplied Cond.
453 | GCMP Cond Reg Reg -- src1, src2
455 | GABS MachRep Reg Reg -- src, dst
456 | GNEG MachRep Reg Reg -- src, dst
457 | GSQRT MachRep Reg Reg -- src, dst
458 | GSIN MachRep Reg Reg -- src, dst
459 | GCOS MachRep Reg Reg -- src, dst
460 | GTAN MachRep Reg Reg -- src, dst
462 | GFREE -- do ffree on all x86 regs; an ugly hack
465 #if x86_64_TARGET_ARCH
466 -- SSE2 floating point: we use a restricted set of the available SSE2
467 -- instructions for floating-point.
469 -- use MOV for moving (either movss or movsd (movlpd better?))
471 | CVTSS2SD Reg Reg -- F32 to F64
472 | CVTSD2SS Reg Reg -- F64 to F32
473 | CVTSS2SI Operand Reg -- F32 to I32/I64 (with rounding)
474 | CVTSD2SI Operand Reg -- F64 to I32/I64 (with rounding)
475 | CVTSI2SS Operand Reg -- I32/I64 to F32
476 | CVTSI2SD Operand Reg -- I32/I64 to F64
478 -- use ADD & SUB for arithmetic. In both cases, operands
481 -- SSE2 floating-point division:
482 | FDIV MachRep Operand Operand -- divisor, dividend(dst)
484 -- use CMP for comparisons. ucomiss and ucomisd instructions
485 -- compare single/double prec floating point respectively.
487 | SQRT MachRep Operand Reg -- src, dst
491 | TEST MachRep Operand Operand
492 | CMP MachRep Operand Operand
496 | PUSH MachRep Operand
497 | POP MachRep Operand
498 -- both unused (SDM):
504 | JXX Cond BlockId -- includes unconditional branches
505 | JMP_TBL Operand [BlockId] -- table jump
506 | CALL (Either Imm Reg)
509 | CLTD MachRep -- sign extend %eax into %edx:%eax
511 | FETCHGOT Reg -- pseudo-insn for position-independent code
515 -- addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
518 = OpReg Reg -- register
519 | OpImm Imm -- immediate value
520 | OpAddr AddrMode -- memory reference
522 #endif /* i386 or x86_64 */
525 i386_insert_ffrees :: [Instr] -> [Instr]
526 i386_insert_ffrees insns
527 | any is_G_instr insns
528 = concatMap ffree_before_nonlocal_transfers insns
532 ffree_before_nonlocal_transfers insn
534 CALL _ -> [GFREE, insn]
535 JMP _ -> [GFREE, insn]
539 -- if you ever add a new FP insn to the fake x86 FP insn set,
540 -- you must update this too
541 is_G_instr :: Instr -> Bool
544 GMOV _ _ -> True; GLD _ _ _ -> True; GST _ _ _ -> True
545 GLDZ _ -> True; GLD1 _ -> True
546 GFTOI _ _ -> True; GDTOI _ _ -> True
547 GITOF _ _ -> True; GITOD _ _ -> True
548 GADD _ _ _ _ -> True; GDIV _ _ _ _ -> True
549 GSUB _ _ _ _ -> True; GMUL _ _ _ _ -> True
550 GCMP _ _ _ -> True; GABS _ _ _ -> True
551 GNEG _ _ _ -> True; GSQRT _ _ _ -> True
552 GSIN _ _ _ -> True; GCOS _ _ _ -> True; GTAN _ _ _ -> True
553 GFREE -> panic "is_G_instr: GFREE (!)"
555 #endif /* i386_TARGET_ARCH */
558 -- -----------------------------------------------------------------------------
559 -- Sparc instructions
561 #if sparc_TARGET_ARCH
563 -- data Instr continues...
566 | LD MachRep AddrMode Reg -- size, src, dst
567 | ST MachRep Reg AddrMode -- size, src, dst
570 | ADD Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
571 | SUB Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
572 | UMUL Bool Reg RI Reg -- cc?, src1, src2, dst
573 | SMUL Bool Reg RI Reg -- cc?, src1, src2, dst
574 | RDY Reg -- move contents of Y register to reg
576 -- Simple bit-twiddling.
577 | AND Bool Reg RI Reg -- cc?, src1, src2, dst
578 | ANDN Bool Reg RI Reg -- cc?, src1, src2, dst
579 | OR Bool Reg RI Reg -- cc?, src1, src2, dst
580 | ORN Bool Reg RI Reg -- cc?, src1, src2, dst
581 | XOR Bool Reg RI Reg -- cc?, src1, src2, dst
582 | XNOR Bool Reg RI Reg -- cc?, src1, src2, dst
583 | SLL Reg RI Reg -- src1, src2, dst
584 | SRL Reg RI Reg -- src1, src2, dst
585 | SRA Reg RI Reg -- src1, src2, dst
586 | SETHI Imm Reg -- src, dst
587 | NOP -- Really SETHI 0, %g0, but worth an alias
591 -- Note that we cheat by treating F{ABS,MOV,NEG} of doubles as single
592 -- instructions right up until we spit them out.
593 | FABS MachRep Reg Reg -- src dst
594 | FADD MachRep Reg Reg Reg -- src1, src2, dst
595 | FCMP Bool MachRep Reg Reg -- exception?, src1, src2, dst
596 | FDIV MachRep Reg Reg Reg -- src1, src2, dst
597 | FMOV MachRep Reg Reg -- src, dst
598 | FMUL MachRep Reg Reg Reg -- src1, src2, dst
599 | FNEG MachRep Reg Reg -- src, dst
600 | FSQRT MachRep Reg Reg -- src, dst
601 | FSUB MachRep Reg Reg Reg -- src1, src2, dst
602 | FxTOy MachRep MachRep Reg Reg -- src, dst
605 | BI Cond Bool Imm -- cond, annul?, target
606 | BF Cond Bool Imm -- cond, annul?, target
608 | JMP DestInfo AddrMode -- target
609 | CALL (Either Imm Reg) Int Bool -- target, args, terminal
616 riZero (RIImm (ImmInt 0)) = True
617 riZero (RIImm (ImmInteger 0)) = True
618 riZero (RIReg (RealReg 0)) = True
621 -- Calculate the effective address which would be used by the
622 -- corresponding fpRel sequence. fpRel is in MachRegs.lhs,
623 -- alas -- can't have fpRelEA here because of module dependencies.
624 fpRelEA :: Int -> Reg -> Instr
626 = ADD False False fp (RIImm (ImmInt (n * BYTES_PER_WORD))) dst
628 -- Code to shift the stack pointer by n words.
629 moveSp :: Int -> Instr
631 = ADD False False sp (RIImm (ImmInt (n * BYTES_PER_WORD))) sp
633 -- Produce the second-half-of-a-double register given the first half.
635 fPair (RealReg n) | n >= 32 && n `mod` 2 == 0 = RealReg (n+1)
636 fPair other = pprPanic "fPair(sparc NCG)" (ppr other)
637 #endif /* sparc_TARGET_ARCH */
640 -- -----------------------------------------------------------------------------
641 -- PowerPC instructions
643 #ifdef powerpc_TARGET_ARCH
644 -- data Instr continues...
647 | LD MachRep Reg AddrMode -- Load size, dst, src
648 | LA MachRep Reg AddrMode -- Load arithmetic size, dst, src
649 | ST MachRep Reg AddrMode -- Store size, src, dst
650 | STU MachRep Reg AddrMode -- Store with Update size, src, dst
651 | LIS Reg Imm -- Load Immediate Shifted dst, src
652 | LI Reg Imm -- Load Immediate dst, src
653 | MR Reg Reg -- Move Register dst, src -- also for fmr
655 | CMP MachRep Reg RI --- size, src1, src2
656 | CMPL MachRep Reg RI --- size, src1, src2
659 | JMP CLabel -- same as branch,
660 -- but with CLabel instead of block ID
662 | BCTR [BlockId] -- with list of local destinations
663 | BL CLabel [Reg] -- with list of argument regs
666 | ADD Reg Reg RI -- dst, src1, src2
667 | ADDC Reg Reg Reg -- (carrying) dst, src1, src2
668 | ADDE Reg Reg Reg -- (extend) dst, src1, src2
669 | ADDIS Reg Reg Imm -- Add Immediate Shifted dst, src1, src2
670 | SUBF Reg Reg Reg -- dst, src1, src2 ; dst = src2 - src1
675 | MULLW_MayOflo Reg Reg Reg
676 -- dst = 1 if src1 * src2 overflows
677 -- pseudo-instruction; pretty-printed as:
678 -- mullwo. dst, src1, src2
680 -- rlwinm dst, dst, 2, 31,31
682 | AND Reg Reg RI -- dst, src1, src2
683 | OR Reg Reg RI -- dst, src1, src2
684 | XOR Reg Reg RI -- dst, src1, src2
685 | XORIS Reg Reg Imm -- XOR Immediate Shifted dst, src1, src2
687 | EXTS MachRep Reg Reg
692 | SLW Reg Reg RI -- shift left word
693 | SRW Reg Reg RI -- shift right word
694 | SRAW Reg Reg RI -- shift right arithmetic word
696 -- Rotate Left Word Immediate then AND with Mask
697 | RLWINM Reg Reg Int Int Int
699 | FADD MachRep Reg Reg Reg
700 | FSUB MachRep Reg Reg Reg
701 | FMUL MachRep Reg Reg Reg
702 | FDIV MachRep Reg Reg Reg
703 | FNEG Reg Reg -- negate is the same for single and double prec.
707 | FCTIWZ Reg Reg -- convert to integer word
708 | FRSP Reg Reg -- reduce to single precision
709 -- (but destination is a FP register)
711 | CRNOR Int Int Int -- condition register nor
712 | MFCR Reg -- move from condition register
714 | MFLR Reg -- move from link register
715 | FETCHPC Reg -- pseudo-instruction:
716 -- bcl to next insn, mflr reg
718 #endif /* powerpc_TARGET_ARCH */