1 -----------------------------------------------------------------------------
3 -- Machine-dependent assembly language
5 -- (c) The University of Glasgow 1993-2004
7 -----------------------------------------------------------------------------
9 #include "nativeGen/NCG.h"
12 -- * Cmm instantiations
13 NatCmm, NatCmmTop, NatBasicBlock,
15 -- * Machine instructions
17 Cond(..), condUnsigned, condToSigned, condToUnsigned,
19 #if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH && !x86_64_TARGET_ARCH
20 Size(..), machRepSize,
24 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
31 riZero, fpRelEA, moveSp, fPair,
35 #include "HsVersions.h"
39 import MachOp ( MachRep(..) )
40 import CLabel ( CLabel, pprCLabel )
41 import Panic ( panic )
44 import Constants ( wORD_SIZE )
49 -- -----------------------------------------------------------------------------
50 -- Our flavours of the Cmm types
52 -- Type synonyms for Cmm populated with native code
53 type NatCmm = GenCmm CmmStatic Instr
54 type NatCmmTop = GenCmmTop CmmStatic Instr
55 type NatBasicBlock = GenBasicBlock Instr
57 -- -----------------------------------------------------------------------------
58 -- Conditions on this architecture
62 = ALWAYS -- For BI (same as BR)
63 | EQQ -- For CMP and BI (NB: "EQ" is a 1.3 Prelude name)
65 | GTT -- For BI only (NB: "GT" is a 1.3 Prelude name)
66 | LE -- For CMP and BI
67 | LTT -- For CMP and BI (NB: "LT" is a 1.3 Prelude name)
69 | NEVER -- For BI (null instruction)
73 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
74 = ALWAYS -- What's really used? ToDo
93 = ALWAYS -- What's really used? ToDo
110 #if powerpc_TARGET_ARCH
123 deriving Eq -- to make an assertion work
125 condUnsigned GU = True
126 condUnsigned LU = True
127 condUnsigned GEU = True
128 condUnsigned LEU = True
129 condUnsigned _ = False
131 condToSigned GU = GTT
132 condToSigned LU = LTT
133 condToSigned GEU = GE
134 condToSigned LEU = LE
137 condToUnsigned GTT = GU
138 condToUnsigned LTT = LU
139 condToUnsigned GE = GEU
140 condToUnsigned LE = LEU
143 -- -----------------------------------------------------------------------------
144 -- Sizes on this architecture
146 -- ToDo: it's not clear to me that we need separate signed-vs-unsigned sizes
147 -- here. I've removed them from the x86 version, we'll see what happens --SDM
149 #if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH && !x86_64_TARGET_ARCH
151 #if alpha_TARGET_ARCH
154 -- | W -- word (2 bytes): UNUSED
156 | L -- longword (4 bytes)
157 | Q -- quadword (8 bytes)
158 -- | FF -- VAX F-style floating pt: UNUSED
159 -- | GF -- VAX G-style floating pt: UNUSED
160 -- | DF -- VAX D-style floating pt: UNUSED
161 -- | SF -- IEEE single-precision floating pt: UNUSED
162 | TF -- IEEE double-precision floating pt
164 #if sparc_TARGET_ARCH || powerpc_TARGET_ARCH
166 | Bu -- byte (unsigned)
167 | H -- halfword (signed, 2 bytes)
168 | Hu -- halfword (unsigned, 2 bytes)
169 | W -- word (4 bytes)
170 | F -- IEEE single-precision floating pt
171 | DF -- IEEE single-precision floating pt
175 machRepSize :: MachRep -> Size
176 machRepSize I8 = IF_ARCH_alpha(Bu, IF_ARCH_sparc(Bu, ))
177 machRepSize I16 = IF_ARCH_alpha(err,IF_ARCH_sparc(Hu, ))
178 machRepSize I32 = IF_ARCH_alpha(L, IF_ARCH_sparc(W, ))
179 machRepSize I64 = panic "machRepSize: I64"
180 machRepSize I128 = panic "machRepSize: I128"
181 machRepSize F32 = IF_ARCH_alpha(TF, IF_ARCH_sparc(F, ))
182 machRepSize F64 = IF_ARCH_alpha(TF, IF_ARCH_sparc(DF,))
185 -- -----------------------------------------------------------------------------
186 -- Register or immediate (a handy type on some platforms)
192 -- -----------------------------------------------------------------------------
193 -- Machine's assembly language
195 -- We have a few common "instructions" (nearly all the pseudo-ops) but
196 -- mostly all of 'Instr' is machine-specific.
199 = COMMENT FastString -- comment pseudo-op
201 | LDATA Section [CmmStatic] -- some static data spat out during code
202 -- generation. Will be extracted before
205 | NEWBLOCK BlockId -- start a new basic block. Useful during
206 -- codegen, removed later. Preceding
207 -- instruction should be a jump, as per the
208 -- invariants for a BasicBlock (see Cmm).
210 | DELTA Int -- specify current stack offset for
211 -- benefit of subsequent passes
213 -- -----------------------------------------------------------------------------
214 -- Alpha instructions
216 #if alpha_TARGET_ARCH
218 -- data Instr continues...
221 | LD Size Reg AddrMode -- size, dst, src
222 | LDA Reg AddrMode -- dst, src
223 | LDAH Reg AddrMode -- dst, src
224 | LDGP Reg AddrMode -- dst, src
225 | LDI Size Reg Imm -- size, dst, src
226 | ST Size Reg AddrMode -- size, src, dst
230 | ABS Size RI Reg -- size, src, dst
231 | NEG Size Bool RI Reg -- size, overflow, src, dst
232 | ADD Size Bool Reg RI Reg -- size, overflow, src, src, dst
233 | SADD Size Size Reg RI Reg -- size, scale, src, src, dst
234 | SUB Size Bool Reg RI Reg -- size, overflow, src, src, dst
235 | SSUB Size Size Reg RI Reg -- size, scale, src, src, dst
236 | MUL Size Bool Reg RI Reg -- size, overflow, src, src, dst
237 | DIV Size Bool Reg RI Reg -- size, unsigned, src, src, dst
238 | REM Size Bool Reg RI Reg -- size, unsigned, src, src, dst
240 -- Simple bit-twiddling.
258 | CMP Cond Reg RI Reg
264 | FADD Size Reg Reg Reg
265 | FDIV Size Reg Reg Reg
266 | FMUL Size Reg Reg Reg
267 | FSUB Size Reg Reg Reg
268 | CVTxy Size Size Reg Reg
269 | FCMP Size Cond Reg Reg Reg
276 | JMP Reg AddrMode Int
278 | JSR Reg AddrMode Int
280 -- Alpha-specific pseudo-ops.
288 #endif /* alpha_TARGET_ARCH */
291 -- -----------------------------------------------------------------------------
292 -- Intel x86 instructions
295 Intel, in their infinite wisdom, selected a stack model for floating
296 point registers on x86. That might have made sense back in 1979 --
297 nowadays we can see it for the nonsense it really is. A stack model
298 fits poorly with the existing nativeGen infrastructure, which assumes
299 flat integer and FP register sets. Prior to this commit, nativeGen
300 could not generate correct x86 FP code -- to do so would have meant
301 somehow working the register-stack paradigm into the register
302 allocator and spiller, which sounds very difficult.
304 We have decided to cheat, and go for a simple fix which requires no
305 infrastructure modifications, at the expense of generating ropey but
306 correct FP code. All notions of the x86 FP stack and its insns have
307 been removed. Instead, we pretend (to the instruction selector and
308 register allocator) that x86 has six floating point registers, %fake0
309 .. %fake5, which can be used in the usual flat manner. We further
310 claim that x86 has floating point instructions very similar to SPARC
311 and Alpha, that is, a simple 3-operand register-register arrangement.
312 Code generation and register allocation proceed on this basis.
314 When we come to print out the final assembly, our convenient fiction
315 is converted to dismal reality. Each fake instruction is
316 independently converted to a series of real x86 instructions.
317 %fake0 .. %fake5 are mapped to %st(0) .. %st(5). To do reg-reg
318 arithmetic operations, the two operands are pushed onto the top of the
319 FP stack, the operation done, and the result copied back into the
320 relevant register. There are only six %fake registers because 2 are
321 needed for the translation, and x86 has 8 in total.
323 The translation is inefficient but is simple and it works. A cleverer
324 translation would handle a sequence of insns, simulating the FP stack
325 contents, would not impose a fixed mapping from %fake to %st regs, and
326 hopefully could avoid most of the redundant reg-reg moves of the
329 We might as well make use of whatever unique FP facilities Intel have
330 chosen to bless us with (let's not be churlish, after all).
331 Hence GLDZ and GLD1. Bwahahahahahahaha!
335 MORE FLOATING POINT MUSINGS...
337 Intel's internal floating point registers are by default 80 bit
338 extended precision. This means that all operations done on values in
339 registers are done at 80 bits, and unless the intermediate values are
340 truncated to the appropriate size (32 or 64 bits) by storing in
341 memory, calculations in registers will give different results from
342 calculations which pass intermediate values in memory (eg. via
345 One solution is to set the FPU into 64 bit precision mode. Some OSs
346 do this (eg. FreeBSD) and some don't (eg. Linux). The problem here is
347 that this will only affect 64-bit precision arithmetic; 32-bit
348 calculations will still be done at 64-bit precision in registers. So
349 it doesn't solve the whole problem.
351 There's also the issue of what the C library is expecting in terms of
352 precision. It seems to be the case that glibc on Linux expects the
353 FPU to be set to 80 bit precision, so setting it to 64 bit could have
354 unexpected effects. Changing the default could have undesirable
355 effects on other 3rd-party library code too, so the right thing would
356 be to save/restore the FPU control word across Haskell code if we were
359 gcc's -ffloat-store gives consistent results by always storing the
360 results of floating-point calculations in memory, which works for both
361 32 and 64-bit precision. However, it only affects the values of
362 user-declared floating point variables in C, not intermediate results.
363 GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
366 Another problem is how to spill floating point registers in the
367 register allocator. Should we spill the whole 80 bits, or just 64?
368 On an OS which is set to 64 bit precision, spilling 64 is fine. On
369 Linux, spilling 64 bits will round the results of some operations.
370 This is what gcc does. Spilling at 80 bits requires taking up a full
371 128 bit slot (so we get alignment). We spill at 80-bits and ignore
372 the alignment problems.
374 In the future, we'll use the SSE registers for floating point. This
375 requires a CPU that supports SSE2 (ordinary SSE only supports 32 bit
376 precision float ops), which means P4 or Xeon and above. Using SSE
377 will solve all these problems, because the SSE registers use fixed 32
378 bit or 64 bit precision.
383 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
385 -- data Instr continues...
388 | MOV MachRep Operand Operand
389 | MOVZxL MachRep Operand Operand -- size is the size of operand 1
390 | MOVSxL MachRep Operand Operand -- size is the size of operand 1
391 -- x86_64 note: plain mov into a 32-bit register always zero-extends
392 -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
393 -- don't affect the high bits of the register.
395 -- Load effective address (also a very useful three-operand add instruction :-)
396 | LEA MachRep Operand Operand
399 | ADD MachRep Operand Operand
400 | ADC MachRep Operand Operand
401 | SUB MachRep Operand Operand
403 | MUL MachRep Operand Operand
404 | IMUL MachRep Operand Operand -- signed int mul
405 | IMUL2 MachRep Operand -- %edx:%eax = operand * %eax
407 | DIV MachRep Operand -- eax := eax:edx/op, edx := eax:edx%op
408 | IDIV MachRep Operand -- ditto, but signed
410 -- Simple bit-twiddling.
411 | AND MachRep Operand Operand
412 | OR MachRep Operand Operand
413 | XOR MachRep Operand Operand
414 | NOT MachRep Operand
415 | NEGI MachRep Operand -- NEG instruction (name clash with Cond)
417 -- Shifts (amount may be immediate or %cl only)
418 | SHL MachRep Operand{-amount-} Operand
419 | SAR MachRep Operand{-amount-} Operand
420 | SHR MachRep Operand{-amount-} Operand
422 | BT MachRep Imm Operand
428 -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles
429 -- as single instructions right up until we spit them out.
430 -- all the 3-operand fake fp insns are src1 src2 dst
431 -- and furthermore are constrained to be fp regs only.
432 -- IMPORTANT: keep is_G_insn up to date with any changes here
433 | GMOV Reg Reg -- src(fpreg), dst(fpreg)
434 | GLD MachRep AddrMode Reg -- src, dst(fpreg)
435 | GST MachRep Reg AddrMode -- src(fpreg), dst
437 | GLDZ Reg -- dst(fpreg)
438 | GLD1 Reg -- dst(fpreg)
440 | GFTOI Reg Reg -- src(fpreg), dst(intreg)
441 | GDTOI Reg Reg -- src(fpreg), dst(intreg)
443 | GITOF Reg Reg -- src(intreg), dst(fpreg)
444 | GITOD Reg Reg -- src(intreg), dst(fpreg)
446 | GADD MachRep Reg Reg Reg -- src1, src2, dst
447 | GDIV MachRep Reg Reg Reg -- src1, src2, dst
448 | GSUB MachRep Reg Reg Reg -- src1, src2, dst
449 | GMUL MachRep Reg Reg Reg -- src1, src2, dst
451 -- FP compare. Cond must be `elem` [EQQ, NE, LE, LTT, GE, GTT]
452 -- Compare src1 with src2; set the Zero flag iff the numbers are
453 -- comparable and the comparison is True. Subsequent code must
454 -- test the %eflags zero flag regardless of the supplied Cond.
455 | GCMP Cond Reg Reg -- src1, src2
457 | GABS MachRep Reg Reg -- src, dst
458 | GNEG MachRep Reg Reg -- src, dst
459 | GSQRT MachRep Reg Reg -- src, dst
460 | GSIN MachRep Reg Reg -- src, dst
461 | GCOS MachRep Reg Reg -- src, dst
462 | GTAN MachRep Reg Reg -- src, dst
464 | GFREE -- do ffree on all x86 regs; an ugly hack
467 #if x86_64_TARGET_ARCH
468 -- SSE2 floating point: we use a restricted set of the available SSE2
469 -- instructions for floating-point.
471 -- use MOV for moving (either movss or movsd (movlpd better?))
473 | CVTSS2SD Reg Reg -- F32 to F64
474 | CVTSD2SS Reg Reg -- F64 to F32
475 | CVTSS2SI Operand Reg -- F32 to I32/I64 (with rounding)
476 | CVTSD2SI Operand Reg -- F64 to I32/I64 (with rounding)
477 | CVTSI2SS Operand Reg -- I32/I64 to F32
478 | CVTSI2SD Operand Reg -- I32/I64 to F64
480 -- use ADD & SUB for arithmetic. In both cases, operands
483 -- SSE2 floating-point division:
484 | FDIV MachRep Operand Operand -- divisor, dividend(dst)
486 -- use CMP for comparisons. ucomiss and ucomisd instructions
487 -- compare single/double prec floating point respectively.
489 | SQRT MachRep Operand Reg -- src, dst
493 | TEST MachRep Operand Operand
494 | CMP MachRep Operand Operand
498 | PUSH MachRep Operand
499 | POP MachRep Operand
500 -- both unused (SDM):
506 | JXX Cond BlockId -- includes unconditional branches
507 | JMP_TBL Operand [BlockId] -- table jump
508 | CALL (Either Imm Reg) [Reg]
511 | CLTD MachRep -- sign extend %eax into %edx:%eax
513 | FETCHGOT Reg -- pseudo-insn for ELF position-independent code
517 -- addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
518 | FETCHPC Reg -- pseudo-insn for Darwin position-independent code
525 = OpReg Reg -- register
526 | OpImm Imm -- immediate value
527 | OpAddr AddrMode -- memory reference
529 #endif /* i386 or x86_64 */
532 i386_insert_ffrees :: [Instr] -> [Instr]
533 i386_insert_ffrees insns
534 | any is_G_instr insns
535 = concatMap ffree_before_nonlocal_transfers insns
539 ffree_before_nonlocal_transfers insn
541 CALL _ _ -> [GFREE, insn]
542 JMP _ -> [GFREE, insn]
546 -- if you ever add a new FP insn to the fake x86 FP insn set,
547 -- you must update this too
548 is_G_instr :: Instr -> Bool
551 GMOV _ _ -> True; GLD _ _ _ -> True; GST _ _ _ -> True
552 GLDZ _ -> True; GLD1 _ -> True
553 GFTOI _ _ -> True; GDTOI _ _ -> True
554 GITOF _ _ -> True; GITOD _ _ -> True
555 GADD _ _ _ _ -> True; GDIV _ _ _ _ -> True
556 GSUB _ _ _ _ -> True; GMUL _ _ _ _ -> True
557 GCMP _ _ _ -> True; GABS _ _ _ -> True
558 GNEG _ _ _ -> True; GSQRT _ _ _ -> True
559 GSIN _ _ _ -> True; GCOS _ _ _ -> True; GTAN _ _ _ -> True
560 GFREE -> panic "is_G_instr: GFREE (!)"
562 #endif /* i386_TARGET_ARCH */
565 -- -----------------------------------------------------------------------------
566 -- Sparc instructions
568 #if sparc_TARGET_ARCH
570 -- data Instr continues...
573 | LD MachRep AddrMode Reg -- size, src, dst
574 | ST MachRep Reg AddrMode -- size, src, dst
577 | ADD Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
578 | SUB Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
579 | UMUL Bool Reg RI Reg -- cc?, src1, src2, dst
580 | SMUL Bool Reg RI Reg -- cc?, src1, src2, dst
581 | RDY Reg -- move contents of Y register to reg
583 -- Simple bit-twiddling.
584 | AND Bool Reg RI Reg -- cc?, src1, src2, dst
585 | ANDN Bool Reg RI Reg -- cc?, src1, src2, dst
586 | OR Bool Reg RI Reg -- cc?, src1, src2, dst
587 | ORN Bool Reg RI Reg -- cc?, src1, src2, dst
588 | XOR Bool Reg RI Reg -- cc?, src1, src2, dst
589 | XNOR Bool Reg RI Reg -- cc?, src1, src2, dst
590 | SLL Reg RI Reg -- src1, src2, dst
591 | SRL Reg RI Reg -- src1, src2, dst
592 | SRA Reg RI Reg -- src1, src2, dst
593 | SETHI Imm Reg -- src, dst
594 | NOP -- Really SETHI 0, %g0, but worth an alias
598 -- Note that we cheat by treating F{ABS,MOV,NEG} of doubles as single
599 -- instructions right up until we spit them out.
600 | FABS MachRep Reg Reg -- src dst
601 | FADD MachRep Reg Reg Reg -- src1, src2, dst
602 | FCMP Bool MachRep Reg Reg -- exception?, src1, src2, dst
603 | FDIV MachRep Reg Reg Reg -- src1, src2, dst
604 | FMOV MachRep Reg Reg -- src, dst
605 | FMUL MachRep Reg Reg Reg -- src1, src2, dst
606 | FNEG MachRep Reg Reg -- src, dst
607 | FSQRT MachRep Reg Reg -- src, dst
608 | FSUB MachRep Reg Reg Reg -- src1, src2, dst
609 | FxTOy MachRep MachRep Reg Reg -- src, dst
612 | BI Cond Bool Imm -- cond, annul?, target
613 | BF Cond Bool Imm -- cond, annul?, target
615 | JMP AddrMode -- target
616 | CALL (Either Imm Reg) Int Bool -- target, args, terminal
620 riZero (RIImm (ImmInt 0)) = True
621 riZero (RIImm (ImmInteger 0)) = True
622 riZero (RIReg (RealReg 0)) = True
625 -- Calculate the effective address which would be used by the
626 -- corresponding fpRel sequence. fpRel is in MachRegs.lhs,
627 -- alas -- can't have fpRelEA here because of module dependencies.
628 fpRelEA :: Int -> Reg -> Instr
630 = ADD False False fp (RIImm (ImmInt (n * wORD_SIZE))) dst
632 -- Code to shift the stack pointer by n words.
633 moveSp :: Int -> Instr
635 = ADD False False sp (RIImm (ImmInt (n * wORD_SIZE))) sp
637 -- Produce the second-half-of-a-double register given the first half.
639 fPair (RealReg n) | n >= 32 && n `mod` 2 == 0 = RealReg (n+1)
640 fPair other = pprPanic "fPair(sparc NCG)" (ppr other)
641 #endif /* sparc_TARGET_ARCH */
644 -- -----------------------------------------------------------------------------
645 -- PowerPC instructions
647 #ifdef powerpc_TARGET_ARCH
648 -- data Instr continues...
651 | LD MachRep Reg AddrMode -- Load size, dst, src
652 | LA MachRep Reg AddrMode -- Load arithmetic size, dst, src
653 | ST MachRep Reg AddrMode -- Store size, src, dst
654 | STU MachRep Reg AddrMode -- Store with Update size, src, dst
655 | LIS Reg Imm -- Load Immediate Shifted dst, src
656 | LI Reg Imm -- Load Immediate dst, src
657 | MR Reg Reg -- Move Register dst, src -- also for fmr
659 | CMP MachRep Reg RI --- size, src1, src2
660 | CMPL MachRep Reg RI --- size, src1, src2
663 | JMP CLabel -- same as branch,
664 -- but with CLabel instead of block ID
666 | BCTR [BlockId] -- with list of local destinations
667 | BL CLabel [Reg] -- with list of argument regs
670 | ADD Reg Reg RI -- dst, src1, src2
671 | ADDC Reg Reg Reg -- (carrying) dst, src1, src2
672 | ADDE Reg Reg Reg -- (extend) dst, src1, src2
673 | ADDIS Reg Reg Imm -- Add Immediate Shifted dst, src1, src2
674 | SUBF Reg Reg Reg -- dst, src1, src2 ; dst = src2 - src1
679 | MULLW_MayOflo Reg Reg Reg
680 -- dst = 1 if src1 * src2 overflows
681 -- pseudo-instruction; pretty-printed as:
682 -- mullwo. dst, src1, src2
684 -- rlwinm dst, dst, 2, 31,31
686 | AND Reg Reg RI -- dst, src1, src2
687 | OR Reg Reg RI -- dst, src1, src2
688 | XOR Reg Reg RI -- dst, src1, src2
689 | XORIS Reg Reg Imm -- XOR Immediate Shifted dst, src1, src2
691 | EXTS MachRep Reg Reg
696 | SLW Reg Reg RI -- shift left word
697 | SRW Reg Reg RI -- shift right word
698 | SRAW Reg Reg RI -- shift right arithmetic word
700 -- Rotate Left Word Immediate then AND with Mask
701 | RLWINM Reg Reg Int Int Int
703 | FADD MachRep Reg Reg Reg
704 | FSUB MachRep Reg Reg Reg
705 | FMUL MachRep Reg Reg Reg
706 | FDIV MachRep Reg Reg Reg
707 | FNEG Reg Reg -- negate is the same for single and double prec.
711 | FCTIWZ Reg Reg -- convert to integer word
712 | FRSP Reg Reg -- reduce to single precision
713 -- (but destination is a FP register)
715 | CRNOR Int Int Int -- condition register nor
716 | MFCR Reg -- move from condition register
718 | MFLR Reg -- move from link register
719 | FETCHPC Reg -- pseudo-instruction:
720 -- bcl to next insn, mflr reg
722 #endif /* powerpc_TARGET_ARCH */