2 -- The above warning supression flag is a temporary kludge.
3 -- While working on this module you are encouraged to remove it and fix
4 -- any warnings in the module. See
5 -- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
8 -----------------------------------------------------------------------------
10 -- Machine-dependent assembly language
12 -- (c) The University of Glasgow 1993-2004
14 -----------------------------------------------------------------------------
16 #include "nativeGen/NCG.h"
19 -- * Cmm instantiations
20 NatCmm, NatCmmTop, NatBasicBlock,
22 -- * Machine instructions
24 Cond(..), condUnsigned, condToSigned, condToUnsigned,
25 #if powerpc_TARGET_ARCH
30 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
37 riZero, fpRelEA, moveSp, fPair,
41 #include "HsVersions.h"
46 import CLabel ( CLabel, pprCLabel )
47 import Panic ( panic )
50 import Constants ( wORD_SIZE )
55 -- -----------------------------------------------------------------------------
56 -- Our flavours of the Cmm types
58 -- Type synonyms for Cmm populated with native code
59 type NatCmm = GenCmm CmmStatic [CmmStatic] (ListGraph Instr)
60 type NatCmmTop = GenCmmTop CmmStatic [CmmStatic] (ListGraph Instr)
61 type NatBasicBlock = GenBasicBlock Instr
63 -- -----------------------------------------------------------------------------
64 -- Conditions on this architecture
68 = ALWAYS -- For BI (same as BR)
69 | EQQ -- For CMP and BI (NB: "EQ" is a 1.3 Prelude name)
71 | GTT -- For BI only (NB: "GT" is a 1.3 Prelude name)
72 | LE -- For CMP and BI
73 | LTT -- For CMP and BI (NB: "LT" is a 1.3 Prelude name)
75 | NEVER -- For BI (null instruction)
79 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
80 = ALWAYS -- What's really used? ToDo
99 = ALWAYS -- What's really used? ToDo
116 #if powerpc_TARGET_ARCH
129 deriving Eq -- to make an assertion work
131 condUnsigned GU = True
132 condUnsigned LU = True
133 condUnsigned GEU = True
134 condUnsigned LEU = True
135 condUnsigned _ = False
137 condToSigned GU = GTT
138 condToSigned LU = LTT
139 condToSigned GEU = GE
140 condToSigned LEU = LE
143 condToUnsigned GTT = GU
144 condToUnsigned LTT = LU
145 condToUnsigned GE = GEU
146 condToUnsigned LE = LEU
149 #if powerpc_TARGET_ARCH
150 condNegate ALWAYS = panic "condNegate: ALWAYS"
163 -- -----------------------------------------------------------------------------
164 -- Register or immediate (a handy type on some platforms)
170 -- -----------------------------------------------------------------------------
171 -- Machine's assembly language
173 -- We have a few common "instructions" (nearly all the pseudo-ops) but
174 -- mostly all of 'Instr' is machine-specific.
177 = COMMENT FastString -- comment pseudo-op
179 | LDATA Section [CmmStatic] -- some static data spat out during code
180 -- generation. Will be extracted before
183 | NEWBLOCK BlockId -- start a new basic block. Useful during
184 -- codegen, removed later. Preceding
185 -- instruction should be a jump, as per the
186 -- invariants for a BasicBlock (see Cmm).
188 | DELTA Int -- specify current stack offset for
189 -- benefit of subsequent passes
191 | SPILL Reg Int -- ^ spill this reg to a stack slot
192 | RELOAD Int Reg -- ^ reload this reg from a stack slot
194 -- -----------------------------------------------------------------------------
195 -- Alpha instructions
197 #if alpha_TARGET_ARCH
199 -- data Instr continues...
202 | LD Size Reg AddrMode -- size, dst, src
203 | LDA Reg AddrMode -- dst, src
204 | LDAH Reg AddrMode -- dst, src
205 | LDGP Reg AddrMode -- dst, src
206 | LDI Size Reg Imm -- size, dst, src
207 | ST Size Reg AddrMode -- size, src, dst
211 | ABS Size RI Reg -- size, src, dst
212 | NEG Size Bool RI Reg -- size, overflow, src, dst
213 | ADD Size Bool Reg RI Reg -- size, overflow, src, src, dst
214 | SADD Size Size Reg RI Reg -- size, scale, src, src, dst
215 | SUB Size Bool Reg RI Reg -- size, overflow, src, src, dst
216 | SSUB Size Size Reg RI Reg -- size, scale, src, src, dst
217 | MUL Size Bool Reg RI Reg -- size, overflow, src, src, dst
218 | DIV Size Bool Reg RI Reg -- size, unsigned, src, src, dst
219 | REM Size Bool Reg RI Reg -- size, unsigned, src, src, dst
221 -- Simple bit-twiddling.
239 | CMP Cond Reg RI Reg
245 | FADD Size Reg Reg Reg
246 | FDIV Size Reg Reg Reg
247 | FMUL Size Reg Reg Reg
248 | FSUB Size Reg Reg Reg
249 | CVTxy Size Size Reg Reg
250 | FCMP Size Cond Reg Reg Reg
257 | JMP Reg AddrMode Int
259 | JSR Reg AddrMode Int
261 -- Alpha-specific pseudo-ops.
269 #endif /* alpha_TARGET_ARCH */
272 -- -----------------------------------------------------------------------------
273 -- Intel x86 instructions
276 Intel, in their infinite wisdom, selected a stack model for floating
277 point registers on x86. That might have made sense back in 1979 --
278 nowadays we can see it for the nonsense it really is. A stack model
279 fits poorly with the existing nativeGen infrastructure, which assumes
280 flat integer and FP register sets. Prior to this commit, nativeGen
281 could not generate correct x86 FP code -- to do so would have meant
282 somehow working the register-stack paradigm into the register
283 allocator and spiller, which sounds very difficult.
285 We have decided to cheat, and go for a simple fix which requires no
286 infrastructure modifications, at the expense of generating ropey but
287 correct FP code. All notions of the x86 FP stack and its insns have
288 been removed. Instead, we pretend (to the instruction selector and
289 register allocator) that x86 has six floating point registers, %fake0
290 .. %fake5, which can be used in the usual flat manner. We further
291 claim that x86 has floating point instructions very similar to SPARC
292 and Alpha, that is, a simple 3-operand register-register arrangement.
293 Code generation and register allocation proceed on this basis.
295 When we come to print out the final assembly, our convenient fiction
296 is converted to dismal reality. Each fake instruction is
297 independently converted to a series of real x86 instructions.
298 %fake0 .. %fake5 are mapped to %st(0) .. %st(5). To do reg-reg
299 arithmetic operations, the two operands are pushed onto the top of the
300 FP stack, the operation done, and the result copied back into the
301 relevant register. There are only six %fake registers because 2 are
302 needed for the translation, and x86 has 8 in total.
304 The translation is inefficient but is simple and it works. A cleverer
305 translation would handle a sequence of insns, simulating the FP stack
306 contents, would not impose a fixed mapping from %fake to %st regs, and
307 hopefully could avoid most of the redundant reg-reg moves of the
310 We might as well make use of whatever unique FP facilities Intel have
311 chosen to bless us with (let's not be churlish, after all).
312 Hence GLDZ and GLD1. Bwahahahahahahaha!
316 MORE FLOATING POINT MUSINGS...
318 Intel's internal floating point registers are by default 80 bit
319 extended precision. This means that all operations done on values in
320 registers are done at 80 bits, and unless the intermediate values are
321 truncated to the appropriate size (32 or 64 bits) by storing in
322 memory, calculations in registers will give different results from
323 calculations which pass intermediate values in memory (eg. via
326 One solution is to set the FPU into 64 bit precision mode. Some OSs
327 do this (eg. FreeBSD) and some don't (eg. Linux). The problem here is
328 that this will only affect 64-bit precision arithmetic; 32-bit
329 calculations will still be done at 64-bit precision in registers. So
330 it doesn't solve the whole problem.
332 There's also the issue of what the C library is expecting in terms of
333 precision. It seems to be the case that glibc on Linux expects the
334 FPU to be set to 80 bit precision, so setting it to 64 bit could have
335 unexpected effects. Changing the default could have undesirable
336 effects on other 3rd-party library code too, so the right thing would
337 be to save/restore the FPU control word across Haskell code if we were
340 gcc's -ffloat-store gives consistent results by always storing the
341 results of floating-point calculations in memory, which works for both
342 32 and 64-bit precision. However, it only affects the values of
343 user-declared floating point variables in C, not intermediate results.
344 GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
347 Another problem is how to spill floating point registers in the
348 register allocator. Should we spill the whole 80 bits, or just 64?
349 On an OS which is set to 64 bit precision, spilling 64 is fine. On
350 Linux, spilling 64 bits will round the results of some operations.
351 This is what gcc does. Spilling at 80 bits requires taking up a full
352 128 bit slot (so we get alignment). We spill at 80-bits and ignore
353 the alignment problems.
355 In the future, we'll use the SSE registers for floating point. This
356 requires a CPU that supports SSE2 (ordinary SSE only supports 32 bit
357 precision float ops), which means P4 or Xeon and above. Using SSE
358 will solve all these problems, because the SSE registers use fixed 32
359 bit or 64 bit precision.
364 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
366 -- data Instr continues...
369 | MOV Size Operand Operand
370 | MOVZxL Size Operand Operand -- size is the size of operand 1
371 | MOVSxL Size Operand Operand -- size is the size of operand 1
372 -- x86_64 note: plain mov into a 32-bit register always zero-extends
373 -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
374 -- don't affect the high bits of the register.
376 -- Load effective address (also a very useful three-operand add instruction :-)
377 | LEA Size Operand Operand
380 | ADD Size Operand Operand
381 | ADC Size Operand Operand
382 | SUB Size Operand Operand
384 | MUL Size Operand Operand
385 | IMUL Size Operand Operand -- signed int mul
386 | IMUL2 Size Operand -- %edx:%eax = operand * %eax
388 | DIV Size Operand -- eax := eax:edx/op, edx := eax:edx%op
389 | IDIV Size Operand -- ditto, but signed
391 -- Simple bit-twiddling.
392 | AND Size Operand Operand
393 | OR Size Operand Operand
394 | XOR Size Operand Operand
396 | NEGI Size Operand -- NEG instruction (name clash with Cond)
398 -- Shifts (amount may be immediate or %cl only)
399 | SHL Size Operand{-amount-} Operand
400 | SAR Size Operand{-amount-} Operand
401 | SHR Size Operand{-amount-} Operand
403 | BT Size Imm Operand
409 -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles
410 -- as single instructions right up until we spit them out.
411 -- all the 3-operand fake fp insns are src1 src2 dst
412 -- and furthermore are constrained to be fp regs only.
413 -- IMPORTANT: keep is_G_insn up to date with any changes here
414 | GMOV Reg Reg -- src(fpreg), dst(fpreg)
415 | GLD Size AddrMode Reg -- src, dst(fpreg)
416 | GST Size Reg AddrMode -- src(fpreg), dst
418 | GLDZ Reg -- dst(fpreg)
419 | GLD1 Reg -- dst(fpreg)
421 | GFTOI Reg Reg -- src(fpreg), dst(intreg)
422 | GDTOI Reg Reg -- src(fpreg), dst(intreg)
424 | GITOF Reg Reg -- src(intreg), dst(fpreg)
425 | GITOD Reg Reg -- src(intreg), dst(fpreg)
427 | GADD Size Reg Reg Reg -- src1, src2, dst
428 | GDIV Size Reg Reg Reg -- src1, src2, dst
429 | GSUB Size Reg Reg Reg -- src1, src2, dst
430 | GMUL Size Reg Reg Reg -- src1, src2, dst
432 -- FP compare. Cond must be `elem` [EQQ, NE, LE, LTT, GE, GTT]
433 -- Compare src1 with src2; set the Zero flag iff the numbers are
434 -- comparable and the comparison is True. Subsequent code must
435 -- test the %eflags zero flag regardless of the supplied Cond.
436 | GCMP Cond Reg Reg -- src1, src2
438 | GABS Size Reg Reg -- src, dst
439 | GNEG Size Reg Reg -- src, dst
440 | GSQRT Size Reg Reg -- src, dst
441 | GSIN Size CLabel CLabel Reg Reg -- src, dst
442 | GCOS Size CLabel CLabel Reg Reg -- src, dst
443 | GTAN Size CLabel CLabel Reg Reg -- src, dst
445 | GFREE -- do ffree on all x86 regs; an ugly hack
448 #if x86_64_TARGET_ARCH
449 -- SSE2 floating point: we use a restricted set of the available SSE2
450 -- instructions for floating-point.
452 -- use MOV for moving (either movss or movsd (movlpd better?))
454 | CVTSS2SD Reg Reg -- F32 to F64
455 | CVTSD2SS Reg Reg -- F64 to F32
456 | CVTTSS2SIQ Operand Reg -- F32 to I32/I64 (with truncation)
457 | CVTTSD2SIQ Operand Reg -- F64 to I32/I64 (with truncation)
458 | CVTSI2SS Operand Reg -- I32/I64 to F32
459 | CVTSI2SD Operand Reg -- I32/I64 to F64
461 -- use ADD & SUB for arithmetic. In both cases, operands
464 -- SSE2 floating-point division:
465 | FDIV Size Operand Operand -- divisor, dividend(dst)
467 -- use CMP for comparisons. ucomiss and ucomisd instructions
468 -- compare single/double prec floating point respectively.
470 | SQRT Size Operand Reg -- src, dst
474 | TEST Size Operand Operand
475 | CMP Size Operand Operand
481 -- both unused (SDM):
487 | JXX Cond BlockId -- includes unconditional branches
488 | JXX_GBL Cond Imm -- non-local version of JXX
489 | JMP_TBL Operand [BlockId] -- table jump
490 | CALL (Either Imm Reg) [Reg]
493 | CLTD Size -- sign extend %eax into %edx:%eax
495 | FETCHGOT Reg -- pseudo-insn for ELF position-independent code
499 -- addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
500 | FETCHPC Reg -- pseudo-insn for Darwin position-independent code
507 = OpReg Reg -- register
508 | OpImm Imm -- immediate value
509 | OpAddr AddrMode -- memory reference
511 #endif /* i386 or x86_64 */
514 i386_insert_ffrees :: [GenBasicBlock Instr] -> [GenBasicBlock Instr]
515 i386_insert_ffrees blocks
516 | or (map (any is_G_instr) [ instrs | BasicBlock id instrs <- blocks ])
517 = map ffree_before_nonlocal_transfers blocks
521 ffree_before_nonlocal_transfers (BasicBlock id insns)
522 = BasicBlock id (foldr p [] insns)
523 where p insn r = case insn of
524 CALL _ _ -> GFREE : insn : r
525 JMP _ -> GFREE : insn : r
528 -- if you ever add a new FP insn to the fake x86 FP insn set,
529 -- you must update this too
530 is_G_instr :: Instr -> Bool
533 GMOV _ _ -> True; GLD _ _ _ -> True; GST _ _ _ -> True
534 GLDZ _ -> True; GLD1 _ -> True
535 GFTOI _ _ -> True; GDTOI _ _ -> True
536 GITOF _ _ -> True; GITOD _ _ -> True
537 GADD _ _ _ _ -> True; GDIV _ _ _ _ -> True
538 GSUB _ _ _ _ -> True; GMUL _ _ _ _ -> True
539 GCMP _ _ _ -> True; GABS _ _ _ -> True
540 GNEG _ _ _ -> True; GSQRT _ _ _ -> True
541 GSIN _ _ _ _ _ -> True; GCOS _ _ _ _ _ -> True; GTAN _ _ _ _ _ -> True
542 GFREE -> panic "is_G_instr: GFREE (!)"
544 #endif /* i386_TARGET_ARCH */
547 -- -----------------------------------------------------------------------------
548 -- Sparc instructions
550 #if sparc_TARGET_ARCH
552 -- data Instr continues...
555 | LD Size AddrMode Reg -- size, src, dst
556 | ST Size Reg AddrMode -- size, src, dst
559 | ADD Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
560 | SUB Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
561 | UMUL Bool Reg RI Reg -- cc?, src1, src2, dst
562 | SMUL Bool Reg RI Reg -- cc?, src1, src2, dst
563 | RDY Reg -- move contents of Y register to reg
565 -- Simple bit-twiddling.
566 | AND Bool Reg RI Reg -- cc?, src1, src2, dst
567 | ANDN Bool Reg RI Reg -- cc?, src1, src2, dst
568 | OR Bool Reg RI Reg -- cc?, src1, src2, dst
569 | ORN Bool Reg RI Reg -- cc?, src1, src2, dst
570 | XOR Bool Reg RI Reg -- cc?, src1, src2, dst
571 | XNOR Bool Reg RI Reg -- cc?, src1, src2, dst
572 | SLL Reg RI Reg -- src1, src2, dst
573 | SRL Reg RI Reg -- src1, src2, dst
574 | SRA Reg RI Reg -- src1, src2, dst
575 | SETHI Imm Reg -- src, dst
576 | NOP -- Really SETHI 0, %g0, but worth an alias
580 -- Note that we cheat by treating F{ABS,MOV,NEG} of doubles as single
581 -- instructions right up until we spit them out.
582 | FABS Size Reg Reg -- src dst
583 | FADD Size Reg Reg Reg -- src1, src2, dst
584 | FCMP Bool Size Reg Reg -- exception?, src1, src2, dst
585 | FDIV Size Reg Reg Reg -- src1, src2, dst
586 | FMOV Size Reg Reg -- src, dst
587 | FMUL Size Reg Reg Reg -- src1, src2, dst
588 | FNEG Size Reg Reg -- src, dst
589 | FSQRT Size Reg Reg -- src, dst
590 | FSUB Size Reg Reg Reg -- src1, src2, dst
591 | FxTOy Size Size Reg Reg -- src, dst
594 | BI Cond Bool Imm -- cond, annul?, target
595 | BF Cond Bool Imm -- cond, annul?, target
597 | JMP AddrMode -- target
598 | CALL (Either Imm Reg) Int Bool -- target, args, terminal
602 riZero (RIImm (ImmInt 0)) = True
603 riZero (RIImm (ImmInteger 0)) = True
604 riZero (RIReg (RealReg 0)) = True
607 -- Calculate the effective address which would be used by the
608 -- corresponding fpRel sequence. fpRel is in MachRegs.lhs,
609 -- alas -- can't have fpRelEA here because of module dependencies.
610 fpRelEA :: Int -> Reg -> Instr
612 = ADD False False fp (RIImm (ImmInt (n * wORD_SIZE))) dst
614 -- Code to shift the stack pointer by n words.
615 moveSp :: Int -> Instr
617 = ADD False False sp (RIImm (ImmInt (n * wORD_SIZE))) sp
619 -- Produce the second-half-of-a-double register given the first half.
621 fPair (RealReg n) | n >= 32 && n `mod` 2 == 0 = RealReg (n+1)
622 fPair other = pprPanic "fPair(sparc NCG)" (ppr other)
623 #endif /* sparc_TARGET_ARCH */
626 -- -----------------------------------------------------------------------------
627 -- PowerPC instructions
629 #ifdef powerpc_TARGET_ARCH
630 -- data Instr continues...
633 | LD Size Reg AddrMode -- Load size, dst, src
634 | LA Size Reg AddrMode -- Load arithmetic size, dst, src
635 | ST Size Reg AddrMode -- Store size, src, dst
636 | STU Size Reg AddrMode -- Store with Update size, src, dst
637 | LIS Reg Imm -- Load Immediate Shifted dst, src
638 | LI Reg Imm -- Load Immediate dst, src
639 | MR Reg Reg -- Move Register dst, src -- also for fmr
641 | CMP Size Reg RI --- size, src1, src2
642 | CMPL Size Reg RI --- size, src1, src2
645 | BCCFAR Cond BlockId
646 | JMP CLabel -- same as branch,
647 -- but with CLabel instead of block ID
649 | BCTR [BlockId] -- with list of local destinations
650 | BL CLabel [Reg] -- with list of argument regs
653 | ADD Reg Reg RI -- dst, src1, src2
654 | ADDC Reg Reg Reg -- (carrying) dst, src1, src2
655 | ADDE Reg Reg Reg -- (extend) dst, src1, src2
656 | ADDIS Reg Reg Imm -- Add Immediate Shifted dst, src1, src2
657 | SUBF Reg Reg Reg -- dst, src1, src2 ; dst = src2 - src1
662 | MULLW_MayOflo Reg Reg Reg
663 -- dst = 1 if src1 * src2 overflows
664 -- pseudo-instruction; pretty-printed as:
665 -- mullwo. dst, src1, src2
667 -- rlwinm dst, dst, 2, 31,31
669 | AND Reg Reg RI -- dst, src1, src2
670 | OR Reg Reg RI -- dst, src1, src2
671 | XOR Reg Reg RI -- dst, src1, src2
672 | XORIS Reg Reg Imm -- XOR Immediate Shifted dst, src1, src2
679 | SLW Reg Reg RI -- shift left word
680 | SRW Reg Reg RI -- shift right word
681 | SRAW Reg Reg RI -- shift right arithmetic word
683 -- Rotate Left Word Immediate then AND with Mask
684 | RLWINM Reg Reg Int Int Int
686 | FADD Size Reg Reg Reg
687 | FSUB Size Reg Reg Reg
688 | FMUL Size Reg Reg Reg
689 | FDIV Size Reg Reg Reg
690 | FNEG Reg Reg -- negate is the same for single and double prec.
694 | FCTIWZ Reg Reg -- convert to integer word
695 | FRSP Reg Reg -- reduce to single precision
696 -- (but destination is a FP register)
698 | CRNOR Int Int Int -- condition register nor
699 | MFCR Reg -- move from condition register
701 | MFLR Reg -- move from link register
702 | FETCHPC Reg -- pseudo-instruction:
703 -- bcl to next insn, mflr reg
705 | LWSYNC -- memory barrier
706 #endif /* powerpc_TARGET_ARCH */