1 -----------------------------------------------------------------------------
3 -- Machine-dependent assembly language
5 -- (c) The University of Glasgow 1993-2004
7 -----------------------------------------------------------------------------
9 #include "nativeGen/NCG.h"
12 -- * Cmm instantiations
13 NatCmm, NatCmmTop, NatBasicBlock,
15 -- * Machine instructions
18 #if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH
19 Size(..), machRepSize,
28 riZero, fpRelEA, moveSp, fPair,
30 #if powerpc_TARGET_ARCH
31 condUnsigned, condToSigned,
33 DestInfo(..), hasDestInfo, pprDests,
37 #include "HsVersions.h"
41 import MachOp ( MachRep(..) )
42 import CLabel ( CLabel, pprCLabel )
43 import Panic ( panic )
45 import Config ( cLeadingUnderscore )
51 -- -----------------------------------------------------------------------------
52 -- Our flavours of the Cmm types
54 -- Type synonyms for Cmm populated with native code
55 type NatCmm = GenCmm CmmStatic Instr
56 type NatCmmTop = GenCmmTop CmmStatic Instr
57 type NatBasicBlock = GenBasicBlock Instr
59 -- -----------------------------------------------------------------------------
60 -- Conditions on this architecture
64 = ALWAYS -- For BI (same as BR)
65 | EQQ -- For CMP and BI (NB: "EQ" is a 1.3 Prelude name)
67 | GTT -- For BI only (NB: "GT" is a 1.3 Prelude name)
68 | LE -- For CMP and BI
69 | LTT -- For CMP and BI (NB: "LT" is a 1.3 Prelude name)
71 | NEVER -- For BI (null instruction)
76 = ALWAYS -- What's really used? ToDo
93 = ALWAYS -- What's really used? ToDo
110 #if powerpc_TARGET_ARCH
123 deriving Eq -- to make an assertion work
126 -- -----------------------------------------------------------------------------
127 -- Sizes on this architecture
129 -- ToDo: it's not clear to me that we need separate signed-vs-unsigned sizes
130 -- here. I've removed them from the x86 version, we'll see what happens --SDM
132 #if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH
134 #if alpha_TARGET_ARCH
137 -- | W -- word (2 bytes): UNUSED
139 | L -- longword (4 bytes)
140 | Q -- quadword (8 bytes)
141 -- | FF -- VAX F-style floating pt: UNUSED
142 -- | GF -- VAX G-style floating pt: UNUSED
143 -- | DF -- VAX D-style floating pt: UNUSED
144 -- | SF -- IEEE single-precision floating pt: UNUSED
145 | TF -- IEEE double-precision floating pt
147 #if sparc_TARGET_ARCH || powerpc_TARGET_ARCH
149 | Bu -- byte (unsigned)
150 | H -- halfword (signed, 2 bytes)
151 | Hu -- halfword (unsigned, 2 bytes)
152 | W -- word (4 bytes)
153 | F -- IEEE single-precision floating pt
154 | DF -- IEEE single-precision floating pt
158 machRepSize :: MachRep -> Size
159 machRepSize I8 = IF_ARCH_alpha(Bu, IF_ARCH_sparc(Bu, ))
160 machRepSize I16 = IF_ARCH_alpha(err,IF_ARCH_sparc(Hu, ))
161 machRepSize I32 = IF_ARCH_alpha(L, IF_ARCH_sparc(W, ))
162 machRepSize I64 = panic "machRepSize: I64"
163 machRepSize I128 = panic "machRepSize: I128"
164 machRepSize F32 = IF_ARCH_alpha(TF, IF_ARCH_sparc(F, ))
165 machRepSize F64 = IF_ARCH_alpha(TF, IF_ARCH_sparc(DF,))
168 -- -----------------------------------------------------------------------------
169 -- Register or immediate (a handy type on some platforms)
175 -- -----------------------------------------------------------------------------
176 -- Machine's assembly language
178 -- We have a few common "instructions" (nearly all the pseudo-ops) but
179 -- mostly all of 'Instr' is machine-specific.
182 = COMMENT FastString -- comment pseudo-op
184 | LDATA Section [CmmStatic] -- some static data spat out during code
185 -- generation. Will be extracted before
188 | NEWBLOCK BlockId -- start a new basic block. Useful during
189 -- codegen, removed later. Preceding
190 -- instruction should be a jump, as per the
191 -- invariants for a BasicBlock (see Cmm).
193 | DELTA Int -- specify current stack offset for
194 -- benefit of subsequent passes
196 -- -----------------------------------------------------------------------------
197 -- Alpha instructions
199 #if alpha_TARGET_ARCH
201 -- data Instr continues...
204 | LD Size Reg AddrMode -- size, dst, src
205 | LDA Reg AddrMode -- dst, src
206 | LDAH Reg AddrMode -- dst, src
207 | LDGP Reg AddrMode -- dst, src
208 | LDI Size Reg Imm -- size, dst, src
209 | ST Size Reg AddrMode -- size, src, dst
213 | ABS Size RI Reg -- size, src, dst
214 | NEG Size Bool RI Reg -- size, overflow, src, dst
215 | ADD Size Bool Reg RI Reg -- size, overflow, src, src, dst
216 | SADD Size Size Reg RI Reg -- size, scale, src, src, dst
217 | SUB Size Bool Reg RI Reg -- size, overflow, src, src, dst
218 | SSUB Size Size Reg RI Reg -- size, scale, src, src, dst
219 | MUL Size Bool Reg RI Reg -- size, overflow, src, src, dst
220 | DIV Size Bool Reg RI Reg -- size, unsigned, src, src, dst
221 | REM Size Bool Reg RI Reg -- size, unsigned, src, src, dst
223 -- Simple bit-twiddling.
241 | CMP Cond Reg RI Reg
247 | FADD Size Reg Reg Reg
248 | FDIV Size Reg Reg Reg
249 | FMUL Size Reg Reg Reg
250 | FSUB Size Reg Reg Reg
251 | CVTxy Size Size Reg Reg
252 | FCMP Size Cond Reg Reg Reg
259 | JMP Reg AddrMode Int
261 | JSR Reg AddrMode Int
263 -- Alpha-specific pseudo-ops.
271 #endif /* alpha_TARGET_ARCH */
274 -- -----------------------------------------------------------------------------
275 -- Intel x86 instructions
278 Intel, in their infinite wisdom, selected a stack model for floating
279 point registers on x86. That might have made sense back in 1979 --
280 nowadays we can see it for the nonsense it really is. A stack model
281 fits poorly with the existing nativeGen infrastructure, which assumes
282 flat integer and FP register sets. Prior to this commit, nativeGen
283 could not generate correct x86 FP code -- to do so would have meant
284 somehow working the register-stack paradigm into the register
285 allocator and spiller, which sounds very difficult.
287 We have decided to cheat, and go for a simple fix which requires no
288 infrastructure modifications, at the expense of generating ropey but
289 correct FP code. All notions of the x86 FP stack and its insns have
290 been removed. Instead, we pretend (to the instruction selector and
291 register allocator) that x86 has six floating point registers, %fake0
292 .. %fake5, which can be used in the usual flat manner. We further
293 claim that x86 has floating point instructions very similar to SPARC
294 and Alpha, that is, a simple 3-operand register-register arrangement.
295 Code generation and register allocation proceed on this basis.
297 When we come to print out the final assembly, our convenient fiction
298 is converted to dismal reality. Each fake instruction is
299 independently converted to a series of real x86 instructions.
300 %fake0 .. %fake5 are mapped to %st(0) .. %st(5). To do reg-reg
301 arithmetic operations, the two operands are pushed onto the top of the
302 FP stack, the operation done, and the result copied back into the
303 relevant register. There are only six %fake registers because 2 are
304 needed for the translation, and x86 has 8 in total.
306 The translation is inefficient but is simple and it works. A cleverer
307 translation would handle a sequence of insns, simulating the FP stack
308 contents, would not impose a fixed mapping from %fake to %st regs, and
309 hopefully could avoid most of the redundant reg-reg moves of the
312 We might as well make use of whatever unique FP facilities Intel have
313 chosen to bless us with (let's not be churlish, after all).
314 Hence GLDZ and GLD1. Bwahahahahahahaha!
318 MORE FLOATING POINT MUSINGS...
320 Intel's internal floating point registers are by default 80 bit
321 extended precision. This means that all operations done on values in
322 registers are done at 80 bits, and unless the intermediate values are
323 truncated to the appropriate size (32 or 64 bits) by storing in
324 memory, calculations in registers will give different results from
325 calculations which pass intermediate values in memory (eg. via
328 One solution is to set the FPU into 64 bit precision mode. Some OSs
329 do this (eg. FreeBSD) and some don't (eg. Linux). The problem here is
330 that this will only affect 64-bit precision arithmetic; 32-bit
331 calculations will still be done at 64-bit precision in registers. So
332 it doesn't solve the whole problem.
334 There's also the issue of what the C library is expecting in terms of
335 precision. It seems to be the case that glibc on Linux expects the
336 FPU to be set to 80 bit precision, so setting it to 64 bit could have
337 unexpected effects. Changing the default could have undesirable
338 effects on other 3rd-party library code too, so the right thing would
339 be to save/restore the FPU control word across Haskell code if we were
342 gcc's -ffloat-store gives consistent results by always storing the
343 results of floating-point calculations in memory, which works for both
344 32 and 64-bit precision. However, it only affects the values of
345 user-declared floating point variables in C, not intermediate results.
346 GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
349 Another problem is how to spill floating point registers in the
350 register allocator. Should we spill the whole 80 bits, or just 64?
351 On an OS which is set to 64 bit precision, spilling 64 is fine. On
352 Linux, spilling 64 bits will round the results of some operations.
353 This is what gcc does. Spilling at 80 bits requires taking up a full
354 128 bit slot (so we get alignment). We spill at 80-bits and ignore
355 the alignment problems.
357 In the future, we'll use the SSE registers for floating point. This
358 requires a CPU that supports SSE2 (ordinary SSE only supports 32 bit
359 precision float ops), which means P4 or Xeon and above. Using SSE
360 will solve all these problems, because the SSE registers use fixed 32
361 bit or 64 bit precision.
368 -- data Instr continues...
371 | MOV MachRep Operand Operand
372 | MOVZxL MachRep Operand Operand -- size is the size of operand 1
373 | MOVSxL MachRep Operand Operand -- size is the size of operand 1
375 -- Load effective address (also a very useful three-operand add instruction :-)
376 | LEA MachRep Operand Operand
379 | ADD MachRep Operand Operand
380 | ADC MachRep Operand Operand
381 | SUB MachRep Operand Operand
382 | IMUL MachRep Operand Operand -- signed int mul
383 | MUL MachRep Operand Operand -- unsigned int mul
386 -- operand1:operand2 := (operand1[31:0] *signed operand2[31:0])
388 | DIV MachRep Operand -- eax := eax:edx/op, edx := eax:edx%op
389 | IDIV MachRep Operand -- ditto, but signed
391 -- Simple bit-twiddling.
392 | AND MachRep Operand Operand
393 | OR MachRep Operand Operand
394 | XOR MachRep Operand Operand
395 | NOT MachRep Operand
396 | NEGI MachRep Operand -- NEG instruction (name clash with Cond)
398 -- Shifts (amount may be immediate or %cl only)
399 | SHL MachRep Operand{-amount-} Operand
400 | SAR MachRep Operand{-amount-} Operand
401 | SHR MachRep Operand{-amount-} Operand
403 | BT MachRep Imm Operand
408 -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles
409 -- as single instructions right up until we spit them out.
410 -- all the 3-operand fake fp insns are src1 src2 dst
411 -- and furthermore are constrained to be fp regs only.
412 -- IMPORTANT: keep is_G_insn up to date with any changes here
413 | GMOV Reg Reg -- src(fpreg), dst(fpreg)
414 | GLD MachRep AddrMode Reg -- src, dst(fpreg)
415 | GST MachRep Reg AddrMode -- src(fpreg), dst
417 | GLDZ Reg -- dst(fpreg)
418 | GLD1 Reg -- dst(fpreg)
420 | GFTOI Reg Reg -- src(fpreg), dst(intreg)
421 | GDTOI Reg Reg -- src(fpreg), dst(intreg)
423 | GITOF Reg Reg -- src(intreg), dst(fpreg)
424 | GITOD Reg Reg -- src(intreg), dst(fpreg)
426 | GADD MachRep Reg Reg Reg -- src1, src2, dst
427 | GDIV MachRep Reg Reg Reg -- src1, src2, dst
428 | GSUB MachRep Reg Reg Reg -- src1, src2, dst
429 | GMUL MachRep Reg Reg Reg -- src1, src2, dst
431 -- FP compare. Cond must be `elem` [EQQ, NE, LE, LTT, GE, GTT]
432 -- Compare src1 with src2; set the Zero flag iff the numbers are
433 -- comparable and the comparison is True. Subsequent code must
434 -- test the %eflags zero flag regardless of the supplied Cond.
435 | GCMP Cond Reg Reg -- src1, src2
437 | GABS MachRep Reg Reg -- src, dst
438 | GNEG MachRep Reg Reg -- src, dst
439 | GSQRT MachRep Reg Reg -- src, dst
440 | GSIN MachRep Reg Reg -- src, dst
441 | GCOS MachRep Reg Reg -- src, dst
442 | GTAN MachRep Reg Reg -- src, dst
444 | GFREE -- do ffree on all x86 regs; an ugly hack
447 | TEST MachRep Operand Operand
448 | CMP MachRep Operand Operand
452 | PUSH MachRep Operand
453 | POP MachRep Operand
454 -- both unused (SDM):
460 | JXX Cond BlockId -- includes unconditional branches
461 | JMP_TBL Operand [BlockId] -- table jump
462 | CALL (Either Imm Reg)
465 | CLTD -- sign extend %eax into %edx:%eax
467 | FETCHGOT Reg -- pseudo-insn for position-independent code
471 -- addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
474 = OpReg Reg -- register
475 | OpImm Imm -- immediate value
476 | OpAddr AddrMode -- memory reference
479 i386_insert_ffrees :: [Instr] -> [Instr]
480 i386_insert_ffrees insns
481 | any is_G_instr insns
482 = concatMap ffree_before_nonlocal_transfers insns
486 ffree_before_nonlocal_transfers insn
488 CALL _ -> [GFREE, insn]
489 JMP _ -> [GFREE, insn]
493 -- if you ever add a new FP insn to the fake x86 FP insn set,
494 -- you must update this too
495 is_G_instr :: Instr -> Bool
498 GMOV _ _ -> True; GLD _ _ _ -> True; GST _ _ _ -> True;
499 GLDZ _ -> True; GLD1 _ -> True;
500 GFTOI _ _ -> True; GDTOI _ _ -> True;
501 GITOF _ _ -> True; GITOD _ _ -> True;
502 GADD _ _ _ _ -> True; GDIV _ _ _ _ -> True
503 GSUB _ _ _ _ -> True; GMUL _ _ _ _ -> True
504 GCMP _ _ _ -> True; GABS _ _ _ -> True
505 GNEG _ _ _ -> True; GSQRT _ _ _ -> True
506 GSIN _ _ _ -> True; GCOS _ _ _ -> True; GTAN _ _ _ -> True;
507 GFREE -> panic "is_G_instr: GFREE (!)"
510 #endif /* i386_TARGET_ARCH */
513 -- -----------------------------------------------------------------------------
514 -- Sparc instructions
516 #if sparc_TARGET_ARCH
518 -- data Instr continues...
521 | LD MachRep AddrMode Reg -- size, src, dst
522 | ST MachRep Reg AddrMode -- size, src, dst
525 | ADD Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
526 | SUB Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
527 | UMUL Bool Reg RI Reg -- cc?, src1, src2, dst
528 | SMUL Bool Reg RI Reg -- cc?, src1, src2, dst
529 | RDY Reg -- move contents of Y register to reg
531 -- Simple bit-twiddling.
532 | AND Bool Reg RI Reg -- cc?, src1, src2, dst
533 | ANDN Bool Reg RI Reg -- cc?, src1, src2, dst
534 | OR Bool Reg RI Reg -- cc?, src1, src2, dst
535 | ORN Bool Reg RI Reg -- cc?, src1, src2, dst
536 | XOR Bool Reg RI Reg -- cc?, src1, src2, dst
537 | XNOR Bool Reg RI Reg -- cc?, src1, src2, dst
538 | SLL Reg RI Reg -- src1, src2, dst
539 | SRL Reg RI Reg -- src1, src2, dst
540 | SRA Reg RI Reg -- src1, src2, dst
541 | SETHI Imm Reg -- src, dst
542 | NOP -- Really SETHI 0, %g0, but worth an alias
546 -- Note that we cheat by treating F{ABS,MOV,NEG} of doubles as single
547 -- instructions right up until we spit them out.
548 | FABS MachRep Reg Reg -- src dst
549 | FADD MachRep Reg Reg Reg -- src1, src2, dst
550 | FCMP Bool MachRep Reg Reg -- exception?, src1, src2, dst
551 | FDIV MachRep Reg Reg Reg -- src1, src2, dst
552 | FMOV MachRep Reg Reg -- src, dst
553 | FMUL MachRep Reg Reg Reg -- src1, src2, dst
554 | FNEG MachRep Reg Reg -- src, dst
555 | FSQRT MachRep Reg Reg -- src, dst
556 | FSUB MachRep Reg Reg Reg -- src1, src2, dst
557 | FxTOy MachRep MachRep Reg Reg -- src, dst
560 | BI Cond Bool Imm -- cond, annul?, target
561 | BF Cond Bool Imm -- cond, annul?, target
563 | JMP DestInfo AddrMode -- target
564 | CALL (Either Imm Reg) Int Bool -- target, args, terminal
571 riZero (RIImm (ImmInt 0)) = True
572 riZero (RIImm (ImmInteger 0)) = True
573 riZero (RIReg (RealReg 0)) = True
576 -- Calculate the effective address which would be used by the
577 -- corresponding fpRel sequence. fpRel is in MachRegs.lhs,
578 -- alas -- can't have fpRelEA here because of module dependencies.
579 fpRelEA :: Int -> Reg -> Instr
581 = ADD False False fp (RIImm (ImmInt (n * BYTES_PER_WORD))) dst
583 -- Code to shift the stack pointer by n words.
584 moveSp :: Int -> Instr
586 = ADD False False sp (RIImm (ImmInt (n * BYTES_PER_WORD))) sp
588 -- Produce the second-half-of-a-double register given the first half.
590 fPair (RealReg n) | n >= 32 && n `mod` 2 == 0 = RealReg (n+1)
591 fPair other = pprPanic "fPair(sparc NCG)" (ppr other)
592 #endif /* sparc_TARGET_ARCH */
595 -- -----------------------------------------------------------------------------
596 -- PowerPC instructions
598 #ifdef powerpc_TARGET_ARCH
599 -- data Instr continues...
602 | LD MachRep Reg AddrMode -- Load size, dst, src
603 | LA MachRep Reg AddrMode -- Load arithmetic size, dst, src
604 | ST MachRep Reg AddrMode -- Store size, src, dst
605 | STU MachRep Reg AddrMode -- Store with Update size, src, dst
606 | LIS Reg Imm -- Load Immediate Shifted dst, src
607 | LI Reg Imm -- Load Immediate dst, src
608 | MR Reg Reg -- Move Register dst, src -- also for fmr
610 | CMP MachRep Reg RI --- size, src1, src2
611 | CMPL MachRep Reg RI --- size, src1, src2
614 | JMP CLabel -- same as branch,
615 -- but with CLabel instead of block ID
617 | BCTR [BlockId] -- with list of local destinations
618 | BL CLabel [Reg] -- with list of argument regs
621 | ADD Reg Reg RI -- dst, src1, src2
622 | ADDC Reg Reg Reg -- (carrying) dst, src1, src2
623 | ADDE Reg Reg Reg -- (extend) dst, src1, src2
624 | ADDIS Reg Reg Imm -- Add Immediate Shifted dst, src1, src2
625 | SUBF Reg Reg Reg -- dst, src1, src2 ; dst = src2 - src1
630 | MULLW_MayOflo Reg Reg Reg
631 -- dst = 1 if src1 * src2 overflows
632 -- pseudo-instruction; pretty-printed as:
633 -- mullwo. dst, src1, src2
635 -- rlwinm dst, dst, 2, 31,31
637 | AND Reg Reg RI -- dst, src1, src2
638 | OR Reg Reg RI -- dst, src1, src2
639 | XOR Reg Reg RI -- dst, src1, src2
640 | XORIS Reg Reg Imm -- XOR Immediate Shifted dst, src1, src2
642 | EXTS MachRep Reg Reg
647 | SLW Reg Reg RI -- shift left word
648 | SRW Reg Reg RI -- shift right word
649 | SRAW Reg Reg RI -- shift right arithmetic word
651 -- Rotate Left Word Immediate then AND with Mask
652 | RLWINM Reg Reg Int Int Int
654 | FADD MachRep Reg Reg Reg
655 | FSUB MachRep Reg Reg Reg
656 | FMUL MachRep Reg Reg Reg
657 | FDIV MachRep Reg Reg Reg
658 | FNEG Reg Reg -- negate is the same for single and double prec.
662 | FCTIWZ Reg Reg -- convert to integer word
663 | FRSP Reg Reg -- reduce to single precision
664 -- (but destination is a FP register)
666 | CRNOR Int Int Int -- condition register nor
667 | MFCR Reg -- move from condition register
669 | MFLR Reg -- move from link register
670 | FETCHPC Reg -- pseudo-instruction:
671 -- bcl to next insn, mflr reg
673 condUnsigned GU = True
674 condUnsigned LU = True
675 condUnsigned GEU = True
676 condUnsigned LEU = True
677 condUnsigned _ = False
679 condToSigned GU = GTT
680 condToSigned LU = LTT
681 condToSigned GEU = GE
682 condToSigned LEU = LE
684 #endif /* powerpc_TARGET_ARCH */
687 -- -----------------------------------------------------------------------------
690 -- ToDo: might not be needed anymore --SDM
692 -- used by insnFuture in RegAllocInfo.lhs
694 = NoDestInfo -- no supplied dests; infer from context
695 | DestInfo [CLabel] -- precisely these dests and no others
697 hasDestInfo NoDestInfo = False
698 hasDestInfo (DestInfo _) = True
700 pprDests :: DestInfo -> SDoc
701 pprDests NoDestInfo = text "NoDestInfo"
702 pprDests (DestInfo dsts) = brackets (hsep (map pprCLabel dsts))