1 -----------------------------------------------------------------------------
3 -- Machine-dependent assembly language
5 -- (c) The University of Glasgow 1993-2004
7 -----------------------------------------------------------------------------
9 #include "nativeGen/NCG.h"
12 -- * Cmm instantiations
13 NatCmm, NatCmmTop, NatBasicBlock,
15 -- * Machine instructions
18 #if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH
19 Size(..), machRepSize,
28 riZero, fpRelEA, moveSp, fPair,
30 #if powerpc_TARGET_ARCH
31 condUnsigned, condToSigned,
33 DestInfo(..), hasDestInfo, pprDests,
37 #include "HsVersions.h"
38 #include "../includes/ghcconfig.h"
42 import MachOp ( MachRep(..) )
43 import CLabel ( CLabel, pprCLabel )
44 import Panic ( panic )
46 import Config ( cLeadingUnderscore )
52 -- -----------------------------------------------------------------------------
53 -- Our flavours of the Cmm types
55 -- Type synonyms for Cmm populated with native code
56 type NatCmm = GenCmm CmmStatic Instr
57 type NatCmmTop = GenCmmTop CmmStatic Instr
58 type NatBasicBlock = GenBasicBlock Instr
60 -- -----------------------------------------------------------------------------
61 -- Conditions on this architecture
65 = ALWAYS -- For BI (same as BR)
66 | EQQ -- For CMP and BI (NB: "EQ" is a 1.3 Prelude name)
68 | GTT -- For BI only (NB: "GT" is a 1.3 Prelude name)
69 | LE -- For CMP and BI
70 | LTT -- For CMP and BI (NB: "LT" is a 1.3 Prelude name)
72 | NEVER -- For BI (null instruction)
77 = ALWAYS -- What's really used? ToDo
94 = ALWAYS -- What's really used? ToDo
111 #if powerpc_TARGET_ARCH
124 deriving Eq -- to make an assertion work
127 -- -----------------------------------------------------------------------------
128 -- Sizes on this architecture
130 -- ToDo: it's not clear to me that we need separate signed-vs-unsigned sizes
131 -- here. I've removed them from the x86 version, we'll see what happens --SDM
133 #if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH
135 #if alpha_TARGET_ARCH
138 -- | W -- word (2 bytes): UNUSED
140 | L -- longword (4 bytes)
141 | Q -- quadword (8 bytes)
142 -- | FF -- VAX F-style floating pt: UNUSED
143 -- | GF -- VAX G-style floating pt: UNUSED
144 -- | DF -- VAX D-style floating pt: UNUSED
145 -- | SF -- IEEE single-precision floating pt: UNUSED
146 | TF -- IEEE double-precision floating pt
148 #if sparc_TARGET_ARCH || powerpc_TARGET_ARCH
150 | Bu -- byte (unsigned)
151 | H -- halfword (signed, 2 bytes)
152 | Hu -- halfword (unsigned, 2 bytes)
153 | W -- word (4 bytes)
154 | F -- IEEE single-precision floating pt
155 | DF -- IEEE single-precision floating pt
159 machRepSize :: MachRep -> Size
160 machRepSize I8 = IF_ARCH_alpha(Bu, IF_ARCH_sparc(Bu, ))
161 machRepSize I16 = IF_ARCH_alpha(err,IF_ARCH_sparc(Hu, ))
162 machRepSize I32 = IF_ARCH_alpha(L, IF_ARCH_sparc(W, ))
163 machRepSize I64 = panic "machRepSize: I64"
164 machRepSize I128 = panic "machRepSize: I128"
165 machRepSize F32 = IF_ARCH_alpha(TF, IF_ARCH_sparc(F, ))
166 machRepSize F64 = IF_ARCH_alpha(TF, IF_ARCH_sparc(DF,))
169 -- -----------------------------------------------------------------------------
170 -- Register or immediate (a handy type on some platforms)
176 -- -----------------------------------------------------------------------------
177 -- Machine's assembly language
179 -- We have a few common "instructions" (nearly all the pseudo-ops) but
180 -- mostly all of 'Instr' is machine-specific.
183 = COMMENT FastString -- comment pseudo-op
185 | LDATA Section [CmmStatic] -- some static data spat out during code
186 -- generation. Will be extracted before
189 | NEWBLOCK BlockId -- start a new basic block. Useful during
190 -- codegen, removed later. Preceding
191 -- instruction should be a jump, as per the
192 -- invariants for a BasicBlock (see Cmm).
194 | DELTA Int -- specify current stack offset for
195 -- benefit of subsequent passes
197 -- -----------------------------------------------------------------------------
198 -- Alpha instructions
200 #if alpha_TARGET_ARCH
202 -- data Instr continues...
205 | LD Size Reg AddrMode -- size, dst, src
206 | LDA Reg AddrMode -- dst, src
207 | LDAH Reg AddrMode -- dst, src
208 | LDGP Reg AddrMode -- dst, src
209 | LDI Size Reg Imm -- size, dst, src
210 | ST Size Reg AddrMode -- size, src, dst
214 | ABS Size RI Reg -- size, src, dst
215 | NEG Size Bool RI Reg -- size, overflow, src, dst
216 | ADD Size Bool Reg RI Reg -- size, overflow, src, src, dst
217 | SADD Size Size Reg RI Reg -- size, scale, src, src, dst
218 | SUB Size Bool Reg RI Reg -- size, overflow, src, src, dst
219 | SSUB Size Size Reg RI Reg -- size, scale, src, src, dst
220 | MUL Size Bool Reg RI Reg -- size, overflow, src, src, dst
221 | DIV Size Bool Reg RI Reg -- size, unsigned, src, src, dst
222 | REM Size Bool Reg RI Reg -- size, unsigned, src, src, dst
224 -- Simple bit-twiddling.
242 | CMP Cond Reg RI Reg
248 | FADD Size Reg Reg Reg
249 | FDIV Size Reg Reg Reg
250 | FMUL Size Reg Reg Reg
251 | FSUB Size Reg Reg Reg
252 | CVTxy Size Size Reg Reg
253 | FCMP Size Cond Reg Reg Reg
260 | JMP Reg AddrMode Int
262 | JSR Reg AddrMode Int
264 -- Alpha-specific pseudo-ops.
272 #endif /* alpha_TARGET_ARCH */
275 -- -----------------------------------------------------------------------------
276 -- Intel x86 instructions
279 Intel, in their infinite wisdom, selected a stack model for floating
280 point registers on x86. That might have made sense back in 1979 --
281 nowadays we can see it for the nonsense it really is. A stack model
282 fits poorly with the existing nativeGen infrastructure, which assumes
283 flat integer and FP register sets. Prior to this commit, nativeGen
284 could not generate correct x86 FP code -- to do so would have meant
285 somehow working the register-stack paradigm into the register
286 allocator and spiller, which sounds very difficult.
288 We have decided to cheat, and go for a simple fix which requires no
289 infrastructure modifications, at the expense of generating ropey but
290 correct FP code. All notions of the x86 FP stack and its insns have
291 been removed. Instead, we pretend (to the instruction selector and
292 register allocator) that x86 has six floating point registers, %fake0
293 .. %fake5, which can be used in the usual flat manner. We further
294 claim that x86 has floating point instructions very similar to SPARC
295 and Alpha, that is, a simple 3-operand register-register arrangement.
296 Code generation and register allocation proceed on this basis.
298 When we come to print out the final assembly, our convenient fiction
299 is converted to dismal reality. Each fake instruction is
300 independently converted to a series of real x86 instructions.
301 %fake0 .. %fake5 are mapped to %st(0) .. %st(5). To do reg-reg
302 arithmetic operations, the two operands are pushed onto the top of the
303 FP stack, the operation done, and the result copied back into the
304 relevant register. There are only six %fake registers because 2 are
305 needed for the translation, and x86 has 8 in total.
307 The translation is inefficient but is simple and it works. A cleverer
308 translation would handle a sequence of insns, simulating the FP stack
309 contents, would not impose a fixed mapping from %fake to %st regs, and
310 hopefully could avoid most of the redundant reg-reg moves of the
313 We might as well make use of whatever unique FP facilities Intel have
314 chosen to bless us with (let's not be churlish, after all).
315 Hence GLDZ and GLD1. Bwahahahahahahaha!
319 MORE FLOATING POINT MUSINGS...
321 Intel's internal floating point registers are by default 80 bit
322 extended precision. This means that all operations done on values in
323 registers are done at 80 bits, and unless the intermediate values are
324 truncated to the appropriate size (32 or 64 bits) by storing in
325 memory, calculations in registers will give different results from
326 calculations which pass intermediate values in memory (eg. via
329 One solution is to set the FPU into 64 bit precision mode. Some OSs
330 do this (eg. FreeBSD) and some don't (eg. Linux). The problem here is
331 that this will only affect 64-bit precision arithmetic; 32-bit
332 calculations will still be done at 64-bit precision in registers. So
333 it doesn't solve the whole problem.
335 There's also the issue of what the C library is expecting in terms of
336 precision. It seems to be the case that glibc on Linux expects the
337 FPU to be set to 80 bit precision, so setting it to 64 bit could have
338 unexpected effects. Changing the default could have undesirable
339 effects on other 3rd-party library code too, so the right thing would
340 be to save/restore the FPU control word across Haskell code if we were
343 gcc's -ffloat-store gives consistent results by always storing the
344 results of floating-point calculations in memory, which works for both
345 32 and 64-bit precision. However, it only affects the values of
346 user-declared floating point variables in C, not intermediate results.
347 GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
350 Another problem is how to spill floating point registers in the
351 register allocator. Should we spill the whole 80 bits, or just 64?
352 On an OS which is set to 64 bit precision, spilling 64 is fine. On
353 Linux, spilling 64 bits will round the results of some operations.
354 This is what gcc does. Spilling at 80 bits requires taking up a full
355 128 bit slot (so we get alignment). We spill at 80-bits and ignore
356 the alignment problems.
358 In the future, we'll use the SSE registers for floating point. This
359 requires a CPU that supports SSE2 (ordinary SSE only supports 32 bit
360 precision float ops), which means P4 or Xeon and above. Using SSE
361 will solve all these problems, because the SSE registers use fixed 32
362 bit or 64 bit precision.
369 -- data Instr continues...
372 | MOV MachRep Operand Operand
373 | MOVZxL MachRep Operand Operand -- size is the size of operand 1
374 | MOVSxL MachRep Operand Operand -- size is the size of operand 1
376 -- Load effective address (also a very useful three-operand add instruction :-)
377 | LEA MachRep Operand Operand
380 | ADD MachRep Operand Operand
381 | ADC MachRep Operand Operand
382 | SUB MachRep Operand Operand
383 | IMUL MachRep Operand Operand -- signed int mul
384 | MUL MachRep Operand Operand -- unsigned int mul
387 -- operand1:operand2 := (operand1[31:0] *signed operand2[31:0])
389 | DIV MachRep Operand -- eax := eax:edx/op, edx := eax:edx%op
390 | IDIV MachRep Operand -- ditto, but signed
392 -- Simple bit-twiddling.
393 | AND MachRep Operand Operand
394 | OR MachRep Operand Operand
395 | XOR MachRep Operand Operand
396 | NOT MachRep Operand
397 | NEGI MachRep Operand -- NEG instruction (name clash with Cond)
399 -- Shifts (amount may be immediate or %cl only)
400 | SHL MachRep Operand{-amount-} Operand
401 | SAR MachRep Operand{-amount-} Operand
402 | SHR MachRep Operand{-amount-} Operand
404 | BT MachRep Imm Operand
409 -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles
410 -- as single instructions right up until we spit them out.
411 -- all the 3-operand fake fp insns are src1 src2 dst
412 -- and furthermore are constrained to be fp regs only.
413 -- IMPORTANT: keep is_G_insn up to date with any changes here
414 | GMOV Reg Reg -- src(fpreg), dst(fpreg)
415 | GLD MachRep AddrMode Reg -- src, dst(fpreg)
416 | GST MachRep Reg AddrMode -- src(fpreg), dst
418 | GLDZ Reg -- dst(fpreg)
419 | GLD1 Reg -- dst(fpreg)
421 | GFTOI Reg Reg -- src(fpreg), dst(intreg)
422 | GDTOI Reg Reg -- src(fpreg), dst(intreg)
424 | GITOF Reg Reg -- src(intreg), dst(fpreg)
425 | GITOD Reg Reg -- src(intreg), dst(fpreg)
427 | GADD MachRep Reg Reg Reg -- src1, src2, dst
428 | GDIV MachRep Reg Reg Reg -- src1, src2, dst
429 | GSUB MachRep Reg Reg Reg -- src1, src2, dst
430 | GMUL MachRep Reg Reg Reg -- src1, src2, dst
432 -- FP compare. Cond must be `elem` [EQQ, NE, LE, LTT, GE, GTT]
433 -- Compare src1 with src2; set the Zero flag iff the numbers are
434 -- comparable and the comparison is True. Subsequent code must
435 -- test the %eflags zero flag regardless of the supplied Cond.
436 | GCMP Cond Reg Reg -- src1, src2
438 | GABS MachRep Reg Reg -- src, dst
439 | GNEG MachRep Reg Reg -- src, dst
440 | GSQRT MachRep Reg Reg -- src, dst
441 | GSIN MachRep Reg Reg -- src, dst
442 | GCOS MachRep Reg Reg -- src, dst
443 | GTAN MachRep Reg Reg -- src, dst
445 | GFREE -- do ffree on all x86 regs; an ugly hack
448 | TEST MachRep Operand Operand
449 | CMP MachRep Operand Operand
453 | PUSH MachRep Operand
454 | POP MachRep Operand
455 -- both unused (SDM):
461 | JXX Cond BlockId -- includes unconditional branches
462 | JMP_TBL Operand [BlockId] -- table jump
463 | CALL (Either Imm Reg)
466 | CLTD -- sign extend %eax into %edx:%eax
469 = OpReg Reg -- register
470 | OpImm Imm -- immediate value
471 | OpAddr AddrMode -- memory reference
474 i386_insert_ffrees :: [Instr] -> [Instr]
475 i386_insert_ffrees insns
476 | any is_G_instr insns
477 = concatMap ffree_before_nonlocal_transfers insns
481 ffree_before_nonlocal_transfers insn
483 CALL _ -> [GFREE, insn]
484 JMP _ -> [GFREE, insn]
488 -- if you ever add a new FP insn to the fake x86 FP insn set,
489 -- you must update this too
490 is_G_instr :: Instr -> Bool
493 GMOV _ _ -> True; GLD _ _ _ -> True; GST _ _ _ -> True;
494 GLDZ _ -> True; GLD1 _ -> True;
495 GFTOI _ _ -> True; GDTOI _ _ -> True;
496 GITOF _ _ -> True; GITOD _ _ -> True;
497 GADD _ _ _ _ -> True; GDIV _ _ _ _ -> True
498 GSUB _ _ _ _ -> True; GMUL _ _ _ _ -> True
499 GCMP _ _ _ -> True; GABS _ _ _ -> True
500 GNEG _ _ _ -> True; GSQRT _ _ _ -> True
501 GSIN _ _ _ -> True; GCOS _ _ _ -> True; GTAN _ _ _ -> True;
502 GFREE -> panic "is_G_instr: GFREE (!)"
505 #endif /* i386_TARGET_ARCH */
508 -- -----------------------------------------------------------------------------
509 -- Sparc instructions
511 #if sparc_TARGET_ARCH
513 -- data Instr continues...
516 | LD MachRep AddrMode Reg -- size, src, dst
517 | ST MachRep Reg AddrMode -- size, src, dst
520 | ADD Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
521 | SUB Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
522 | UMUL Bool Reg RI Reg -- cc?, src1, src2, dst
523 | SMUL Bool Reg RI Reg -- cc?, src1, src2, dst
524 | RDY Reg -- move contents of Y register to reg
526 -- Simple bit-twiddling.
527 | AND Bool Reg RI Reg -- cc?, src1, src2, dst
528 | ANDN Bool Reg RI Reg -- cc?, src1, src2, dst
529 | OR Bool Reg RI Reg -- cc?, src1, src2, dst
530 | ORN Bool Reg RI Reg -- cc?, src1, src2, dst
531 | XOR Bool Reg RI Reg -- cc?, src1, src2, dst
532 | XNOR Bool Reg RI Reg -- cc?, src1, src2, dst
533 | SLL Reg RI Reg -- src1, src2, dst
534 | SRL Reg RI Reg -- src1, src2, dst
535 | SRA Reg RI Reg -- src1, src2, dst
536 | SETHI Imm Reg -- src, dst
537 | NOP -- Really SETHI 0, %g0, but worth an alias
541 -- Note that we cheat by treating F{ABS,MOV,NEG} of doubles as single
542 -- instructions right up until we spit them out.
543 | FABS MachRep Reg Reg -- src dst
544 | FADD MachRep Reg Reg Reg -- src1, src2, dst
545 | FCMP Bool MachRep Reg Reg -- exception?, src1, src2, dst
546 | FDIV MachRep Reg Reg Reg -- src1, src2, dst
547 | FMOV MachRep Reg Reg -- src, dst
548 | FMUL MachRep Reg Reg Reg -- src1, src2, dst
549 | FNEG MachRep Reg Reg -- src, dst
550 | FSQRT MachRep Reg Reg -- src, dst
551 | FSUB MachRep Reg Reg Reg -- src1, src2, dst
552 | FxTOy MachRep MachRep Reg Reg -- src, dst
555 | BI Cond Bool Imm -- cond, annul?, target
556 | BF Cond Bool Imm -- cond, annul?, target
558 | JMP DestInfo AddrMode -- target
559 | CALL (Either Imm Reg) Int Bool -- target, args, terminal
566 riZero (RIImm (ImmInt 0)) = True
567 riZero (RIImm (ImmInteger 0)) = True
568 riZero (RIReg (RealReg 0)) = True
571 -- Calculate the effective address which would be used by the
572 -- corresponding fpRel sequence. fpRel is in MachRegs.lhs,
573 -- alas -- can't have fpRelEA here because of module dependencies.
574 fpRelEA :: Int -> Reg -> Instr
576 = ADD False False fp (RIImm (ImmInt (n * BYTES_PER_WORD))) dst
578 -- Code to shift the stack pointer by n words.
579 moveSp :: Int -> Instr
581 = ADD False False sp (RIImm (ImmInt (n * BYTES_PER_WORD))) sp
583 -- Produce the second-half-of-a-double register given the first half.
585 fPair (RealReg n) | n >= 32 && n `mod` 2 == 0 = RealReg (n+1)
586 fPair other = pprPanic "fPair(sparc NCG)" (ppr other)
587 #endif /* sparc_TARGET_ARCH */
590 -- -----------------------------------------------------------------------------
591 -- PowerPC instructions
593 #ifdef powerpc_TARGET_ARCH
594 -- data Instr continues...
597 | LD MachRep Reg AddrMode -- Load size, dst, src
598 | LA MachRep Reg AddrMode -- Load arithmetic size, dst, src
599 | ST MachRep Reg AddrMode -- Store size, src, dst
600 | STU MachRep Reg AddrMode -- Store with Update size, src, dst
601 | LIS Reg Imm -- Load Immediate Shifted dst, src
602 | LI Reg Imm -- Load Immediate dst, src
603 | MR Reg Reg -- Move Register dst, src -- also for fmr
605 | CMP MachRep Reg RI --- size, src1, src2
606 | CMPL MachRep Reg RI --- size, src1, src2
609 | JMP CLabel -- same as branch,
610 -- but with CLabel instead of block ID
612 | BCTR [BlockId] -- with list of local destinations
613 | BL CLabel [Reg] -- with list of argument regs
616 | ADD Reg Reg RI -- dst, src1, src2
617 | ADDC Reg Reg Reg -- (carrying) dst, src1, src2
618 | ADDE Reg Reg Reg -- (extend) dst, src1, src2
619 | ADDIS Reg Reg Imm -- Add Immediate Shifted dst, src1, src2
620 | SUBF Reg Reg Reg -- dst, src1, src2 ; dst = src2 - src1
625 | MULLW_MayOflo Reg Reg Reg
626 -- dst = 1 if src1 * src2 overflows
627 -- pseudo-instruction; pretty-printed as:
628 -- mullwo. dst, src1, src2
630 -- rlwinm dst, dst, 2, 31,31
632 | AND Reg Reg RI -- dst, src1, src2
633 | OR Reg Reg RI -- dst, src1, src2
634 | XOR Reg Reg RI -- dst, src1, src2
635 | XORIS Reg Reg Imm -- XOR Immediate Shifted dst, src1, src2
637 | EXTS MachRep Reg Reg
642 | SLW Reg Reg RI -- shift left word
643 | SRW Reg Reg RI -- shift right word
644 | SRAW Reg Reg RI -- shift right arithmetic word
646 -- Rotate Left Word Immediate then AND with Mask
647 | RLWINM Reg Reg Int Int Int
649 | FADD MachRep Reg Reg Reg
650 | FSUB MachRep Reg Reg Reg
651 | FMUL MachRep Reg Reg Reg
652 | FDIV MachRep Reg Reg Reg
653 | FNEG Reg Reg -- negate is the same for single and double prec.
657 | FCTIWZ Reg Reg -- convert to integer word
658 | FRSP Reg Reg -- reduce to single precision
659 -- (but destination is a FP register)
661 | CRNOR Int Int Int -- condition register nor
662 | MFCR Reg -- move from condition register
664 | MFLR Reg -- move from link register
665 | FETCHPC Reg -- pseudo-instruction:
666 -- bcl to next insn, mflr reg
668 condUnsigned GU = True
669 condUnsigned LU = True
670 condUnsigned GEU = True
671 condUnsigned LEU = True
672 condUnsigned _ = False
674 condToSigned GU = GTT
675 condToSigned LU = LTT
676 condToSigned GEU = GE
677 condToSigned LEU = LE
679 #endif /* powerpc_TARGET_ARCH */
682 -- -----------------------------------------------------------------------------
685 -- ToDo: might not be needed anymore --SDM
687 -- used by insnFuture in RegAllocInfo.lhs
689 = NoDestInfo -- no supplied dests; infer from context
690 | DestInfo [CLabel] -- precisely these dests and no others
692 hasDestInfo NoDestInfo = False
693 hasDestInfo (DestInfo _) = True
695 pprDests :: DestInfo -> SDoc
696 pprDests NoDestInfo = text "NoDestInfo"
697 pprDests (DestInfo dsts) = brackets (hsep (map pprCLabel dsts))