1 -----------------------------------------------------------------------------
3 -- Machine-dependent assembly language
5 -- (c) The University of Glasgow 1993-2004
7 -----------------------------------------------------------------------------
9 #include "nativeGen/NCG.h"
12 -- * Cmm instantiations
13 NatCmm, NatCmmTop, NatBasicBlock,
15 -- * Machine instructions
17 Cond(..), condUnsigned, condToSigned, condToUnsigned,
19 #if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH && !x86_64_TARGET_ARCH
20 Size(..), machRepSize,
24 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
31 riZero, fpRelEA, moveSp, fPair,
35 #include "HsVersions.h"
39 import MachOp ( MachRep(..) )
40 import CLabel ( CLabel, pprCLabel )
41 import Panic ( panic )
48 -- -----------------------------------------------------------------------------
49 -- Our flavours of the Cmm types
51 -- Type synonyms for Cmm populated with native code
52 type NatCmm = GenCmm CmmStatic Instr
53 type NatCmmTop = GenCmmTop CmmStatic Instr
54 type NatBasicBlock = GenBasicBlock Instr
56 -- -----------------------------------------------------------------------------
57 -- Conditions on this architecture
61 = ALWAYS -- For BI (same as BR)
62 | EQQ -- For CMP and BI (NB: "EQ" is a 1.3 Prelude name)
64 | GTT -- For BI only (NB: "GT" is a 1.3 Prelude name)
65 | LE -- For CMP and BI
66 | LTT -- For CMP and BI (NB: "LT" is a 1.3 Prelude name)
68 | NEVER -- For BI (null instruction)
72 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
73 = ALWAYS -- What's really used? ToDo
92 = ALWAYS -- What's really used? ToDo
109 #if powerpc_TARGET_ARCH
122 deriving Eq -- to make an assertion work
124 condUnsigned GU = True
125 condUnsigned LU = True
126 condUnsigned GEU = True
127 condUnsigned LEU = True
128 condUnsigned _ = False
130 condToSigned GU = GTT
131 condToSigned LU = LTT
132 condToSigned GEU = GE
133 condToSigned LEU = LE
136 condToUnsigned GTT = GU
137 condToUnsigned LTT = LU
138 condToUnsigned GE = GEU
139 condToUnsigned LE = LEU
142 -- -----------------------------------------------------------------------------
143 -- Sizes on this architecture
145 -- ToDo: it's not clear to me that we need separate signed-vs-unsigned sizes
146 -- here. I've removed them from the x86 version, we'll see what happens --SDM
148 #if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH && !x86_64_TARGET_ARCH
150 #if alpha_TARGET_ARCH
153 -- | W -- word (2 bytes): UNUSED
155 | L -- longword (4 bytes)
156 | Q -- quadword (8 bytes)
157 -- | FF -- VAX F-style floating pt: UNUSED
158 -- | GF -- VAX G-style floating pt: UNUSED
159 -- | DF -- VAX D-style floating pt: UNUSED
160 -- | SF -- IEEE single-precision floating pt: UNUSED
161 | TF -- IEEE double-precision floating pt
163 #if sparc_TARGET_ARCH || powerpc_TARGET_ARCH
165 | Bu -- byte (unsigned)
166 | H -- halfword (signed, 2 bytes)
167 | Hu -- halfword (unsigned, 2 bytes)
168 | W -- word (4 bytes)
169 | F -- IEEE single-precision floating pt
170 | DF -- IEEE single-precision floating pt
174 machRepSize :: MachRep -> Size
175 machRepSize I8 = IF_ARCH_alpha(Bu, IF_ARCH_sparc(Bu, ))
176 machRepSize I16 = IF_ARCH_alpha(err,IF_ARCH_sparc(Hu, ))
177 machRepSize I32 = IF_ARCH_alpha(L, IF_ARCH_sparc(W, ))
178 machRepSize I64 = panic "machRepSize: I64"
179 machRepSize I128 = panic "machRepSize: I128"
180 machRepSize F32 = IF_ARCH_alpha(TF, IF_ARCH_sparc(F, ))
181 machRepSize F64 = IF_ARCH_alpha(TF, IF_ARCH_sparc(DF,))
184 -- -----------------------------------------------------------------------------
185 -- Register or immediate (a handy type on some platforms)
191 -- -----------------------------------------------------------------------------
192 -- Machine's assembly language
194 -- We have a few common "instructions" (nearly all the pseudo-ops) but
195 -- mostly all of 'Instr' is machine-specific.
198 = COMMENT FastString -- comment pseudo-op
200 | LDATA Section [CmmStatic] -- some static data spat out during code
201 -- generation. Will be extracted before
204 | NEWBLOCK BlockId -- start a new basic block. Useful during
205 -- codegen, removed later. Preceding
206 -- instruction should be a jump, as per the
207 -- invariants for a BasicBlock (see Cmm).
209 | DELTA Int -- specify current stack offset for
210 -- benefit of subsequent passes
212 -- -----------------------------------------------------------------------------
213 -- Alpha instructions
215 #if alpha_TARGET_ARCH
217 -- data Instr continues...
220 | LD Size Reg AddrMode -- size, dst, src
221 | LDA Reg AddrMode -- dst, src
222 | LDAH Reg AddrMode -- dst, src
223 | LDGP Reg AddrMode -- dst, src
224 | LDI Size Reg Imm -- size, dst, src
225 | ST Size Reg AddrMode -- size, src, dst
229 | ABS Size RI Reg -- size, src, dst
230 | NEG Size Bool RI Reg -- size, overflow, src, dst
231 | ADD Size Bool Reg RI Reg -- size, overflow, src, src, dst
232 | SADD Size Size Reg RI Reg -- size, scale, src, src, dst
233 | SUB Size Bool Reg RI Reg -- size, overflow, src, src, dst
234 | SSUB Size Size Reg RI Reg -- size, scale, src, src, dst
235 | MUL Size Bool Reg RI Reg -- size, overflow, src, src, dst
236 | DIV Size Bool Reg RI Reg -- size, unsigned, src, src, dst
237 | REM Size Bool Reg RI Reg -- size, unsigned, src, src, dst
239 -- Simple bit-twiddling.
257 | CMP Cond Reg RI Reg
263 | FADD Size Reg Reg Reg
264 | FDIV Size Reg Reg Reg
265 | FMUL Size Reg Reg Reg
266 | FSUB Size Reg Reg Reg
267 | CVTxy Size Size Reg Reg
268 | FCMP Size Cond Reg Reg Reg
275 | JMP Reg AddrMode Int
277 | JSR Reg AddrMode Int
279 -- Alpha-specific pseudo-ops.
287 #endif /* alpha_TARGET_ARCH */
290 -- -----------------------------------------------------------------------------
291 -- Intel x86 instructions
294 Intel, in their infinite wisdom, selected a stack model for floating
295 point registers on x86. That might have made sense back in 1979 --
296 nowadays we can see it for the nonsense it really is. A stack model
297 fits poorly with the existing nativeGen infrastructure, which assumes
298 flat integer and FP register sets. Prior to this commit, nativeGen
299 could not generate correct x86 FP code -- to do so would have meant
300 somehow working the register-stack paradigm into the register
301 allocator and spiller, which sounds very difficult.
303 We have decided to cheat, and go for a simple fix which requires no
304 infrastructure modifications, at the expense of generating ropey but
305 correct FP code. All notions of the x86 FP stack and its insns have
306 been removed. Instead, we pretend (to the instruction selector and
307 register allocator) that x86 has six floating point registers, %fake0
308 .. %fake5, which can be used in the usual flat manner. We further
309 claim that x86 has floating point instructions very similar to SPARC
310 and Alpha, that is, a simple 3-operand register-register arrangement.
311 Code generation and register allocation proceed on this basis.
313 When we come to print out the final assembly, our convenient fiction
314 is converted to dismal reality. Each fake instruction is
315 independently converted to a series of real x86 instructions.
316 %fake0 .. %fake5 are mapped to %st(0) .. %st(5). To do reg-reg
317 arithmetic operations, the two operands are pushed onto the top of the
318 FP stack, the operation done, and the result copied back into the
319 relevant register. There are only six %fake registers because 2 are
320 needed for the translation, and x86 has 8 in total.
322 The translation is inefficient but is simple and it works. A cleverer
323 translation would handle a sequence of insns, simulating the FP stack
324 contents, would not impose a fixed mapping from %fake to %st regs, and
325 hopefully could avoid most of the redundant reg-reg moves of the
328 We might as well make use of whatever unique FP facilities Intel have
329 chosen to bless us with (let's not be churlish, after all).
330 Hence GLDZ and GLD1. Bwahahahahahahaha!
334 MORE FLOATING POINT MUSINGS...
336 Intel's internal floating point registers are by default 80 bit
337 extended precision. This means that all operations done on values in
338 registers are done at 80 bits, and unless the intermediate values are
339 truncated to the appropriate size (32 or 64 bits) by storing in
340 memory, calculations in registers will give different results from
341 calculations which pass intermediate values in memory (eg. via
344 One solution is to set the FPU into 64 bit precision mode. Some OSs
345 do this (eg. FreeBSD) and some don't (eg. Linux). The problem here is
346 that this will only affect 64-bit precision arithmetic; 32-bit
347 calculations will still be done at 64-bit precision in registers. So
348 it doesn't solve the whole problem.
350 There's also the issue of what the C library is expecting in terms of
351 precision. It seems to be the case that glibc on Linux expects the
352 FPU to be set to 80 bit precision, so setting it to 64 bit could have
353 unexpected effects. Changing the default could have undesirable
354 effects on other 3rd-party library code too, so the right thing would
355 be to save/restore the FPU control word across Haskell code if we were
358 gcc's -ffloat-store gives consistent results by always storing the
359 results of floating-point calculations in memory, which works for both
360 32 and 64-bit precision. However, it only affects the values of
361 user-declared floating point variables in C, not intermediate results.
362 GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
365 Another problem is how to spill floating point registers in the
366 register allocator. Should we spill the whole 80 bits, or just 64?
367 On an OS which is set to 64 bit precision, spilling 64 is fine. On
368 Linux, spilling 64 bits will round the results of some operations.
369 This is what gcc does. Spilling at 80 bits requires taking up a full
370 128 bit slot (so we get alignment). We spill at 80-bits and ignore
371 the alignment problems.
373 In the future, we'll use the SSE registers for floating point. This
374 requires a CPU that supports SSE2 (ordinary SSE only supports 32 bit
375 precision float ops), which means P4 or Xeon and above. Using SSE
376 will solve all these problems, because the SSE registers use fixed 32
377 bit or 64 bit precision.
382 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
384 -- data Instr continues...
387 | MOV MachRep Operand Operand
388 | MOVZxL MachRep Operand Operand -- size is the size of operand 1
389 | MOVSxL MachRep Operand Operand -- size is the size of operand 1
390 -- x86_64 note: plain mov into a 32-bit register always zero-extends
391 -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
392 -- don't affect the high bits of the register.
394 -- Load effective address (also a very useful three-operand add instruction :-)
395 | LEA MachRep Operand Operand
398 | ADD MachRep Operand Operand
399 | ADC MachRep Operand Operand
400 | SUB MachRep Operand Operand
402 | MUL MachRep Operand Operand
403 | IMUL MachRep Operand Operand -- signed int mul
404 | IMUL2 MachRep Operand -- %edx:%eax = operand * %eax
406 | DIV MachRep Operand -- eax := eax:edx/op, edx := eax:edx%op
407 | IDIV MachRep Operand -- ditto, but signed
409 -- Simple bit-twiddling.
410 | AND MachRep Operand Operand
411 | OR MachRep Operand Operand
412 | XOR MachRep Operand Operand
413 | NOT MachRep Operand
414 | NEGI MachRep Operand -- NEG instruction (name clash with Cond)
416 -- Shifts (amount may be immediate or %cl only)
417 | SHL MachRep Operand{-amount-} Operand
418 | SAR MachRep Operand{-amount-} Operand
419 | SHR MachRep Operand{-amount-} Operand
421 | BT MachRep Imm Operand
427 -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles
428 -- as single instructions right up until we spit them out.
429 -- all the 3-operand fake fp insns are src1 src2 dst
430 -- and furthermore are constrained to be fp regs only.
431 -- IMPORTANT: keep is_G_insn up to date with any changes here
432 | GMOV Reg Reg -- src(fpreg), dst(fpreg)
433 | GLD MachRep AddrMode Reg -- src, dst(fpreg)
434 | GST MachRep Reg AddrMode -- src(fpreg), dst
436 | GLDZ Reg -- dst(fpreg)
437 | GLD1 Reg -- dst(fpreg)
439 | GFTOI Reg Reg -- src(fpreg), dst(intreg)
440 | GDTOI Reg Reg -- src(fpreg), dst(intreg)
442 | GITOF Reg Reg -- src(intreg), dst(fpreg)
443 | GITOD Reg Reg -- src(intreg), dst(fpreg)
445 | GADD MachRep Reg Reg Reg -- src1, src2, dst
446 | GDIV MachRep Reg Reg Reg -- src1, src2, dst
447 | GSUB MachRep Reg Reg Reg -- src1, src2, dst
448 | GMUL MachRep Reg Reg Reg -- src1, src2, dst
450 -- FP compare. Cond must be `elem` [EQQ, NE, LE, LTT, GE, GTT]
451 -- Compare src1 with src2; set the Zero flag iff the numbers are
452 -- comparable and the comparison is True. Subsequent code must
453 -- test the %eflags zero flag regardless of the supplied Cond.
454 | GCMP Cond Reg Reg -- src1, src2
456 | GABS MachRep Reg Reg -- src, dst
457 | GNEG MachRep Reg Reg -- src, dst
458 | GSQRT MachRep Reg Reg -- src, dst
459 | GSIN MachRep Reg Reg -- src, dst
460 | GCOS MachRep Reg Reg -- src, dst
461 | GTAN MachRep Reg Reg -- src, dst
463 | GFREE -- do ffree on all x86 regs; an ugly hack
466 #if x86_64_TARGET_ARCH
467 -- SSE2 floating point: we use a restricted set of the available SSE2
468 -- instructions for floating-point.
470 -- use MOV for moving (either movss or movsd (movlpd better?))
472 | CVTSS2SD Reg Reg -- F32 to F64
473 | CVTSD2SS Reg Reg -- F64 to F32
474 | CVTSS2SI Operand Reg -- F32 to I32/I64 (with rounding)
475 | CVTSD2SI Operand Reg -- F64 to I32/I64 (with rounding)
476 | CVTSI2SS Operand Reg -- I32/I64 to F32
477 | CVTSI2SD Operand Reg -- I32/I64 to F64
479 -- use ADD & SUB for arithmetic. In both cases, operands
482 -- SSE2 floating-point division:
483 | FDIV MachRep Operand Operand -- divisor, dividend(dst)
485 -- use CMP for comparisons. ucomiss and ucomisd instructions
486 -- compare single/double prec floating point respectively.
488 | SQRT MachRep Operand Reg -- src, dst
492 | TEST MachRep Operand Operand
493 | CMP MachRep Operand Operand
497 | PUSH MachRep Operand
498 | POP MachRep Operand
499 -- both unused (SDM):
505 | JXX Cond BlockId -- includes unconditional branches
506 | JMP_TBL Operand [BlockId] -- table jump
507 | CALL (Either Imm Reg)
510 | CLTD MachRep -- sign extend %eax into %edx:%eax
512 | FETCHGOT Reg -- pseudo-insn for position-independent code
516 -- addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
519 = OpReg Reg -- register
520 | OpImm Imm -- immediate value
521 | OpAddr AddrMode -- memory reference
523 #endif /* i386 or x86_64 */
526 i386_insert_ffrees :: [Instr] -> [Instr]
527 i386_insert_ffrees insns
528 | any is_G_instr insns
529 = concatMap ffree_before_nonlocal_transfers insns
533 ffree_before_nonlocal_transfers insn
535 CALL _ -> [GFREE, insn]
536 JMP _ -> [GFREE, insn]
540 -- if you ever add a new FP insn to the fake x86 FP insn set,
541 -- you must update this too
542 is_G_instr :: Instr -> Bool
545 GMOV _ _ -> True; GLD _ _ _ -> True; GST _ _ _ -> True
546 GLDZ _ -> True; GLD1 _ -> True
547 GFTOI _ _ -> True; GDTOI _ _ -> True
548 GITOF _ _ -> True; GITOD _ _ -> True
549 GADD _ _ _ _ -> True; GDIV _ _ _ _ -> True
550 GSUB _ _ _ _ -> True; GMUL _ _ _ _ -> True
551 GCMP _ _ _ -> True; GABS _ _ _ -> True
552 GNEG _ _ _ -> True; GSQRT _ _ _ -> True
553 GSIN _ _ _ -> True; GCOS _ _ _ -> True; GTAN _ _ _ -> True
554 GFREE -> panic "is_G_instr: GFREE (!)"
556 #endif /* i386_TARGET_ARCH */
559 -- -----------------------------------------------------------------------------
560 -- Sparc instructions
562 #if sparc_TARGET_ARCH
564 -- data Instr continues...
567 | LD MachRep AddrMode Reg -- size, src, dst
568 | ST MachRep Reg AddrMode -- size, src, dst
571 | ADD Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
572 | SUB Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
573 | UMUL Bool Reg RI Reg -- cc?, src1, src2, dst
574 | SMUL Bool Reg RI Reg -- cc?, src1, src2, dst
575 | RDY Reg -- move contents of Y register to reg
577 -- Simple bit-twiddling.
578 | AND Bool Reg RI Reg -- cc?, src1, src2, dst
579 | ANDN Bool Reg RI Reg -- cc?, src1, src2, dst
580 | OR Bool Reg RI Reg -- cc?, src1, src2, dst
581 | ORN Bool Reg RI Reg -- cc?, src1, src2, dst
582 | XOR Bool Reg RI Reg -- cc?, src1, src2, dst
583 | XNOR Bool Reg RI Reg -- cc?, src1, src2, dst
584 | SLL Reg RI Reg -- src1, src2, dst
585 | SRL Reg RI Reg -- src1, src2, dst
586 | SRA Reg RI Reg -- src1, src2, dst
587 | SETHI Imm Reg -- src, dst
588 | NOP -- Really SETHI 0, %g0, but worth an alias
592 -- Note that we cheat by treating F{ABS,MOV,NEG} of doubles as single
593 -- instructions right up until we spit them out.
594 | FABS MachRep Reg Reg -- src dst
595 | FADD MachRep Reg Reg Reg -- src1, src2, dst
596 | FCMP Bool MachRep Reg Reg -- exception?, src1, src2, dst
597 | FDIV MachRep Reg Reg Reg -- src1, src2, dst
598 | FMOV MachRep Reg Reg -- src, dst
599 | FMUL MachRep Reg Reg Reg -- src1, src2, dst
600 | FNEG MachRep Reg Reg -- src, dst
601 | FSQRT MachRep Reg Reg -- src, dst
602 | FSUB MachRep Reg Reg Reg -- src1, src2, dst
603 | FxTOy MachRep MachRep Reg Reg -- src, dst
606 | BI Cond Bool Imm -- cond, annul?, target
607 | BF Cond Bool Imm -- cond, annul?, target
609 | JMP DestInfo AddrMode -- target
610 | CALL (Either Imm Reg) Int Bool -- target, args, terminal
617 riZero (RIImm (ImmInt 0)) = True
618 riZero (RIImm (ImmInteger 0)) = True
619 riZero (RIReg (RealReg 0)) = True
622 -- Calculate the effective address which would be used by the
623 -- corresponding fpRel sequence. fpRel is in MachRegs.lhs,
624 -- alas -- can't have fpRelEA here because of module dependencies.
625 fpRelEA :: Int -> Reg -> Instr
627 = ADD False False fp (RIImm (ImmInt (n * BYTES_PER_WORD))) dst
629 -- Code to shift the stack pointer by n words.
630 moveSp :: Int -> Instr
632 = ADD False False sp (RIImm (ImmInt (n * BYTES_PER_WORD))) sp
634 -- Produce the second-half-of-a-double register given the first half.
636 fPair (RealReg n) | n >= 32 && n `mod` 2 == 0 = RealReg (n+1)
637 fPair other = pprPanic "fPair(sparc NCG)" (ppr other)
638 #endif /* sparc_TARGET_ARCH */
641 -- -----------------------------------------------------------------------------
642 -- PowerPC instructions
644 #ifdef powerpc_TARGET_ARCH
645 -- data Instr continues...
648 | LD MachRep Reg AddrMode -- Load size, dst, src
649 | LA MachRep Reg AddrMode -- Load arithmetic size, dst, src
650 | ST MachRep Reg AddrMode -- Store size, src, dst
651 | STU MachRep Reg AddrMode -- Store with Update size, src, dst
652 | LIS Reg Imm -- Load Immediate Shifted dst, src
653 | LI Reg Imm -- Load Immediate dst, src
654 | MR Reg Reg -- Move Register dst, src -- also for fmr
656 | CMP MachRep Reg RI --- size, src1, src2
657 | CMPL MachRep Reg RI --- size, src1, src2
660 | JMP CLabel -- same as branch,
661 -- but with CLabel instead of block ID
663 | BCTR [BlockId] -- with list of local destinations
664 | BL CLabel [Reg] -- with list of argument regs
667 | ADD Reg Reg RI -- dst, src1, src2
668 | ADDC Reg Reg Reg -- (carrying) dst, src1, src2
669 | ADDE Reg Reg Reg -- (extend) dst, src1, src2
670 | ADDIS Reg Reg Imm -- Add Immediate Shifted dst, src1, src2
671 | SUBF Reg Reg Reg -- dst, src1, src2 ; dst = src2 - src1
676 | MULLW_MayOflo Reg Reg Reg
677 -- dst = 1 if src1 * src2 overflows
678 -- pseudo-instruction; pretty-printed as:
679 -- mullwo. dst, src1, src2
681 -- rlwinm dst, dst, 2, 31,31
683 | AND Reg Reg RI -- dst, src1, src2
684 | OR Reg Reg RI -- dst, src1, src2
685 | XOR Reg Reg RI -- dst, src1, src2
686 | XORIS Reg Reg Imm -- XOR Immediate Shifted dst, src1, src2
688 | EXTS MachRep Reg Reg
693 | SLW Reg Reg RI -- shift left word
694 | SRW Reg Reg RI -- shift right word
695 | SRAW Reg Reg RI -- shift right arithmetic word
697 -- Rotate Left Word Immediate then AND with Mask
698 | RLWINM Reg Reg Int Int Int
700 | FADD MachRep Reg Reg Reg
701 | FSUB MachRep Reg Reg Reg
702 | FMUL MachRep Reg Reg Reg
703 | FDIV MachRep Reg Reg Reg
704 | FNEG Reg Reg -- negate is the same for single and double prec.
708 | FCTIWZ Reg Reg -- convert to integer word
709 | FRSP Reg Reg -- reduce to single precision
710 -- (but destination is a FP register)
712 | CRNOR Int Int Int -- condition register nor
713 | MFCR Reg -- move from condition register
715 | MFLR Reg -- move from link register
716 | FETCHPC Reg -- pseudo-instruction:
717 -- bcl to next insn, mflr reg
719 #endif /* powerpc_TARGET_ARCH */