compiler/nativeGen/X86/Instr.hs

   1 -----------------------------------------------------------------------------
   2 --
   3 -- Machine-dependent assembly language
   4 --
   5 -- (c) The University of Glasgow 1993-2004
   6 --
   7 -----------------------------------------------------------------------------
   8
   9 #include "HsVersions.h"
  10 #include "nativeGen/NCG.h"
  11
  12 module X86.Instr
  13 where
  14
  15 import BlockId
  16 import MachRegs
  17 import Cmm
  18 import FastString
  19
  20 data Cond
  21         = ALWAYS        -- What's really used? ToDo
  22         | EQQ
  23         | GE
  24         | GEU
  25         | GTT
  26         | GU
  27         | LE
  28         | LEU
  29         | LTT
  30         | LU
  31         | NE
  32         | NEG
  33         | POS
  34         | CARRY
  35         | OFLO
  36         | PARITY
  37         | NOTPARITY
  38
  39
  40
  41 -- -----------------------------------------------------------------------------
  42 -- Intel x86 instructions
  43
  44 {-
  45 Intel, in their infinite wisdom, selected a stack model for floating
  46 point registers on x86.  That might have made sense back in 1979 --
  47 nowadays we can see it for the nonsense it really is.  A stack model
  48 fits poorly with the existing nativeGen infrastructure, which assumes
  49 flat integer and FP register sets.  Prior to this commit, nativeGen
  50 could not generate correct x86 FP code -- to do so would have meant
  51 somehow working the register-stack paradigm into the register
  52 allocator and spiller, which sounds very difficult.
  53
  54 We have decided to cheat, and go for a simple fix which requires no
  55 infrastructure modifications, at the expense of generating ropey but
  56 correct FP code.  All notions of the x86 FP stack and its insns have
  57 been removed.  Instead, we pretend (to the instruction selector and
  58 register allocator) that x86 has six floating point registers, %fake0
  59 .. %fake5, which can be used in the usual flat manner.  We further
  60 claim that x86 has floating point instructions very similar to SPARC
  61 and Alpha, that is, a simple 3-operand register-register arrangement.
  62 Code generation and register allocation proceed on this basis.
  63
  64 When we come to print out the final assembly, our convenient fiction
  65 is converted to dismal reality.  Each fake instruction is
  66 independently converted to a series of real x86 instructions.
  67 %fake0 .. %fake5 are mapped to %st(0) .. %st(5).  To do reg-reg
  68 arithmetic operations, the two operands are pushed onto the top of the
  69 FP stack, the operation done, and the result copied back into the
  70 relevant register.  There are only six %fake registers because 2 are
  71 needed for the translation, and x86 has 8 in total.
  72
  73 The translation is inefficient but is simple and it works.  A cleverer
  74 translation would handle a sequence of insns, simulating the FP stack
  75 contents, would not impose a fixed mapping from %fake to %st regs, and
  76 hopefully could avoid most of the redundant reg-reg moves of the
  77 current translation.
  78
  79 We might as well make use of whatever unique FP facilities Intel have
  80 chosen to bless us with (let's not be churlish, after all).
  81 Hence GLDZ and GLD1.  Bwahahahahahahaha!
  82 -}
  83
  84 {-
  85 MORE FLOATING POINT MUSINGS...
  86
  87 Intel's internal floating point registers are by default 80 bit
  88 extended precision.  This means that all operations done on values in
  89 registers are done at 80 bits, and unless the intermediate values are
  90 truncated to the appropriate size (32 or 64 bits) by storing in
  91 memory, calculations in registers will give different results from
  92 calculations which pass intermediate values in memory (eg. via
  93 function calls).
  94
  95 One solution is to set the FPU into 64 bit precision mode.  Some OSs
  96 do this (eg. FreeBSD) and some don't (eg. Linux).  The problem here is
  97 that this will only affect 64-bit precision arithmetic; 32-bit
  98 calculations will still be done at 64-bit precision in registers.  So
  99 it doesn't solve the whole problem.
 100
 101 There's also the issue of what the C library is expecting in terms of
 102 precision.  It seems to be the case that glibc on Linux expects the
 103 FPU to be set to 80 bit precision, so setting it to 64 bit could have
 104 unexpected effects.  Changing the default could have undesirable
 105 effects on other 3rd-party library code too, so the right thing would
 106 be to save/restore the FPU control word across Haskell code if we were
 107 to do this.
 108
 109 gcc's -ffloat-store gives consistent results by always storing the
 110 results of floating-point calculations in memory, which works for both
 111 32 and 64-bit precision.  However, it only affects the values of
 112 user-declared floating point variables in C, not intermediate results.
 113 GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
 114 flag).
 115
 116 Another problem is how to spill floating point registers in the
 117 register allocator.  Should we spill the whole 80 bits, or just 64?
 118 On an OS which is set to 64 bit precision, spilling 64 is fine.  On
 119 Linux, spilling 64 bits will round the results of some operations.
 120 This is what gcc does.  Spilling at 80 bits requires taking up a full
 121 128 bit slot (so we get alignment).  We spill at 80-bits and ignore
 122 the alignment problems.
 123
 124 In the future, we'll use the SSE registers for floating point.  This
 125 requires a CPU that supports SSE2 (ordinary SSE only supports 32 bit
 126 precision float ops), which means P4 or Xeon and above.  Using SSE
 127 will solve all these problems, because the SSE registers use fixed 32
 128 bit or 64 bit precision.
 129
 130 --SDM 1/2003
 131 -}
 132
 133
 134 data Instr
 135         -- comment pseudo-op
 136         = COMMENT FastString
 137
 138         -- some static data spat out during code
 139         -- generation.  Will be extracted before
 140         -- pretty-printing.
 141         | LDATA   Section [CmmStatic]
 142
 143         -- start a new basic block.  Useful during
 144         -- codegen, removed later.  Preceding
 145         -- instruction should be a jump, as per the
 146         -- invariants for a BasicBlock (see Cmm).
 147         | NEWBLOCK BlockId
 148
 149         -- specify current stack offset for
 150         -- benefit of subsequent passes
 151         | DELTA   Int
 152
 153         -- | spill this reg to a stack slot
 154         | SPILL   Reg Int
 155
 156         -- | reload this reg from a stack slot
 157         | RELOAD  Int Reg
 158
 159
 160         -- Moves.
 161         | MOV         Size Operand Operand
 162         | MOVZxL      Size Operand Operand -- size is the size of operand 1
 163         | MOVSxL      Size Operand Operand -- size is the size of operand 1
 164         -- x86_64 note: plain mov into a 32-bit register always zero-extends
 165         -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
 166         -- don't affect the high bits of the register.
 167
 168         -- Load effective address (also a very useful three-operand add instruction :-)
 169         | LEA         Size Operand Operand
 170
 171         -- Int Arithmetic.
 172         | ADD         Size Operand Operand
 173         | ADC         Size Operand Operand
 174         | SUB         Size Operand Operand
 175
 176         | MUL         Size Operand Operand
 177         | IMUL        Size Operand Operand      -- signed int mul
 178         | IMUL2       Size Operand              -- %edx:%eax = operand * %eax
 179
 180         | DIV         Size Operand              -- eax := eax:edx/op, edx := eax:edx%op
 181         | IDIV        Size Operand              -- ditto, but signed
 182
 183         -- Simple bit-twiddling.
 184         | AND         Size Operand Operand
 185         | OR          Size Operand Operand
 186         | XOR         Size Operand Operand
 187         | NOT         Size Operand
 188         | NEGI        Size Operand              -- NEG instruction (name clash with Cond)
 189
 190         -- Shifts (amount may be immediate or %cl only)
 191         | SHL         Size Operand{-amount-} Operand
 192         | SAR         Size Operand{-amount-} Operand
 193         | SHR         Size Operand{-amount-} Operand
 194
 195         | BT          Size Imm Operand
 196         | NOP
 197
 198 #if i386_TARGET_ARCH
 199         -- Float Arithmetic.
 200
 201         -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles
 202         -- as single instructions right up until we spit them out.
 203         -- all the 3-operand fake fp insns are src1 src2 dst
 204         -- and furthermore are constrained to be fp regs only.
 205         -- IMPORTANT: keep is_G_insn up to date with any changes here
 206         | GMOV        Reg Reg -- src(fpreg), dst(fpreg)
 207         | GLD         Size AddrMode Reg -- src, dst(fpreg)
 208         | GST         Size Reg AddrMode -- src(fpreg), dst
 209
 210         | GLDZ        Reg -- dst(fpreg)
 211         | GLD1        Reg -- dst(fpreg)
 212
 213         | GFTOI       Reg Reg -- src(fpreg), dst(intreg)
 214         | GDTOI       Reg Reg -- src(fpreg), dst(intreg)
 215
 216         | GITOF       Reg Reg -- src(intreg), dst(fpreg)
 217         | GITOD       Reg Reg -- src(intreg), dst(fpreg)
 218
 219         | GADD        Size Reg Reg Reg -- src1, src2, dst
 220         | GDIV        Size Reg Reg Reg -- src1, src2, dst
 221         | GSUB        Size Reg Reg Reg -- src1, src2, dst
 222         | GMUL        Size Reg Reg Reg -- src1, src2, dst
 223
 224                 -- FP compare.  Cond must be `elem` [EQQ, NE, LE, LTT, GE, GTT]
 225                 -- Compare src1 with src2; set the Zero flag iff the numbers are
 226                 -- comparable and the comparison is True.  Subsequent code must
 227                 -- test the %eflags zero flag regardless of the supplied Cond.
 228         | GCMP        Cond Reg Reg -- src1, src2
 229
 230         | GABS        Size Reg Reg -- src, dst
 231         | GNEG        Size Reg Reg -- src, dst
 232         | GSQRT       Size Reg Reg -- src, dst
 233         | GSIN        Size CLabel CLabel Reg Reg -- src, dst
 234         | GCOS        Size CLabel CLabel Reg Reg -- src, dst
 235         | GTAN        Size CLabel CLabel Reg Reg -- src, dst
 236
 237         | GFREE         -- do ffree on all x86 regs; an ugly hack
 238 #endif
 239
 240 #if x86_64_TARGET_ARCH
 241 -- SSE2 floating point: we use a restricted set of the available SSE2
 242 -- instructions for floating-point.
 243
 244         -- use MOV for moving (either movss or movsd (movlpd better?))
 245
 246         | CVTSS2SD      Reg Reg         -- F32 to F64
 247         | CVTSD2SS      Reg Reg         -- F64 to F32
 248         | CVTTSS2SIQ    Operand Reg     -- F32 to I32/I64 (with truncation)
 249         | CVTTSD2SIQ    Operand Reg     -- F64 to I32/I64 (with truncation)
 250         | CVTSI2SS      Operand Reg     -- I32/I64 to F32
 251         | CVTSI2SD      Operand Reg     -- I32/I64 to F64
 252
 253         -- use ADD & SUB for arithmetic.  In both cases, operands
 254         -- are  Operand Reg.
 255
 256         -- SSE2 floating-point division:
 257         | FDIV          Size Operand Operand   -- divisor, dividend(dst)
 258
 259         -- use CMP for comparisons.  ucomiss and ucomisd instructions
 260         -- compare single/double prec floating point respectively.
 261
 262         | SQRT          Size Operand Reg        -- src, dst
 263 #endif
 264
 265         -- Comparison
 266         | TEST          Size Operand Operand
 267         | CMP           Size Operand Operand
 268         | SETCC         Cond Operand
 269
 270         -- Stack Operations.
 271         | PUSH          Size Operand
 272         | POP           Size Operand
 273         -- both unused (SDM):
 274         --  | PUSHA
 275         --  | POPA
 276
 277         -- Jumping around.
 278         | JMP         Operand
 279         | JXX         Cond BlockId  -- includes unconditional branches
 280         | JXX_GBL     Cond Imm      -- non-local version of JXX
 281         | JMP_TBL     Operand [BlockId]  -- table jump
 282         | CALL        (Either Imm Reg) [Reg]
 283
 284         -- Other things.
 285         | CLTD Size              -- sign extend %eax into %edx:%eax
 286
 287         | FETCHGOT    Reg        -- pseudo-insn for ELF position-independent code
 288                                  -- pretty-prints as
 289                                  --       call 1f
 290                                  -- 1:    popl %reg
 291                                  --       addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
 292         | FETCHPC     Reg        -- pseudo-insn for Darwin position-independent code
 293                                  -- pretty-prints as
 294                                  --       call 1f
 295                                  -- 1:    popl %reg
 296
 297
 298 data Operand
 299         = OpReg  Reg            -- register
 300         | OpImm  Imm            -- immediate value
 301         | OpAddr AddrMode       -- memory reference
 302
 303
 304 #if i386_TARGET_ARCH
 305 i386_insert_ffrees :: [GenBasicBlock Instr] -> [GenBasicBlock Instr]
 306 i386_insert_ffrees blocks
 307    | or (map (any is_G_instr) [ instrs | BasicBlock id instrs <- blocks ])
 308    = map ffree_before_nonlocal_transfers blocks
 309    | otherwise
 310    = blocks
 311   where
 312    ffree_before_nonlocal_transfers (BasicBlock id insns)
 313      = BasicBlock id (foldr p [] insns)
 314      where p insn r = case insn of
 315                         CALL _ _ -> GFREE : insn : r
 316                         JMP _    -> GFREE : insn : r
 317                         other    -> insn : r
 318
 319 -- if you ever add a new FP insn to the fake x86 FP insn set,
 320 -- you must update this too
 321 is_G_instr :: Instr -> Bool
 322 is_G_instr instr
 323    = case instr of
 324         GMOV _ _ -> True; GLD _ _ _ -> True; GST _ _ _ -> True
 325         GLDZ _ -> True; GLD1 _ -> True
 326         GFTOI _ _ -> True; GDTOI _ _ -> True
 327         GITOF _ _ -> True; GITOD _ _ -> True
 328         GADD _ _ _ _ -> True; GDIV _ _ _ _ -> True
 329         GSUB _ _ _ _ -> True; GMUL _ _ _ _ -> True
 330         GCMP _ _ _ -> True; GABS _ _ _ -> True
 331         GNEG _ _ _ -> True; GSQRT _ _ _ -> True
 332         GSIN _ _ _ _ _ -> True; GCOS _ _ _ _ _ -> True; GTAN _ _ _ _ _ -> True
 333         GFREE -> panic "is_G_instr: GFREE (!)"
 334         other -> False
 335 #endif /* i386_TARGET_ARCH */