compiler/nativeGen/X86/Instr.hs

   1 -----------------------------------------------------------------------------
   2 --
   3 -- Machine-dependent assembly language
   4 --
   5 -- (c) The University of Glasgow 1993-2004
   6 --
   7 -----------------------------------------------------------------------------
   8
   9 #include "HsVersions.h"
  10 #include "nativeGen/NCG.h"
  11
  12 module X86.Instr
  13 where
  14
  15 import BlockId
  16 import X86.Regs
  17 import RegsBase
  18 import Cmm
  19 import FastString
  20
  21 import CLabel
  22 import Panic
  23
  24 data Cond
  25         = ALWAYS        -- What's really used? ToDo
  26         | EQQ
  27         | GE
  28         | GEU
  29         | GTT
  30         | GU
  31         | LE
  32         | LEU
  33         | LTT
  34         | LU
  35         | NE
  36         | NEG
  37         | POS
  38         | CARRY
  39         | OFLO
  40         | PARITY
  41         | NOTPARITY
  42         deriving (Eq)
  43
  44
  45 -- -----------------------------------------------------------------------------
  46 -- Intel x86 instructions
  47
  48 {-
  49 Intel, in their infinite wisdom, selected a stack model for floating
  50 point registers on x86.  That might have made sense back in 1979 --
  51 nowadays we can see it for the nonsense it really is.  A stack model
  52 fits poorly with the existing nativeGen infrastructure, which assumes
  53 flat integer and FP register sets.  Prior to this commit, nativeGen
  54 could not generate correct x86 FP code -- to do so would have meant
  55 somehow working the register-stack paradigm into the register
  56 allocator and spiller, which sounds very difficult.
  57
  58 We have decided to cheat, and go for a simple fix which requires no
  59 infrastructure modifications, at the expense of generating ropey but
  60 correct FP code.  All notions of the x86 FP stack and its insns have
  61 been removed.  Instead, we pretend (to the instruction selector and
  62 register allocator) that x86 has six floating point registers, %fake0
  63 .. %fake5, which can be used in the usual flat manner.  We further
  64 claim that x86 has floating point instructions very similar to SPARC
  65 and Alpha, that is, a simple 3-operand register-register arrangement.
  66 Code generation and register allocation proceed on this basis.
  67
  68 When we come to print out the final assembly, our convenient fiction
  69 is converted to dismal reality.  Each fake instruction is
  70 independently converted to a series of real x86 instructions.
  71 %fake0 .. %fake5 are mapped to %st(0) .. %st(5).  To do reg-reg
  72 arithmetic operations, the two operands are pushed onto the top of the
  73 FP stack, the operation done, and the result copied back into the
  74 relevant register.  There are only six %fake registers because 2 are
  75 needed for the translation, and x86 has 8 in total.
  76
  77 The translation is inefficient but is simple and it works.  A cleverer
  78 translation would handle a sequence of insns, simulating the FP stack
  79 contents, would not impose a fixed mapping from %fake to %st regs, and
  80 hopefully could avoid most of the redundant reg-reg moves of the
  81 current translation.
  82
  83 We might as well make use of whatever unique FP facilities Intel have
  84 chosen to bless us with (let's not be churlish, after all).
  85 Hence GLDZ and GLD1.  Bwahahahahahahaha!
  86 -}
  87
  88 {-
  89 MORE FLOATING POINT MUSINGS...
  90
  91 Intel's internal floating point registers are by default 80 bit
  92 extended precision.  This means that all operations done on values in
  93 registers are done at 80 bits, and unless the intermediate values are
  94 truncated to the appropriate size (32 or 64 bits) by storing in
  95 memory, calculations in registers will give different results from
  96 calculations which pass intermediate values in memory (eg. via
  97 function calls).
  98
  99 One solution is to set the FPU into 64 bit precision mode.  Some OSs
 100 do this (eg. FreeBSD) and some don't (eg. Linux).  The problem here is
 101 that this will only affect 64-bit precision arithmetic; 32-bit
 102 calculations will still be done at 64-bit precision in registers.  So
 103 it doesn't solve the whole problem.
 104
 105 There's also the issue of what the C library is expecting in terms of
 106 precision.  It seems to be the case that glibc on Linux expects the
 107 FPU to be set to 80 bit precision, so setting it to 64 bit could have
 108 unexpected effects.  Changing the default could have undesirable
 109 effects on other 3rd-party library code too, so the right thing would
 110 be to save/restore the FPU control word across Haskell code if we were
 111 to do this.
 112
 113 gcc's -ffloat-store gives consistent results by always storing the
 114 results of floating-point calculations in memory, which works for both
 115 32 and 64-bit precision.  However, it only affects the values of
 116 user-declared floating point variables in C, not intermediate results.
 117 GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
 118 flag).
 119
 120 Another problem is how to spill floating point registers in the
 121 register allocator.  Should we spill the whole 80 bits, or just 64?
 122 On an OS which is set to 64 bit precision, spilling 64 is fine.  On
 123 Linux, spilling 64 bits will round the results of some operations.
 124 This is what gcc does.  Spilling at 80 bits requires taking up a full
 125 128 bit slot (so we get alignment).  We spill at 80-bits and ignore
 126 the alignment problems.
 127
 128 In the future, we'll use the SSE registers for floating point.  This
 129 requires a CPU that supports SSE2 (ordinary SSE only supports 32 bit
 130 precision float ops), which means P4 or Xeon and above.  Using SSE
 131 will solve all these problems, because the SSE registers use fixed 32
 132 bit or 64 bit precision.
 133
 134 --SDM 1/2003
 135 -}
 136
 137
 138 data Instr
 139         -- comment pseudo-op
 140         = COMMENT FastString
 141
 142         -- some static data spat out during code
 143         -- generation.  Will be extracted before
 144         -- pretty-printing.
 145         | LDATA   Section [CmmStatic]
 146
 147         -- start a new basic block.  Useful during
 148         -- codegen, removed later.  Preceding
 149         -- instruction should be a jump, as per the
 150         -- invariants for a BasicBlock (see Cmm).
 151         | NEWBLOCK BlockId
 152
 153         -- specify current stack offset for
 154         -- benefit of subsequent passes
 155         | DELTA   Int
 156
 157         -- | spill this reg to a stack slot
 158         | SPILL   Reg Int
 159
 160         -- | reload this reg from a stack slot
 161         | RELOAD  Int Reg
 162
 163
 164         -- Moves.
 165         | MOV         Size Operand Operand
 166         | MOVZxL      Size Operand Operand -- size is the size of operand 1
 167         | MOVSxL      Size Operand Operand -- size is the size of operand 1
 168         -- x86_64 note: plain mov into a 32-bit register always zero-extends
 169         -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
 170         -- don't affect the high bits of the register.
 171
 172         -- Load effective address (also a very useful three-operand add instruction :-)
 173         | LEA         Size Operand Operand
 174
 175         -- Int Arithmetic.
 176         | ADD         Size Operand Operand
 177         | ADC         Size Operand Operand
 178         | SUB         Size Operand Operand
 179
 180         | MUL         Size Operand Operand
 181         | IMUL        Size Operand Operand      -- signed int mul
 182         | IMUL2       Size Operand              -- %edx:%eax = operand * %eax
 183
 184         | DIV         Size Operand              -- eax := eax:edx/op, edx := eax:edx%op
 185         | IDIV        Size Operand              -- ditto, but signed
 186
 187         -- Simple bit-twiddling.
 188         | AND         Size Operand Operand
 189         | OR          Size Operand Operand
 190         | XOR         Size Operand Operand
 191         | NOT         Size Operand
 192         | NEGI        Size Operand              -- NEG instruction (name clash with Cond)
 193
 194         -- Shifts (amount may be immediate or %cl only)
 195         | SHL         Size Operand{-amount-} Operand
 196         | SAR         Size Operand{-amount-} Operand
 197         | SHR         Size Operand{-amount-} Operand
 198
 199         | BT          Size Imm Operand
 200         | NOP
 201
 202         -- x86 Float Arithmetic.
 203         -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles
 204         -- as single instructions right up until we spit them out.
 205         -- all the 3-operand fake fp insns are src1 src2 dst
 206         -- and furthermore are constrained to be fp regs only.
 207         -- IMPORTANT: keep is_G_insn up to date with any changes here
 208         | GMOV        Reg Reg -- src(fpreg), dst(fpreg)
 209         | GLD         Size AddrMode Reg -- src, dst(fpreg)
 210         | GST         Size Reg AddrMode -- src(fpreg), dst
 211
 212         | GLDZ        Reg -- dst(fpreg)
 213         | GLD1        Reg -- dst(fpreg)
 214
 215         | GFTOI       Reg Reg -- src(fpreg), dst(intreg)
 216         | GDTOI       Reg Reg -- src(fpreg), dst(intreg)
 217
 218         | GITOF       Reg Reg -- src(intreg), dst(fpreg)
 219         | GITOD       Reg Reg -- src(intreg), dst(fpreg)
 220
 221         | GADD        Size Reg Reg Reg -- src1, src2, dst
 222         | GDIV        Size Reg Reg Reg -- src1, src2, dst
 223         | GSUB        Size Reg Reg Reg -- src1, src2, dst
 224         | GMUL        Size Reg Reg Reg -- src1, src2, dst
 225
 226                 -- FP compare.  Cond must be `elem` [EQQ, NE, LE, LTT, GE, GTT]
 227                 -- Compare src1 with src2; set the Zero flag iff the numbers are
 228                 -- comparable and the comparison is True.  Subsequent code must
 229                 -- test the %eflags zero flag regardless of the supplied Cond.
 230         | GCMP        Cond Reg Reg -- src1, src2
 231
 232         | GABS        Size Reg Reg -- src, dst
 233         | GNEG        Size Reg Reg -- src, dst
 234         | GSQRT       Size Reg Reg -- src, dst
 235         | GSIN        Size CLabel CLabel Reg Reg -- src, dst
 236         | GCOS        Size CLabel CLabel Reg Reg -- src, dst
 237         | GTAN        Size CLabel CLabel Reg Reg -- src, dst
 238
 239         | GFREE         -- do ffree on all x86 regs; an ugly hack
 240
 241
 242         -- SSE2 floating point: we use a restricted set of the available SSE2
 243         -- instructions for floating-point.
 244         -- use MOV for moving (either movss or movsd (movlpd better?))
 245         | CVTSS2SD      Reg Reg         -- F32 to F64
 246         | CVTSD2SS      Reg Reg         -- F64 to F32
 247         | CVTTSS2SIQ    Operand Reg     -- F32 to I32/I64 (with truncation)
 248         | CVTTSD2SIQ    Operand Reg     -- F64 to I32/I64 (with truncation)
 249         | CVTSI2SS      Operand Reg     -- I32/I64 to F32
 250         | CVTSI2SD      Operand Reg     -- I32/I64 to F64
 251
 252         -- use ADD & SUB for arithmetic.  In both cases, operands
 253         -- are  Operand Reg.
 254
 255         -- SSE2 floating-point division:
 256         | FDIV          Size Operand Operand   -- divisor, dividend(dst)
 257
 258         -- use CMP for comparisons.  ucomiss and ucomisd instructions
 259         -- compare single/double prec floating point respectively.
 260
 261         | SQRT          Size Operand Reg        -- src, dst
 262
 263
 264         -- Comparison
 265         | TEST          Size Operand Operand
 266         | CMP           Size Operand Operand
 267         | SETCC         Cond Operand
 268
 269         -- Stack Operations.
 270         | PUSH          Size Operand
 271         | POP           Size Operand
 272         -- both unused (SDM):
 273         --  | PUSHA
 274         --  | POPA
 275
 276         -- Jumping around.
 277         | JMP         Operand
 278         | JXX         Cond BlockId  -- includes unconditional branches
 279         | JXX_GBL     Cond Imm      -- non-local version of JXX
 280         | JMP_TBL     Operand [BlockId]  -- table jump
 281         | CALL        (Either Imm Reg) [Reg]
 282
 283         -- Other things.
 284         | CLTD Size              -- sign extend %eax into %edx:%eax
 285
 286         | FETCHGOT    Reg        -- pseudo-insn for ELF position-independent code
 287                                  -- pretty-prints as
 288                                  --       call 1f
 289                                  -- 1:    popl %reg
 290                                  --       addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
 291         | FETCHPC     Reg        -- pseudo-insn for Darwin position-independent code
 292                                  -- pretty-prints as
 293                                  --       call 1f
 294                                  -- 1:    popl %reg
 295
 296
 297 data Operand
 298         = OpReg  Reg            -- register
 299         | OpImm  Imm            -- immediate value
 300         | OpAddr AddrMode       -- memory reference
 301
 302
 303
 304 i386_insert_ffrees :: [GenBasicBlock Instr] -> [GenBasicBlock Instr]
 305 i386_insert_ffrees blocks
 306    | or (map (any is_G_instr) [ instrs | BasicBlock _ instrs <- blocks ])
 307    = map ffree_before_nonlocal_transfers blocks
 308
 309    | otherwise
 310    = blocks
 311   where
 312    ffree_before_nonlocal_transfers (BasicBlock id insns)
 313      = BasicBlock id (foldr p [] insns)
 314      where p insn r = case insn of
 315                         CALL _ _ -> GFREE : insn : r
 316                         JMP _    -> GFREE : insn : r
 317                         _        -> insn : r
 318
 319 -- if you ever add a new FP insn to the fake x86 FP insn set,
 320 -- you must update this too
 321 is_G_instr :: Instr -> Bool
 322 is_G_instr instr
 323    = case instr of
 324         GMOV{}          -> True
 325         GLD{}           -> True
 326         GST{}           -> True
 327         GLDZ{}          -> True
 328         GLD1{}          -> True
 329         GFTOI{}         -> True
 330         GDTOI{}         -> True
 331         GITOF{}         -> True
 332         GITOD{}         -> True
 333         GADD{}          -> True
 334         GDIV{}          -> True
 335         GSUB{}          -> True
 336         GMUL{}          -> True
 337         GCMP{}          -> True
 338         GABS{}          -> True
 339         GNEG{}          -> True
 340         GSQRT{}         -> True
 341         GSIN{}          -> True
 342         GCOS{}          -> True
 343         GTAN{}          -> True
 344         GFREE           -> panic "is_G_instr: GFREE (!)"
 345         _               -> False