compiler/nativeGen/X86/Instr.hs

   1 -----------------------------------------------------------------------------
   2 --
   3 -- Machine-dependent assembly language
   4 --
   5 -- (c) The University of Glasgow 1993-2004
   6 --
   7 -----------------------------------------------------------------------------
   8
   9 #include "HsVersions.h"
  10 #include "nativeGen/NCG.h"
  11
  12 module X86.Instr
  13 where
  14
  15 import BlockId
  16 import X86.Regs
  17 import RegsBase
  18 import Cmm
  19 import FastString
  20
  21 #if i386_TARGET_ARCH
  22 import CLabel
  23 import Panic
  24 #endif
  25
  26 data Cond
  27         = ALWAYS        -- What's really used? ToDo
  28         | EQQ
  29         | GE
  30         | GEU
  31         | GTT
  32         | GU
  33         | LE
  34         | LEU
  35         | LTT
  36         | LU
  37         | NE
  38         | NEG
  39         | POS
  40         | CARRY
  41         | OFLO
  42         | PARITY
  43         | NOTPARITY
  44         deriving (Eq)
  45
  46
  47 -- -----------------------------------------------------------------------------
  48 -- Intel x86 instructions
  49
  50 {-
  51 Intel, in their infinite wisdom, selected a stack model for floating
  52 point registers on x86.  That might have made sense back in 1979 --
  53 nowadays we can see it for the nonsense it really is.  A stack model
  54 fits poorly with the existing nativeGen infrastructure, which assumes
  55 flat integer and FP register sets.  Prior to this commit, nativeGen
  56 could not generate correct x86 FP code -- to do so would have meant
  57 somehow working the register-stack paradigm into the register
  58 allocator and spiller, which sounds very difficult.
  59
  60 We have decided to cheat, and go for a simple fix which requires no
  61 infrastructure modifications, at the expense of generating ropey but
  62 correct FP code.  All notions of the x86 FP stack and its insns have
  63 been removed.  Instead, we pretend (to the instruction selector and
  64 register allocator) that x86 has six floating point registers, %fake0
  65 .. %fake5, which can be used in the usual flat manner.  We further
  66 claim that x86 has floating point instructions very similar to SPARC
  67 and Alpha, that is, a simple 3-operand register-register arrangement.
  68 Code generation and register allocation proceed on this basis.
  69
  70 When we come to print out the final assembly, our convenient fiction
  71 is converted to dismal reality.  Each fake instruction is
  72 independently converted to a series of real x86 instructions.
  73 %fake0 .. %fake5 are mapped to %st(0) .. %st(5).  To do reg-reg
  74 arithmetic operations, the two operands are pushed onto the top of the
  75 FP stack, the operation done, and the result copied back into the
  76 relevant register.  There are only six %fake registers because 2 are
  77 needed for the translation, and x86 has 8 in total.
  78
  79 The translation is inefficient but is simple and it works.  A cleverer
  80 translation would handle a sequence of insns, simulating the FP stack
  81 contents, would not impose a fixed mapping from %fake to %st regs, and
  82 hopefully could avoid most of the redundant reg-reg moves of the
  83 current translation.
  84
  85 We might as well make use of whatever unique FP facilities Intel have
  86 chosen to bless us with (let's not be churlish, after all).
  87 Hence GLDZ and GLD1.  Bwahahahahahahaha!
  88 -}
  89
  90 {-
  91 MORE FLOATING POINT MUSINGS...
  92
  93 Intel's internal floating point registers are by default 80 bit
  94 extended precision.  This means that all operations done on values in
  95 registers are done at 80 bits, and unless the intermediate values are
  96 truncated to the appropriate size (32 or 64 bits) by storing in
  97 memory, calculations in registers will give different results from
  98 calculations which pass intermediate values in memory (eg. via
  99 function calls).
 100
 101 One solution is to set the FPU into 64 bit precision mode.  Some OSs
 102 do this (eg. FreeBSD) and some don't (eg. Linux).  The problem here is
 103 that this will only affect 64-bit precision arithmetic; 32-bit
 104 calculations will still be done at 64-bit precision in registers.  So
 105 it doesn't solve the whole problem.
 106
 107 There's also the issue of what the C library is expecting in terms of
 108 precision.  It seems to be the case that glibc on Linux expects the
 109 FPU to be set to 80 bit precision, so setting it to 64 bit could have
 110 unexpected effects.  Changing the default could have undesirable
 111 effects on other 3rd-party library code too, so the right thing would
 112 be to save/restore the FPU control word across Haskell code if we were
 113 to do this.
 114
 115 gcc's -ffloat-store gives consistent results by always storing the
 116 results of floating-point calculations in memory, which works for both
 117 32 and 64-bit precision.  However, it only affects the values of
 118 user-declared floating point variables in C, not intermediate results.
 119 GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
 120 flag).
 121
 122 Another problem is how to spill floating point registers in the
 123 register allocator.  Should we spill the whole 80 bits, or just 64?
 124 On an OS which is set to 64 bit precision, spilling 64 is fine.  On
 125 Linux, spilling 64 bits will round the results of some operations.
 126 This is what gcc does.  Spilling at 80 bits requires taking up a full
 127 128 bit slot (so we get alignment).  We spill at 80-bits and ignore
 128 the alignment problems.
 129
 130 In the future, we'll use the SSE registers for floating point.  This
 131 requires a CPU that supports SSE2 (ordinary SSE only supports 32 bit
 132 precision float ops), which means P4 or Xeon and above.  Using SSE
 133 will solve all these problems, because the SSE registers use fixed 32
 134 bit or 64 bit precision.
 135
 136 --SDM 1/2003
 137 -}
 138
 139
 140 data Instr
 141         -- comment pseudo-op
 142         = COMMENT FastString
 143
 144         -- some static data spat out during code
 145         -- generation.  Will be extracted before
 146         -- pretty-printing.
 147         | LDATA   Section [CmmStatic]
 148
 149         -- start a new basic block.  Useful during
 150         -- codegen, removed later.  Preceding
 151         -- instruction should be a jump, as per the
 152         -- invariants for a BasicBlock (see Cmm).
 153         | NEWBLOCK BlockId
 154
 155         -- specify current stack offset for
 156         -- benefit of subsequent passes
 157         | DELTA   Int
 158
 159         -- | spill this reg to a stack slot
 160         | SPILL   Reg Int
 161
 162         -- | reload this reg from a stack slot
 163         | RELOAD  Int Reg
 164
 165
 166         -- Moves.
 167         | MOV         Size Operand Operand
 168         | MOVZxL      Size Operand Operand -- size is the size of operand 1
 169         | MOVSxL      Size Operand Operand -- size is the size of operand 1
 170         -- x86_64 note: plain mov into a 32-bit register always zero-extends
 171         -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
 172         -- don't affect the high bits of the register.
 173
 174         -- Load effective address (also a very useful three-operand add instruction :-)
 175         | LEA         Size Operand Operand
 176
 177         -- Int Arithmetic.
 178         | ADD         Size Operand Operand
 179         | ADC         Size Operand Operand
 180         | SUB         Size Operand Operand
 181
 182         | MUL         Size Operand Operand
 183         | IMUL        Size Operand Operand      -- signed int mul
 184         | IMUL2       Size Operand              -- %edx:%eax = operand * %eax
 185
 186         | DIV         Size Operand              -- eax := eax:edx/op, edx := eax:edx%op
 187         | IDIV        Size Operand              -- ditto, but signed
 188
 189         -- Simple bit-twiddling.
 190         | AND         Size Operand Operand
 191         | OR          Size Operand Operand
 192         | XOR         Size Operand Operand
 193         | NOT         Size Operand
 194         | NEGI        Size Operand              -- NEG instruction (name clash with Cond)
 195
 196         -- Shifts (amount may be immediate or %cl only)
 197         | SHL         Size Operand{-amount-} Operand
 198         | SAR         Size Operand{-amount-} Operand
 199         | SHR         Size Operand{-amount-} Operand
 200
 201         | BT          Size Imm Operand
 202         | NOP
 203
 204 #if i386_TARGET_ARCH
 205         -- Float Arithmetic.
 206
 207         -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles
 208         -- as single instructions right up until we spit them out.
 209         -- all the 3-operand fake fp insns are src1 src2 dst
 210         -- and furthermore are constrained to be fp regs only.
 211         -- IMPORTANT: keep is_G_insn up to date with any changes here
 212         | GMOV        Reg Reg -- src(fpreg), dst(fpreg)
 213         | GLD         Size AddrMode Reg -- src, dst(fpreg)
 214         | GST         Size Reg AddrMode -- src(fpreg), dst
 215
 216         | GLDZ        Reg -- dst(fpreg)
 217         | GLD1        Reg -- dst(fpreg)
 218
 219         | GFTOI       Reg Reg -- src(fpreg), dst(intreg)
 220         | GDTOI       Reg Reg -- src(fpreg), dst(intreg)
 221
 222         | GITOF       Reg Reg -- src(intreg), dst(fpreg)
 223         | GITOD       Reg Reg -- src(intreg), dst(fpreg)
 224
 225         | GADD        Size Reg Reg Reg -- src1, src2, dst
 226         | GDIV        Size Reg Reg Reg -- src1, src2, dst
 227         | GSUB        Size Reg Reg Reg -- src1, src2, dst
 228         | GMUL        Size Reg Reg Reg -- src1, src2, dst
 229
 230                 -- FP compare.  Cond must be `elem` [EQQ, NE, LE, LTT, GE, GTT]
 231                 -- Compare src1 with src2; set the Zero flag iff the numbers are
 232                 -- comparable and the comparison is True.  Subsequent code must
 233                 -- test the %eflags zero flag regardless of the supplied Cond.
 234         | GCMP        Cond Reg Reg -- src1, src2
 235
 236         | GABS        Size Reg Reg -- src, dst
 237         | GNEG        Size Reg Reg -- src, dst
 238         | GSQRT       Size Reg Reg -- src, dst
 239         | GSIN        Size CLabel CLabel Reg Reg -- src, dst
 240         | GCOS        Size CLabel CLabel Reg Reg -- src, dst
 241         | GTAN        Size CLabel CLabel Reg Reg -- src, dst
 242
 243         | GFREE         -- do ffree on all x86 regs; an ugly hack
 244 #endif
 245
 246 #if x86_64_TARGET_ARCH
 247 -- SSE2 floating point: we use a restricted set of the available SSE2
 248 -- instructions for floating-point.
 249
 250         -- use MOV for moving (either movss or movsd (movlpd better?))
 251
 252         | CVTSS2SD      Reg Reg         -- F32 to F64
 253         | CVTSD2SS      Reg Reg         -- F64 to F32
 254         | CVTTSS2SIQ    Operand Reg     -- F32 to I32/I64 (with truncation)
 255         | CVTTSD2SIQ    Operand Reg     -- F64 to I32/I64 (with truncation)
 256         | CVTSI2SS      Operand Reg     -- I32/I64 to F32
 257         | CVTSI2SD      Operand Reg     -- I32/I64 to F64
 258
 259         -- use ADD & SUB for arithmetic.  In both cases, operands
 260         -- are  Operand Reg.
 261
 262         -- SSE2 floating-point division:
 263         | FDIV          Size Operand Operand   -- divisor, dividend(dst)
 264
 265         -- use CMP for comparisons.  ucomiss and ucomisd instructions
 266         -- compare single/double prec floating point respectively.
 267
 268         | SQRT          Size Operand Reg        -- src, dst
 269 #endif
 270
 271         -- Comparison
 272         | TEST          Size Operand Operand
 273         | CMP           Size Operand Operand
 274         | SETCC         Cond Operand
 275
 276         -- Stack Operations.
 277         | PUSH          Size Operand
 278         | POP           Size Operand
 279         -- both unused (SDM):
 280         --  | PUSHA
 281         --  | POPA
 282
 283         -- Jumping around.
 284         | JMP         Operand
 285         | JXX         Cond BlockId  -- includes unconditional branches
 286         | JXX_GBL     Cond Imm      -- non-local version of JXX
 287         | JMP_TBL     Operand [BlockId]  -- table jump
 288         | CALL        (Either Imm Reg) [Reg]
 289
 290         -- Other things.
 291         | CLTD Size              -- sign extend %eax into %edx:%eax
 292
 293         | FETCHGOT    Reg        -- pseudo-insn for ELF position-independent code
 294                                  -- pretty-prints as
 295                                  --       call 1f
 296                                  -- 1:    popl %reg
 297                                  --       addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
 298         | FETCHPC     Reg        -- pseudo-insn for Darwin position-independent code
 299                                  -- pretty-prints as
 300                                  --       call 1f
 301                                  -- 1:    popl %reg
 302
 303
 304 data Operand
 305         = OpReg  Reg            -- register
 306         | OpImm  Imm            -- immediate value
 307         | OpAddr AddrMode       -- memory reference
 308
 309
 310 #if i386_TARGET_ARCH
 311 i386_insert_ffrees :: [GenBasicBlock Instr] -> [GenBasicBlock Instr]
 312 i386_insert_ffrees blocks
 313    | or (map (any is_G_instr) [ instrs | BasicBlock _ instrs <- blocks ])
 314    = map ffree_before_nonlocal_transfers blocks
 315
 316    | otherwise
 317    = blocks
 318   where
 319    ffree_before_nonlocal_transfers (BasicBlock id insns)
 320      = BasicBlock id (foldr p [] insns)
 321      where p insn r = case insn of
 322                         CALL _ _ -> GFREE : insn : r
 323                         JMP _    -> GFREE : insn : r
 324                         _        -> insn : r
 325
 326 -- if you ever add a new FP insn to the fake x86 FP insn set,
 327 -- you must update this too
 328 is_G_instr :: Instr -> Bool
 329 is_G_instr instr
 330    = case instr of
 331         GMOV{}          -> True
 332         GLD{}           -> True
 333         GST{}           -> True
 334         GLDZ{}          -> True
 335         GLD1{}          -> True
 336         GFTOI{}         -> True
 337         GDTOI{}         -> True
 338         GITOF{}         -> True
 339         GITOD{}         -> True
 340         GADD{}          -> True
 341         GDIV{}          -> True
 342         GSUB{}          -> True
 343         GMUL{}          -> True
 344         GCMP{}          -> True
 345         GABS{}          -> True
 346         GNEG{}          -> True
 347         GSQRT{}         -> True
 348         GSIN{}          -> True
 349         GCOS{}          -> True
 350         GTAN{}          -> True
 351         GFREE           -> panic "is_G_instr: GFREE (!)"
 352         _               -> False
 353 #endif /* i386_TARGET_ARCH */