compiler/nativeGen/X86/Instr.hs

   1 -----------------------------------------------------------------------------
   2 --
   3 -- Machine-dependent assembly language
   4 --
   5 -- (c) The University of Glasgow 1993-2004
   6 --
   7 -----------------------------------------------------------------------------
   8
   9 #include "HsVersions.h"
  10 #include "nativeGen/NCG.h"
  11
  12 module X86.Instr
  13 where
  14
  15 import BlockId
  16 import Regs
  17 import Cmm
  18 import FastString
  19
  20 #if i386_TARGET_ARCH
  21 import CLabel
  22 import Panic
  23 #endif
  24
  25 data Cond
  26         = ALWAYS        -- What's really used? ToDo
  27         | EQQ
  28         | GE
  29         | GEU
  30         | GTT
  31         | GU
  32         | LE
  33         | LEU
  34         | LTT
  35         | LU
  36         | NE
  37         | NEG
  38         | POS
  39         | CARRY
  40         | OFLO
  41         | PARITY
  42         | NOTPARITY
  43
  44
  45
  46 -- -----------------------------------------------------------------------------
  47 -- Intel x86 instructions
  48
  49 {-
  50 Intel, in their infinite wisdom, selected a stack model for floating
  51 point registers on x86.  That might have made sense back in 1979 --
  52 nowadays we can see it for the nonsense it really is.  A stack model
  53 fits poorly with the existing nativeGen infrastructure, which assumes
  54 flat integer and FP register sets.  Prior to this commit, nativeGen
  55 could not generate correct x86 FP code -- to do so would have meant
  56 somehow working the register-stack paradigm into the register
  57 allocator and spiller, which sounds very difficult.
  58
  59 We have decided to cheat, and go for a simple fix which requires no
  60 infrastructure modifications, at the expense of generating ropey but
  61 correct FP code.  All notions of the x86 FP stack and its insns have
  62 been removed.  Instead, we pretend (to the instruction selector and
  63 register allocator) that x86 has six floating point registers, %fake0
  64 .. %fake5, which can be used in the usual flat manner.  We further
  65 claim that x86 has floating point instructions very similar to SPARC
  66 and Alpha, that is, a simple 3-operand register-register arrangement.
  67 Code generation and register allocation proceed on this basis.
  68
  69 When we come to print out the final assembly, our convenient fiction
  70 is converted to dismal reality.  Each fake instruction is
  71 independently converted to a series of real x86 instructions.
  72 %fake0 .. %fake5 are mapped to %st(0) .. %st(5).  To do reg-reg
  73 arithmetic operations, the two operands are pushed onto the top of the
  74 FP stack, the operation done, and the result copied back into the
  75 relevant register.  There are only six %fake registers because 2 are
  76 needed for the translation, and x86 has 8 in total.
  77
  78 The translation is inefficient but is simple and it works.  A cleverer
  79 translation would handle a sequence of insns, simulating the FP stack
  80 contents, would not impose a fixed mapping from %fake to %st regs, and
  81 hopefully could avoid most of the redundant reg-reg moves of the
  82 current translation.
  83
  84 We might as well make use of whatever unique FP facilities Intel have
  85 chosen to bless us with (let's not be churlish, after all).
  86 Hence GLDZ and GLD1.  Bwahahahahahahaha!
  87 -}
  88
  89 {-
  90 MORE FLOATING POINT MUSINGS...
  91
  92 Intel's internal floating point registers are by default 80 bit
  93 extended precision.  This means that all operations done on values in
  94 registers are done at 80 bits, and unless the intermediate values are
  95 truncated to the appropriate size (32 or 64 bits) by storing in
  96 memory, calculations in registers will give different results from
  97 calculations which pass intermediate values in memory (eg. via
  98 function calls).
  99
 100 One solution is to set the FPU into 64 bit precision mode.  Some OSs
 101 do this (eg. FreeBSD) and some don't (eg. Linux).  The problem here is
 102 that this will only affect 64-bit precision arithmetic; 32-bit
 103 calculations will still be done at 64-bit precision in registers.  So
 104 it doesn't solve the whole problem.
 105
 106 There's also the issue of what the C library is expecting in terms of
 107 precision.  It seems to be the case that glibc on Linux expects the
 108 FPU to be set to 80 bit precision, so setting it to 64 bit could have
 109 unexpected effects.  Changing the default could have undesirable
 110 effects on other 3rd-party library code too, so the right thing would
 111 be to save/restore the FPU control word across Haskell code if we were
 112 to do this.
 113
 114 gcc's -ffloat-store gives consistent results by always storing the
 115 results of floating-point calculations in memory, which works for both
 116 32 and 64-bit precision.  However, it only affects the values of
 117 user-declared floating point variables in C, not intermediate results.
 118 GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
 119 flag).
 120
 121 Another problem is how to spill floating point registers in the
 122 register allocator.  Should we spill the whole 80 bits, or just 64?
 123 On an OS which is set to 64 bit precision, spilling 64 is fine.  On
 124 Linux, spilling 64 bits will round the results of some operations.
 125 This is what gcc does.  Spilling at 80 bits requires taking up a full
 126 128 bit slot (so we get alignment).  We spill at 80-bits and ignore
 127 the alignment problems.
 128
 129 In the future, we'll use the SSE registers for floating point.  This
 130 requires a CPU that supports SSE2 (ordinary SSE only supports 32 bit
 131 precision float ops), which means P4 or Xeon and above.  Using SSE
 132 will solve all these problems, because the SSE registers use fixed 32
 133 bit or 64 bit precision.
 134
 135 --SDM 1/2003
 136 -}
 137
 138
 139 data Instr
 140         -- comment pseudo-op
 141         = COMMENT FastString
 142
 143         -- some static data spat out during code
 144         -- generation.  Will be extracted before
 145         -- pretty-printing.
 146         | LDATA   Section [CmmStatic]
 147
 148         -- start a new basic block.  Useful during
 149         -- codegen, removed later.  Preceding
 150         -- instruction should be a jump, as per the
 151         -- invariants for a BasicBlock (see Cmm).
 152         | NEWBLOCK BlockId
 153
 154         -- specify current stack offset for
 155         -- benefit of subsequent passes
 156         | DELTA   Int
 157
 158         -- | spill this reg to a stack slot
 159         | SPILL   Reg Int
 160
 161         -- | reload this reg from a stack slot
 162         | RELOAD  Int Reg
 163
 164
 165         -- Moves.
 166         | MOV         Size Operand Operand
 167         | MOVZxL      Size Operand Operand -- size is the size of operand 1
 168         | MOVSxL      Size Operand Operand -- size is the size of operand 1
 169         -- x86_64 note: plain mov into a 32-bit register always zero-extends
 170         -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
 171         -- don't affect the high bits of the register.
 172
 173         -- Load effective address (also a very useful three-operand add instruction :-)
 174         | LEA         Size Operand Operand
 175
 176         -- Int Arithmetic.
 177         | ADD         Size Operand Operand
 178         | ADC         Size Operand Operand
 179         | SUB         Size Operand Operand
 180
 181         | MUL         Size Operand Operand
 182         | IMUL        Size Operand Operand      -- signed int mul
 183         | IMUL2       Size Operand              -- %edx:%eax = operand * %eax
 184
 185         | DIV         Size Operand              -- eax := eax:edx/op, edx := eax:edx%op
 186         | IDIV        Size Operand              -- ditto, but signed
 187
 188         -- Simple bit-twiddling.
 189         | AND         Size Operand Operand
 190         | OR          Size Operand Operand
 191         | XOR         Size Operand Operand
 192         | NOT         Size Operand
 193         | NEGI        Size Operand              -- NEG instruction (name clash with Cond)
 194
 195         -- Shifts (amount may be immediate or %cl only)
 196         | SHL         Size Operand{-amount-} Operand
 197         | SAR         Size Operand{-amount-} Operand
 198         | SHR         Size Operand{-amount-} Operand
 199
 200         | BT          Size Imm Operand
 201         | NOP
 202
 203 #if i386_TARGET_ARCH
 204         -- Float Arithmetic.
 205
 206         -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles
 207         -- as single instructions right up until we spit them out.
 208         -- all the 3-operand fake fp insns are src1 src2 dst
 209         -- and furthermore are constrained to be fp regs only.
 210         -- IMPORTANT: keep is_G_insn up to date with any changes here
 211         | GMOV        Reg Reg -- src(fpreg), dst(fpreg)
 212         | GLD         Size AddrMode Reg -- src, dst(fpreg)
 213         | GST         Size Reg AddrMode -- src(fpreg), dst
 214
 215         | GLDZ        Reg -- dst(fpreg)
 216         | GLD1        Reg -- dst(fpreg)
 217
 218         | GFTOI       Reg Reg -- src(fpreg), dst(intreg)
 219         | GDTOI       Reg Reg -- src(fpreg), dst(intreg)
 220
 221         | GITOF       Reg Reg -- src(intreg), dst(fpreg)
 222         | GITOD       Reg Reg -- src(intreg), dst(fpreg)
 223
 224         | GADD        Size Reg Reg Reg -- src1, src2, dst
 225         | GDIV        Size Reg Reg Reg -- src1, src2, dst
 226         | GSUB        Size Reg Reg Reg -- src1, src2, dst
 227         | GMUL        Size Reg Reg Reg -- src1, src2, dst
 228
 229                 -- FP compare.  Cond must be `elem` [EQQ, NE, LE, LTT, GE, GTT]
 230                 -- Compare src1 with src2; set the Zero flag iff the numbers are
 231                 -- comparable and the comparison is True.  Subsequent code must
 232                 -- test the %eflags zero flag regardless of the supplied Cond.
 233         | GCMP        Cond Reg Reg -- src1, src2
 234
 235         | GABS        Size Reg Reg -- src, dst
 236         | GNEG        Size Reg Reg -- src, dst
 237         | GSQRT       Size Reg Reg -- src, dst
 238         | GSIN        Size CLabel CLabel Reg Reg -- src, dst
 239         | GCOS        Size CLabel CLabel Reg Reg -- src, dst
 240         | GTAN        Size CLabel CLabel Reg Reg -- src, dst
 241
 242         | GFREE         -- do ffree on all x86 regs; an ugly hack
 243 #endif
 244
 245 #if x86_64_TARGET_ARCH
 246 -- SSE2 floating point: we use a restricted set of the available SSE2
 247 -- instructions for floating-point.
 248
 249         -- use MOV for moving (either movss or movsd (movlpd better?))
 250
 251         | CVTSS2SD      Reg Reg         -- F32 to F64
 252         | CVTSD2SS      Reg Reg         -- F64 to F32
 253         | CVTTSS2SIQ    Operand Reg     -- F32 to I32/I64 (with truncation)
 254         | CVTTSD2SIQ    Operand Reg     -- F64 to I32/I64 (with truncation)
 255         | CVTSI2SS      Operand Reg     -- I32/I64 to F32
 256         | CVTSI2SD      Operand Reg     -- I32/I64 to F64
 257
 258         -- use ADD & SUB for arithmetic.  In both cases, operands
 259         -- are  Operand Reg.
 260
 261         -- SSE2 floating-point division:
 262         | FDIV          Size Operand Operand   -- divisor, dividend(dst)
 263
 264         -- use CMP for comparisons.  ucomiss and ucomisd instructions
 265         -- compare single/double prec floating point respectively.
 266
 267         | SQRT          Size Operand Reg        -- src, dst
 268 #endif
 269
 270         -- Comparison
 271         | TEST          Size Operand Operand
 272         | CMP           Size Operand Operand
 273         | SETCC         Cond Operand
 274
 275         -- Stack Operations.
 276         | PUSH          Size Operand
 277         | POP           Size Operand
 278         -- both unused (SDM):
 279         --  | PUSHA
 280         --  | POPA
 281
 282         -- Jumping around.
 283         | JMP         Operand
 284         | JXX         Cond BlockId  -- includes unconditional branches
 285         | JXX_GBL     Cond Imm      -- non-local version of JXX
 286         | JMP_TBL     Operand [BlockId]  -- table jump
 287         | CALL        (Either Imm Reg) [Reg]
 288
 289         -- Other things.
 290         | CLTD Size              -- sign extend %eax into %edx:%eax
 291
 292         | FETCHGOT    Reg        -- pseudo-insn for ELF position-independent code
 293                                  -- pretty-prints as
 294                                  --       call 1f
 295                                  -- 1:    popl %reg
 296                                  --       addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
 297         | FETCHPC     Reg        -- pseudo-insn for Darwin position-independent code
 298                                  -- pretty-prints as
 299                                  --       call 1f
 300                                  -- 1:    popl %reg
 301
 302
 303 data Operand
 304         = OpReg  Reg            -- register
 305         | OpImm  Imm            -- immediate value
 306         | OpAddr AddrMode       -- memory reference
 307
 308
 309 #if i386_TARGET_ARCH
 310 i386_insert_ffrees :: [GenBasicBlock Instr] -> [GenBasicBlock Instr]
 311 i386_insert_ffrees blocks
 312    | or (map (any is_G_instr) [ instrs | BasicBlock _ instrs <- blocks ])
 313    = map ffree_before_nonlocal_transfers blocks
 314
 315    | otherwise
 316    = blocks
 317   where
 318    ffree_before_nonlocal_transfers (BasicBlock id insns)
 319      = BasicBlock id (foldr p [] insns)
 320      where p insn r = case insn of
 321                         CALL _ _ -> GFREE : insn : r
 322                         JMP _    -> GFREE : insn : r
 323                         _        -> insn : r
 324
 325 -- if you ever add a new FP insn to the fake x86 FP insn set,
 326 -- you must update this too
 327 is_G_instr :: Instr -> Bool
 328 is_G_instr instr
 329    = case instr of
 330         GMOV{}          -> True
 331         GLD{}           -> True
 332         GST{}           -> True
 333         GLDZ{}          -> True
 334         GLD1{}          -> True
 335         GFTOI{}         -> True
 336         GDTOI{}         -> True
 337         GITOF{}         -> True
 338         GITOD{}         -> True
 339         GADD{}          -> True
 340         GDIV{}          -> True
 341         GSUB{}          -> True
 342         GMUL{}          -> True
 343         GCMP{}          -> True
 344         GABS{}          -> True
 345         GNEG{}          -> True
 346         GSQRT{}         -> True
 347         GSIN{}          -> True
 348         GCOS{}          -> True
 349         GTAN{}          -> True
 350         GFREE           -> panic "is_G_instr: GFREE (!)"
 351         _               -> False
 352 #endif /* i386_TARGET_ARCH */