compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    getMessages,
  29    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  30    getLexState, popLexState, pushLexState,
  31    extension, standaloneDerivingEnabled, bangPatEnabled
  32   ) where
  33
  34 #include "HsVersions.h"
  35
  36 import Bag
  37 import ErrUtils
  38 import Outputable
  39 import StringBuffer
  40 import FastString
  41 import FastTypes
  42 import SrcLoc
  43 import UniqFM
  44 import DynFlags
  45 import Ctype
  46 import Util             ( maybePrefixMatch, readRational )
  47
  48 import Control.Monad
  49 import Data.Bits
  50 import Data.Char        ( chr, isSpace )
  51 import Data.Ratio
  52 import Debug.Trace
  53
  54 #if __GLASGOW_HASKELL__ >= 605
  55 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  56 #else
  57 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  58 #endif
  59 }
  60
  61 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  62 $whitechar   = [\ \n\r\f\v\xa0 $unispace]
  63 $white_no_nl = $whitechar # \n
  64 $tab         = \t
  65
  66 $ascdigit  = 0-9
  67 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  68 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  69 $digit     = [$ascdigit $unidigit]
  70
  71 $special   = [\(\)\,\;\[\]\`\{\}]
  72 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  73 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  74 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  75
  76 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  77 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  78 $large     = [$asclarge $unilarge]
  79
  80 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
  81 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  82 $small     = [$ascsmall $unismall \_]
  83
  84 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
  85 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  86
  87 $octit     = 0-7
  88 $hexit     = [$decdigit A-F a-f]
  89 $symchar   = [$symbol \:]
  90 $nl        = [\n\r]
  91 $idchar    = [$small $large $digit \']
  92
  93 $docsym    = [\| \^ \* \$]
  94
  95 @varid     = $small $idchar*
  96 @conid     = $large $idchar*
  97
  98 @varsym    = $symbol $symchar*
  99 @consym    = \: $symchar*
 100
 101 @decimal     = $decdigit+
 102 @octal       = $octit+
 103 @hexadecimal = $hexit+
 104 @exponent    = [eE] [\-\+]? @decimal
 105
 106 -- we support the hierarchical module name extension:
 107 @qual = (@conid \.)+
 108
 109 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 110
 111 -- normal signed numerical literals can only be explicitly negative,
 112 -- not explicitly positive (contrast @exponent)
 113 @negative = \-
 114 @signed = @negative ?
 115
 116 haskell :-
 117
 118 -- everywhere: skip whitespace and comments
 119 $white_no_nl+                           ;
 120 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 121
 122 -- Everywhere: deal with nested comments.  We explicitly rule out
 123 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 124 -- (this can happen even though pragmas will normally take precedence due to
 125 -- longest-match, because pragmas aren't valid in every state, but comments
 126 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 127 -- set.
 128
 129 "{-" / { isNormalComment } { nested_comment lexToken }
 130
 131 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 132 -- more dashes followed by a symbol should be parsed as a varsym, so we
 133 -- have to exclude those.
 134
 135 -- Since Haddock comments aren't valid in every state, we need to rule them
 136 -- out here.
 137
 138 -- The following two rules match comments that begin with two dashes, but
 139 -- continue with a different character. The rules test that this character
 140 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 141 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 142 -- have a Haddock comment). The rules then munch the rest of the line.
 143
 144 "-- " ~$docsym .* ;
 145 "--" [^$symbol : \ ] .* ;
 146
 147 -- Next, match Haddock comments if no -haddock flag
 148
 149 "-- " $docsym .* / { ifExtension (not . haddockEnabled) } ;
 150
 151 -- Now, when we've matched comments that begin with 2 dashes and continue
 152 -- with a different character, we need to match comments that begin with three
 153 -- or more dashes (which clearly can't be Haddock comments). We only need to
 154 -- make sure that the first non-dash character isn't a symbol, and munch the
 155 -- rest of the line.
 156
 157 "---"\-* [^$symbol :] .* ;
 158
 159 -- Since the previous rules all match dashes followed by at least one
 160 -- character, we also need to match a whole line filled with just dashes.
 161
 162 "--"\-* / { atEOL } ;
 163
 164 -- We need this rule since none of the other single line comment rules
 165 -- actually match this case.
 166
 167 "-- " / { atEOL } ;
 168
 169 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 170 -- blank lines) until we find a non-whitespace character, then do layout
 171 -- processing.
 172 --
 173 -- One slight wibble here: what if the line begins with {-#? In
 174 -- theory, we have to lex the pragma to see if it's one we recognise,
 175 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 176 -- as a nested comment.  We don't bother with this: if the line begins
 177 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 178 <bol> {
 179   \n                                    ;
 180   ^\# (line)?                           { begin line_prag1 }
 181   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 182   ^\# \! .* \n                          ; -- #!, for scripts
 183   ()                                    { do_bol }
 184 }
 185
 186 -- after a layout keyword (let, where, do, of), we begin a new layout
 187 -- context if the curly brace is missing.
 188 -- Careful! This stuff is quite delicate.
 189 <layout, layout_do> {
 190   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 191         -- we might encounter {-# here, but {- has been handled already
 192   \n                                    ;
 193   ^\# (line)?                           { begin line_prag1 }
 194 }
 195
 196 -- do is treated in a subtly different way, see new_layout_context
 197 <layout>    ()                          { new_layout_context True }
 198 <layout_do> ()                          { new_layout_context False }
 199
 200 -- after a new layout context which was found to be to the left of the
 201 -- previous context, we have generated a '{' token, and we now need to
 202 -- generate a matching '}' token.
 203 <layout_left>  ()                       { do_layout_left }
 204
 205 <0,option_prags> \n                             { begin bol }
 206
 207 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 208
 209 -- single-line line pragmas, of the form
 210 --    # <line> "<file>" <extra-stuff> \n
 211 <line_prag1> $decdigit+                 { setLine line_prag1a }
 212 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 213 <line_prag1b> .*                        { pop }
 214
 215 -- Haskell-style line pragmas, of the form
 216 --    {-# LINE <line> "<file>" #-}
 217 <line_prag2> $decdigit+                 { setLine line_prag2a }
 218 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 219 <line_prag2b> "#-}"|"-}"                { pop }
 220    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 221    -- with older versions of GHC which generated these.
 222
 223 -- We only want RULES pragmas to be picked up when explicit forall
 224 -- syntax is enabled is on, because the contents of the pragma always
 225 -- uses it. If it's not on then we're sure to get a parse error.
 226 -- (ToDo: we should really emit a warning when ignoring pragmas)
 227 -- XXX Now that we can enable this without the -fglasgow-exts hammer,
 228 -- is it better just to let the parse error happen?
 229 <0>
 230   "{-#" $whitechar* (RULES|rules) / { ifExtension explicitForallEnabled } { token ITrules_prag }
 231
 232 <0,option_prags> {
 233   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 234   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 235                                         { token (ITinline_prag False) }
 236   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 237                                         { token ITspec_prag }
 238   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 239         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 240   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 241         $whitechar* (NO(T?)INLINE|no(t?)inline)
 242                                         { token (ITspec_inline_prag False) }
 243   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 244   "{-#" $whitechar* (DEPRECATED|deprecated)
 245                                         { token ITdeprecated_prag }
 246   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 247   "{-#" $whitechar* (GENERATED|generated)
 248                                         { token ITgenerated_prag }
 249   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 250   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 251
 252   "{-#" $whitechar* (DOCOPTIONS|docoptions)
 253   / { ifExtension haddockEnabled }     { lex_string_prag ITdocOptions }
 254
 255  "{-#"                                 { nested_comment lexToken }
 256
 257   -- ToDo: should only be valid inside a pragma:
 258   "#-}"                                 { token ITclose_prag}
 259 }
 260
 261 <option_prags> {
 262   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 263   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 264                                         { lex_string_prag IToptions_prag }
 265   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 266   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 267 }
 268
 269 <0,option_prags> {
 270         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 271   "{-#" $whitechar* $idchar+            { nested_comment lexToken }
 272 }
 273
 274 -- '0' state: ordinary lexemes
 275
 276 -- Haddock comments
 277
 278 <0> {
 279   "-- " $docsym    / { ifExtension haddockEnabled } { multiline_doc_comment }
 280   "{-" \ ? $docsym / { ifExtension haddockEnabled } { nested_doc_comment }
 281 }
 282
 283 -- "special" symbols
 284
 285 <0> {
 286   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 287   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 288 }
 289
 290 <0> {
 291   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 292   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 293   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 294   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 295   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 296   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 297   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 298   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 299 }
 300
 301 <0> {
 302   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 303                                         { special IToparenbar }
 304   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 305 }
 306
 307 <0> {
 308   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 309 }
 310
 311 <0> {
 312   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 313          { token IToubxparen }
 314   "#)" / { ifExtension unboxedTuplesEnabled }
 315          { token ITcubxparen }
 316 }
 317
 318 <0> {
 319   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 320   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 321 }
 322
 323 <0,option_prags> {
 324   \(                                    { special IToparen }
 325   \)                                    { special ITcparen }
 326   \[                                    { special ITobrack }
 327   \]                                    { special ITcbrack }
 328   \,                                    { special ITcomma }
 329   \;                                    { special ITsemi }
 330   \`                                    { special ITbackquote }
 331
 332   \{                                    { open_brace }
 333   \}                                    { close_brace }
 334 }
 335
 336 <0,option_prags> {
 337   @qual @varid                  { idtoken qvarid }
 338   @qual @conid                  { idtoken qconid }
 339   @varid                        { varid }
 340   @conid                        { idtoken conid }
 341 }
 342
 343 <0> {
 344   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 345   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 346   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 347   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 348 }
 349
 350 -- ToDo: M.(,,,)
 351
 352 <0> {
 353   @qual @varsym                 { idtoken qvarsym }
 354   @qual @consym                 { idtoken qconsym }
 355   @varsym                       { varsym }
 356   @consym                       { consym }
 357 }
 358
 359 -- For the normal boxed literals we need to be careful
 360 -- when trying to be close to Haskell98
 361 <0> {
 362   -- Normal integral literals (:: Num a => a, from Integer)
 363   @decimal                      { tok_num positive 0 0 decimal }
 364   0[oO] @octal                  { tok_num positive 2 2 octal }
 365   0[xX] @hexadecimal            { tok_num positive 2 2 hexadecimal }
 366
 367   -- Normal rational literals (:: Fractional a => a, from Rational)
 368   @floating_point               { strtoken tok_float }
 369 }
 370
 371 <0> {
 372   -- Unboxed ints (:: Int#)
 373   -- It's simpler (and faster?) to give separate cases to the negatives,
 374   -- especially considering octal/hexadecimal prefixes.
 375   @decimal \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 376   0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 377   0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 378   @negative @decimal \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 379   @negative 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 380   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 381
 382   -- Unboxed floats and doubles (:: Float#, :: Double#)
 383   -- prim_{float,double} work with signed literals
 384   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 385   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 386 }
 387
 388 -- Strings and chars are lexed by hand-written code.  The reason is
 389 -- that even if we recognise the string or char here in the regex
 390 -- lexer, we would still have to parse the string afterward in order
 391 -- to convert it to a String.
 392 <0> {
 393   \'                            { lex_char_tok }
 394   \"                            { lex_string_tok }
 395 }
 396
 397 {
 398 -- -----------------------------------------------------------------------------
 399 -- The token type
 400
 401 data Token
 402   = ITas                        -- Haskell keywords
 403   | ITcase
 404   | ITclass
 405   | ITdata
 406   | ITdefault
 407   | ITderiving
 408   | ITderive
 409   | ITdo
 410   | ITelse
 411   | IThiding
 412   | ITif
 413   | ITimport
 414   | ITin
 415   | ITinfix
 416   | ITinfixl
 417   | ITinfixr
 418   | ITinstance
 419   | ITlet
 420   | ITmodule
 421   | ITnewtype
 422   | ITof
 423   | ITqualified
 424   | ITthen
 425   | ITtype
 426   | ITwhere
 427   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 428
 429   | ITforall                    -- GHC extension keywords
 430   | ITforeign
 431   | ITexport
 432   | ITlabel
 433   | ITdynamic
 434   | ITsafe
 435   | ITthreadsafe
 436   | ITunsafe
 437   | ITstdcallconv
 438   | ITccallconv
 439   | ITdotnet
 440   | ITmdo
 441   | ITfamily
 442
 443         -- Pragmas
 444   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 445   | ITspec_prag                 -- SPECIALISE
 446   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 447   | ITsource_prag
 448   | ITrules_prag
 449   | ITdeprecated_prag
 450   | ITline_prag
 451   | ITscc_prag
 452   | ITgenerated_prag
 453   | ITcore_prag                 -- hdaume: core annotations
 454   | ITunpack_prag
 455   | ITclose_prag
 456   | IToptions_prag String
 457   | ITinclude_prag String
 458   | ITlanguage_prag
 459
 460   | ITdotdot                    -- reserved symbols
 461   | ITcolon
 462   | ITdcolon
 463   | ITequal
 464   | ITlam
 465   | ITvbar
 466   | ITlarrow
 467   | ITrarrow
 468   | ITat
 469   | ITtilde
 470   | ITdarrow
 471   | ITminus
 472   | ITbang
 473   | ITstar
 474   | ITdot
 475
 476   | ITbiglam                    -- GHC-extension symbols
 477
 478   | ITocurly                    -- special symbols
 479   | ITccurly
 480   | ITocurlybar                 -- {|, for type applications
 481   | ITccurlybar                 -- |}, for type applications
 482   | ITvocurly
 483   | ITvccurly
 484   | ITobrack
 485   | ITopabrack                  -- [:, for parallel arrays with -fparr
 486   | ITcpabrack                  -- :], for parallel arrays with -fparr
 487   | ITcbrack
 488   | IToparen
 489   | ITcparen
 490   | IToubxparen
 491   | ITcubxparen
 492   | ITsemi
 493   | ITcomma
 494   | ITunderscore
 495   | ITbackquote
 496
 497   | ITvarid   FastString        -- identifiers
 498   | ITconid   FastString
 499   | ITvarsym  FastString
 500   | ITconsym  FastString
 501   | ITqvarid  (FastString,FastString)
 502   | ITqconid  (FastString,FastString)
 503   | ITqvarsym (FastString,FastString)
 504   | ITqconsym (FastString,FastString)
 505
 506   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 507
 508   | ITpragma StringBuffer
 509
 510   | ITchar       Char
 511   | ITstring     FastString
 512   | ITinteger    Integer
 513   | ITrational   Rational
 514
 515   | ITprimchar   Char
 516   | ITprimstring FastString
 517   | ITprimint    Integer
 518   | ITprimfloat  Rational
 519   | ITprimdouble Rational
 520
 521   -- MetaHaskell extension tokens
 522   | ITopenExpQuote              --  [| or [e|
 523   | ITopenPatQuote              --  [p|
 524   | ITopenDecQuote              --  [d|
 525   | ITopenTypQuote              --  [t|
 526   | ITcloseQuote                --  |]
 527   | ITidEscape   FastString     --  $x
 528   | ITparenEscape               --  $(
 529   | ITvarQuote                  --  '
 530   | ITtyQuote                   --  ''
 531
 532   -- Arrow notation extension
 533   | ITproc
 534   | ITrec
 535   | IToparenbar                 --  (|
 536   | ITcparenbar                 --  |)
 537   | ITlarrowtail                --  -<
 538   | ITrarrowtail                --  >-
 539   | ITLarrowtail                --  -<<
 540   | ITRarrowtail                --  >>-
 541
 542   | ITunknown String            -- Used when the lexer can't make sense of it
 543   | ITeof                       -- end of file token
 544
 545   -- Documentation annotations
 546   | ITdocCommentNext  String     -- something beginning '-- |'
 547   | ITdocCommentPrev  String     -- something beginning '-- ^'
 548   | ITdocCommentNamed String     -- something beginning '-- $'
 549   | ITdocSection      Int String -- a section heading
 550   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 551
 552 #ifdef DEBUG
 553   deriving Show -- debugging
 554 #endif
 555
 556 isSpecial :: Token -> Bool
 557 -- If we see M.x, where x is a keyword, but
 558 -- is special, we treat is as just plain M.x,
 559 -- not as a keyword.
 560 isSpecial ITas          = True
 561 isSpecial IThiding      = True
 562 isSpecial ITderive      = True
 563 isSpecial ITqualified   = True
 564 isSpecial ITforall      = True
 565 isSpecial ITexport      = True
 566 isSpecial ITlabel       = True
 567 isSpecial ITdynamic     = True
 568 isSpecial ITsafe        = True
 569 isSpecial ITthreadsafe  = True
 570 isSpecial ITunsafe      = True
 571 isSpecial ITccallconv   = True
 572 isSpecial ITstdcallconv = True
 573 isSpecial ITmdo         = True
 574 isSpecial ITfamily      = True
 575 isSpecial _             = False
 576
 577 -- the bitmap provided as the third component indicates whether the
 578 -- corresponding extension keyword is valid under the extension options
 579 -- provided to the compiler; if the extension corresponding to *any* of the
 580 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 581 -- facilitates using a keyword in two different extensions that can be
 582 -- activated independently)
 583 --
 584 reservedWordsFM = listToUFM $
 585         map (\(x, y, z) -> (mkFastString x, (y, z)))
 586        [( "_",          ITunderscore,   0 ),
 587         ( "as",         ITas,           0 ),
 588         ( "case",       ITcase,         0 ),
 589         ( "class",      ITclass,        0 ),
 590         ( "data",       ITdata,         0 ),
 591         ( "default",    ITdefault,      0 ),
 592         ( "deriving",   ITderiving,     0 ),
 593         ( "derive",     ITderive,       0 ),
 594         ( "do",         ITdo,           0 ),
 595         ( "else",       ITelse,         0 ),
 596         ( "hiding",     IThiding,       0 ),
 597         ( "if",         ITif,           0 ),
 598         ( "import",     ITimport,       0 ),
 599         ( "in",         ITin,           0 ),
 600         ( "infix",      ITinfix,        0 ),
 601         ( "infixl",     ITinfixl,       0 ),
 602         ( "infixr",     ITinfixr,       0 ),
 603         ( "instance",   ITinstance,     0 ),
 604         ( "let",        ITlet,          0 ),
 605         ( "module",     ITmodule,       0 ),
 606         ( "newtype",    ITnewtype,      0 ),
 607         ( "of",         ITof,           0 ),
 608         ( "qualified",  ITqualified,    0 ),
 609         ( "then",       ITthen,         0 ),
 610         ( "type",       ITtype,         0 ),
 611         ( "where",      ITwhere,        0 ),
 612         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 613
 614         ( "forall",     ITforall,        bit explicitForallBit),
 615         ( "mdo",        ITmdo,           bit recursiveDoBit),
 616         ( "family",     ITfamily,        bit tyFamBit),
 617
 618         ( "foreign",    ITforeign,       bit ffiBit),
 619         ( "export",     ITexport,        bit ffiBit),
 620         ( "label",      ITlabel,         bit ffiBit),
 621         ( "dynamic",    ITdynamic,       bit ffiBit),
 622         ( "safe",       ITsafe,          bit ffiBit),
 623         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 624         ( "unsafe",     ITunsafe,        bit ffiBit),
 625         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 626         ( "ccall",      ITccallconv,     bit ffiBit),
 627         ( "dotnet",     ITdotnet,        bit ffiBit),
 628
 629         ( "rec",        ITrec,           bit arrowsBit),
 630         ( "proc",       ITproc,          bit arrowsBit)
 631      ]
 632
 633 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 634 reservedSymsFM = listToUFM $
 635     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 636       [ ("..",  ITdotdot,   always)
 637         -- (:) is a reserved op, meaning only list cons
 638        ,(":",   ITcolon,    always)
 639        ,("::",  ITdcolon,   always)
 640        ,("=",   ITequal,    always)
 641        ,("\\",  ITlam,      always)
 642        ,("|",   ITvbar,     always)
 643        ,("<-",  ITlarrow,   always)
 644        ,("->",  ITrarrow,   always)
 645        ,("@",   ITat,       always)
 646        ,("~",   ITtilde,    always)
 647        ,("=>",  ITdarrow,   always)
 648        ,("-",   ITminus,    always)
 649        ,("!",   ITbang,     always)
 650
 651         -- For data T (a::*) = MkT
 652        ,("*", ITstar, \i -> kindSigsEnabled i || tyFamEnabled i)
 653         -- For 'forall a . t'
 654        ,(".", ITdot, explicitForallEnabled)
 655
 656        ,("-<",  ITlarrowtail, arrowsEnabled)
 657        ,(">-",  ITrarrowtail, arrowsEnabled)
 658        ,("-<<", ITLarrowtail, arrowsEnabled)
 659        ,(">>-", ITRarrowtail, arrowsEnabled)
 660
 661 #if __GLASGOW_HASKELL__ >= 605
 662        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 663        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 664        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 665                                 explicitForallEnabled i)
 666        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 667        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 668        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 669         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 670         -- form part of a large operator.  This would let us have a better
 671         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 672 #endif
 673        ]
 674
 675 -- -----------------------------------------------------------------------------
 676 -- Lexer actions
 677
 678 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 679
 680 special :: Token -> Action
 681 special tok span _buf len = return (L span tok)
 682
 683 token, layout_token :: Token -> Action
 684 token t span buf len = return (L span t)
 685 layout_token t span buf len = pushLexState layout >> return (L span t)
 686
 687 idtoken :: (StringBuffer -> Int -> Token) -> Action
 688 idtoken f span buf len = return (L span $! (f buf len))
 689
 690 skip_one_varid :: (FastString -> Token) -> Action
 691 skip_one_varid f span buf len
 692   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 693
 694 strtoken :: (String -> Token) -> Action
 695 strtoken f span buf len =
 696   return (L span $! (f $! lexemeToString buf len))
 697
 698 init_strtoken :: Int -> (String -> Token) -> Action
 699 -- like strtoken, but drops the last N character(s)
 700 init_strtoken drop f span buf len =
 701   return (L span $! (f $! lexemeToString buf (len-drop)))
 702
 703 begin :: Int -> Action
 704 begin code _span _str _len = do pushLexState code; lexToken
 705
 706 pop :: Action
 707 pop _span _buf _len = do popLexState; lexToken
 708
 709 pop_and :: Action -> Action
 710 pop_and act span buf len = do popLexState; act span buf len
 711
 712 {-# INLINE nextCharIs #-}
 713 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 714
 715 notFollowedBy char _ _ _ (AI _ _ buf)
 716   = nextCharIs buf (/=char)
 717
 718 notFollowedBySymbol _ _ _ (AI _ _ buf)
 719   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 720
 721 -- We must reject doc comments as being ordinary comments everywhere.
 722 -- In some cases the doc comment will be selected as the lexeme due to
 723 -- maximal munch, but not always, because the nested comment rule is
 724 -- valid in all states, but the doc-comment rules are only valid in
 725 -- the non-layout states.
 726 isNormalComment bits _ _ (AI _ _ buf)
 727   | haddockEnabled bits = notFollowedByDocOrPragma
 728   | otherwise           = nextCharIs buf (/='#')
 729   where
 730     notFollowedByDocOrPragma
 731        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 732
 733 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 734
 735 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 736   = if haddockEnabled bits then False else (p buf)
 737
 738 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 739
 740 ifExtension pred bits _ _ _ = pred bits
 741
 742 multiline_doc_comment :: Action
 743 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 744   where
 745     worker commentAcc input docType oneLine = case alexGetChar input of
 746       Just ('\n', input')
 747         | oneLine -> docCommentEnd input commentAcc docType buf span
 748         | otherwise -> case checkIfCommentLine input' of
 749           Just input -> worker ('\n':commentAcc) input docType False
 750           Nothing -> docCommentEnd input commentAcc docType buf span
 751       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 752       Nothing -> docCommentEnd input commentAcc docType buf span
 753
 754     checkIfCommentLine input = check (dropNonNewlineSpace input)
 755       where
 756         check input = case alexGetChar input of
 757           Just ('-', input) -> case alexGetChar input of
 758             Just ('-', input) -> case alexGetChar input of
 759               Just (c, _) | c /= '-' -> Just input
 760               _ -> Nothing
 761             _ -> Nothing
 762           _ -> Nothing
 763
 764         dropNonNewlineSpace input = case alexGetChar input of
 765           Just (c, input')
 766             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 767             | otherwise -> input
 768           Nothing -> input
 769
 770 {-
 771   nested comments require traversing by hand, they can't be parsed
 772   using regular expressions.
 773 -}
 774 nested_comment :: P (Located Token) -> Action
 775 nested_comment cont span _str _len = do
 776   input <- getInput
 777   go 1 input
 778   where
 779     go 0 input = do setInput input; cont
 780     go n input = case alexGetChar input of
 781       Nothing -> errBrace input span
 782       Just ('-',input) -> case alexGetChar input of
 783         Nothing  -> errBrace input span
 784         Just ('\125',input) -> go (n-1) input
 785         Just (c,_)          -> go n input
 786       Just ('\123',input) -> case alexGetChar input of
 787         Nothing  -> errBrace input span
 788         Just ('-',input) -> go (n+1) input
 789         Just (c,_)       -> go n input
 790       Just (c,input) -> go n input
 791
 792 nested_doc_comment :: Action
 793 nested_doc_comment span buf _len = withLexedDocType (go "")
 794   where
 795     go commentAcc input docType _ = case alexGetChar input of
 796       Nothing -> errBrace input span
 797       Just ('-',input) -> case alexGetChar input of
 798         Nothing -> errBrace input span
 799         Just ('\125',input@(AI end _ buf2)) ->
 800           docCommentEnd input commentAcc docType buf span
 801         Just (c,_) -> go ('-':commentAcc) input docType False
 802       Just ('\123', input) -> case alexGetChar input of
 803         Nothing  -> errBrace input span
 804         Just ('-',input) -> do
 805           setInput input
 806           let cont = do input <- getInput; go commentAcc input docType False
 807           nested_comment cont span buf _len
 808         Just (c,_) -> go ('\123':commentAcc) input docType False
 809       Just (c,input) -> go (c:commentAcc) input docType False
 810
 811 withLexedDocType lexDocComment = do
 812   input@(AI _ _ buf) <- getInput
 813   case prevChar buf ' ' of
 814     '|' -> lexDocComment input ITdocCommentNext False
 815     '^' -> lexDocComment input ITdocCommentPrev False
 816     '$' -> lexDocComment input ITdocCommentNamed False
 817     '*' -> lexDocSection 1 input
 818  where
 819     lexDocSection n input = case alexGetChar input of
 820       Just ('*', input) -> lexDocSection (n+1) input
 821       Just (c, _) -> lexDocComment input (ITdocSection n) True
 822       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 823
 824 -- docCommentEnd
 825 -------------------------------------------------------------------------------
 826 -- This function is quite tricky. We can't just return a new token, we also
 827 -- need to update the state of the parser. Why? Because the token is longer
 828 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 829 -- it writes the wrong token length to the parser state. This function is
 830 -- called afterwards, so it can just update the state.
 831
 832 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 833 -- which is something that the original lexer didn't account for.
 834 -- I have added last_line_len in the parser state which represents the length
 835 -- of the part of the token that is on the last line. It is now used for layout
 836 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 837 -- was before, the full length of the token, and it is now only used for error
 838 -- messages. /Waern
 839
 840 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 841                  SrcSpan -> P (Located Token)
 842 docCommentEnd input commentAcc docType buf span = do
 843   setInput input
 844   let (AI loc last_offs nextBuf) = input
 845       comment = reverse commentAcc
 846       span' = mkSrcSpan (srcSpanStart span) loc
 847       last_len = byteDiff buf nextBuf
 848
 849       last_line_len = if (last_offs - last_len < 0)
 850         then last_offs
 851         else last_len
 852
 853   span `seq` setLastToken span' last_len last_line_len
 854   return (L span' (docType comment))
 855
 856 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 857
 858 open_brace, close_brace :: Action
 859 open_brace span _str _len = do
 860   ctx <- getContext
 861   setContext (NoLayout:ctx)
 862   return (L span ITocurly)
 863 close_brace span _str _len = do
 864   popContext
 865   return (L span ITccurly)
 866
 867 qvarid buf len = ITqvarid $! splitQualName buf len
 868 qconid buf len = ITqconid $! splitQualName buf len
 869
 870 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 871 -- takes a StringBuffer and a length, and returns the module name
 872 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 873 -- because of hierarchical module names.
 874 splitQualName orig_buf len = split orig_buf orig_buf
 875   where
 876     split buf dot_buf
 877         | orig_buf `byteDiff` buf >= len  = done dot_buf
 878         | c == '.'                        = found_dot buf'
 879         | otherwise                       = split buf' dot_buf
 880       where
 881        (c,buf') = nextChar buf
 882
 883     -- careful, we might get names like M....
 884     -- so, if the character after the dot is not upper-case, this is
 885     -- the end of the qualifier part.
 886     found_dot buf -- buf points after the '.'
 887         | isUpper c    = split buf' buf
 888         | otherwise    = done buf
 889       where
 890        (c,buf') = nextChar buf
 891
 892     done dot_buf =
 893         (lexemeToFastString orig_buf (qual_size - 1),
 894          lexemeToFastString dot_buf (len - qual_size))
 895       where
 896         qual_size = orig_buf `byteDiff` dot_buf
 897
 898 varid span buf len =
 899   case lookupUFM reservedWordsFM fs of
 900         Just (keyword,0)    -> do
 901                 maybe_layout keyword
 902                 return (L span keyword)
 903         Just (keyword,exts) -> do
 904                 b <- extension (\i -> exts .&. i /= 0)
 905                 if b then do maybe_layout keyword
 906                              return (L span keyword)
 907                      else return (L span (ITvarid fs))
 908         _other -> return (L span (ITvarid fs))
 909   where
 910         fs = lexemeToFastString buf len
 911
 912 conid buf len = ITconid fs
 913   where fs = lexemeToFastString buf len
 914
 915 qvarsym buf len = ITqvarsym $! splitQualName buf len
 916 qconsym buf len = ITqconsym $! splitQualName buf len
 917
 918 varsym = sym ITvarsym
 919 consym = sym ITconsym
 920
 921 sym con span buf len =
 922   case lookupUFM reservedSymsFM fs of
 923         Just (keyword,exts) -> do
 924                 b <- extension exts
 925                 if b then return (L span keyword)
 926                      else return (L span $! con fs)
 927         _other -> return (L span $! con fs)
 928   where
 929         fs = lexemeToFastString buf len
 930
 931 -- Variations on the integral numeric literal.
 932 tok_integral :: (Integer -> Token)
 933      -> (Integer -> Integer)
 934  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
 935      -> Int -> Int
 936      -> (Integer, (Char->Int)) -> Action
 937 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
 938   return $ L span $ itint $! transint $ parseUnsignedInteger
 939      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
 940
 941 -- some conveniences for use with tok_integral
 942 tok_num = tok_integral ITinteger
 943 tok_primint = tok_integral ITprimint
 944 positive = id
 945 negative = negate
 946 decimal = (10,octDecDigit)
 947 octal = (8,octDecDigit)
 948 hexadecimal = (16,hexDigit)
 949
 950 -- readRational can understand negative rationals, exponents, everything.
 951 tok_float        str = ITrational   $! readRational str
 952 tok_primfloat    str = ITprimfloat  $! readRational str
 953 tok_primdouble   str = ITprimdouble $! readRational str
 954
 955 -- -----------------------------------------------------------------------------
 956 -- Layout processing
 957
 958 -- we're at the first token on a line, insert layout tokens if necessary
 959 do_bol :: Action
 960 do_bol span _str _len = do
 961         pos <- getOffside
 962         case pos of
 963             LT -> do
 964                 --trace "layout: inserting '}'" $ do
 965                 popContext
 966                 -- do NOT pop the lex state, we might have a ';' to insert
 967                 return (L span ITvccurly)
 968             EQ -> do
 969                 --trace "layout: inserting ';'" $ do
 970                 popLexState
 971                 return (L span ITsemi)
 972             GT -> do
 973                 popLexState
 974                 lexToken
 975
 976 -- certain keywords put us in the "layout" state, where we might
 977 -- add an opening curly brace.
 978 maybe_layout ITdo       = pushLexState layout_do
 979 maybe_layout ITmdo      = pushLexState layout_do
 980 maybe_layout ITof       = pushLexState layout
 981 maybe_layout ITlet      = pushLexState layout
 982 maybe_layout ITwhere    = pushLexState layout
 983 maybe_layout ITrec      = pushLexState layout
 984 maybe_layout _          = return ()
 985
 986 -- Pushing a new implicit layout context.  If the indentation of the
 987 -- next token is not greater than the previous layout context, then
 988 -- Haskell 98 says that the new layout context should be empty; that is
 989 -- the lexer must generate {}.
 990 --
 991 -- We are slightly more lenient than this: when the new context is started
 992 -- by a 'do', then we allow the new context to be at the same indentation as
 993 -- the previous context.  This is what the 'strict' argument is for.
 994 --
 995 new_layout_context strict span _buf _len = do
 996     popLexState
 997     (AI _ offset _) <- getInput
 998     ctx <- getContext
 999     case ctx of
1000         Layout prev_off : _  |
1001            (strict     && prev_off >= offset  ||
1002             not strict && prev_off > offset) -> do
1003                 -- token is indented to the left of the previous context.
1004                 -- we must generate a {} sequence now.
1005                 pushLexState layout_left
1006                 return (L span ITvocurly)
1007         other -> do
1008                 setContext (Layout offset : ctx)
1009                 return (L span ITvocurly)
1010
1011 do_layout_left span _buf _len = do
1012     popLexState
1013     pushLexState bol  -- we must be at the start of a line
1014     return (L span ITvccurly)
1015
1016 -- -----------------------------------------------------------------------------
1017 -- LINE pragmas
1018
1019 setLine :: Int -> Action
1020 setLine code span buf len = do
1021   let line = parseUnsignedInteger buf len 10 octDecDigit
1022   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1023         -- subtract one: the line number refers to the *following* line
1024   popLexState
1025   pushLexState code
1026   lexToken
1027
1028 setFile :: Int -> Action
1029 setFile code span buf len = do
1030   let file = lexemeToFastString (stepOn buf) (len-2)
1031   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1032   popLexState
1033   pushLexState code
1034   lexToken
1035
1036
1037 -- -----------------------------------------------------------------------------
1038 -- Options, includes and language pragmas.
1039
1040 lex_string_prag :: (String -> Token) -> Action
1041 lex_string_prag mkTok span buf len
1042     = do input <- getInput
1043          start <- getSrcLoc
1044          tok <- go [] input
1045          end <- getSrcLoc
1046          return (L (mkSrcSpan start end) tok)
1047     where go acc input
1048               = if isString input "#-}"
1049                    then do setInput input
1050                            return (mkTok (reverse acc))
1051                    else case alexGetChar input of
1052                           Just (c,i) -> go (c:acc) i
1053                           Nothing -> err input
1054           isString i [] = True
1055           isString i (x:xs)
1056               = case alexGetChar i of
1057                   Just (c,i') | c == x    -> isString i' xs
1058                   _other -> False
1059           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1060
1061
1062 -- -----------------------------------------------------------------------------
1063 -- Strings & Chars
1064
1065 -- This stuff is horrible.  I hates it.
1066
1067 lex_string_tok :: Action
1068 lex_string_tok span buf len = do
1069   tok <- lex_string ""
1070   end <- getSrcLoc
1071   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1072
1073 lex_string :: String -> P Token
1074 lex_string s = do
1075   i <- getInput
1076   case alexGetChar' i of
1077     Nothing -> lit_error
1078
1079     Just ('"',i)  -> do
1080         setInput i
1081         magicHash <- extension magicHashEnabled
1082         if magicHash
1083           then do
1084             i <- getInput
1085             case alexGetChar' i of
1086               Just ('#',i) -> do
1087                    setInput i
1088                    if any (> '\xFF') s
1089                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1090                     else let s' = mkZFastString (reverse s) in
1091                          return (ITprimstring s')
1092                         -- mkZFastString is a hack to avoid encoding the
1093                         -- string in UTF-8.  We just want the exact bytes.
1094               _other ->
1095                 return (ITstring (mkFastString (reverse s)))
1096           else
1097                 return (ITstring (mkFastString (reverse s)))
1098
1099     Just ('\\',i)
1100         | Just ('&',i) <- next -> do
1101                 setInput i; lex_string s
1102         | Just (c,i) <- next, is_space c -> do
1103                 setInput i; lex_stringgap s
1104         where next = alexGetChar' i
1105
1106     Just (c, i) -> do
1107         c' <- lex_char c i
1108         lex_string (c':s)
1109
1110 lex_stringgap s = do
1111   c <- getCharOrFail
1112   case c of
1113     '\\' -> lex_string s
1114     c | is_space c -> lex_stringgap s
1115     _other -> lit_error
1116
1117
1118 lex_char_tok :: Action
1119 -- Here we are basically parsing character literals, such as 'x' or '\n'
1120 -- but, when Template Haskell is on, we additionally spot
1121 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1122 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1123 -- So we have to do two characters of lookahead: when we see 'x we need to
1124 -- see if there's a trailing quote
1125 lex_char_tok span buf len = do  -- We've seen '
1126    i1 <- getInput       -- Look ahead to first character
1127    let loc = srcSpanStart span
1128    case alexGetChar' i1 of
1129         Nothing -> lit_error
1130
1131         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1132                   th_exts <- extension thEnabled
1133                   if th_exts then do
1134                         setInput i2
1135                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1136                    else lit_error
1137
1138         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
1139                   setInput i2
1140                   lit_ch <- lex_escape
1141                   mc <- getCharOrFail   -- Trailing quote
1142                   if mc == '\'' then finish_char_tok loc lit_ch
1143                                 else do setInput i2; lit_error
1144
1145         Just (c, i2@(AI end2 _ _))
1146                 | not (isAny c) -> lit_error
1147                 | otherwise ->
1148
1149                 -- We've seen 'x, where x is a valid character
1150                 --  (i.e. not newline etc) but not a quote or backslash
1151            case alexGetChar' i2 of      -- Look ahead one more character
1152                 Nothing -> lit_error
1153                 Just ('\'', i3) -> do   -- We've seen 'x'
1154                         setInput i3
1155                         finish_char_tok loc c
1156                 _other -> do            -- We've seen 'x not followed by quote
1157                                         -- If TH is on, just parse the quote only
1158                         th_exts <- extension thEnabled
1159                         let (AI end _ _) = i1
1160                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1161                                    else do setInput i2; lit_error
1162
1163 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1164 finish_char_tok loc ch  -- We've already seen the closing quote
1165                         -- Just need to check for trailing #
1166   = do  magicHash <- extension magicHashEnabled
1167         i@(AI end _ _) <- getInput
1168         if magicHash then do
1169                 case alexGetChar' i of
1170                         Just ('#',i@(AI end _ _)) -> do
1171                                 setInput i
1172                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1173                         _other ->
1174                                 return (L (mkSrcSpan loc end) (ITchar ch))
1175                 else do
1176                    return (L (mkSrcSpan loc end) (ITchar ch))
1177
1178 lex_char :: Char -> AlexInput -> P Char
1179 lex_char c inp = do
1180   case c of
1181       '\\' -> do setInput inp; lex_escape
1182       c | isAny c -> do setInput inp; return c
1183       _other -> lit_error
1184
1185 isAny c | c > '\xff' = isPrint c
1186         | otherwise  = is_any c
1187
1188 lex_escape :: P Char
1189 lex_escape = do
1190   c <- getCharOrFail
1191   case c of
1192         'a'   -> return '\a'
1193         'b'   -> return '\b'
1194         'f'   -> return '\f'
1195         'n'   -> return '\n'
1196         'r'   -> return '\r'
1197         't'   -> return '\t'
1198         'v'   -> return '\v'
1199         '\\'  -> return '\\'
1200         '"'   -> return '\"'
1201         '\''  -> return '\''
1202         '^'   -> do c <- getCharOrFail
1203                     if c >= '@' && c <= '_'
1204                         then return (chr (ord c - ord '@'))
1205                         else lit_error
1206
1207         'x'   -> readNum is_hexdigit 16 hexDigit
1208         'o'   -> readNum is_octdigit  8 octDecDigit
1209         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1210
1211         c1 ->  do
1212            i <- getInput
1213            case alexGetChar' i of
1214             Nothing -> lit_error
1215             Just (c2,i2) ->
1216               case alexGetChar' i2 of
1217                 Nothing -> do setInput i2; lit_error
1218                 Just (c3,i3) ->
1219                    let str = [c1,c2,c3] in
1220                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1221                                      Just rest <- [maybePrefixMatch p str] ] of
1222                           (escape_char,[]):_ -> do
1223                                 setInput i3
1224                                 return escape_char
1225                           (escape_char,_:_):_ -> do
1226                                 setInput i2
1227                                 return escape_char
1228                           [] -> lit_error
1229
1230 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1231 readNum is_digit base conv = do
1232   i <- getInput
1233   c <- getCharOrFail
1234   if is_digit c
1235         then readNum2 is_digit base conv (conv c)
1236         else do setInput i; lit_error
1237
1238 readNum2 is_digit base conv i = do
1239   input <- getInput
1240   read i input
1241   where read i input = do
1242           case alexGetChar' input of
1243             Just (c,input') | is_digit c -> do
1244                 read (i*base + conv c) input'
1245             _other -> do
1246                 if i >= 0 && i <= 0x10FFFF
1247                    then do setInput input; return (chr i)
1248                    else lit_error
1249
1250 silly_escape_chars = [
1251         ("NUL", '\NUL'),
1252         ("SOH", '\SOH'),
1253         ("STX", '\STX'),
1254         ("ETX", '\ETX'),
1255         ("EOT", '\EOT'),
1256         ("ENQ", '\ENQ'),
1257         ("ACK", '\ACK'),
1258         ("BEL", '\BEL'),
1259         ("BS", '\BS'),
1260         ("HT", '\HT'),
1261         ("LF", '\LF'),
1262         ("VT", '\VT'),
1263         ("FF", '\FF'),
1264         ("CR", '\CR'),
1265         ("SO", '\SO'),
1266         ("SI", '\SI'),
1267         ("DLE", '\DLE'),
1268         ("DC1", '\DC1'),
1269         ("DC2", '\DC2'),
1270         ("DC3", '\DC3'),
1271         ("DC4", '\DC4'),
1272         ("NAK", '\NAK'),
1273         ("SYN", '\SYN'),
1274         ("ETB", '\ETB'),
1275         ("CAN", '\CAN'),
1276         ("EM", '\EM'),
1277         ("SUB", '\SUB'),
1278         ("ESC", '\ESC'),
1279         ("FS", '\FS'),
1280         ("GS", '\GS'),
1281         ("RS", '\RS'),
1282         ("US", '\US'),
1283         ("SP", '\SP'),
1284         ("DEL", '\DEL')
1285         ]
1286
1287 -- before calling lit_error, ensure that the current input is pointing to
1288 -- the position of the error in the buffer.  This is so that we can report
1289 -- a correct location to the user, but also so we can detect UTF-8 decoding
1290 -- errors if they occur.
1291 lit_error = lexError "lexical error in string/character literal"
1292
1293 getCharOrFail :: P Char
1294 getCharOrFail =  do
1295   i <- getInput
1296   case alexGetChar' i of
1297         Nothing -> lexError "unexpected end-of-file in string/character literal"
1298         Just (c,i)  -> do setInput i; return c
1299
1300 -- -----------------------------------------------------------------------------
1301 -- Warnings
1302
1303 warn :: DynFlag -> SDoc -> Action
1304 warn option warning span _buf _len = do
1305     addWarning option (mkWarnMsg span alwaysQualify warning)
1306     lexToken
1307
1308 -- -----------------------------------------------------------------------------
1309 -- The Parse Monad
1310
1311 data LayoutContext
1312   = NoLayout
1313   | Layout !Int
1314   deriving Show
1315
1316 data ParseResult a
1317   = POk PState a
1318   | PFailed
1319         SrcSpan         -- The start and end of the text span related to
1320                         -- the error.  Might be used in environments which can
1321                         -- show this span, e.g. by highlighting it.
1322         Message         -- The error message
1323
1324 data PState = PState {
1325         buffer     :: StringBuffer,
1326     dflags     :: DynFlags,
1327     messages   :: Messages,
1328         last_loc   :: SrcSpan,  -- pos of previous token
1329         last_offs  :: !Int,     -- offset of the previous token from the
1330                                 -- beginning of  the current line.
1331                                 -- \t is equal to 8 spaces.
1332         last_len   :: !Int,     -- len of previous token
1333   last_line_len :: !Int,
1334         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1335         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1336         context    :: [LayoutContext],
1337         lex_state  :: [Int]
1338      }
1339         -- last_loc and last_len are used when generating error messages,
1340         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1341         -- current token to happyError, we could at least get rid of last_len.
1342         -- Getting rid of last_loc would require finding another way to
1343         -- implement pushCurrentContext (which is only called from one place).
1344
1345 newtype P a = P { unP :: PState -> ParseResult a }
1346
1347 instance Monad P where
1348   return = returnP
1349   (>>=) = thenP
1350   fail = failP
1351
1352 returnP :: a -> P a
1353 returnP a = P $ \s -> POk s a
1354
1355 thenP :: P a -> (a -> P b) -> P b
1356 (P m) `thenP` k = P $ \ s ->
1357         case m s of
1358                 POk s1 a         -> (unP (k a)) s1
1359                 PFailed span err -> PFailed span err
1360
1361 failP :: String -> P a
1362 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1363
1364 failMsgP :: String -> P a
1365 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1366
1367 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1368 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1369
1370 failSpanMsgP :: SrcSpan -> String -> P a
1371 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1372
1373 extension :: (Int -> Bool) -> P Bool
1374 extension p = P $ \s -> POk s (p $! extsBitmap s)
1375
1376 getExts :: P Int
1377 getExts = P $ \s -> POk s (extsBitmap s)
1378
1379 setSrcLoc :: SrcLoc -> P ()
1380 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1381
1382 getSrcLoc :: P SrcLoc
1383 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1384
1385 setLastToken :: SrcSpan -> Int -> Int -> P ()
1386 setLastToken loc len line_len = P $ \s -> POk s {
1387   last_loc=loc,
1388   last_len=len,
1389   last_line_len=line_len
1390 } ()
1391
1392 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1393
1394 alexInputPrevChar :: AlexInput -> Char
1395 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1396
1397 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1398 alexGetChar (AI loc ofs s)
1399   | atEnd s   = Nothing
1400   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1401                 --trace (show (ord c)) $
1402                 Just (adj_c, (AI loc' ofs' s'))
1403   where (c,s') = nextChar s
1404         loc'   = advanceSrcLoc loc c
1405         ofs'   = advanceOffs c ofs
1406
1407         non_graphic     = '\x0'
1408         upper           = '\x1'
1409         lower           = '\x2'
1410         digit           = '\x3'
1411         symbol          = '\x4'
1412         space           = '\x5'
1413         other_graphic   = '\x6'
1414
1415         adj_c
1416           | c <= '\x06' = non_graphic
1417           | c <= '\xff' = c
1418           -- Alex doesn't handle Unicode, so when Unicode
1419           -- character is encoutered we output these values
1420           -- with the actual character value hidden in the state.
1421           | otherwise =
1422                 case generalCategory c of
1423                   UppercaseLetter       -> upper
1424                   LowercaseLetter       -> lower
1425                   TitlecaseLetter       -> upper
1426                   ModifierLetter        -> other_graphic
1427                   OtherLetter           -> other_graphic
1428                   NonSpacingMark        -> other_graphic
1429                   SpacingCombiningMark  -> other_graphic
1430                   EnclosingMark         -> other_graphic
1431                   DecimalNumber         -> digit
1432                   LetterNumber          -> other_graphic
1433                   OtherNumber           -> other_graphic
1434                   ConnectorPunctuation  -> other_graphic
1435                   DashPunctuation       -> other_graphic
1436                   OpenPunctuation       -> other_graphic
1437                   ClosePunctuation      -> other_graphic
1438                   InitialQuote          -> other_graphic
1439                   FinalQuote            -> other_graphic
1440                   OtherPunctuation      -> other_graphic
1441                   MathSymbol            -> symbol
1442                   CurrencySymbol        -> symbol
1443                   ModifierSymbol        -> symbol
1444                   OtherSymbol           -> symbol
1445                   Space                 -> space
1446                   _other                -> non_graphic
1447
1448 -- This version does not squash unicode characters, it is used when
1449 -- lexing strings.
1450 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1451 alexGetChar' (AI loc ofs s)
1452   | atEnd s   = Nothing
1453   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1454                 --trace (show (ord c)) $
1455                 Just (c, (AI loc' ofs' s'))
1456   where (c,s') = nextChar s
1457         loc'   = advanceSrcLoc loc c
1458         ofs'   = advanceOffs c ofs
1459
1460 advanceOffs :: Char -> Int -> Int
1461 advanceOffs '\n' offs = 0
1462 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1463 advanceOffs _    offs = offs + 1
1464
1465 getInput :: P AlexInput
1466 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1467
1468 setInput :: AlexInput -> P ()
1469 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1470
1471 pushLexState :: Int -> P ()
1472 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1473
1474 popLexState :: P Int
1475 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1476
1477 getLexState :: P Int
1478 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1479
1480 -- for reasons of efficiency, flags indicating language extensions (eg,
1481 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1482 -- integer
1483
1484 genericsBit, ffiBit, parrBit :: Int
1485 genericsBit = 0 -- {| and |}
1486 ffiBit     = 1
1487 parrBit    = 2
1488 arrowsBit  = 4
1489 thBit      = 5
1490 ipBit      = 6
1491 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1492 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1493                 -- (doesn't affect the lexer)
1494 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1495 haddockBit = 10 -- Lex and parse Haddock comments
1496 magicHashBit = 11 -- # in both functions and operators
1497 kindSigsBit = 12 -- Kind signatures on type variables
1498 recursiveDoBit = 13 -- mdo
1499 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1500 unboxedTuplesBit = 15 -- (# and #)
1501 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1502
1503 genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1504 always           _     = True
1505 genericsEnabled  flags = testBit flags genericsBit
1506 ffiEnabled       flags = testBit flags ffiBit
1507 parrEnabled      flags = testBit flags parrBit
1508 arrowsEnabled    flags = testBit flags arrowsBit
1509 thEnabled        flags = testBit flags thBit
1510 ipEnabled        flags = testBit flags ipBit
1511 explicitForallEnabled flags = testBit flags explicitForallBit
1512 bangPatEnabled   flags = testBit flags bangPatBit
1513 tyFamEnabled     flags = testBit flags tyFamBit
1514 haddockEnabled   flags = testBit flags haddockBit
1515 magicHashEnabled flags = testBit flags magicHashBit
1516 kindSigsEnabled  flags = testBit flags kindSigsBit
1517 recursiveDoEnabled flags = testBit flags recursiveDoBit
1518 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1519 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1520 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1521
1522 -- PState for parsing options pragmas
1523 --
1524 pragState :: StringBuffer -> SrcLoc -> PState
1525 pragState buf loc  =
1526   PState {
1527       buffer          = buf,
1528       messages      = emptyMessages,
1529       -- XXX defaultDynFlags is not right, but we don't have a real
1530       -- dflags handy
1531       dflags        = defaultDynFlags,
1532       last_loc      = mkSrcSpan loc loc,
1533       last_offs     = 0,
1534       last_len      = 0,
1535       last_line_len = 0,
1536       loc           = loc,
1537       extsBitmap    = 0,
1538       context       = [],
1539       lex_state     = [bol, option_prags, 0]
1540     }
1541
1542
1543 -- create a parse state
1544 --
1545 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1546 mkPState buf loc flags  =
1547   PState {
1548       buffer          = buf,
1549       dflags        = flags,
1550       messages      = emptyMessages,
1551       last_loc      = mkSrcSpan loc loc,
1552       last_offs     = 0,
1553       last_len      = 0,
1554       last_line_len = 0,
1555       loc           = loc,
1556       extsBitmap    = fromIntegral bitmap,
1557       context       = [],
1558       lex_state     = [bol, 0]
1559         -- we begin in the layout state if toplev_layout is set
1560     }
1561     where
1562       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1563                .|. ffiBit       `setBitIf` dopt Opt_ForeignFunctionInterface flags
1564                .|. parrBit      `setBitIf` dopt Opt_PArr         flags
1565                .|. arrowsBit    `setBitIf` dopt Opt_Arrows       flags
1566                .|. thBit        `setBitIf` dopt Opt_TemplateHaskell flags
1567                .|. ipBit        `setBitIf` dopt Opt_ImplicitParams flags
1568                .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
1569                .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
1570                .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
1571                .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
1572                .|. explicitForallBit `setBitIf` dopt Opt_RankNTypes flags
1573                .|. bangPatBit   `setBitIf` dopt Opt_BangPatterns flags
1574                .|. tyFamBit     `setBitIf` dopt Opt_TypeFamilies flags
1575                .|. haddockBit   `setBitIf` dopt Opt_Haddock      flags
1576                .|. magicHashBit `setBitIf` dopt Opt_MagicHash    flags
1577                .|. kindSigsBit  `setBitIf` dopt Opt_KindSignatures flags
1578                .|. recursiveDoBit `setBitIf` dopt Opt_RecursiveDo flags
1579                .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
1580                .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
1581                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1582       --
1583       setBitIf :: Int -> Bool -> Int
1584       b `setBitIf` cond | cond      = bit b
1585                         | otherwise = 0
1586
1587 addWarning :: DynFlag -> WarnMsg -> P ()
1588 addWarning option w
1589  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1590        let ws' = if dopt option d then ws `snocBag` w else ws
1591        in POk s{messages=(ws', es)} ()
1592
1593 getMessages :: PState -> Messages
1594 getMessages PState{messages=ms} = ms
1595
1596 getContext :: P [LayoutContext]
1597 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1598
1599 setContext :: [LayoutContext] -> P ()
1600 setContext ctx = P $ \s -> POk s{context=ctx} ()
1601
1602 popContext :: P ()
1603 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1604                            loc = loc, last_len = len, last_loc = last_loc }) ->
1605   case ctx of
1606         (_:tl) -> POk s{ context = tl } ()
1607         []     -> PFailed last_loc (srcParseErr buf len)
1608
1609 -- Push a new layout context at the indentation of the last token read.
1610 -- This is only used at the outer level of a module when the 'module'
1611 -- keyword is missing.
1612 pushCurrentContext :: P ()
1613 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1614     POk s{context = Layout (offs-len) : ctx} ()
1615 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1616
1617 getOffside :: P Ordering
1618 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1619                 let ord = case stk of
1620                         (Layout n:_) -> compare offs n
1621                         _            -> GT
1622                 in POk s ord
1623
1624 -- ---------------------------------------------------------------------------
1625 -- Construct a parse error
1626
1627 srcParseErr
1628   :: StringBuffer       -- current buffer (placed just after the last token)
1629   -> Int                -- length of the previous token
1630   -> Message
1631 srcParseErr buf len
1632   = hcat [ if null token
1633              then ptext SLIT("parse error (possibly incorrect indentation)")
1634              else hcat [ptext SLIT("parse error on input "),
1635                         char '`', text token, char '\'']
1636     ]
1637   where token = lexemeToString (offsetBytes (-len) buf) len
1638
1639 -- Report a parse failure, giving the span of the previous token as
1640 -- the location of the error.  This is the entry point for errors
1641 -- detected during parsing.
1642 srcParseFail :: P a
1643 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1644                             last_loc = last_loc } ->
1645     PFailed last_loc (srcParseErr buf len)
1646
1647 -- A lexical error is reported at a particular position in the source file,
1648 -- not over a token range.
1649 lexError :: String -> P a
1650 lexError str = do
1651   loc <- getSrcLoc
1652   i@(AI end _ buf) <- getInput
1653   reportLexError loc end buf str
1654
1655 -- -----------------------------------------------------------------------------
1656 -- This is the top-level function: called from the parser each time a
1657 -- new token is to be read from the input.
1658
1659 lexer :: (Located Token -> P a) -> P a
1660 lexer cont = do
1661   tok@(L span tok__) <- lexToken
1662 --  trace ("token: " ++ show tok__) $ do
1663   cont tok
1664
1665 lexToken :: P (Located Token)
1666 lexToken = do
1667   inp@(AI loc1 _ buf) <- getInput
1668   sc <- getLexState
1669   exts <- getExts
1670   case alexScanUser exts inp sc of
1671     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1672                   setLastToken span 0 0
1673                   return (L span ITeof)
1674     AlexError (AI loc2 _ buf) -> do
1675         reportLexError loc1 loc2 buf "lexical error"
1676     AlexSkip inp2 _ -> do
1677         setInput inp2
1678         lexToken
1679     AlexToken inp2@(AI end _ buf2) len t -> do
1680     setInput inp2
1681     let span = mkSrcSpan loc1 end
1682     let bytes = byteDiff buf buf2
1683     span `seq` setLastToken span bytes bytes
1684     t span buf bytes
1685
1686 reportLexError loc1 loc2 buf str
1687   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1688   | otherwise =
1689   let
1690         c = fst (nextChar buf)
1691   in
1692   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1693     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1694     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1695 }