compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 {-# OPTIONS -Wwarn -w #-}
  35 -- The above -Wwarn supression flag is a temporary kludge.
  36 -- While working on this module you are encouraged to remove it and fix
  37 -- any warnings in the module. See
  38 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  39 -- for details
  40 --
  41 -- Note that Alex itself generates code with with some unused bindings and
  42 -- without type signatures, so removing the flag might not be possible.
  43
  44 {-# OPTIONS_GHC -funbox-strict-fields #-}
  45
  46 module Lexer (
  47    Token(..), lexer, pragState, mkPState, PState(..),
  48    P(..), ParseResult(..), getSrcLoc,
  49    getPState,
  50    failLocMsgP, failSpanMsgP, srcParseFail,
  51    getMessages,
  52    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  53    getLexState, popLexState, pushLexState,
  54    extension, standaloneDerivingEnabled, bangPatEnabled,
  55    addWarning,
  56    lexTokenStream
  57   ) where
  58
  59 import Bag
  60 import ErrUtils
  61 import Outputable
  62 import StringBuffer
  63 import FastString
  64 import SrcLoc
  65 import UniqFM
  66 import DynFlags
  67 import Ctype
  68 import Util             ( readRational )
  69
  70 import Control.Monad
  71 import Data.Bits
  72 import Data.Char
  73 import Data.List
  74 import Data.Maybe
  75 import Data.Map (Map)
  76 import qualified Data.Map as Map
  77 import Data.Ratio
  78 }
  79
  80 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  81 $whitechar   = [\ \n\r\f\v $unispace]
  82 $white_no_nl = $whitechar # \n
  83 $tab         = \t
  84
  85 $ascdigit  = 0-9
  86 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  87 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  88 $digit     = [$ascdigit $unidigit]
  89
  90 $special   = [\(\)\,\;\[\]\`\{\}]
  91 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  92 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  93 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  94
  95 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  96 $asclarge  = [A-Z]
  97 $large     = [$asclarge $unilarge]
  98
  99 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 100 $ascsmall  = [a-z]
 101 $small     = [$ascsmall $unismall \_]
 102
 103 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 104 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 105
 106 $octit     = 0-7
 107 $hexit     = [$decdigit A-F a-f]
 108 $symchar   = [$symbol \:]
 109 $nl        = [\n\r]
 110 $idchar    = [$small $large $digit \']
 111
 112 $pragmachar = [$small $large $digit]
 113
 114 $docsym    = [\| \^ \* \$]
 115
 116 @varid     = $small $idchar*
 117 @conid     = $large $idchar*
 118
 119 @varsym    = $symbol $symchar*
 120 @consym    = \: $symchar*
 121
 122 @decimal     = $decdigit+
 123 @octal       = $octit+
 124 @hexadecimal = $hexit+
 125 @exponent    = [eE] [\-\+]? @decimal
 126
 127 -- we support the hierarchical module name extension:
 128 @qual = (@conid \.)+
 129
 130 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 131
 132 -- normal signed numerical literals can only be explicitly negative,
 133 -- not explicitly positive (contrast @exponent)
 134 @negative = \-
 135 @signed = @negative ?
 136
 137 haskell :-
 138
 139 -- everywhere: skip whitespace and comments
 140 $white_no_nl+                           ;
 141 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 142
 143 -- Everywhere: deal with nested comments.  We explicitly rule out
 144 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 145 -- (this can happen even though pragmas will normally take precedence due to
 146 -- longest-match, because pragmas aren't valid in every state, but comments
 147 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 148 -- set.
 149
 150 "{-" / { isNormalComment } { nested_comment lexToken }
 151
 152 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 153 -- more dashes followed by a symbol should be parsed as a varsym, so we
 154 -- have to exclude those.
 155
 156 -- Since Haddock comments aren't valid in every state, we need to rule them
 157 -- out here.
 158
 159 -- The following two rules match comments that begin with two dashes, but
 160 -- continue with a different character. The rules test that this character
 161 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 162 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 163 -- have a Haddock comment). The rules then munch the rest of the line.
 164
 165 "-- " ~[$docsym \#] .* { lineCommentToken }
 166 "--" [^$symbol : \ ] .* { lineCommentToken }
 167
 168 -- Next, match Haddock comments if no -haddock flag
 169
 170 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 171
 172 -- Now, when we've matched comments that begin with 2 dashes and continue
 173 -- with a different character, we need to match comments that begin with three
 174 -- or more dashes (which clearly can't be Haddock comments). We only need to
 175 -- make sure that the first non-dash character isn't a symbol, and munch the
 176 -- rest of the line.
 177
 178 "---"\-* [^$symbol :] .* { lineCommentToken }
 179
 180 -- Since the previous rules all match dashes followed by at least one
 181 -- character, we also need to match a whole line filled with just dashes.
 182
 183 "--"\-* / { atEOL } { lineCommentToken }
 184
 185 -- We need this rule since none of the other single line comment rules
 186 -- actually match this case.
 187
 188 "-- " / { atEOL } { lineCommentToken }
 189
 190 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 191 -- blank lines) until we find a non-whitespace character, then do layout
 192 -- processing.
 193 --
 194 -- One slight wibble here: what if the line begins with {-#? In
 195 -- theory, we have to lex the pragma to see if it's one we recognise,
 196 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 197 -- as a nested comment.  We don't bother with this: if the line begins
 198 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 199 <bol> {
 200   \n                                    ;
 201   ^\# (line)?                           { begin line_prag1 }
 202   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 203   ^\# \! .* \n                          ; -- #!, for scripts
 204   ()                                    { do_bol }
 205 }
 206
 207 -- after a layout keyword (let, where, do, of), we begin a new layout
 208 -- context if the curly brace is missing.
 209 -- Careful! This stuff is quite delicate.
 210 <layout, layout_do> {
 211   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 212         -- we might encounter {-# here, but {- has been handled already
 213   \n                                    ;
 214   ^\# (line)?                           { begin line_prag1 }
 215 }
 216
 217 -- do is treated in a subtly different way, see new_layout_context
 218 <layout>    ()                          { new_layout_context True }
 219 <layout_do> ()                          { new_layout_context False }
 220
 221 -- after a new layout context which was found to be to the left of the
 222 -- previous context, we have generated a '{' token, and we now need to
 223 -- generate a matching '}' token.
 224 <layout_left>  ()                       { do_layout_left }
 225
 226 <0,option_prags> \n                             { begin bol }
 227
 228 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 229                                 { dispatch_pragmas linePrags }
 230
 231 -- single-line line pragmas, of the form
 232 --    # <line> "<file>" <extra-stuff> \n
 233 <line_prag1> $decdigit+                 { setLine line_prag1a }
 234 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 235 <line_prag1b> .*                        { pop }
 236
 237 -- Haskell-style line pragmas, of the form
 238 --    {-# LINE <line> "<file>" #-}
 239 <line_prag2> $decdigit+                 { setLine line_prag2a }
 240 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 241 <line_prag2b> "#-}"|"-}"                { pop }
 242    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 243    -- with older versions of GHC which generated these.
 244
 245 <0,option_prags> {
 246   "{-#" $whitechar* $pragmachar+
 247         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 248                                  { dispatch_pragmas twoWordPrags }
 249
 250   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 251                                  { dispatch_pragmas oneWordPrags }
 252
 253   -- We ignore all these pragmas, but don't generate a warning for them
 254   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 255                                  { dispatch_pragmas ignoredPrags }
 256
 257   -- ToDo: should only be valid inside a pragma:
 258   "#-}"                                 { endPrag }
 259 }
 260
 261 <option_prags> {
 262   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 263                                    { dispatch_pragmas fileHeaderPrags }
 264
 265   "-- #"                                 { multiline_doc_comment }
 266 }
 267
 268 <0> {
 269   -- In the "0" mode we ignore these pragmas
 270   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 271                      { nested_comment lexToken }
 272 }
 273
 274 <0> {
 275   "-- #" .* { lineCommentToken }
 276 }
 277
 278 <0,option_prags> {
 279   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 280                     (nested_comment lexToken) }
 281 }
 282
 283 -- '0' state: ordinary lexemes
 284
 285 -- Haddock comments
 286
 287 <0> {
 288   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 289   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 290 }
 291
 292 -- "special" symbols
 293
 294 <0> {
 295   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 296   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 297 }
 298
 299 <0> {
 300   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 301   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 302   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 303   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 304   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 305   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 306   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 307   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 308
 309   "[$" @varid "|"  / { ifExtension qqEnabled }
 310                      { lex_quasiquote_tok }
 311 }
 312
 313 <0> {
 314   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 315                                         { special IToparenbar }
 316   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 317 }
 318
 319 <0> {
 320   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 321 }
 322
 323 <0> {
 324   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 325          { token IToubxparen }
 326   "#)" / { ifExtension unboxedTuplesEnabled }
 327          { token ITcubxparen }
 328 }
 329
 330 <0> {
 331   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 332   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 333 }
 334
 335 <0,option_prags> {
 336   \(                                    { special IToparen }
 337   \)                                    { special ITcparen }
 338   \[                                    { special ITobrack }
 339   \]                                    { special ITcbrack }
 340   \,                                    { special ITcomma }
 341   \;                                    { special ITsemi }
 342   \`                                    { special ITbackquote }
 343
 344   \{                                    { open_brace }
 345   \}                                    { close_brace }
 346 }
 347
 348 <0,option_prags> {
 349   @qual @varid                  { idtoken qvarid }
 350   @qual @conid                  { idtoken qconid }
 351   @varid                        { varid }
 352   @conid                        { idtoken conid }
 353 }
 354
 355 <0> {
 356   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 357   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 358   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 359   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 360 }
 361
 362 -- ToDo: - move `var` and (sym) into lexical syntax?
 363 --       - remove backquote from $special?
 364 <0> {
 365   @qual @varsym       / { ifExtension oldQualOps } { idtoken qvarsym }
 366   @qual @consym       / { ifExtension oldQualOps } { idtoken qconsym }
 367   @qual \( @varsym \) / { ifExtension newQualOps } { idtoken prefixqvarsym }
 368   @qual \( @consym \) / { ifExtension newQualOps } { idtoken prefixqconsym }
 369   @varsym                                          { varsym }
 370   @consym                                          { consym }
 371 }
 372
 373 -- For the normal boxed literals we need to be careful
 374 -- when trying to be close to Haskell98
 375 <0> {
 376   -- Normal integral literals (:: Num a => a, from Integer)
 377   @decimal           { tok_num positive 0 0 decimal }
 378   0[oO] @octal       { tok_num positive 2 2 octal }
 379   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 380
 381   -- Normal rational literals (:: Fractional a => a, from Rational)
 382   @floating_point    { strtoken tok_float }
 383 }
 384
 385 <0> {
 386   -- Unboxed ints (:: Int#) and words (:: Word#)
 387   -- It's simpler (and faster?) to give separate cases to the negatives,
 388   -- especially considering octal/hexadecimal prefixes.
 389   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 390   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 391   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 392   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 393   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 394   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 395
 396   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 397   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 398   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 399
 400   -- Unboxed floats and doubles (:: Float#, :: Double#)
 401   -- prim_{float,double} work with signed literals
 402   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 403   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 404 }
 405
 406 -- Strings and chars are lexed by hand-written code.  The reason is
 407 -- that even if we recognise the string or char here in the regex
 408 -- lexer, we would still have to parse the string afterward in order
 409 -- to convert it to a String.
 410 <0> {
 411   \'                            { lex_char_tok }
 412   \"                            { lex_string_tok }
 413 }
 414
 415 {
 416 -- -----------------------------------------------------------------------------
 417 -- The token type
 418
 419 data Token
 420   = ITas                        -- Haskell keywords
 421   | ITcase
 422   | ITclass
 423   | ITdata
 424   | ITdefault
 425   | ITderiving
 426   | ITdo
 427   | ITelse
 428   | IThiding
 429   | ITif
 430   | ITimport
 431   | ITin
 432   | ITinfix
 433   | ITinfixl
 434   | ITinfixr
 435   | ITinstance
 436   | ITlet
 437   | ITmodule
 438   | ITnewtype
 439   | ITof
 440   | ITqualified
 441   | ITthen
 442   | ITtype
 443   | ITwhere
 444   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 445
 446   | ITforall                    -- GHC extension keywords
 447   | ITforeign
 448   | ITexport
 449   | ITlabel
 450   | ITdynamic
 451   | ITsafe
 452   | ITthreadsafe
 453   | ITunsafe
 454   | ITstdcallconv
 455   | ITccallconv
 456   | ITprimcallconv
 457   | ITmdo
 458   | ITfamily
 459   | ITgroup
 460   | ITby
 461   | ITusing
 462
 463         -- Pragmas
 464   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 465   | ITinline_conlike_prag Bool  -- same
 466   | ITspec_prag                 -- SPECIALISE
 467   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 468   | ITsource_prag
 469   | ITrules_prag
 470   | ITwarning_prag
 471   | ITdeprecated_prag
 472   | ITline_prag
 473   | ITscc_prag
 474   | ITgenerated_prag
 475   | ITcore_prag                 -- hdaume: core annotations
 476   | ITunpack_prag
 477   | ITann_prag
 478   | ITclose_prag
 479   | IToptions_prag String
 480   | ITinclude_prag String
 481   | ITlanguage_prag
 482
 483   | ITdotdot                    -- reserved symbols
 484   | ITcolon
 485   | ITdcolon
 486   | ITequal
 487   | ITlam
 488   | ITvbar
 489   | ITlarrow
 490   | ITrarrow
 491   | ITat
 492   | ITtilde
 493   | ITdarrow
 494   | ITminus
 495   | ITbang
 496   | ITstar
 497   | ITdot
 498
 499   | ITbiglam                    -- GHC-extension symbols
 500
 501   | ITocurly                    -- special symbols
 502   | ITccurly
 503   | ITocurlybar                 -- {|, for type applications
 504   | ITccurlybar                 -- |}, for type applications
 505   | ITvocurly
 506   | ITvccurly
 507   | ITobrack
 508   | ITopabrack                  -- [:, for parallel arrays with -XParr
 509   | ITcpabrack                  -- :], for parallel arrays with -XParr
 510   | ITcbrack
 511   | IToparen
 512   | ITcparen
 513   | IToubxparen
 514   | ITcubxparen
 515   | ITsemi
 516   | ITcomma
 517   | ITunderscore
 518   | ITbackquote
 519
 520   | ITvarid   FastString        -- identifiers
 521   | ITconid   FastString
 522   | ITvarsym  FastString
 523   | ITconsym  FastString
 524   | ITqvarid  (FastString,FastString)
 525   | ITqconid  (FastString,FastString)
 526   | ITqvarsym (FastString,FastString)
 527   | ITqconsym (FastString,FastString)
 528   | ITprefixqvarsym (FastString,FastString)
 529   | ITprefixqconsym (FastString,FastString)
 530
 531   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 532
 533   | ITchar       Char
 534   | ITstring     FastString
 535   | ITinteger    Integer
 536   | ITrational   Rational
 537
 538   | ITprimchar   Char
 539   | ITprimstring FastString
 540   | ITprimint    Integer
 541   | ITprimword   Integer
 542   | ITprimfloat  Rational
 543   | ITprimdouble Rational
 544
 545   -- Template Haskell extension tokens
 546   | ITopenExpQuote              --  [| or [e|
 547   | ITopenPatQuote              --  [p|
 548   | ITopenDecQuote              --  [d|
 549   | ITopenTypQuote              --  [t|
 550   | ITcloseQuote                --  |]
 551   | ITidEscape   FastString     --  $x
 552   | ITparenEscape               --  $(
 553   | ITvarQuote                  --  '
 554   | ITtyQuote                   --  ''
 555   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 556
 557   -- Arrow notation extension
 558   | ITproc
 559   | ITrec
 560   | IToparenbar                 --  (|
 561   | ITcparenbar                 --  |)
 562   | ITlarrowtail                --  -<
 563   | ITrarrowtail                --  >-
 564   | ITLarrowtail                --  -<<
 565   | ITRarrowtail                --  >>-
 566
 567   | ITunknown String            -- Used when the lexer can't make sense of it
 568   | ITeof                       -- end of file token
 569
 570   -- Documentation annotations
 571   | ITdocCommentNext  String     -- something beginning '-- |'
 572   | ITdocCommentPrev  String     -- something beginning '-- ^'
 573   | ITdocCommentNamed String     -- something beginning '-- $'
 574   | ITdocSection      Int String -- a section heading
 575   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 576   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 577   | ITlineComment     String     -- comment starting by "--"
 578   | ITblockComment    String     -- comment in {- -}
 579
 580 #ifdef DEBUG
 581   deriving Show -- debugging
 582 #endif
 583
 584 {-
 585 isSpecial :: Token -> Bool
 586 -- If we see M.x, where x is a keyword, but
 587 -- is special, we treat is as just plain M.x,
 588 -- not as a keyword.
 589 isSpecial ITas          = True
 590 isSpecial IThiding      = True
 591 isSpecial ITqualified   = True
 592 isSpecial ITforall      = True
 593 isSpecial ITexport      = True
 594 isSpecial ITlabel       = True
 595 isSpecial ITdynamic     = True
 596 isSpecial ITsafe        = True
 597 isSpecial ITthreadsafe  = True
 598 isSpecial ITunsafe      = True
 599 isSpecial ITccallconv   = True
 600 isSpecial ITstdcallconv = True
 601 isSpecial ITprimcallconv = True
 602 isSpecial ITmdo         = True
 603 isSpecial ITfamily      = True
 604 isSpecial ITgroup   = True
 605 isSpecial ITby      = True
 606 isSpecial ITusing   = True
 607 isSpecial _             = False
 608 -}
 609
 610 -- the bitmap provided as the third component indicates whether the
 611 -- corresponding extension keyword is valid under the extension options
 612 -- provided to the compiler; if the extension corresponding to *any* of the
 613 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 614 -- facilitates using a keyword in two different extensions that can be
 615 -- activated independently)
 616 --
 617 reservedWordsFM :: UniqFM (Token, Int)
 618 reservedWordsFM = listToUFM $
 619         map (\(x, y, z) -> (mkFastString x, (y, z)))
 620        [( "_",          ITunderscore,   0 ),
 621         ( "as",         ITas,           0 ),
 622         ( "case",       ITcase,         0 ),
 623         ( "class",      ITclass,        0 ),
 624         ( "data",       ITdata,         0 ),
 625         ( "default",    ITdefault,      0 ),
 626         ( "deriving",   ITderiving,     0 ),
 627         ( "do",         ITdo,           0 ),
 628         ( "else",       ITelse,         0 ),
 629         ( "hiding",     IThiding,       0 ),
 630         ( "if",         ITif,           0 ),
 631         ( "import",     ITimport,       0 ),
 632         ( "in",         ITin,           0 ),
 633         ( "infix",      ITinfix,        0 ),
 634         ( "infixl",     ITinfixl,       0 ),
 635         ( "infixr",     ITinfixr,       0 ),
 636         ( "instance",   ITinstance,     0 ),
 637         ( "let",        ITlet,          0 ),
 638         ( "module",     ITmodule,       0 ),
 639         ( "newtype",    ITnewtype,      0 ),
 640         ( "of",         ITof,           0 ),
 641         ( "qualified",  ITqualified,    0 ),
 642         ( "then",       ITthen,         0 ),
 643         ( "type",       ITtype,         0 ),
 644         ( "where",      ITwhere,        0 ),
 645         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 646
 647     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 648         ( "mdo",        ITmdo,           bit recursiveDoBit),
 649         ( "family",     ITfamily,        bit tyFamBit),
 650     ( "group",  ITgroup,     bit transformComprehensionsBit),
 651     ( "by",     ITby,        bit transformComprehensionsBit),
 652     ( "using",  ITusing,     bit transformComprehensionsBit),
 653
 654         ( "foreign",    ITforeign,       bit ffiBit),
 655         ( "export",     ITexport,        bit ffiBit),
 656         ( "label",      ITlabel,         bit ffiBit),
 657         ( "dynamic",    ITdynamic,       bit ffiBit),
 658         ( "safe",       ITsafe,          bit ffiBit),
 659         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 660         ( "unsafe",     ITunsafe,        bit ffiBit),
 661         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 662         ( "ccall",      ITccallconv,     bit ffiBit),
 663         ( "prim",       ITprimcallconv,  bit ffiBit),
 664
 665         ( "rec",        ITrec,           bit recBit),
 666         ( "proc",       ITproc,          bit arrowsBit)
 667      ]
 668
 669 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 670 reservedSymsFM = listToUFM $
 671     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 672       [ ("..",  ITdotdot,   always)
 673         -- (:) is a reserved op, meaning only list cons
 674        ,(":",   ITcolon,    always)
 675        ,("::",  ITdcolon,   always)
 676        ,("=",   ITequal,    always)
 677        ,("\\",  ITlam,      always)
 678        ,("|",   ITvbar,     always)
 679        ,("<-",  ITlarrow,   always)
 680        ,("->",  ITrarrow,   always)
 681        ,("@",   ITat,       always)
 682        ,("~",   ITtilde,    always)
 683        ,("=>",  ITdarrow,   always)
 684        ,("-",   ITminus,    always)
 685        ,("!",   ITbang,     always)
 686
 687         -- For data T (a::*) = MkT
 688        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 689         -- For 'forall a . t'
 690        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 691
 692        ,("-<",  ITlarrowtail, arrowsEnabled)
 693        ,(">-",  ITrarrowtail, arrowsEnabled)
 694        ,("-<<", ITLarrowtail, arrowsEnabled)
 695        ,(">>-", ITRarrowtail, arrowsEnabled)
 696
 697        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 698        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 699        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 700                                 explicitForallEnabled i)
 701        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 702        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 703        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 704         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 705         -- form part of a large operator.  This would let us have a better
 706         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 707        ]
 708
 709 -- -----------------------------------------------------------------------------
 710 -- Lexer actions
 711
 712 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 713
 714 special :: Token -> Action
 715 special tok span _buf _len = return (L span tok)
 716
 717 token, layout_token :: Token -> Action
 718 token t span _buf _len = return (L span t)
 719 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 720
 721 idtoken :: (StringBuffer -> Int -> Token) -> Action
 722 idtoken f span buf len = return (L span $! (f buf len))
 723
 724 skip_one_varid :: (FastString -> Token) -> Action
 725 skip_one_varid f span buf len
 726   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 727
 728 strtoken :: (String -> Token) -> Action
 729 strtoken f span buf len =
 730   return (L span $! (f $! lexemeToString buf len))
 731
 732 init_strtoken :: Int -> (String -> Token) -> Action
 733 -- like strtoken, but drops the last N character(s)
 734 init_strtoken drop f span buf len =
 735   return (L span $! (f $! lexemeToString buf (len-drop)))
 736
 737 begin :: Int -> Action
 738 begin code _span _str _len = do pushLexState code; lexToken
 739
 740 pop :: Action
 741 pop _span _buf _len = do popLexState; lexToken
 742
 743 pop_and :: Action -> Action
 744 pop_and act span buf len = do popLexState; act span buf len
 745
 746 {-# INLINE nextCharIs #-}
 747 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 748 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 749
 750 notFollowedBy :: Char -> AlexAccPred Int
 751 notFollowedBy char _ _ _ (AI _ _ buf)
 752   = nextCharIs buf (/=char)
 753
 754 notFollowedBySymbol :: AlexAccPred Int
 755 notFollowedBySymbol _ _ _ (AI _ _ buf)
 756   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 757
 758 -- We must reject doc comments as being ordinary comments everywhere.
 759 -- In some cases the doc comment will be selected as the lexeme due to
 760 -- maximal munch, but not always, because the nested comment rule is
 761 -- valid in all states, but the doc-comment rules are only valid in
 762 -- the non-layout states.
 763 isNormalComment :: AlexAccPred Int
 764 isNormalComment bits _ _ (AI _ _ buf)
 765   | haddockEnabled bits = notFollowedByDocOrPragma
 766   | otherwise           = nextCharIs buf (/='#')
 767   where
 768     notFollowedByDocOrPragma
 769        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 770
 771 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 772 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 773
 774 {-
 775 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 776   = if haddockEnabled bits then False else (p buf)
 777 -}
 778
 779 atEOL :: AlexAccPred Int
 780 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 781
 782 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 783 ifExtension pred bits _ _ _ = pred bits
 784
 785 multiline_doc_comment :: Action
 786 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 787   where
 788     worker commentAcc input docType oneLine = case alexGetChar input of
 789       Just ('\n', input')
 790         | oneLine -> docCommentEnd input commentAcc docType buf span
 791         | otherwise -> case checkIfCommentLine input' of
 792           Just input -> worker ('\n':commentAcc) input docType False
 793           Nothing -> docCommentEnd input commentAcc docType buf span
 794       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 795       Nothing -> docCommentEnd input commentAcc docType buf span
 796
 797     checkIfCommentLine input = check (dropNonNewlineSpace input)
 798       where
 799         check input = case alexGetChar input of
 800           Just ('-', input) -> case alexGetChar input of
 801             Just ('-', input) -> case alexGetChar input of
 802               Just (c, _) | c /= '-' -> Just input
 803               _ -> Nothing
 804             _ -> Nothing
 805           _ -> Nothing
 806
 807         dropNonNewlineSpace input = case alexGetChar input of
 808           Just (c, input')
 809             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 810             | otherwise -> input
 811           Nothing -> input
 812
 813 lineCommentToken :: Action
 814 lineCommentToken span buf len = do
 815   b <- extension rawTokenStreamEnabled
 816   if b then strtoken ITlineComment span buf len else lexToken
 817
 818 {-
 819   nested comments require traversing by hand, they can't be parsed
 820   using regular expressions.
 821 -}
 822 nested_comment :: P (Located Token) -> Action
 823 nested_comment cont span _str _len = do
 824   input <- getInput
 825   go "" (1::Int) input
 826   where
 827     go commentAcc 0 input = do setInput input
 828                                b <- extension rawTokenStreamEnabled
 829                                if b
 830                                  then docCommentEnd input commentAcc ITblockComment _str span
 831                                  else cont
 832     go commentAcc n input = case alexGetChar input of
 833       Nothing -> errBrace input span
 834       Just ('-',input) -> case alexGetChar input of
 835         Nothing  -> errBrace input span
 836         Just ('\125',input) -> go commentAcc (n-1) input
 837         Just (_,_)          -> go ('-':commentAcc) n input
 838       Just ('\123',input) -> case alexGetChar input of
 839         Nothing  -> errBrace input span
 840         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 841         Just (_,_)       -> go ('\123':commentAcc) n input
 842       Just (c,input) -> go (c:commentAcc) n input
 843
 844 nested_doc_comment :: Action
 845 nested_doc_comment span buf _len = withLexedDocType (go "")
 846   where
 847     go commentAcc input docType _ = case alexGetChar input of
 848       Nothing -> errBrace input span
 849       Just ('-',input) -> case alexGetChar input of
 850         Nothing -> errBrace input span
 851         Just ('\125',input) ->
 852           docCommentEnd input commentAcc docType buf span
 853         Just (_,_) -> go ('-':commentAcc) input docType False
 854       Just ('\123', input) -> case alexGetChar input of
 855         Nothing  -> errBrace input span
 856         Just ('-',input) -> do
 857           setInput input
 858           let cont = do input <- getInput; go commentAcc input docType False
 859           nested_comment cont span buf _len
 860         Just (_,_) -> go ('\123':commentAcc) input docType False
 861       Just (c,input) -> go (c:commentAcc) input docType False
 862
 863 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 864                  -> P (Located Token)
 865 withLexedDocType lexDocComment = do
 866   input@(AI _ _ buf) <- getInput
 867   case prevChar buf ' ' of
 868     '|' -> lexDocComment input ITdocCommentNext False
 869     '^' -> lexDocComment input ITdocCommentPrev False
 870     '$' -> lexDocComment input ITdocCommentNamed False
 871     '*' -> lexDocSection 1 input
 872     '#' -> lexDocComment input ITdocOptionsOld False
 873     _ -> panic "withLexedDocType: Bad doc type"
 874  where
 875     lexDocSection n input = case alexGetChar input of
 876       Just ('*', input) -> lexDocSection (n+1) input
 877       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 878       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 879
 880 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 881 -- off again at the end of the pragma.
 882 rulePrag :: Action
 883 rulePrag span _buf _len = do
 884   setExts (.|. bit inRulePragBit)
 885   return (L span ITrules_prag)
 886
 887 endPrag :: Action
 888 endPrag span _buf _len = do
 889   setExts (.&. complement (bit inRulePragBit))
 890   return (L span ITclose_prag)
 891
 892 -- docCommentEnd
 893 -------------------------------------------------------------------------------
 894 -- This function is quite tricky. We can't just return a new token, we also
 895 -- need to update the state of the parser. Why? Because the token is longer
 896 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 897 -- it writes the wrong token length to the parser state. This function is
 898 -- called afterwards, so it can just update the state.
 899
 900 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 901 -- which is something that the original lexer didn't account for.
 902 -- I have added last_line_len in the parser state which represents the length
 903 -- of the part of the token that is on the last line. It is now used for layout
 904 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 905 -- was before, the full length of the token, and it is now only used for error
 906 -- messages. /Waern
 907
 908 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 909                  SrcSpan -> P (Located Token)
 910 docCommentEnd input commentAcc docType buf span = do
 911   setInput input
 912   let (AI loc last_offs nextBuf) = input
 913       comment = reverse commentAcc
 914       span' = mkSrcSpan (srcSpanStart span) loc
 915       last_len = byteDiff buf nextBuf
 916
 917       last_line_len = if (last_offs - last_len < 0)
 918         then last_offs
 919         else last_len
 920
 921   span `seq` setLastToken span' last_len last_line_len
 922   return (L span' (docType comment))
 923
 924 errBrace :: AlexInput -> SrcSpan -> P a
 925 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 926
 927 open_brace, close_brace :: Action
 928 open_brace span _str _len = do
 929   ctx <- getContext
 930   setContext (NoLayout:ctx)
 931   return (L span ITocurly)
 932 close_brace span _str _len = do
 933   popContext
 934   return (L span ITccurly)
 935
 936 qvarid, qconid :: StringBuffer -> Int -> Token
 937 qvarid buf len = ITqvarid $! splitQualName buf len False
 938 qconid buf len = ITqconid $! splitQualName buf len False
 939
 940 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 941 -- takes a StringBuffer and a length, and returns the module name
 942 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 943 -- because of hierarchical module names.
 944 splitQualName orig_buf len parens = split orig_buf orig_buf
 945   where
 946     split buf dot_buf
 947         | orig_buf `byteDiff` buf >= len  = done dot_buf
 948         | c == '.'                        = found_dot buf'
 949         | otherwise                       = split buf' dot_buf
 950       where
 951        (c,buf') = nextChar buf
 952
 953     -- careful, we might get names like M....
 954     -- so, if the character after the dot is not upper-case, this is
 955     -- the end of the qualifier part.
 956     found_dot buf -- buf points after the '.'
 957         | isUpper c    = split buf' buf
 958         | otherwise    = done buf
 959       where
 960        (c,buf') = nextChar buf
 961
 962     done dot_buf =
 963         (lexemeToFastString orig_buf (qual_size - 1),
 964          if parens -- Prelude.(+)
 965             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 966             else lexemeToFastString dot_buf (len - qual_size))
 967       where
 968         qual_size = orig_buf `byteDiff` dot_buf
 969
 970 varid :: Action
 971 varid span buf len =
 972   fs `seq`
 973   case lookupUFM reservedWordsFM fs of
 974         Just (keyword,0)    -> do
 975                 maybe_layout keyword
 976                 return (L span keyword)
 977         Just (keyword,exts) -> do
 978                 b <- extension (\i -> exts .&. i /= 0)
 979                 if b then do maybe_layout keyword
 980                              return (L span keyword)
 981                      else return (L span (ITvarid fs))
 982         _other -> return (L span (ITvarid fs))
 983   where
 984         fs = lexemeToFastString buf len
 985
 986 conid :: StringBuffer -> Int -> Token
 987 conid buf len = ITconid fs
 988   where fs = lexemeToFastString buf len
 989
 990 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
 991 qvarsym buf len = ITqvarsym $! splitQualName buf len False
 992 qconsym buf len = ITqconsym $! splitQualName buf len False
 993 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
 994 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
 995
 996 varsym, consym :: Action
 997 varsym = sym ITvarsym
 998 consym = sym ITconsym
 999
1000 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
1001     -> P (Located Token)
1002 sym con span buf len =
1003   case lookupUFM reservedSymsFM fs of
1004         Just (keyword,exts) -> do
1005                 b <- extension exts
1006                 if b then return (L span keyword)
1007                      else return (L span $! con fs)
1008         _other -> return (L span $! con fs)
1009   where
1010         fs = lexemeToFastString buf len
1011
1012 -- Variations on the integral numeric literal.
1013 tok_integral :: (Integer -> Token)
1014      -> (Integer -> Integer)
1015  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1016      -> Int -> Int
1017      -> (Integer, (Char->Int)) -> Action
1018 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1019   return $ L span $ itint $! transint $ parseUnsignedInteger
1020      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1021
1022 -- some conveniences for use with tok_integral
1023 tok_num :: (Integer -> Integer)
1024         -> Int -> Int
1025         -> (Integer, (Char->Int)) -> Action
1026 tok_num = tok_integral ITinteger
1027 tok_primint :: (Integer -> Integer)
1028             -> Int -> Int
1029             -> (Integer, (Char->Int)) -> Action
1030 tok_primint = tok_integral ITprimint
1031 tok_primword :: Int -> Int
1032              -> (Integer, (Char->Int)) -> Action
1033 tok_primword = tok_integral ITprimword positive
1034 positive, negative :: (Integer -> Integer)
1035 positive = id
1036 negative = negate
1037 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1038 decimal = (10,octDecDigit)
1039 octal = (8,octDecDigit)
1040 hexadecimal = (16,hexDigit)
1041
1042 -- readRational can understand negative rationals, exponents, everything.
1043 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1044 tok_float        str = ITrational   $! readRational str
1045 tok_primfloat    str = ITprimfloat  $! readRational str
1046 tok_primdouble   str = ITprimdouble $! readRational str
1047
1048 -- -----------------------------------------------------------------------------
1049 -- Layout processing
1050
1051 -- we're at the first token on a line, insert layout tokens if necessary
1052 do_bol :: Action
1053 do_bol span _str _len = do
1054         pos <- getOffside
1055         case pos of
1056             LT -> do
1057                 --trace "layout: inserting '}'" $ do
1058                 popContext
1059                 -- do NOT pop the lex state, we might have a ';' to insert
1060                 return (L span ITvccurly)
1061             EQ -> do
1062                 --trace "layout: inserting ';'" $ do
1063                 popLexState
1064                 return (L span ITsemi)
1065             GT -> do
1066                 popLexState
1067                 lexToken
1068
1069 -- certain keywords put us in the "layout" state, where we might
1070 -- add an opening curly brace.
1071 maybe_layout :: Token -> P ()
1072 maybe_layout ITdo       = pushLexState layout_do
1073 maybe_layout ITmdo      = pushLexState layout_do
1074 maybe_layout ITof       = pushLexState layout
1075 maybe_layout ITlet      = pushLexState layout
1076 maybe_layout ITwhere    = pushLexState layout
1077 maybe_layout ITrec      = pushLexState layout
1078 maybe_layout _          = return ()
1079
1080 -- Pushing a new implicit layout context.  If the indentation of the
1081 -- next token is not greater than the previous layout context, then
1082 -- Haskell 98 says that the new layout context should be empty; that is
1083 -- the lexer must generate {}.
1084 --
1085 -- We are slightly more lenient than this: when the new context is started
1086 -- by a 'do', then we allow the new context to be at the same indentation as
1087 -- the previous context.  This is what the 'strict' argument is for.
1088 --
1089 new_layout_context :: Bool -> Action
1090 new_layout_context strict span _buf _len = do
1091     popLexState
1092     (AI _ offset _) <- getInput
1093     ctx <- getContext
1094     case ctx of
1095         Layout prev_off : _  |
1096            (strict     && prev_off >= offset  ||
1097             not strict && prev_off > offset) -> do
1098                 -- token is indented to the left of the previous context.
1099                 -- we must generate a {} sequence now.
1100                 pushLexState layout_left
1101                 return (L span ITvocurly)
1102         _ -> do
1103                 setContext (Layout offset : ctx)
1104                 return (L span ITvocurly)
1105
1106 do_layout_left :: Action
1107 do_layout_left span _buf _len = do
1108     popLexState
1109     pushLexState bol  -- we must be at the start of a line
1110     return (L span ITvccurly)
1111
1112 -- -----------------------------------------------------------------------------
1113 -- LINE pragmas
1114
1115 setLine :: Int -> Action
1116 setLine code span buf len = do
1117   let line = parseUnsignedInteger buf len 10 octDecDigit
1118   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1119         -- subtract one: the line number refers to the *following* line
1120   popLexState
1121   pushLexState code
1122   lexToken
1123
1124 setFile :: Int -> Action
1125 setFile code span buf len = do
1126   let file = lexemeToFastString (stepOn buf) (len-2)
1127   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1128   popLexState
1129   pushLexState code
1130   lexToken
1131
1132
1133 -- -----------------------------------------------------------------------------
1134 -- Options, includes and language pragmas.
1135
1136 lex_string_prag :: (String -> Token) -> Action
1137 lex_string_prag mkTok span _buf _len
1138     = do input <- getInput
1139          start <- getSrcLoc
1140          tok <- go [] input
1141          end <- getSrcLoc
1142          return (L (mkSrcSpan start end) tok)
1143     where go acc input
1144               = if isString input "#-}"
1145                    then do setInput input
1146                            return (mkTok (reverse acc))
1147                    else case alexGetChar input of
1148                           Just (c,i) -> go (c:acc) i
1149                           Nothing -> err input
1150           isString _ [] = True
1151           isString i (x:xs)
1152               = case alexGetChar i of
1153                   Just (c,i') | c == x    -> isString i' xs
1154                   _other -> False
1155           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1156
1157
1158 -- -----------------------------------------------------------------------------
1159 -- Strings & Chars
1160
1161 -- This stuff is horrible.  I hates it.
1162
1163 lex_string_tok :: Action
1164 lex_string_tok span _buf _len = do
1165   tok <- lex_string ""
1166   end <- getSrcLoc
1167   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1168
1169 lex_string :: String -> P Token
1170 lex_string s = do
1171   i <- getInput
1172   case alexGetChar' i of
1173     Nothing -> lit_error
1174
1175     Just ('"',i)  -> do
1176         setInput i
1177         magicHash <- extension magicHashEnabled
1178         if magicHash
1179           then do
1180             i <- getInput
1181             case alexGetChar' i of
1182               Just ('#',i) -> do
1183                    setInput i
1184                    if any (> '\xFF') s
1185                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1186                     else let s' = mkZFastString (reverse s) in
1187                          return (ITprimstring s')
1188                         -- mkZFastString is a hack to avoid encoding the
1189                         -- string in UTF-8.  We just want the exact bytes.
1190               _other ->
1191                 return (ITstring (mkFastString (reverse s)))
1192           else
1193                 return (ITstring (mkFastString (reverse s)))
1194
1195     Just ('\\',i)
1196         | Just ('&',i) <- next -> do
1197                 setInput i; lex_string s
1198         | Just (c,i) <- next, is_space c -> do
1199                 setInput i; lex_stringgap s
1200         where next = alexGetChar' i
1201
1202     Just (c, i) -> do
1203         c' <- lex_char c i
1204         lex_string (c':s)
1205
1206 lex_stringgap :: String -> P Token
1207 lex_stringgap s = do
1208   c <- getCharOrFail
1209   case c of
1210     '\\' -> lex_string s
1211     c | is_space c -> lex_stringgap s
1212     _other -> lit_error
1213
1214
1215 lex_char_tok :: Action
1216 -- Here we are basically parsing character literals, such as 'x' or '\n'
1217 -- but, when Template Haskell is on, we additionally spot
1218 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1219 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1220 -- So we have to do two characters of lookahead: when we see 'x we need to
1221 -- see if there's a trailing quote
1222 lex_char_tok span _buf _len = do        -- We've seen '
1223    i1 <- getInput       -- Look ahead to first character
1224    let loc = srcSpanStart span
1225    case alexGetChar' i1 of
1226         Nothing -> lit_error
1227
1228         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1229                   th_exts <- extension thEnabled
1230                   if th_exts then do
1231                         setInput i2
1232                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1233                    else lit_error
1234
1235         Just ('\\', i2@(AI _end2 _ _)) -> do    -- We've seen 'backslash
1236                   setInput i2
1237                   lit_ch <- lex_escape
1238                   mc <- getCharOrFail   -- Trailing quote
1239                   if mc == '\'' then finish_char_tok loc lit_ch
1240                                 else do setInput i2; lit_error
1241
1242         Just (c, i2@(AI _end2 _ _))
1243                 | not (isAny c) -> lit_error
1244                 | otherwise ->
1245
1246                 -- We've seen 'x, where x is a valid character
1247                 --  (i.e. not newline etc) but not a quote or backslash
1248            case alexGetChar' i2 of      -- Look ahead one more character
1249                 Just ('\'', i3) -> do   -- We've seen 'x'
1250                         setInput i3
1251                         finish_char_tok loc c
1252                 _other -> do            -- We've seen 'x not followed by quote
1253                                         -- (including the possibility of EOF)
1254                                         -- If TH is on, just parse the quote only
1255                         th_exts <- extension thEnabled
1256                         let (AI end _ _) = i1
1257                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1258                                    else do setInput i2; lit_error
1259
1260 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1261 finish_char_tok loc ch  -- We've already seen the closing quote
1262                         -- Just need to check for trailing #
1263   = do  magicHash <- extension magicHashEnabled
1264         i@(AI end _ _) <- getInput
1265         if magicHash then do
1266                 case alexGetChar' i of
1267                         Just ('#',i@(AI end _ _)) -> do
1268                                 setInput i
1269                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1270                         _other ->
1271                                 return (L (mkSrcSpan loc end) (ITchar ch))
1272                 else do
1273                    return (L (mkSrcSpan loc end) (ITchar ch))
1274
1275 lex_char :: Char -> AlexInput -> P Char
1276 lex_char c inp = do
1277   case c of
1278       '\\' -> do setInput inp; lex_escape
1279       c | isAny c -> do setInput inp; return c
1280       _other -> lit_error
1281
1282 isAny :: Char -> Bool
1283 isAny c | c > '\x7f' = isPrint c
1284         | otherwise  = is_any c
1285
1286 lex_escape :: P Char
1287 lex_escape = do
1288   c <- getCharOrFail
1289   case c of
1290         'a'   -> return '\a'
1291         'b'   -> return '\b'
1292         'f'   -> return '\f'
1293         'n'   -> return '\n'
1294         'r'   -> return '\r'
1295         't'   -> return '\t'
1296         'v'   -> return '\v'
1297         '\\'  -> return '\\'
1298         '"'   -> return '\"'
1299         '\''  -> return '\''
1300         '^'   -> do c <- getCharOrFail
1301                     if c >= '@' && c <= '_'
1302                         then return (chr (ord c - ord '@'))
1303                         else lit_error
1304
1305         'x'   -> readNum is_hexdigit 16 hexDigit
1306         'o'   -> readNum is_octdigit  8 octDecDigit
1307         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1308
1309         c1 ->  do
1310            i <- getInput
1311            case alexGetChar' i of
1312             Nothing -> lit_error
1313             Just (c2,i2) ->
1314               case alexGetChar' i2 of
1315                 Nothing -> do setInput i2; lit_error
1316                 Just (c3,i3) ->
1317                    let str = [c1,c2,c3] in
1318                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1319                                      Just rest <- [stripPrefix p str] ] of
1320                           (escape_char,[]):_ -> do
1321                                 setInput i3
1322                                 return escape_char
1323                           (escape_char,_:_):_ -> do
1324                                 setInput i2
1325                                 return escape_char
1326                           [] -> lit_error
1327
1328 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1329 readNum is_digit base conv = do
1330   i <- getInput
1331   c <- getCharOrFail
1332   if is_digit c
1333         then readNum2 is_digit base conv (conv c)
1334         else do setInput i; lit_error
1335
1336 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1337 readNum2 is_digit base conv i = do
1338   input <- getInput
1339   read i input
1340   where read i input = do
1341           case alexGetChar' input of
1342             Just (c,input') | is_digit c -> do
1343                 read (i*base + conv c) input'
1344             _other -> do
1345                 if i >= 0 && i <= 0x10FFFF
1346                    then do setInput input; return (chr i)
1347                    else lit_error
1348
1349 silly_escape_chars :: [(String, Char)]
1350 silly_escape_chars = [
1351         ("NUL", '\NUL'),
1352         ("SOH", '\SOH'),
1353         ("STX", '\STX'),
1354         ("ETX", '\ETX'),
1355         ("EOT", '\EOT'),
1356         ("ENQ", '\ENQ'),
1357         ("ACK", '\ACK'),
1358         ("BEL", '\BEL'),
1359         ("BS", '\BS'),
1360         ("HT", '\HT'),
1361         ("LF", '\LF'),
1362         ("VT", '\VT'),
1363         ("FF", '\FF'),
1364         ("CR", '\CR'),
1365         ("SO", '\SO'),
1366         ("SI", '\SI'),
1367         ("DLE", '\DLE'),
1368         ("DC1", '\DC1'),
1369         ("DC2", '\DC2'),
1370         ("DC3", '\DC3'),
1371         ("DC4", '\DC4'),
1372         ("NAK", '\NAK'),
1373         ("SYN", '\SYN'),
1374         ("ETB", '\ETB'),
1375         ("CAN", '\CAN'),
1376         ("EM", '\EM'),
1377         ("SUB", '\SUB'),
1378         ("ESC", '\ESC'),
1379         ("FS", '\FS'),
1380         ("GS", '\GS'),
1381         ("RS", '\RS'),
1382         ("US", '\US'),
1383         ("SP", '\SP'),
1384         ("DEL", '\DEL')
1385         ]
1386
1387 -- before calling lit_error, ensure that the current input is pointing to
1388 -- the position of the error in the buffer.  This is so that we can report
1389 -- a correct location to the user, but also so we can detect UTF-8 decoding
1390 -- errors if they occur.
1391 lit_error :: P a
1392 lit_error = lexError "lexical error in string/character literal"
1393
1394 getCharOrFail :: P Char
1395 getCharOrFail =  do
1396   i <- getInput
1397   case alexGetChar' i of
1398         Nothing -> lexError "unexpected end-of-file in string/character literal"
1399         Just (c,i)  -> do setInput i; return c
1400
1401 -- -----------------------------------------------------------------------------
1402 -- QuasiQuote
1403
1404 lex_quasiquote_tok :: Action
1405 lex_quasiquote_tok span buf len = do
1406   let quoter = reverse $ takeWhile (/= '$')
1407                $ reverse $ lexemeToString buf (len - 1)
1408   quoteStart <- getSrcLoc
1409   quote <- lex_quasiquote ""
1410   end <- getSrcLoc
1411   return (L (mkSrcSpan (srcSpanStart span) end)
1412            (ITquasiQuote (mkFastString quoter,
1413                           mkFastString (reverse quote),
1414                           mkSrcSpan quoteStart end)))
1415
1416 lex_quasiquote :: String -> P String
1417 lex_quasiquote s = do
1418   i <- getInput
1419   case alexGetChar' i of
1420     Nothing -> lit_error
1421
1422     Just ('\\',i)
1423         | Just ('|',i) <- next -> do
1424                 setInput i; lex_quasiquote ('|' : s)
1425         | Just (']',i) <- next -> do
1426                 setInput i; lex_quasiquote (']' : s)
1427         where next = alexGetChar' i
1428
1429     Just ('|',i)
1430         | Just (']',i) <- next -> do
1431                 setInput i; return s
1432         where next = alexGetChar' i
1433
1434     Just (c, i) -> do
1435          setInput i; lex_quasiquote (c : s)
1436
1437 -- -----------------------------------------------------------------------------
1438 -- Warnings
1439
1440 warn :: DynFlag -> SDoc -> Action
1441 warn option warning srcspan _buf _len = do
1442     addWarning option srcspan warning
1443     lexToken
1444
1445 warnThen :: DynFlag -> SDoc -> Action -> Action
1446 warnThen option warning action srcspan buf len = do
1447     addWarning option srcspan warning
1448     action srcspan buf len
1449
1450 -- -----------------------------------------------------------------------------
1451 -- The Parse Monad
1452
1453 data LayoutContext
1454   = NoLayout
1455   | Layout !Int
1456   deriving Show
1457
1458 data ParseResult a
1459   = POk PState a
1460   | PFailed
1461         SrcSpan         -- The start and end of the text span related to
1462                         -- the error.  Might be used in environments which can
1463                         -- show this span, e.g. by highlighting it.
1464         Message         -- The error message
1465
1466 data PState = PState {
1467         buffer     :: StringBuffer,
1468         dflags     :: DynFlags,
1469         messages   :: Messages,
1470         last_loc   :: SrcSpan,  -- pos of previous token
1471         last_offs  :: !Int,     -- offset of the previous token from the
1472                                 -- beginning of  the current line.
1473                                 -- \t is equal to 8 spaces.
1474         last_len   :: !Int,     -- len of previous token
1475         last_line_len :: !Int,
1476         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1477         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1478         context    :: [LayoutContext],
1479         lex_state  :: [Int]
1480      }
1481         -- last_loc and last_len are used when generating error messages,
1482         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1483         -- current token to happyError, we could at least get rid of last_len.
1484         -- Getting rid of last_loc would require finding another way to
1485         -- implement pushCurrentContext (which is only called from one place).
1486
1487 newtype P a = P { unP :: PState -> ParseResult a }
1488
1489 instance Monad P where
1490   return = returnP
1491   (>>=) = thenP
1492   fail = failP
1493
1494 returnP :: a -> P a
1495 returnP a = a `seq` (P $ \s -> POk s a)
1496
1497 thenP :: P a -> (a -> P b) -> P b
1498 (P m) `thenP` k = P $ \ s ->
1499         case m s of
1500                 POk s1 a         -> (unP (k a)) s1
1501                 PFailed span err -> PFailed span err
1502
1503 failP :: String -> P a
1504 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1505
1506 failMsgP :: String -> P a
1507 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1508
1509 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1510 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1511
1512 failSpanMsgP :: SrcSpan -> SDoc -> P a
1513 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1514
1515 getPState :: P PState
1516 getPState = P $ \s -> POk s s
1517
1518 extension :: (Int -> Bool) -> P Bool
1519 extension p = P $ \s -> POk s (p $! extsBitmap s)
1520
1521 getExts :: P Int
1522 getExts = P $ \s -> POk s (extsBitmap s)
1523
1524 setExts :: (Int -> Int) -> P ()
1525 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1526
1527 setSrcLoc :: SrcLoc -> P ()
1528 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1529
1530 getSrcLoc :: P SrcLoc
1531 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1532
1533 setLastToken :: SrcSpan -> Int -> Int -> P ()
1534 setLastToken loc len line_len = P $ \s -> POk s {
1535   last_loc=loc,
1536   last_len=len,
1537   last_line_len=line_len
1538 } ()
1539
1540 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1541
1542 alexInputPrevChar :: AlexInput -> Char
1543 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1544
1545 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1546 alexGetChar (AI loc ofs s)
1547   | atEnd s   = Nothing
1548   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1549                 --trace (show (ord c)) $
1550                 Just (adj_c, (AI loc' ofs' s'))
1551   where (c,s') = nextChar s
1552         loc'   = advanceSrcLoc loc c
1553         ofs'   = advanceOffs c ofs
1554
1555         non_graphic     = '\x0'
1556         upper           = '\x1'
1557         lower           = '\x2'
1558         digit           = '\x3'
1559         symbol          = '\x4'
1560         space           = '\x5'
1561         other_graphic   = '\x6'
1562
1563         adj_c
1564           | c <= '\x06' = non_graphic
1565           | c <= '\x7f' = c
1566           -- Alex doesn't handle Unicode, so when Unicode
1567           -- character is encountered we output these values
1568           -- with the actual character value hidden in the state.
1569           | otherwise =
1570                 case generalCategory c of
1571                   UppercaseLetter       -> upper
1572                   LowercaseLetter       -> lower
1573                   TitlecaseLetter       -> upper
1574                   ModifierLetter        -> other_graphic
1575                   OtherLetter           -> lower -- see #1103
1576                   NonSpacingMark        -> other_graphic
1577                   SpacingCombiningMark  -> other_graphic
1578                   EnclosingMark         -> other_graphic
1579                   DecimalNumber         -> digit
1580                   LetterNumber          -> other_graphic
1581                   OtherNumber           -> other_graphic
1582                   ConnectorPunctuation  -> symbol
1583                   DashPunctuation       -> symbol
1584                   OpenPunctuation       -> other_graphic
1585                   ClosePunctuation      -> other_graphic
1586                   InitialQuote          -> other_graphic
1587                   FinalQuote            -> other_graphic
1588                   OtherPunctuation      -> symbol
1589                   MathSymbol            -> symbol
1590                   CurrencySymbol        -> symbol
1591                   ModifierSymbol        -> symbol
1592                   OtherSymbol           -> symbol
1593                   Space                 -> space
1594                   _other                -> non_graphic
1595
1596 -- This version does not squash unicode characters, it is used when
1597 -- lexing strings.
1598 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1599 alexGetChar' (AI loc ofs s)
1600   | atEnd s   = Nothing
1601   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1602                 --trace (show (ord c)) $
1603                 Just (c, (AI loc' ofs' s'))
1604   where (c,s') = nextChar s
1605         loc'   = advanceSrcLoc loc c
1606         ofs'   = advanceOffs c ofs
1607
1608 advanceOffs :: Char -> Int -> Int
1609 advanceOffs '\n' _    = 0
1610 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1611 advanceOffs _    offs = offs + 1
1612
1613 getInput :: P AlexInput
1614 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1615
1616 setInput :: AlexInput -> P ()
1617 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1618
1619 pushLexState :: Int -> P ()
1620 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1621
1622 popLexState :: P Int
1623 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1624
1625 getLexState :: P Int
1626 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1627
1628 -- for reasons of efficiency, flags indicating language extensions (eg,
1629 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1630 -- integer
1631
1632 genericsBit :: Int
1633 genericsBit = 0 -- {| and |}
1634 ffiBit :: Int
1635 ffiBit     = 1
1636 parrBit :: Int
1637 parrBit    = 2
1638 arrowsBit :: Int
1639 arrowsBit  = 4
1640 thBit :: Int
1641 thBit      = 5
1642 ipBit :: Int
1643 ipBit      = 6
1644 explicitForallBit :: Int
1645 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1646 bangPatBit :: Int
1647 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1648                 -- (doesn't affect the lexer)
1649 tyFamBit :: Int
1650 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1651 haddockBit :: Int
1652 haddockBit = 10 -- Lex and parse Haddock comments
1653 magicHashBit :: Int
1654 magicHashBit = 11 -- "#" in both functions and operators
1655 kindSigsBit :: Int
1656 kindSigsBit = 12 -- Kind signatures on type variables
1657 recursiveDoBit :: Int
1658 recursiveDoBit = 13 -- mdo
1659 unicodeSyntaxBit :: Int
1660 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1661 unboxedTuplesBit :: Int
1662 unboxedTuplesBit = 15 -- (# and #)
1663 standaloneDerivingBit :: Int
1664 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1665 transformComprehensionsBit :: Int
1666 transformComprehensionsBit = 17
1667 qqBit :: Int
1668 qqBit      = 18 -- enable quasiquoting
1669 inRulePragBit :: Int
1670 inRulePragBit = 19
1671 rawTokenStreamBit :: Int
1672 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1673 newQualOpsBit :: Int
1674 newQualOpsBit = 21 -- Haskell' qualified operator syntax, e.g. Prelude.(+)
1675 recBit :: Int
1676 recBit = 22 -- rec
1677
1678 always :: Int -> Bool
1679 always           _     = True
1680 genericsEnabled :: Int -> Bool
1681 genericsEnabled  flags = testBit flags genericsBit
1682 parrEnabled :: Int -> Bool
1683 parrEnabled      flags = testBit flags parrBit
1684 arrowsEnabled :: Int -> Bool
1685 arrowsEnabled    flags = testBit flags arrowsBit
1686 thEnabled :: Int -> Bool
1687 thEnabled        flags = testBit flags thBit
1688 ipEnabled :: Int -> Bool
1689 ipEnabled        flags = testBit flags ipBit
1690 explicitForallEnabled :: Int -> Bool
1691 explicitForallEnabled flags = testBit flags explicitForallBit
1692 bangPatEnabled :: Int -> Bool
1693 bangPatEnabled   flags = testBit flags bangPatBit
1694 -- tyFamEnabled :: Int -> Bool
1695 -- tyFamEnabled     flags = testBit flags tyFamBit
1696 haddockEnabled :: Int -> Bool
1697 haddockEnabled   flags = testBit flags haddockBit
1698 magicHashEnabled :: Int -> Bool
1699 magicHashEnabled flags = testBit flags magicHashBit
1700 -- kindSigsEnabled :: Int -> Bool
1701 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1702 unicodeSyntaxEnabled :: Int -> Bool
1703 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1704 unboxedTuplesEnabled :: Int -> Bool
1705 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1706 standaloneDerivingEnabled :: Int -> Bool
1707 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1708 qqEnabled :: Int -> Bool
1709 qqEnabled        flags = testBit flags qqBit
1710 -- inRulePrag :: Int -> Bool
1711 -- inRulePrag       flags = testBit flags inRulePragBit
1712 rawTokenStreamEnabled :: Int -> Bool
1713 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1714 newQualOps :: Int -> Bool
1715 newQualOps       flags = testBit flags newQualOpsBit
1716 oldQualOps :: Int -> Bool
1717 oldQualOps flags = not (newQualOps flags)
1718
1719 -- PState for parsing options pragmas
1720 --
1721 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1722 pragState dynflags buf loc =
1723   PState {
1724       buffer        = buf,
1725       messages      = emptyMessages,
1726       dflags        = dynflags,
1727       last_loc      = mkSrcSpan loc loc,
1728       last_offs     = 0,
1729       last_len      = 0,
1730       last_line_len = 0,
1731       loc           = loc,
1732       extsBitmap    = 0,
1733       context       = [],
1734       lex_state     = [bol, option_prags, 0]
1735     }
1736
1737
1738 -- create a parse state
1739 --
1740 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1741 mkPState buf loc flags  =
1742   PState {
1743       buffer          = buf,
1744       dflags        = flags,
1745       messages      = emptyMessages,
1746       last_loc      = mkSrcSpan loc loc,
1747       last_offs     = 0,
1748       last_len      = 0,
1749       last_line_len = 0,
1750       loc           = loc,
1751       extsBitmap    = fromIntegral bitmap,
1752       context       = [],
1753       lex_state     = [bol, 0]
1754         -- we begin in the layout state if toplev_layout is set
1755     }
1756     where
1757       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1758                .|. ffiBit            `setBitIf` dopt Opt_ForeignFunctionInterface flags
1759                .|. parrBit           `setBitIf` dopt Opt_PArr         flags
1760                .|. arrowsBit         `setBitIf` dopt Opt_Arrows       flags
1761                .|. thBit             `setBitIf` dopt Opt_TemplateHaskell flags
1762                .|. qqBit             `setBitIf` dopt Opt_QuasiQuotes flags
1763                .|. ipBit             `setBitIf` dopt Opt_ImplicitParams flags
1764                .|. explicitForallBit `setBitIf` dopt Opt_ExplicitForAll flags
1765                .|. bangPatBit        `setBitIf` dopt Opt_BangPatterns flags
1766                .|. tyFamBit          `setBitIf` dopt Opt_TypeFamilies flags
1767                .|. haddockBit        `setBitIf` dopt Opt_Haddock      flags
1768                .|. magicHashBit      `setBitIf` dopt Opt_MagicHash    flags
1769                .|. kindSigsBit       `setBitIf` dopt Opt_KindSignatures flags
1770                .|. recursiveDoBit    `setBitIf` dopt Opt_RecursiveDo flags
1771                .|. recBit            `setBitIf` dopt Opt_DoRec  flags
1772                .|. recBit            `setBitIf` dopt Opt_Arrows flags
1773                .|. unicodeSyntaxBit  `setBitIf` dopt Opt_UnicodeSyntax flags
1774                .|. unboxedTuplesBit  `setBitIf` dopt Opt_UnboxedTuples flags
1775                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1776                .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1777                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1778                .|. newQualOpsBit `setBitIf` dopt Opt_NewQualifiedOperators flags
1779       --
1780       setBitIf :: Int -> Bool -> Int
1781       b `setBitIf` cond | cond      = bit b
1782                         | otherwise = 0
1783
1784 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1785 addWarning option srcspan warning
1786  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1787        let warning' = mkWarnMsg srcspan alwaysQualify warning
1788            ws' = if dopt option d then ws `snocBag` warning' else ws
1789        in POk s{messages=(ws', es)} ()
1790
1791 getMessages :: PState -> Messages
1792 getMessages PState{messages=ms} = ms
1793
1794 getContext :: P [LayoutContext]
1795 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1796
1797 setContext :: [LayoutContext] -> P ()
1798 setContext ctx = P $ \s -> POk s{context=ctx} ()
1799
1800 popContext :: P ()
1801 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1802                               last_len = len, last_loc = last_loc }) ->
1803   case ctx of
1804         (_:tl) -> POk s{ context = tl } ()
1805         []     -> PFailed last_loc (srcParseErr buf len)
1806
1807 -- Push a new layout context at the indentation of the last token read.
1808 -- This is only used at the outer level of a module when the 'module'
1809 -- keyword is missing.
1810 pushCurrentContext :: P ()
1811 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1812     POk s{context = Layout (offs-len) : ctx} ()
1813 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1814
1815 getOffside :: P Ordering
1816 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1817                 let ord = case stk of
1818                         (Layout n:_) -> compare offs n
1819                         _            -> GT
1820                 in POk s ord
1821
1822 -- ---------------------------------------------------------------------------
1823 -- Construct a parse error
1824
1825 srcParseErr
1826   :: StringBuffer       -- current buffer (placed just after the last token)
1827   -> Int                -- length of the previous token
1828   -> Message
1829 srcParseErr buf len
1830   = hcat [ if null token
1831              then ptext (sLit "parse error (possibly incorrect indentation)")
1832              else hcat [ptext (sLit "parse error on input "),
1833                         char '`', text token, char '\'']
1834     ]
1835   where token = lexemeToString (offsetBytes (-len) buf) len
1836
1837 -- Report a parse failure, giving the span of the previous token as
1838 -- the location of the error.  This is the entry point for errors
1839 -- detected during parsing.
1840 srcParseFail :: P a
1841 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1842                             last_loc = last_loc } ->
1843     PFailed last_loc (srcParseErr buf len)
1844
1845 -- A lexical error is reported at a particular position in the source file,
1846 -- not over a token range.
1847 lexError :: String -> P a
1848 lexError str = do
1849   loc <- getSrcLoc
1850   (AI end _ buf) <- getInput
1851   reportLexError loc end buf str
1852
1853 -- -----------------------------------------------------------------------------
1854 -- This is the top-level function: called from the parser each time a
1855 -- new token is to be read from the input.
1856
1857 lexer :: (Located Token -> P a) -> P a
1858 lexer cont = do
1859   tok@(L _span _tok__) <- lexToken
1860 --  trace ("token: " ++ show tok__) $ do
1861   cont tok
1862
1863 lexToken :: P (Located Token)
1864 lexToken = do
1865   inp@(AI loc1 _ buf) <- getInput
1866   sc <- getLexState
1867   exts <- getExts
1868   case alexScanUser exts inp sc of
1869     AlexEOF -> do
1870         let span = mkSrcSpan loc1 loc1
1871         setLastToken span 0 0
1872         return (L span ITeof)
1873     AlexError (AI loc2 _ buf) ->
1874         reportLexError loc1 loc2 buf "lexical error"
1875     AlexSkip inp2 _ -> do
1876         setInput inp2
1877         lexToken
1878     AlexToken inp2@(AI end _ buf2) _ t -> do
1879         setInput inp2
1880         let span = mkSrcSpan loc1 end
1881         let bytes = byteDiff buf buf2
1882         span `seq` setLastToken span bytes bytes
1883         t span buf bytes
1884
1885 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
1886 reportLexError loc1 loc2 buf str
1887   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1888   | otherwise =
1889   let
1890         c = fst (nextChar buf)
1891   in
1892   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1893     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1894     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1895
1896 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
1897 lexTokenStream buf loc dflags = unP go initState
1898     where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
1899           go = do
1900             ltok <- lexer return
1901             case ltok of
1902               L _ ITeof -> return []
1903               _ -> liftM (ltok:) go
1904
1905 linePrags = Map.singleton "line" (begin line_prag2)
1906
1907 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
1908                                  ("options_ghc", lex_string_prag IToptions_prag),
1909                                  ("options_haddock", lex_string_prag ITdocOptions),
1910                                  ("language", token ITlanguage_prag),
1911                                  ("include", lex_string_prag ITinclude_prag)])
1912
1913 ignoredPrags = Map.fromList (map ignored pragmas)
1914                where ignored opt = (opt, nested_comment lexToken)
1915                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
1916                      options_pragmas = map ("options_" ++) impls
1917                      -- CFILES is a hugs-only thing.
1918                      pragmas = options_pragmas ++ ["cfiles", "contract"]
1919
1920 oneWordPrags = Map.fromList([("rules", rulePrag),
1921                            ("inline", token (ITinline_prag True)),
1922                            ("notinline", token (ITinline_prag False)),
1923                            ("specialize", token ITspec_prag),
1924                            ("source", token ITsource_prag),
1925                            ("warning", token ITwarning_prag),
1926                            ("deprecated", token ITdeprecated_prag),
1927                            ("scc", token ITscc_prag),
1928                            ("generated", token ITgenerated_prag),
1929                            ("core", token ITcore_prag),
1930                            ("unpack", token ITunpack_prag),
1931                            ("ann", token ITann_prag)])
1932
1933 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_conlike_prag True)),
1934                              ("notinline conlike", token (ITinline_conlike_prag False)),
1935                              ("specialize inline", token (ITspec_inline_prag True)),
1936                              ("specialize notinline", token (ITspec_inline_prag False))])
1937
1938
1939 dispatch_pragmas :: Map String Action -> Action
1940 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
1941                                        Just found -> found span buf len
1942                                        Nothing -> lexError "unknown pragma"
1943
1944 known_pragma :: Map String Action -> AlexAccPred Int
1945 known_pragma prags _ _ len (AI _ _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
1946                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
1947
1948 clean_pragma :: String -> String
1949 clean_pragma prag = canon_ws (map toLower (unprefix prag))
1950                     where unprefix prag' = case stripPrefix "{-#" prag' of
1951                                              Just rest -> rest
1952                                              Nothing -> prag'
1953                           canonical prag' = case prag' of
1954                                               "noinline" -> "notinline"
1955                                               "specialise" -> "specialize"
1956                                               "constructorlike" -> "conlike"
1957                                               otherwise -> prag'
1958                           canon_ws s = unwords (map canonical (words s))
1959 }