compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 {-# OPTIONS -Wwarn -w #-}
  35 -- The above warning supression flag is a temporary kludge.
  36 -- While working on this module you are encouraged to remove it and fix
  37 -- any warnings in the module. See
  38 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  39 -- for details
  40 --
  41 -- Note that Alex itself generates code with with some unused bindings and
  42 -- without type signatures, so removing the flag might not be possible.
  43
  44 {-# OPTIONS_GHC -funbox-strict-fields #-}
  45
  46 module Lexer (
  47    Token(..), lexer, pragState, mkPState, PState(..),
  48    P(..), ParseResult(..), getSrcLoc,
  49    getPState,
  50    failLocMsgP, failSpanMsgP, srcParseFail,
  51    getMessages,
  52    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  53    getLexState, popLexState, pushLexState,
  54    extension, standaloneDerivingEnabled, bangPatEnabled,
  55    addWarning,
  56    lexTokenStream
  57   ) where
  58
  59 import Bag
  60 import ErrUtils
  61 import Outputable
  62 import StringBuffer
  63 import FastString
  64 import SrcLoc
  65 import UniqFM
  66 import DynFlags
  67 import Ctype
  68 import Util             ( readRational )
  69
  70 import Control.Monad
  71 import Data.Bits
  72 import Data.Char
  73 import Data.List
  74 import Data.Maybe
  75 import Data.Map (Map)
  76 import qualified Data.Map as Map
  77 import Data.Ratio
  78 }
  79
  80 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  81 $whitechar   = [\ \n\r\f\v $unispace]
  82 $white_no_nl = $whitechar # \n
  83 $tab         = \t
  84
  85 $ascdigit  = 0-9
  86 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  87 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  88 $digit     = [$ascdigit $unidigit]
  89
  90 $special   = [\(\)\,\;\[\]\`\{\}]
  91 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  92 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  93 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  94
  95 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  96 $asclarge  = [A-Z]
  97 $large     = [$asclarge $unilarge]
  98
  99 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 100 $ascsmall  = [a-z]
 101 $small     = [$ascsmall $unismall \_]
 102
 103 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 104 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 105
 106 $octit     = 0-7
 107 $hexit     = [$decdigit A-F a-f]
 108 $symchar   = [$symbol \:]
 109 $nl        = [\n\r]
 110 $idchar    = [$small $large $digit \']
 111
 112 $pragmachar = [$small $large $digit]
 113
 114 $docsym    = [\| \^ \* \$]
 115
 116 @varid     = $small $idchar*
 117 @conid     = $large $idchar*
 118
 119 @varsym    = $symbol $symchar*
 120 @consym    = \: $symchar*
 121
 122 @decimal     = $decdigit+
 123 @octal       = $octit+
 124 @hexadecimal = $hexit+
 125 @exponent    = [eE] [\-\+]? @decimal
 126
 127 -- we support the hierarchical module name extension:
 128 @qual = (@conid \.)+
 129
 130 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 131
 132 -- normal signed numerical literals can only be explicitly negative,
 133 -- not explicitly positive (contrast @exponent)
 134 @negative = \-
 135 @signed = @negative ?
 136
 137 haskell :-
 138
 139 -- everywhere: skip whitespace and comments
 140 $white_no_nl+                           ;
 141 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 142
 143 -- Everywhere: deal with nested comments.  We explicitly rule out
 144 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 145 -- (this can happen even though pragmas will normally take precedence due to
 146 -- longest-match, because pragmas aren't valid in every state, but comments
 147 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 148 -- set.
 149
 150 "{-" / { isNormalComment } { nested_comment lexToken }
 151
 152 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 153 -- more dashes followed by a symbol should be parsed as a varsym, so we
 154 -- have to exclude those.
 155
 156 -- Since Haddock comments aren't valid in every state, we need to rule them
 157 -- out here.
 158
 159 -- The following two rules match comments that begin with two dashes, but
 160 -- continue with a different character. The rules test that this character
 161 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 162 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 163 -- have a Haddock comment). The rules then munch the rest of the line.
 164
 165 "-- " ~[$docsym \#] .* { lineCommentToken }
 166 "--" [^$symbol : \ ] .* { lineCommentToken }
 167
 168 -- Next, match Haddock comments if no -haddock flag
 169
 170 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 171
 172 -- Now, when we've matched comments that begin with 2 dashes and continue
 173 -- with a different character, we need to match comments that begin with three
 174 -- or more dashes (which clearly can't be Haddock comments). We only need to
 175 -- make sure that the first non-dash character isn't a symbol, and munch the
 176 -- rest of the line.
 177
 178 "---"\-* [^$symbol :] .* { lineCommentToken }
 179
 180 -- Since the previous rules all match dashes followed by at least one
 181 -- character, we also need to match a whole line filled with just dashes.
 182
 183 "--"\-* / { atEOL } { lineCommentToken }
 184
 185 -- We need this rule since none of the other single line comment rules
 186 -- actually match this case.
 187
 188 "-- " / { atEOL } { lineCommentToken }
 189
 190 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 191 -- blank lines) until we find a non-whitespace character, then do layout
 192 -- processing.
 193 --
 194 -- One slight wibble here: what if the line begins with {-#? In
 195 -- theory, we have to lex the pragma to see if it's one we recognise,
 196 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 197 -- as a nested comment.  We don't bother with this: if the line begins
 198 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 199 <bol> {
 200   \n                                    ;
 201   ^\# (line)?                           { begin line_prag1 }
 202   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 203   ^\# \! .* \n                          ; -- #!, for scripts
 204   ()                                    { do_bol }
 205 }
 206
 207 -- after a layout keyword (let, where, do, of), we begin a new layout
 208 -- context if the curly brace is missing.
 209 -- Careful! This stuff is quite delicate.
 210 <layout, layout_do> {
 211   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 212         -- we might encounter {-# here, but {- has been handled already
 213   \n                                    ;
 214   ^\# (line)?                           { begin line_prag1 }
 215 }
 216
 217 -- do is treated in a subtly different way, see new_layout_context
 218 <layout>    ()                          { new_layout_context True }
 219 <layout_do> ()                          { new_layout_context False }
 220
 221 -- after a new layout context which was found to be to the left of the
 222 -- previous context, we have generated a '{' token, and we now need to
 223 -- generate a matching '}' token.
 224 <layout_left>  ()                       { do_layout_left }
 225
 226 <0,option_prags> \n                             { begin bol }
 227
 228 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 229                                 { dispatch_pragmas linePrags }
 230
 231 -- single-line line pragmas, of the form
 232 --    # <line> "<file>" <extra-stuff> \n
 233 <line_prag1> $decdigit+                 { setLine line_prag1a }
 234 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 235 <line_prag1b> .*                        { pop }
 236
 237 -- Haskell-style line pragmas, of the form
 238 --    {-# LINE <line> "<file>" #-}
 239 <line_prag2> $decdigit+                 { setLine line_prag2a }
 240 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 241 <line_prag2b> "#-}"|"-}"                { pop }
 242    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 243    -- with older versions of GHC which generated these.
 244
 245 <0,option_prags> {
 246   "{-#" $whitechar* $pragmachar+
 247         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 248                                  { dispatch_pragmas twoWordPrags }
 249
 250   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 251                                  { dispatch_pragmas oneWordPrags }
 252
 253   -- We ignore all these pragmas, but don't generate a warning for them
 254   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 255                                  { dispatch_pragmas ignoredPrags }
 256
 257   -- ToDo: should only be valid inside a pragma:
 258   "#-}"                                 { endPrag }
 259 }
 260
 261 <option_prags> {
 262   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 263                                    { dispatch_pragmas fileHeaderPrags }
 264
 265   "-- #"                                 { multiline_doc_comment }
 266 }
 267
 268 <0> {
 269   -- In the "0" mode we ignore these pragmas
 270   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 271                      { nested_comment lexToken }
 272 }
 273
 274 <0> {
 275   "-- #" .* { lineCommentToken }
 276 }
 277
 278 <0,option_prags> {
 279   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 280                     (nested_comment lexToken) }
 281 }
 282
 283 -- '0' state: ordinary lexemes
 284
 285 -- Haddock comments
 286
 287 <0> {
 288   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 289   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 290 }
 291
 292 -- "special" symbols
 293
 294 <0> {
 295   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 296   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 297 }
 298
 299 <0> {
 300   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 301   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 302   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 303   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 304   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 305   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 306   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 307   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 308
 309   "[$" @varid "|"  / { ifExtension qqEnabled }
 310                      { lex_quasiquote_tok }
 311 }
 312
 313 <0> {
 314   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 315                                         { special IToparenbar }
 316   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 317 }
 318
 319 <0> {
 320   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 321 }
 322
 323 <0> {
 324   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 325          { token IToubxparen }
 326   "#)" / { ifExtension unboxedTuplesEnabled }
 327          { token ITcubxparen }
 328 }
 329
 330 <0> {
 331   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 332   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 333 }
 334
 335 <0,option_prags> {
 336   \(                                    { special IToparen }
 337   \)                                    { special ITcparen }
 338   \[                                    { special ITobrack }
 339   \]                                    { special ITcbrack }
 340   \,                                    { special ITcomma }
 341   \;                                    { special ITsemi }
 342   \`                                    { special ITbackquote }
 343
 344   \{                                    { open_brace }
 345   \}                                    { close_brace }
 346 }
 347
 348 <0,option_prags> {
 349   @qual @varid                  { idtoken qvarid }
 350   @qual @conid                  { idtoken qconid }
 351   @varid                        { varid }
 352   @conid                        { idtoken conid }
 353 }
 354
 355 <0> {
 356   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 357   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 358   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 359   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 360 }
 361
 362 -- ToDo: - move `var` and (sym) into lexical syntax?
 363 --       - remove backquote from $special?
 364 <0> {
 365   @qual @varsym       / { ifExtension oldQualOps } { idtoken qvarsym }
 366   @qual @consym       / { ifExtension oldQualOps } { idtoken qconsym }
 367   @qual \( @varsym \) / { ifExtension newQualOps } { idtoken prefixqvarsym }
 368   @qual \( @consym \) / { ifExtension newQualOps } { idtoken prefixqconsym }
 369   @varsym                                          { varsym }
 370   @consym                                          { consym }
 371 }
 372
 373 -- For the normal boxed literals we need to be careful
 374 -- when trying to be close to Haskell98
 375 <0> {
 376   -- Normal integral literals (:: Num a => a, from Integer)
 377   @decimal           { tok_num positive 0 0 decimal }
 378   0[oO] @octal       { tok_num positive 2 2 octal }
 379   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 380
 381   -- Normal rational literals (:: Fractional a => a, from Rational)
 382   @floating_point    { strtoken tok_float }
 383 }
 384
 385 <0> {
 386   -- Unboxed ints (:: Int#) and words (:: Word#)
 387   -- It's simpler (and faster?) to give separate cases to the negatives,
 388   -- especially considering octal/hexadecimal prefixes.
 389   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 390   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 391   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 392   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 393   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 394   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 395
 396   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 397   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 398   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 399
 400   -- Unboxed floats and doubles (:: Float#, :: Double#)
 401   -- prim_{float,double} work with signed literals
 402   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 403   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 404 }
 405
 406 -- Strings and chars are lexed by hand-written code.  The reason is
 407 -- that even if we recognise the string or char here in the regex
 408 -- lexer, we would still have to parse the string afterward in order
 409 -- to convert it to a String.
 410 <0> {
 411   \'                            { lex_char_tok }
 412   \"                            { lex_string_tok }
 413 }
 414
 415 {
 416 -- -----------------------------------------------------------------------------
 417 -- The token type
 418
 419 data Token
 420   = ITas                        -- Haskell keywords
 421   | ITcase
 422   | ITclass
 423   | ITdata
 424   | ITdefault
 425   | ITderiving
 426   | ITdo
 427   | ITelse
 428   | IThiding
 429   | ITif
 430   | ITimport
 431   | ITin
 432   | ITinfix
 433   | ITinfixl
 434   | ITinfixr
 435   | ITinstance
 436   | ITlet
 437   | ITmodule
 438   | ITnewtype
 439   | ITof
 440   | ITqualified
 441   | ITthen
 442   | ITtype
 443   | ITwhere
 444   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 445
 446   | ITforall                    -- GHC extension keywords
 447   | ITforeign
 448   | ITexport
 449   | ITlabel
 450   | ITdynamic
 451   | ITsafe
 452   | ITthreadsafe
 453   | ITunsafe
 454   | ITstdcallconv
 455   | ITccallconv
 456   | ITprimcallconv
 457   | ITdotnet
 458   | ITmdo
 459   | ITfamily
 460   | ITgroup
 461   | ITby
 462   | ITusing
 463
 464         -- Pragmas
 465   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 466   | ITinline_conlike_prag Bool  -- same
 467   | ITspec_prag                 -- SPECIALISE
 468   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 469   | ITsource_prag
 470   | ITrules_prag
 471   | ITwarning_prag
 472   | ITdeprecated_prag
 473   | ITline_prag
 474   | ITscc_prag
 475   | ITgenerated_prag
 476   | ITcore_prag                 -- hdaume: core annotations
 477   | ITunpack_prag
 478   | ITann_prag
 479   | ITclose_prag
 480   | IToptions_prag String
 481   | ITinclude_prag String
 482   | ITlanguage_prag
 483
 484   | ITdotdot                    -- reserved symbols
 485   | ITcolon
 486   | ITdcolon
 487   | ITequal
 488   | ITlam
 489   | ITvbar
 490   | ITlarrow
 491   | ITrarrow
 492   | ITat
 493   | ITtilde
 494   | ITdarrow
 495   | ITminus
 496   | ITbang
 497   | ITstar
 498   | ITdot
 499
 500   | ITbiglam                    -- GHC-extension symbols
 501
 502   | ITocurly                    -- special symbols
 503   | ITccurly
 504   | ITocurlybar                 -- {|, for type applications
 505   | ITccurlybar                 -- |}, for type applications
 506   | ITvocurly
 507   | ITvccurly
 508   | ITobrack
 509   | ITopabrack                  -- [:, for parallel arrays with -XParr
 510   | ITcpabrack                  -- :], for parallel arrays with -XParr
 511   | ITcbrack
 512   | IToparen
 513   | ITcparen
 514   | IToubxparen
 515   | ITcubxparen
 516   | ITsemi
 517   | ITcomma
 518   | ITunderscore
 519   | ITbackquote
 520
 521   | ITvarid   FastString        -- identifiers
 522   | ITconid   FastString
 523   | ITvarsym  FastString
 524   | ITconsym  FastString
 525   | ITqvarid  (FastString,FastString)
 526   | ITqconid  (FastString,FastString)
 527   | ITqvarsym (FastString,FastString)
 528   | ITqconsym (FastString,FastString)
 529   | ITprefixqvarsym (FastString,FastString)
 530   | ITprefixqconsym (FastString,FastString)
 531
 532   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 533
 534   | ITpragma StringBuffer
 535
 536   | ITchar       Char
 537   | ITstring     FastString
 538   | ITinteger    Integer
 539   | ITrational   Rational
 540
 541   | ITprimchar   Char
 542   | ITprimstring FastString
 543   | ITprimint    Integer
 544   | ITprimword   Integer
 545   | ITprimfloat  Rational
 546   | ITprimdouble Rational
 547
 548   -- MetaHaskell extension tokens
 549   | ITopenExpQuote              --  [| or [e|
 550   | ITopenPatQuote              --  [p|
 551   | ITopenDecQuote              --  [d|
 552   | ITopenTypQuote              --  [t|
 553   | ITcloseQuote                --  |]
 554   | ITidEscape   FastString     --  $x
 555   | ITparenEscape               --  $(
 556   | ITvarQuote                  --  '
 557   | ITtyQuote                   --  ''
 558   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 559
 560   -- Arrow notation extension
 561   | ITproc
 562   | ITrec
 563   | IToparenbar                 --  (|
 564   | ITcparenbar                 --  |)
 565   | ITlarrowtail                --  -<
 566   | ITrarrowtail                --  >-
 567   | ITLarrowtail                --  -<<
 568   | ITRarrowtail                --  >>-
 569
 570   | ITunknown String            -- Used when the lexer can't make sense of it
 571   | ITeof                       -- end of file token
 572
 573   -- Documentation annotations
 574   | ITdocCommentNext  String     -- something beginning '-- |'
 575   | ITdocCommentPrev  String     -- something beginning '-- ^'
 576   | ITdocCommentNamed String     -- something beginning '-- $'
 577   | ITdocSection      Int String -- a section heading
 578   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 579   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 580   | ITlineComment     String     -- comment starting by "--"
 581   | ITblockComment    String     -- comment in {- -}
 582
 583 #ifdef DEBUG
 584   deriving Show -- debugging
 585 #endif
 586
 587 {-
 588 isSpecial :: Token -> Bool
 589 -- If we see M.x, where x is a keyword, but
 590 -- is special, we treat is as just plain M.x,
 591 -- not as a keyword.
 592 isSpecial ITas          = True
 593 isSpecial IThiding      = True
 594 isSpecial ITqualified   = True
 595 isSpecial ITforall      = True
 596 isSpecial ITexport      = True
 597 isSpecial ITlabel       = True
 598 isSpecial ITdynamic     = True
 599 isSpecial ITsafe        = True
 600 isSpecial ITthreadsafe  = True
 601 isSpecial ITunsafe      = True
 602 isSpecial ITccallconv   = True
 603 isSpecial ITstdcallconv = True
 604 isSpecial ITprimcallconv = True
 605 isSpecial ITmdo         = True
 606 isSpecial ITfamily      = True
 607 isSpecial ITgroup   = True
 608 isSpecial ITby      = True
 609 isSpecial ITusing   = True
 610 isSpecial _             = False
 611 -}
 612
 613 -- the bitmap provided as the third component indicates whether the
 614 -- corresponding extension keyword is valid under the extension options
 615 -- provided to the compiler; if the extension corresponding to *any* of the
 616 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 617 -- facilitates using a keyword in two different extensions that can be
 618 -- activated independently)
 619 --
 620 reservedWordsFM :: UniqFM (Token, Int)
 621 reservedWordsFM = listToUFM $
 622         map (\(x, y, z) -> (mkFastString x, (y, z)))
 623        [( "_",          ITunderscore,   0 ),
 624         ( "as",         ITas,           0 ),
 625         ( "case",       ITcase,         0 ),
 626         ( "class",      ITclass,        0 ),
 627         ( "data",       ITdata,         0 ),
 628         ( "default",    ITdefault,      0 ),
 629         ( "deriving",   ITderiving,     0 ),
 630         ( "do",         ITdo,           0 ),
 631         ( "else",       ITelse,         0 ),
 632         ( "hiding",     IThiding,       0 ),
 633         ( "if",         ITif,           0 ),
 634         ( "import",     ITimport,       0 ),
 635         ( "in",         ITin,           0 ),
 636         ( "infix",      ITinfix,        0 ),
 637         ( "infixl",     ITinfixl,       0 ),
 638         ( "infixr",     ITinfixr,       0 ),
 639         ( "instance",   ITinstance,     0 ),
 640         ( "let",        ITlet,          0 ),
 641         ( "module",     ITmodule,       0 ),
 642         ( "newtype",    ITnewtype,      0 ),
 643         ( "of",         ITof,           0 ),
 644         ( "qualified",  ITqualified,    0 ),
 645         ( "then",       ITthen,         0 ),
 646         ( "type",       ITtype,         0 ),
 647         ( "where",      ITwhere,        0 ),
 648         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 649
 650     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 651         ( "mdo",        ITmdo,           bit recursiveDoBit),
 652         ( "family",     ITfamily,        bit tyFamBit),
 653     ( "group",  ITgroup,     bit transformComprehensionsBit),
 654     ( "by",     ITby,        bit transformComprehensionsBit),
 655     ( "using",  ITusing,     bit transformComprehensionsBit),
 656
 657         ( "foreign",    ITforeign,       bit ffiBit),
 658         ( "export",     ITexport,        bit ffiBit),
 659         ( "label",      ITlabel,         bit ffiBit),
 660         ( "dynamic",    ITdynamic,       bit ffiBit),
 661         ( "safe",       ITsafe,          bit ffiBit),
 662         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 663         ( "unsafe",     ITunsafe,        bit ffiBit),
 664         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 665         ( "ccall",      ITccallconv,     bit ffiBit),
 666         ( "prim",       ITprimcallconv,  bit ffiBit),
 667         ( "dotnet",     ITdotnet,        bit ffiBit),
 668
 669         ( "rec",        ITrec,           bit arrowsBit),
 670         ( "proc",       ITproc,          bit arrowsBit)
 671      ]
 672
 673 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 674 reservedSymsFM = listToUFM $
 675     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 676       [ ("..",  ITdotdot,   always)
 677         -- (:) is a reserved op, meaning only list cons
 678        ,(":",   ITcolon,    always)
 679        ,("::",  ITdcolon,   always)
 680        ,("=",   ITequal,    always)
 681        ,("\\",  ITlam,      always)
 682        ,("|",   ITvbar,     always)
 683        ,("<-",  ITlarrow,   always)
 684        ,("->",  ITrarrow,   always)
 685        ,("@",   ITat,       always)
 686        ,("~",   ITtilde,    always)
 687        ,("=>",  ITdarrow,   always)
 688        ,("-",   ITminus,    always)
 689        ,("!",   ITbang,     always)
 690
 691         -- For data T (a::*) = MkT
 692        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 693         -- For 'forall a . t'
 694        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 695
 696        ,("-<",  ITlarrowtail, arrowsEnabled)
 697        ,(">-",  ITrarrowtail, arrowsEnabled)
 698        ,("-<<", ITLarrowtail, arrowsEnabled)
 699        ,(">>-", ITRarrowtail, arrowsEnabled)
 700
 701        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 702        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 703        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 704                                 explicitForallEnabled i)
 705        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 706        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 707        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 708         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 709         -- form part of a large operator.  This would let us have a better
 710         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 711        ]
 712
 713 -- -----------------------------------------------------------------------------
 714 -- Lexer actions
 715
 716 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 717
 718 special :: Token -> Action
 719 special tok span _buf _len = return (L span tok)
 720
 721 token, layout_token :: Token -> Action
 722 token t span _buf _len = return (L span t)
 723 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 724
 725 idtoken :: (StringBuffer -> Int -> Token) -> Action
 726 idtoken f span buf len = return (L span $! (f buf len))
 727
 728 skip_one_varid :: (FastString -> Token) -> Action
 729 skip_one_varid f span buf len
 730   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 731
 732 strtoken :: (String -> Token) -> Action
 733 strtoken f span buf len =
 734   return (L span $! (f $! lexemeToString buf len))
 735
 736 init_strtoken :: Int -> (String -> Token) -> Action
 737 -- like strtoken, but drops the last N character(s)
 738 init_strtoken drop f span buf len =
 739   return (L span $! (f $! lexemeToString buf (len-drop)))
 740
 741 begin :: Int -> Action
 742 begin code _span _str _len = do pushLexState code; lexToken
 743
 744 pop :: Action
 745 pop _span _buf _len = do popLexState; lexToken
 746
 747 pop_and :: Action -> Action
 748 pop_and act span buf len = do popLexState; act span buf len
 749
 750 {-# INLINE nextCharIs #-}
 751 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 752 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 753
 754 notFollowedBy :: Char -> AlexAccPred Int
 755 notFollowedBy char _ _ _ (AI _ _ buf)
 756   = nextCharIs buf (/=char)
 757
 758 notFollowedBySymbol :: AlexAccPred Int
 759 notFollowedBySymbol _ _ _ (AI _ _ buf)
 760   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 761
 762 -- We must reject doc comments as being ordinary comments everywhere.
 763 -- In some cases the doc comment will be selected as the lexeme due to
 764 -- maximal munch, but not always, because the nested comment rule is
 765 -- valid in all states, but the doc-comment rules are only valid in
 766 -- the non-layout states.
 767 isNormalComment :: AlexAccPred Int
 768 isNormalComment bits _ _ (AI _ _ buf)
 769   | haddockEnabled bits = notFollowedByDocOrPragma
 770   | otherwise           = nextCharIs buf (/='#')
 771   where
 772     notFollowedByDocOrPragma
 773        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 774
 775 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 776 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 777
 778 {-
 779 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 780   = if haddockEnabled bits then False else (p buf)
 781 -}
 782
 783 atEOL :: AlexAccPred Int
 784 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 785
 786 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 787 ifExtension pred bits _ _ _ = pred bits
 788
 789 multiline_doc_comment :: Action
 790 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 791   where
 792     worker commentAcc input docType oneLine = case alexGetChar input of
 793       Just ('\n', input')
 794         | oneLine -> docCommentEnd input commentAcc docType buf span
 795         | otherwise -> case checkIfCommentLine input' of
 796           Just input -> worker ('\n':commentAcc) input docType False
 797           Nothing -> docCommentEnd input commentAcc docType buf span
 798       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 799       Nothing -> docCommentEnd input commentAcc docType buf span
 800
 801     checkIfCommentLine input = check (dropNonNewlineSpace input)
 802       where
 803         check input = case alexGetChar input of
 804           Just ('-', input) -> case alexGetChar input of
 805             Just ('-', input) -> case alexGetChar input of
 806               Just (c, _) | c /= '-' -> Just input
 807               _ -> Nothing
 808             _ -> Nothing
 809           _ -> Nothing
 810
 811         dropNonNewlineSpace input = case alexGetChar input of
 812           Just (c, input')
 813             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 814             | otherwise -> input
 815           Nothing -> input
 816
 817 lineCommentToken :: Action
 818 lineCommentToken span buf len = do
 819   b <- extension rawTokenStreamEnabled
 820   if b then strtoken ITlineComment span buf len else lexToken
 821
 822 {-
 823   nested comments require traversing by hand, they can't be parsed
 824   using regular expressions.
 825 -}
 826 nested_comment :: P (Located Token) -> Action
 827 nested_comment cont span _str _len = do
 828   input <- getInput
 829   go "" (1::Int) input
 830   where
 831     go commentAcc 0 input = do setInput input
 832                                b <- extension rawTokenStreamEnabled
 833                                if b
 834                                  then docCommentEnd input commentAcc ITblockComment _str span
 835                                  else cont
 836     go commentAcc n input = case alexGetChar input of
 837       Nothing -> errBrace input span
 838       Just ('-',input) -> case alexGetChar input of
 839         Nothing  -> errBrace input span
 840         Just ('\125',input) -> go commentAcc (n-1) input
 841         Just (_,_)          -> go ('-':commentAcc) n input
 842       Just ('\123',input) -> case alexGetChar input of
 843         Nothing  -> errBrace input span
 844         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 845         Just (_,_)       -> go ('\123':commentAcc) n input
 846       Just (c,input) -> go (c:commentAcc) n input
 847
 848 nested_doc_comment :: Action
 849 nested_doc_comment span buf _len = withLexedDocType (go "")
 850   where
 851     go commentAcc input docType _ = case alexGetChar input of
 852       Nothing -> errBrace input span
 853       Just ('-',input) -> case alexGetChar input of
 854         Nothing -> errBrace input span
 855         Just ('\125',input) ->
 856           docCommentEnd input commentAcc docType buf span
 857         Just (_,_) -> go ('-':commentAcc) input docType False
 858       Just ('\123', input) -> case alexGetChar input of
 859         Nothing  -> errBrace input span
 860         Just ('-',input) -> do
 861           setInput input
 862           let cont = do input <- getInput; go commentAcc input docType False
 863           nested_comment cont span buf _len
 864         Just (_,_) -> go ('\123':commentAcc) input docType False
 865       Just (c,input) -> go (c:commentAcc) input docType False
 866
 867 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 868                  -> P (Located Token)
 869 withLexedDocType lexDocComment = do
 870   input@(AI _ _ buf) <- getInput
 871   case prevChar buf ' ' of
 872     '|' -> lexDocComment input ITdocCommentNext False
 873     '^' -> lexDocComment input ITdocCommentPrev False
 874     '$' -> lexDocComment input ITdocCommentNamed False
 875     '*' -> lexDocSection 1 input
 876     '#' -> lexDocComment input ITdocOptionsOld False
 877     _ -> panic "withLexedDocType: Bad doc type"
 878  where
 879     lexDocSection n input = case alexGetChar input of
 880       Just ('*', input) -> lexDocSection (n+1) input
 881       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 882       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 883
 884 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 885 -- off again at the end of the pragma.
 886 rulePrag :: Action
 887 rulePrag span _ _ = do
 888   setExts (.|. bit inRulePragBit)
 889   return (L span ITrules_prag)
 890
 891 endPrag :: Action
 892 endPrag span _ _ = do
 893   setExts (.&. complement (bit inRulePragBit))
 894   return (L span ITclose_prag)
 895
 896 -- docCommentEnd
 897 -------------------------------------------------------------------------------
 898 -- This function is quite tricky. We can't just return a new token, we also
 899 -- need to update the state of the parser. Why? Because the token is longer
 900 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 901 -- it writes the wrong token length to the parser state. This function is
 902 -- called afterwards, so it can just update the state.
 903
 904 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 905 -- which is something that the original lexer didn't account for.
 906 -- I have added last_line_len in the parser state which represents the length
 907 -- of the part of the token that is on the last line. It is now used for layout
 908 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 909 -- was before, the full length of the token, and it is now only used for error
 910 -- messages. /Waern
 911
 912 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 913                  SrcSpan -> P (Located Token)
 914 docCommentEnd input commentAcc docType buf span = do
 915   setInput input
 916   let (AI loc last_offs nextBuf) = input
 917       comment = reverse commentAcc
 918       span' = mkSrcSpan (srcSpanStart span) loc
 919       last_len = byteDiff buf nextBuf
 920
 921       last_line_len = if (last_offs - last_len < 0)
 922         then last_offs
 923         else last_len
 924
 925   span `seq` setLastToken span' last_len last_line_len
 926   return (L span' (docType comment))
 927
 928 errBrace :: AlexInput -> SrcSpan -> P a
 929 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 930
 931 open_brace, close_brace :: Action
 932 open_brace span _str _len = do
 933   ctx <- getContext
 934   setContext (NoLayout:ctx)
 935   return (L span ITocurly)
 936 close_brace span _str _len = do
 937   popContext
 938   return (L span ITccurly)
 939
 940 qvarid, qconid :: StringBuffer -> Int -> Token
 941 qvarid buf len = ITqvarid $! splitQualName buf len False
 942 qconid buf len = ITqconid $! splitQualName buf len False
 943
 944 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 945 -- takes a StringBuffer and a length, and returns the module name
 946 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 947 -- because of hierarchical module names.
 948 splitQualName orig_buf len parens = split orig_buf orig_buf
 949   where
 950     split buf dot_buf
 951         | orig_buf `byteDiff` buf >= len  = done dot_buf
 952         | c == '.'                        = found_dot buf'
 953         | otherwise                       = split buf' dot_buf
 954       where
 955        (c,buf') = nextChar buf
 956
 957     -- careful, we might get names like M....
 958     -- so, if the character after the dot is not upper-case, this is
 959     -- the end of the qualifier part.
 960     found_dot buf -- buf points after the '.'
 961         | isUpper c    = split buf' buf
 962         | otherwise    = done buf
 963       where
 964        (c,buf') = nextChar buf
 965
 966     done dot_buf =
 967         (lexemeToFastString orig_buf (qual_size - 1),
 968          if parens -- Prelude.(+)
 969             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 970             else lexemeToFastString dot_buf (len - qual_size))
 971       where
 972         qual_size = orig_buf `byteDiff` dot_buf
 973
 974 varid :: Action
 975 varid span buf len =
 976   fs `seq`
 977   case lookupUFM reservedWordsFM fs of
 978         Just (keyword,0)    -> do
 979                 maybe_layout keyword
 980                 return (L span keyword)
 981         Just (keyword,exts) -> do
 982                 b <- extension (\i -> exts .&. i /= 0)
 983                 if b then do maybe_layout keyword
 984                              return (L span keyword)
 985                      else return (L span (ITvarid fs))
 986         _other -> return (L span (ITvarid fs))
 987   where
 988         fs = lexemeToFastString buf len
 989
 990 conid :: StringBuffer -> Int -> Token
 991 conid buf len = ITconid fs
 992   where fs = lexemeToFastString buf len
 993
 994 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
 995 qvarsym buf len = ITqvarsym $! splitQualName buf len False
 996 qconsym buf len = ITqconsym $! splitQualName buf len False
 997 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
 998 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
 999
1000 varsym, consym :: Action
1001 varsym = sym ITvarsym
1002 consym = sym ITconsym
1003
1004 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
1005     -> P (Located Token)
1006 sym con span buf len =
1007   case lookupUFM reservedSymsFM fs of
1008         Just (keyword,exts) -> do
1009                 b <- extension exts
1010                 if b then return (L span keyword)
1011                      else return (L span $! con fs)
1012         _other -> return (L span $! con fs)
1013   where
1014         fs = lexemeToFastString buf len
1015
1016 -- Variations on the integral numeric literal.
1017 tok_integral :: (Integer -> Token)
1018      -> (Integer -> Integer)
1019  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1020      -> Int -> Int
1021      -> (Integer, (Char->Int)) -> Action
1022 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1023   return $ L span $ itint $! transint $ parseUnsignedInteger
1024      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1025
1026 -- some conveniences for use with tok_integral
1027 tok_num :: (Integer -> Integer)
1028         -> Int -> Int
1029         -> (Integer, (Char->Int)) -> Action
1030 tok_num = tok_integral ITinteger
1031 tok_primint :: (Integer -> Integer)
1032             -> Int -> Int
1033             -> (Integer, (Char->Int)) -> Action
1034 tok_primint = tok_integral ITprimint
1035 tok_primword :: Int -> Int
1036              -> (Integer, (Char->Int)) -> Action
1037 tok_primword = tok_integral ITprimword positive
1038 positive, negative :: (Integer -> Integer)
1039 positive = id
1040 negative = negate
1041 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1042 decimal = (10,octDecDigit)
1043 octal = (8,octDecDigit)
1044 hexadecimal = (16,hexDigit)
1045
1046 -- readRational can understand negative rationals, exponents, everything.
1047 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1048 tok_float        str = ITrational   $! readRational str
1049 tok_primfloat    str = ITprimfloat  $! readRational str
1050 tok_primdouble   str = ITprimdouble $! readRational str
1051
1052 -- -----------------------------------------------------------------------------
1053 -- Layout processing
1054
1055 -- we're at the first token on a line, insert layout tokens if necessary
1056 do_bol :: Action
1057 do_bol span _str _len = do
1058         pos <- getOffside
1059         case pos of
1060             LT -> do
1061                 --trace "layout: inserting '}'" $ do
1062                 popContext
1063                 -- do NOT pop the lex state, we might have a ';' to insert
1064                 return (L span ITvccurly)
1065             EQ -> do
1066                 --trace "layout: inserting ';'" $ do
1067                 popLexState
1068                 return (L span ITsemi)
1069             GT -> do
1070                 popLexState
1071                 lexToken
1072
1073 -- certain keywords put us in the "layout" state, where we might
1074 -- add an opening curly brace.
1075 maybe_layout :: Token -> P ()
1076 maybe_layout ITdo       = pushLexState layout_do
1077 maybe_layout ITmdo      = pushLexState layout_do
1078 maybe_layout ITof       = pushLexState layout
1079 maybe_layout ITlet      = pushLexState layout
1080 maybe_layout ITwhere    = pushLexState layout
1081 maybe_layout ITrec      = pushLexState layout
1082 maybe_layout _          = return ()
1083
1084 -- Pushing a new implicit layout context.  If the indentation of the
1085 -- next token is not greater than the previous layout context, then
1086 -- Haskell 98 says that the new layout context should be empty; that is
1087 -- the lexer must generate {}.
1088 --
1089 -- We are slightly more lenient than this: when the new context is started
1090 -- by a 'do', then we allow the new context to be at the same indentation as
1091 -- the previous context.  This is what the 'strict' argument is for.
1092 --
1093 new_layout_context :: Bool -> Action
1094 new_layout_context strict span _buf _len = do
1095     popLexState
1096     (AI _ offset _) <- getInput
1097     ctx <- getContext
1098     case ctx of
1099         Layout prev_off : _  |
1100            (strict     && prev_off >= offset  ||
1101             not strict && prev_off > offset) -> do
1102                 -- token is indented to the left of the previous context.
1103                 -- we must generate a {} sequence now.
1104                 pushLexState layout_left
1105                 return (L span ITvocurly)
1106         _ -> do
1107                 setContext (Layout offset : ctx)
1108                 return (L span ITvocurly)
1109
1110 do_layout_left :: Action
1111 do_layout_left span _buf _len = do
1112     popLexState
1113     pushLexState bol  -- we must be at the start of a line
1114     return (L span ITvccurly)
1115
1116 -- -----------------------------------------------------------------------------
1117 -- LINE pragmas
1118
1119 setLine :: Int -> Action
1120 setLine code span buf len = do
1121   let line = parseUnsignedInteger buf len 10 octDecDigit
1122   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1123         -- subtract one: the line number refers to the *following* line
1124   popLexState
1125   pushLexState code
1126   lexToken
1127
1128 setFile :: Int -> Action
1129 setFile code span buf len = do
1130   let file = lexemeToFastString (stepOn buf) (len-2)
1131   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1132   popLexState
1133   pushLexState code
1134   lexToken
1135
1136
1137 -- -----------------------------------------------------------------------------
1138 -- Options, includes and language pragmas.
1139
1140 lex_string_prag :: (String -> Token) -> Action
1141 lex_string_prag mkTok span _buf _len
1142     = do input <- getInput
1143          start <- getSrcLoc
1144          tok <- go [] input
1145          end <- getSrcLoc
1146          return (L (mkSrcSpan start end) tok)
1147     where go acc input
1148               = if isString input "#-}"
1149                    then do setInput input
1150                            return (mkTok (reverse acc))
1151                    else case alexGetChar input of
1152                           Just (c,i) -> go (c:acc) i
1153                           Nothing -> err input
1154           isString _ [] = True
1155           isString i (x:xs)
1156               = case alexGetChar i of
1157                   Just (c,i') | c == x    -> isString i' xs
1158                   _other -> False
1159           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1160
1161
1162 -- -----------------------------------------------------------------------------
1163 -- Strings & Chars
1164
1165 -- This stuff is horrible.  I hates it.
1166
1167 lex_string_tok :: Action
1168 lex_string_tok span _buf _len = do
1169   tok <- lex_string ""
1170   end <- getSrcLoc
1171   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1172
1173 lex_string :: String -> P Token
1174 lex_string s = do
1175   i <- getInput
1176   case alexGetChar' i of
1177     Nothing -> lit_error
1178
1179     Just ('"',i)  -> do
1180         setInput i
1181         magicHash <- extension magicHashEnabled
1182         if magicHash
1183           then do
1184             i <- getInput
1185             case alexGetChar' i of
1186               Just ('#',i) -> do
1187                    setInput i
1188                    if any (> '\xFF') s
1189                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1190                     else let s' = mkZFastString (reverse s) in
1191                          return (ITprimstring s')
1192                         -- mkZFastString is a hack to avoid encoding the
1193                         -- string in UTF-8.  We just want the exact bytes.
1194               _other ->
1195                 return (ITstring (mkFastString (reverse s)))
1196           else
1197                 return (ITstring (mkFastString (reverse s)))
1198
1199     Just ('\\',i)
1200         | Just ('&',i) <- next -> do
1201                 setInput i; lex_string s
1202         | Just (c,i) <- next, is_space c -> do
1203                 setInput i; lex_stringgap s
1204         where next = alexGetChar' i
1205
1206     Just (c, i) -> do
1207         c' <- lex_char c i
1208         lex_string (c':s)
1209
1210 lex_stringgap :: String -> P Token
1211 lex_stringgap s = do
1212   c <- getCharOrFail
1213   case c of
1214     '\\' -> lex_string s
1215     c | is_space c -> lex_stringgap s
1216     _other -> lit_error
1217
1218
1219 lex_char_tok :: Action
1220 -- Here we are basically parsing character literals, such as 'x' or '\n'
1221 -- but, when Template Haskell is on, we additionally spot
1222 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1223 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1224 -- So we have to do two characters of lookahead: when we see 'x we need to
1225 -- see if there's a trailing quote
1226 lex_char_tok span _buf _len = do        -- We've seen '
1227    i1 <- getInput       -- Look ahead to first character
1228    let loc = srcSpanStart span
1229    case alexGetChar' i1 of
1230         Nothing -> lit_error
1231
1232         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1233                   th_exts <- extension thEnabled
1234                   if th_exts then do
1235                         setInput i2
1236                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1237                    else lit_error
1238
1239         Just ('\\', i2@(AI _end2 _ _)) -> do    -- We've seen 'backslash
1240                   setInput i2
1241                   lit_ch <- lex_escape
1242                   mc <- getCharOrFail   -- Trailing quote
1243                   if mc == '\'' then finish_char_tok loc lit_ch
1244                                 else do setInput i2; lit_error
1245
1246         Just (c, i2@(AI _end2 _ _))
1247                 | not (isAny c) -> lit_error
1248                 | otherwise ->
1249
1250                 -- We've seen 'x, where x is a valid character
1251                 --  (i.e. not newline etc) but not a quote or backslash
1252            case alexGetChar' i2 of      -- Look ahead one more character
1253                 Just ('\'', i3) -> do   -- We've seen 'x'
1254                         setInput i3
1255                         finish_char_tok loc c
1256                 _other -> do            -- We've seen 'x not followed by quote
1257                                         -- (including the possibility of EOF)
1258                                         -- If TH is on, just parse the quote only
1259                         th_exts <- extension thEnabled
1260                         let (AI end _ _) = i1
1261                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1262                                    else do setInput i2; lit_error
1263
1264 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1265 finish_char_tok loc ch  -- We've already seen the closing quote
1266                         -- Just need to check for trailing #
1267   = do  magicHash <- extension magicHashEnabled
1268         i@(AI end _ _) <- getInput
1269         if magicHash then do
1270                 case alexGetChar' i of
1271                         Just ('#',i@(AI end _ _)) -> do
1272                                 setInput i
1273                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1274                         _other ->
1275                                 return (L (mkSrcSpan loc end) (ITchar ch))
1276                 else do
1277                    return (L (mkSrcSpan loc end) (ITchar ch))
1278
1279 lex_char :: Char -> AlexInput -> P Char
1280 lex_char c inp = do
1281   case c of
1282       '\\' -> do setInput inp; lex_escape
1283       c | isAny c -> do setInput inp; return c
1284       _other -> lit_error
1285
1286 isAny :: Char -> Bool
1287 isAny c | c > '\x7f' = isPrint c
1288         | otherwise  = is_any c
1289
1290 lex_escape :: P Char
1291 lex_escape = do
1292   c <- getCharOrFail
1293   case c of
1294         'a'   -> return '\a'
1295         'b'   -> return '\b'
1296         'f'   -> return '\f'
1297         'n'   -> return '\n'
1298         'r'   -> return '\r'
1299         't'   -> return '\t'
1300         'v'   -> return '\v'
1301         '\\'  -> return '\\'
1302         '"'   -> return '\"'
1303         '\''  -> return '\''
1304         '^'   -> do c <- getCharOrFail
1305                     if c >= '@' && c <= '_'
1306                         then return (chr (ord c - ord '@'))
1307                         else lit_error
1308
1309         'x'   -> readNum is_hexdigit 16 hexDigit
1310         'o'   -> readNum is_octdigit  8 octDecDigit
1311         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1312
1313         c1 ->  do
1314            i <- getInput
1315            case alexGetChar' i of
1316             Nothing -> lit_error
1317             Just (c2,i2) ->
1318               case alexGetChar' i2 of
1319                 Nothing -> do setInput i2; lit_error
1320                 Just (c3,i3) ->
1321                    let str = [c1,c2,c3] in
1322                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1323                                      Just rest <- [stripPrefix p str] ] of
1324                           (escape_char,[]):_ -> do
1325                                 setInput i3
1326                                 return escape_char
1327                           (escape_char,_:_):_ -> do
1328                                 setInput i2
1329                                 return escape_char
1330                           [] -> lit_error
1331
1332 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1333 readNum is_digit base conv = do
1334   i <- getInput
1335   c <- getCharOrFail
1336   if is_digit c
1337         then readNum2 is_digit base conv (conv c)
1338         else do setInput i; lit_error
1339
1340 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1341 readNum2 is_digit base conv i = do
1342   input <- getInput
1343   read i input
1344   where read i input = do
1345           case alexGetChar' input of
1346             Just (c,input') | is_digit c -> do
1347                 read (i*base + conv c) input'
1348             _other -> do
1349                 if i >= 0 && i <= 0x10FFFF
1350                    then do setInput input; return (chr i)
1351                    else lit_error
1352
1353 silly_escape_chars :: [(String, Char)]
1354 silly_escape_chars = [
1355         ("NUL", '\NUL'),
1356         ("SOH", '\SOH'),
1357         ("STX", '\STX'),
1358         ("ETX", '\ETX'),
1359         ("EOT", '\EOT'),
1360         ("ENQ", '\ENQ'),
1361         ("ACK", '\ACK'),
1362         ("BEL", '\BEL'),
1363         ("BS", '\BS'),
1364         ("HT", '\HT'),
1365         ("LF", '\LF'),
1366         ("VT", '\VT'),
1367         ("FF", '\FF'),
1368         ("CR", '\CR'),
1369         ("SO", '\SO'),
1370         ("SI", '\SI'),
1371         ("DLE", '\DLE'),
1372         ("DC1", '\DC1'),
1373         ("DC2", '\DC2'),
1374         ("DC3", '\DC3'),
1375         ("DC4", '\DC4'),
1376         ("NAK", '\NAK'),
1377         ("SYN", '\SYN'),
1378         ("ETB", '\ETB'),
1379         ("CAN", '\CAN'),
1380         ("EM", '\EM'),
1381         ("SUB", '\SUB'),
1382         ("ESC", '\ESC'),
1383         ("FS", '\FS'),
1384         ("GS", '\GS'),
1385         ("RS", '\RS'),
1386         ("US", '\US'),
1387         ("SP", '\SP'),
1388         ("DEL", '\DEL')
1389         ]
1390
1391 -- before calling lit_error, ensure that the current input is pointing to
1392 -- the position of the error in the buffer.  This is so that we can report
1393 -- a correct location to the user, but also so we can detect UTF-8 decoding
1394 -- errors if they occur.
1395 lit_error :: P a
1396 lit_error = lexError "lexical error in string/character literal"
1397
1398 getCharOrFail :: P Char
1399 getCharOrFail =  do
1400   i <- getInput
1401   case alexGetChar' i of
1402         Nothing -> lexError "unexpected end-of-file in string/character literal"
1403         Just (c,i)  -> do setInput i; return c
1404
1405 -- -----------------------------------------------------------------------------
1406 -- QuasiQuote
1407
1408 lex_quasiquote_tok :: Action
1409 lex_quasiquote_tok span buf len = do
1410   let quoter = reverse $ takeWhile (/= '$')
1411                $ reverse $ lexemeToString buf (len - 1)
1412   quoteStart <- getSrcLoc
1413   quote <- lex_quasiquote ""
1414   end <- getSrcLoc
1415   return (L (mkSrcSpan (srcSpanStart span) end)
1416            (ITquasiQuote (mkFastString quoter,
1417                           mkFastString (reverse quote),
1418                           mkSrcSpan quoteStart end)))
1419
1420 lex_quasiquote :: String -> P String
1421 lex_quasiquote s = do
1422   i <- getInput
1423   case alexGetChar' i of
1424     Nothing -> lit_error
1425
1426     Just ('\\',i)
1427         | Just ('|',i) <- next -> do
1428                 setInput i; lex_quasiquote ('|' : s)
1429         | Just (']',i) <- next -> do
1430                 setInput i; lex_quasiquote (']' : s)
1431         where next = alexGetChar' i
1432
1433     Just ('|',i)
1434         | Just (']',i) <- next -> do
1435                 setInput i; return s
1436         where next = alexGetChar' i
1437
1438     Just (c, i) -> do
1439          setInput i; lex_quasiquote (c : s)
1440
1441 -- -----------------------------------------------------------------------------
1442 -- Warnings
1443
1444 warn :: DynFlag -> SDoc -> Action
1445 warn option warning srcspan _buf _len = do
1446     addWarning option srcspan warning
1447     lexToken
1448
1449 warnThen :: DynFlag -> SDoc -> Action -> Action
1450 warnThen option warning action srcspan buf len = do
1451     addWarning option srcspan warning
1452     action srcspan buf len
1453
1454 -- -----------------------------------------------------------------------------
1455 -- The Parse Monad
1456
1457 data LayoutContext
1458   = NoLayout
1459   | Layout !Int
1460   deriving Show
1461
1462 data ParseResult a
1463   = POk PState a
1464   | PFailed
1465         SrcSpan         -- The start and end of the text span related to
1466                         -- the error.  Might be used in environments which can
1467                         -- show this span, e.g. by highlighting it.
1468         Message         -- The error message
1469
1470 data PState = PState {
1471         buffer     :: StringBuffer,
1472         dflags     :: DynFlags,
1473         messages   :: Messages,
1474         last_loc   :: SrcSpan,  -- pos of previous token
1475         last_offs  :: !Int,     -- offset of the previous token from the
1476                                 -- beginning of  the current line.
1477                                 -- \t is equal to 8 spaces.
1478         last_len   :: !Int,     -- len of previous token
1479         last_line_len :: !Int,
1480         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1481         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1482         context    :: [LayoutContext],
1483         lex_state  :: [Int]
1484      }
1485         -- last_loc and last_len are used when generating error messages,
1486         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1487         -- current token to happyError, we could at least get rid of last_len.
1488         -- Getting rid of last_loc would require finding another way to
1489         -- implement pushCurrentContext (which is only called from one place).
1490
1491 newtype P a = P { unP :: PState -> ParseResult a }
1492
1493 instance Monad P where
1494   return = returnP
1495   (>>=) = thenP
1496   fail = failP
1497
1498 returnP :: a -> P a
1499 returnP a = a `seq` (P $ \s -> POk s a)
1500
1501 thenP :: P a -> (a -> P b) -> P b
1502 (P m) `thenP` k = P $ \ s ->
1503         case m s of
1504                 POk s1 a         -> (unP (k a)) s1
1505                 PFailed span err -> PFailed span err
1506
1507 failP :: String -> P a
1508 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1509
1510 failMsgP :: String -> P a
1511 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1512
1513 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1514 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1515
1516 failSpanMsgP :: SrcSpan -> SDoc -> P a
1517 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1518
1519 getPState :: P PState
1520 getPState = P $ \s -> POk s s
1521
1522 extension :: (Int -> Bool) -> P Bool
1523 extension p = P $ \s -> POk s (p $! extsBitmap s)
1524
1525 getExts :: P Int
1526 getExts = P $ \s -> POk s (extsBitmap s)
1527
1528 setExts :: (Int -> Int) -> P ()
1529 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1530
1531 setSrcLoc :: SrcLoc -> P ()
1532 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1533
1534 getSrcLoc :: P SrcLoc
1535 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1536
1537 setLastToken :: SrcSpan -> Int -> Int -> P ()
1538 setLastToken loc len line_len = P $ \s -> POk s {
1539   last_loc=loc,
1540   last_len=len,
1541   last_line_len=line_len
1542 } ()
1543
1544 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1545
1546 alexInputPrevChar :: AlexInput -> Char
1547 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1548
1549 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1550 alexGetChar (AI loc ofs s)
1551   | atEnd s   = Nothing
1552   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1553                 --trace (show (ord c)) $
1554                 Just (adj_c, (AI loc' ofs' s'))
1555   where (c,s') = nextChar s
1556         loc'   = advanceSrcLoc loc c
1557         ofs'   = advanceOffs c ofs
1558
1559         non_graphic     = '\x0'
1560         upper           = '\x1'
1561         lower           = '\x2'
1562         digit           = '\x3'
1563         symbol          = '\x4'
1564         space           = '\x5'
1565         other_graphic   = '\x6'
1566
1567         adj_c
1568           | c <= '\x06' = non_graphic
1569           | c <= '\x7f' = c
1570           -- Alex doesn't handle Unicode, so when Unicode
1571           -- character is encountered we output these values
1572           -- with the actual character value hidden in the state.
1573           | otherwise =
1574                 case generalCategory c of
1575                   UppercaseLetter       -> upper
1576                   LowercaseLetter       -> lower
1577                   TitlecaseLetter       -> upper
1578                   ModifierLetter        -> other_graphic
1579                   OtherLetter           -> lower -- see #1103
1580                   NonSpacingMark        -> other_graphic
1581                   SpacingCombiningMark  -> other_graphic
1582                   EnclosingMark         -> other_graphic
1583                   DecimalNumber         -> digit
1584                   LetterNumber          -> other_graphic
1585                   OtherNumber           -> other_graphic
1586                   ConnectorPunctuation  -> symbol
1587                   DashPunctuation       -> symbol
1588                   OpenPunctuation       -> other_graphic
1589                   ClosePunctuation      -> other_graphic
1590                   InitialQuote          -> other_graphic
1591                   FinalQuote            -> other_graphic
1592                   OtherPunctuation      -> symbol
1593                   MathSymbol            -> symbol
1594                   CurrencySymbol        -> symbol
1595                   ModifierSymbol        -> symbol
1596                   OtherSymbol           -> symbol
1597                   Space                 -> space
1598                   _other                -> non_graphic
1599
1600 -- This version does not squash unicode characters, it is used when
1601 -- lexing strings.
1602 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1603 alexGetChar' (AI loc ofs s)
1604   | atEnd s   = Nothing
1605   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1606                 --trace (show (ord c)) $
1607                 Just (c, (AI loc' ofs' s'))
1608   where (c,s') = nextChar s
1609         loc'   = advanceSrcLoc loc c
1610         ofs'   = advanceOffs c ofs
1611
1612 advanceOffs :: Char -> Int -> Int
1613 advanceOffs '\n' _    = 0
1614 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1615 advanceOffs _    offs = offs + 1
1616
1617 getInput :: P AlexInput
1618 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1619
1620 setInput :: AlexInput -> P ()
1621 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1622
1623 pushLexState :: Int -> P ()
1624 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1625
1626 popLexState :: P Int
1627 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1628
1629 getLexState :: P Int
1630 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1631
1632 -- for reasons of efficiency, flags indicating language extensions (eg,
1633 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1634 -- integer
1635
1636 genericsBit :: Int
1637 genericsBit = 0 -- {| and |}
1638 ffiBit :: Int
1639 ffiBit     = 1
1640 parrBit :: Int
1641 parrBit    = 2
1642 arrowsBit :: Int
1643 arrowsBit  = 4
1644 thBit :: Int
1645 thBit      = 5
1646 ipBit :: Int
1647 ipBit      = 6
1648 explicitForallBit :: Int
1649 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1650 bangPatBit :: Int
1651 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1652                 -- (doesn't affect the lexer)
1653 tyFamBit :: Int
1654 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1655 haddockBit :: Int
1656 haddockBit = 10 -- Lex and parse Haddock comments
1657 magicHashBit :: Int
1658 magicHashBit = 11 -- "#" in both functions and operators
1659 kindSigsBit :: Int
1660 kindSigsBit = 12 -- Kind signatures on type variables
1661 recursiveDoBit :: Int
1662 recursiveDoBit = 13 -- mdo
1663 unicodeSyntaxBit :: Int
1664 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1665 unboxedTuplesBit :: Int
1666 unboxedTuplesBit = 15 -- (# and #)
1667 standaloneDerivingBit :: Int
1668 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1669 transformComprehensionsBit :: Int
1670 transformComprehensionsBit = 17
1671 qqBit :: Int
1672 qqBit      = 18 -- enable quasiquoting
1673 inRulePragBit :: Int
1674 inRulePragBit = 19
1675 rawTokenStreamBit :: Int
1676 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1677 newQualOpsBit :: Int
1678 newQualOpsBit = 21 -- Haskell' qualified operator syntax, e.g. Prelude.(+)
1679
1680 always :: Int -> Bool
1681 always           _     = True
1682 genericsEnabled :: Int -> Bool
1683 genericsEnabled  flags = testBit flags genericsBit
1684 parrEnabled :: Int -> Bool
1685 parrEnabled      flags = testBit flags parrBit
1686 arrowsEnabled :: Int -> Bool
1687 arrowsEnabled    flags = testBit flags arrowsBit
1688 thEnabled :: Int -> Bool
1689 thEnabled        flags = testBit flags thBit
1690 ipEnabled :: Int -> Bool
1691 ipEnabled        flags = testBit flags ipBit
1692 explicitForallEnabled :: Int -> Bool
1693 explicitForallEnabled flags = testBit flags explicitForallBit
1694 bangPatEnabled :: Int -> Bool
1695 bangPatEnabled   flags = testBit flags bangPatBit
1696 -- tyFamEnabled :: Int -> Bool
1697 -- tyFamEnabled     flags = testBit flags tyFamBit
1698 haddockEnabled :: Int -> Bool
1699 haddockEnabled   flags = testBit flags haddockBit
1700 magicHashEnabled :: Int -> Bool
1701 magicHashEnabled flags = testBit flags magicHashBit
1702 -- kindSigsEnabled :: Int -> Bool
1703 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1704 unicodeSyntaxEnabled :: Int -> Bool
1705 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1706 unboxedTuplesEnabled :: Int -> Bool
1707 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1708 standaloneDerivingEnabled :: Int -> Bool
1709 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1710 qqEnabled :: Int -> Bool
1711 qqEnabled        flags = testBit flags qqBit
1712 -- inRulePrag :: Int -> Bool
1713 -- inRulePrag       flags = testBit flags inRulePragBit
1714 rawTokenStreamEnabled :: Int -> Bool
1715 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1716 newQualOps :: Int -> Bool
1717 newQualOps       flags = testBit flags newQualOpsBit
1718 oldQualOps :: Int -> Bool
1719 oldQualOps flags = not (newQualOps flags)
1720
1721 -- PState for parsing options pragmas
1722 --
1723 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1724 pragState dynflags buf loc =
1725   PState {
1726       buffer        = buf,
1727       messages      = emptyMessages,
1728       dflags        = dynflags,
1729       last_loc      = mkSrcSpan loc loc,
1730       last_offs     = 0,
1731       last_len      = 0,
1732       last_line_len = 0,
1733       loc           = loc,
1734       extsBitmap    = 0,
1735       context       = [],
1736       lex_state     = [bol, option_prags, 0]
1737     }
1738
1739
1740 -- create a parse state
1741 --
1742 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1743 mkPState buf loc flags  =
1744   PState {
1745       buffer          = buf,
1746       dflags        = flags,
1747       messages      = emptyMessages,
1748       last_loc      = mkSrcSpan loc loc,
1749       last_offs     = 0,
1750       last_len      = 0,
1751       last_line_len = 0,
1752       loc           = loc,
1753       extsBitmap    = fromIntegral bitmap,
1754       context       = [],
1755       lex_state     = [bol, 0]
1756         -- we begin in the layout state if toplev_layout is set
1757     }
1758     where
1759       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1760                .|. ffiBit       `setBitIf` dopt Opt_ForeignFunctionInterface flags
1761                .|. parrBit      `setBitIf` dopt Opt_PArr         flags
1762                .|. arrowsBit    `setBitIf` dopt Opt_Arrows       flags
1763                .|. thBit        `setBitIf` dopt Opt_TemplateHaskell flags
1764                .|. qqBit        `setBitIf` dopt Opt_QuasiQuotes flags
1765                .|. ipBit        `setBitIf` dopt Opt_ImplicitParams flags
1766                .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
1767                .|. explicitForallBit `setBitIf` dopt Opt_LiberalTypeSynonyms flags
1768                .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
1769                .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
1770                .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
1771                .|. explicitForallBit `setBitIf` dopt Opt_RankNTypes flags
1772                .|. bangPatBit   `setBitIf` dopt Opt_BangPatterns flags
1773                .|. tyFamBit     `setBitIf` dopt Opt_TypeFamilies flags
1774                .|. haddockBit   `setBitIf` dopt Opt_Haddock      flags
1775                .|. magicHashBit `setBitIf` dopt Opt_MagicHash    flags
1776                .|. kindSigsBit  `setBitIf` dopt Opt_KindSignatures flags
1777                .|. recursiveDoBit `setBitIf` dopt Opt_RecursiveDo flags
1778                .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
1779                .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
1780                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1781                .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1782                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1783                .|. newQualOpsBit `setBitIf` dopt Opt_NewQualifiedOperators flags
1784       --
1785       setBitIf :: Int -> Bool -> Int
1786       b `setBitIf` cond | cond      = bit b
1787                         | otherwise = 0
1788
1789 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1790 addWarning option srcspan warning
1791  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1792        let warning' = mkWarnMsg srcspan alwaysQualify warning
1793            ws' = if dopt option d then ws `snocBag` warning' else ws
1794        in POk s{messages=(ws', es)} ()
1795
1796 getMessages :: PState -> Messages
1797 getMessages PState{messages=ms} = ms
1798
1799 getContext :: P [LayoutContext]
1800 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1801
1802 setContext :: [LayoutContext] -> P ()
1803 setContext ctx = P $ \s -> POk s{context=ctx} ()
1804
1805 popContext :: P ()
1806 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1807                               last_len = len, last_loc = last_loc }) ->
1808   case ctx of
1809         (_:tl) -> POk s{ context = tl } ()
1810         []     -> PFailed last_loc (srcParseErr buf len)
1811
1812 -- Push a new layout context at the indentation of the last token read.
1813 -- This is only used at the outer level of a module when the 'module'
1814 -- keyword is missing.
1815 pushCurrentContext :: P ()
1816 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1817     POk s{context = Layout (offs-len) : ctx} ()
1818 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1819
1820 getOffside :: P Ordering
1821 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1822                 let ord = case stk of
1823                         (Layout n:_) -> compare offs n
1824                         _            -> GT
1825                 in POk s ord
1826
1827 -- ---------------------------------------------------------------------------
1828 -- Construct a parse error
1829
1830 srcParseErr
1831   :: StringBuffer       -- current buffer (placed just after the last token)
1832   -> Int                -- length of the previous token
1833   -> Message
1834 srcParseErr buf len
1835   = hcat [ if null token
1836              then ptext (sLit "parse error (possibly incorrect indentation)")
1837              else hcat [ptext (sLit "parse error on input "),
1838                         char '`', text token, char '\'']
1839     ]
1840   where token = lexemeToString (offsetBytes (-len) buf) len
1841
1842 -- Report a parse failure, giving the span of the previous token as
1843 -- the location of the error.  This is the entry point for errors
1844 -- detected during parsing.
1845 srcParseFail :: P a
1846 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1847                             last_loc = last_loc } ->
1848     PFailed last_loc (srcParseErr buf len)
1849
1850 -- A lexical error is reported at a particular position in the source file,
1851 -- not over a token range.
1852 lexError :: String -> P a
1853 lexError str = do
1854   loc <- getSrcLoc
1855   (AI end _ buf) <- getInput
1856   reportLexError loc end buf str
1857
1858 -- -----------------------------------------------------------------------------
1859 -- This is the top-level function: called from the parser each time a
1860 -- new token is to be read from the input.
1861
1862 lexer :: (Located Token -> P a) -> P a
1863 lexer cont = do
1864   tok@(L _span _tok__) <- lexToken
1865 --  trace ("token: " ++ show tok__) $ do
1866   cont tok
1867
1868 lexToken :: P (Located Token)
1869 lexToken = do
1870   inp@(AI loc1 _ buf) <- getInput
1871   sc <- getLexState
1872   exts <- getExts
1873   case alexScanUser exts inp sc of
1874     AlexEOF -> do
1875         let span = mkSrcSpan loc1 loc1
1876         setLastToken span 0 0
1877         return (L span ITeof)
1878     AlexError (AI loc2 _ buf) ->
1879         reportLexError loc1 loc2 buf "lexical error"
1880     AlexSkip inp2 _ -> do
1881         setInput inp2
1882         lexToken
1883     AlexToken inp2@(AI end _ buf2) _ t -> do
1884         setInput inp2
1885         let span = mkSrcSpan loc1 end
1886         let bytes = byteDiff buf buf2
1887         span `seq` setLastToken span bytes bytes
1888         t span buf bytes
1889
1890 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
1891 reportLexError loc1 loc2 buf str
1892   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1893   | otherwise =
1894   let
1895         c = fst (nextChar buf)
1896   in
1897   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1898     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1899     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1900
1901 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
1902 lexTokenStream buf loc dflags = unP go initState
1903     where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
1904           go = do
1905             ltok <- lexer return
1906             case ltok of
1907               L _ ITeof -> return []
1908               _ -> liftM (ltok:) go
1909
1910 linePrags = Map.singleton "line" (begin line_prag2)
1911
1912 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
1913                                  ("options_ghc", lex_string_prag IToptions_prag),
1914                                  ("options_haddock", lex_string_prag ITdocOptions),
1915                                  ("language", token ITlanguage_prag),
1916                                  ("include", lex_string_prag ITinclude_prag)])
1917
1918 ignoredPrags = Map.fromList (map ignored pragmas)
1919                where ignored opt = (opt, nested_comment lexToken)
1920                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
1921                      options_pragmas = map ("options_" ++) impls
1922                      -- CFILES is a hugs-only thing.
1923                      pragmas = options_pragmas ++ ["cfiles", "contract"]
1924
1925 oneWordPrags = Map.fromList([("rules", rulePrag),
1926                            ("inline", token (ITinline_prag True)),
1927                            ("notinline", token (ITinline_prag False)),
1928                            ("specialize", token ITspec_prag),
1929                            ("source", token ITsource_prag),
1930                            ("warning", token ITwarning_prag),
1931                            ("deprecated", token ITdeprecated_prag),
1932                            ("scc", token ITscc_prag),
1933                            ("generated", token ITgenerated_prag),
1934                            ("core", token ITcore_prag),
1935                            ("unpack", token ITunpack_prag),
1936                            ("ann", token ITann_prag)])
1937
1938 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_conlike_prag True)),
1939                              ("notinline conlike", token (ITinline_conlike_prag False)),
1940                              ("specialize inline", token (ITspec_inline_prag True)),
1941                              ("specialize notinline", token (ITspec_inline_prag False))])
1942
1943
1944 dispatch_pragmas :: Map String Action -> Action
1945 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
1946                                        Just found -> found span buf len
1947                                        Nothing -> lexError "unknown pragma"
1948
1949 known_pragma :: Map String Action -> AlexAccPred Int
1950 known_pragma prags _ _ len (AI _ _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
1951                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
1952
1953 clean_pragma :: String -> String
1954 clean_pragma prag = canon_ws (map toLower (unprefix prag))
1955                     where unprefix prag' = case stripPrefix "{-#" prag' of
1956                                              Just rest -> rest
1957                                              Nothing -> prag'
1958                           canonical prag' = case prag' of
1959                                               "noinline" -> "notinline"
1960                                               "specialise" -> "specialize"
1961                                               "constructorlike" -> "conlike"
1962                                               otherwise -> prag'
1963                           canon_ws s = unwords (map canonical (words s))
1964 }