compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 {-# OPTIONS -Wwarn -w #-}
  35 -- The above warning supression flag is a temporary kludge.
  36 -- While working on this module you are encouraged to remove it and fix
  37 -- any warnings in the module. See
  38 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  39 -- for details
  40 --
  41 -- Note that Alex itself generates code with with some unused bindings and
  42 -- without type signatures, so removing the flag might not be possible.
  43
  44 {-# OPTIONS_GHC -funbox-strict-fields #-}
  45
  46 module Lexer (
  47    Token(..), lexer, pragState, mkPState, PState(..),
  48    P(..), ParseResult(..), getSrcLoc,
  49    getPState,
  50    failLocMsgP, failSpanMsgP, srcParseFail,
  51    getMessages,
  52    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  53    getLexState, popLexState, pushLexState,
  54    extension, standaloneDerivingEnabled, bangPatEnabled,
  55    addWarning,
  56    lexTokenStream
  57   ) where
  58
  59 import Bag
  60 import ErrUtils
  61 import Outputable
  62 import StringBuffer
  63 import FastString
  64 import SrcLoc
  65 import UniqFM
  66 import DynFlags
  67 import Ctype
  68 import Util             ( readRational )
  69
  70 import Control.Monad
  71 import Data.Bits
  72 import Data.Char
  73 import Data.List
  74 import Data.Maybe
  75 import Data.Map (Map)
  76 import qualified Data.Map as Map
  77 import Data.Ratio
  78 }
  79
  80 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  81 $whitechar   = [\ \n\r\f\v $unispace]
  82 $white_no_nl = $whitechar # \n
  83 $tab         = \t
  84
  85 $ascdigit  = 0-9
  86 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  87 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  88 $digit     = [$ascdigit $unidigit]
  89
  90 $special   = [\(\)\,\;\[\]\`\{\}]
  91 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  92 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  93 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  94
  95 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  96 $asclarge  = [A-Z]
  97 $large     = [$asclarge $unilarge]
  98
  99 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 100 $ascsmall  = [a-z]
 101 $small     = [$ascsmall $unismall \_]
 102
 103 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 104 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 105
 106 $octit     = 0-7
 107 $hexit     = [$decdigit A-F a-f]
 108 $symchar   = [$symbol \:]
 109 $nl        = [\n\r]
 110 $idchar    = [$small $large $digit \']
 111
 112 $pragmachar = [$small $large $digit]
 113
 114 $docsym    = [\| \^ \* \$]
 115
 116 @varid     = $small $idchar*
 117 @conid     = $large $idchar*
 118
 119 @varsym    = $symbol $symchar*
 120 @consym    = \: $symchar*
 121
 122 @decimal     = $decdigit+
 123 @octal       = $octit+
 124 @hexadecimal = $hexit+
 125 @exponent    = [eE] [\-\+]? @decimal
 126
 127 -- we support the hierarchical module name extension:
 128 @qual = (@conid \.)+
 129
 130 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 131
 132 -- normal signed numerical literals can only be explicitly negative,
 133 -- not explicitly positive (contrast @exponent)
 134 @negative = \-
 135 @signed = @negative ?
 136
 137 haskell :-
 138
 139 -- everywhere: skip whitespace and comments
 140 $white_no_nl+                           ;
 141 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 142
 143 -- Everywhere: deal with nested comments.  We explicitly rule out
 144 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 145 -- (this can happen even though pragmas will normally take precedence due to
 146 -- longest-match, because pragmas aren't valid in every state, but comments
 147 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 148 -- set.
 149
 150 "{-" / { isNormalComment } { nested_comment lexToken }
 151
 152 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 153 -- more dashes followed by a symbol should be parsed as a varsym, so we
 154 -- have to exclude those.
 155
 156 -- Since Haddock comments aren't valid in every state, we need to rule them
 157 -- out here.
 158
 159 -- The following two rules match comments that begin with two dashes, but
 160 -- continue with a different character. The rules test that this character
 161 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 162 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 163 -- have a Haddock comment). The rules then munch the rest of the line.
 164
 165 "-- " ~[$docsym \#] .* { lineCommentToken }
 166 "--" [^$symbol : \ ] .* { lineCommentToken }
 167
 168 -- Next, match Haddock comments if no -haddock flag
 169
 170 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 171
 172 -- Now, when we've matched comments that begin with 2 dashes and continue
 173 -- with a different character, we need to match comments that begin with three
 174 -- or more dashes (which clearly can't be Haddock comments). We only need to
 175 -- make sure that the first non-dash character isn't a symbol, and munch the
 176 -- rest of the line.
 177
 178 "---"\-* [^$symbol :] .* { lineCommentToken }
 179
 180 -- Since the previous rules all match dashes followed by at least one
 181 -- character, we also need to match a whole line filled with just dashes.
 182
 183 "--"\-* / { atEOL } { lineCommentToken }
 184
 185 -- We need this rule since none of the other single line comment rules
 186 -- actually match this case.
 187
 188 "-- " / { atEOL } { lineCommentToken }
 189
 190 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 191 -- blank lines) until we find a non-whitespace character, then do layout
 192 -- processing.
 193 --
 194 -- One slight wibble here: what if the line begins with {-#? In
 195 -- theory, we have to lex the pragma to see if it's one we recognise,
 196 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 197 -- as a nested comment.  We don't bother with this: if the line begins
 198 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 199 <bol> {
 200   \n                                    ;
 201   ^\# (line)?                           { begin line_prag1 }
 202   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 203   ^\# \! .* \n                          ; -- #!, for scripts
 204   ()                                    { do_bol }
 205 }
 206
 207 -- after a layout keyword (let, where, do, of), we begin a new layout
 208 -- context if the curly brace is missing.
 209 -- Careful! This stuff is quite delicate.
 210 <layout, layout_do> {
 211   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 212         -- we might encounter {-# here, but {- has been handled already
 213   \n                                    ;
 214   ^\# (line)?                           { begin line_prag1 }
 215 }
 216
 217 -- do is treated in a subtly different way, see new_layout_context
 218 <layout>    ()                          { new_layout_context True }
 219 <layout_do> ()                          { new_layout_context False }
 220
 221 -- after a new layout context which was found to be to the left of the
 222 -- previous context, we have generated a '{' token, and we now need to
 223 -- generate a matching '}' token.
 224 <layout_left>  ()                       { do_layout_left }
 225
 226 <0,option_prags> \n                             { begin bol }
 227
 228 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 229                                 { dispatch_pragmas linePrags }
 230
 231 -- single-line line pragmas, of the form
 232 --    # <line> "<file>" <extra-stuff> \n
 233 <line_prag1> $decdigit+                 { setLine line_prag1a }
 234 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 235 <line_prag1b> .*                        { pop }
 236
 237 -- Haskell-style line pragmas, of the form
 238 --    {-# LINE <line> "<file>" #-}
 239 <line_prag2> $decdigit+                 { setLine line_prag2a }
 240 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 241 <line_prag2b> "#-}"|"-}"                { pop }
 242    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 243    -- with older versions of GHC which generated these.
 244
 245 <0,option_prags> {
 246   "{-#" $whitechar* $pragmachar+
 247         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 248                                  { dispatch_pragmas twoWordPrags }
 249
 250   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 251                                  { dispatch_pragmas oneWordPrags }
 252
 253   -- We ignore all these pragmas, but don't generate a warning for them
 254   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 255                                  { dispatch_pragmas ignoredPrags }
 256
 257   -- ToDo: should only be valid inside a pragma:
 258   "#-}"                                 { endPrag }
 259 }
 260
 261 <option_prags> {
 262   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 263                                    { dispatch_pragmas fileHeaderPrags }
 264
 265   "-- #"                                 { multiline_doc_comment }
 266 }
 267
 268 <0> {
 269   -- In the "0" mode we ignore these pragmas
 270   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 271                      { nested_comment lexToken }
 272 }
 273
 274 <0> {
 275   "-- #" .* { lineCommentToken }
 276 }
 277
 278 <0,option_prags> {
 279   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 280                     (nested_comment lexToken) }
 281 }
 282
 283 -- '0' state: ordinary lexemes
 284
 285 -- Haddock comments
 286
 287 <0> {
 288   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 289   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 290 }
 291
 292 -- "special" symbols
 293
 294 <0> {
 295   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 296   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 297 }
 298
 299 <0> {
 300   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 301   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 302   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 303   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 304   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 305   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 306   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 307   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 308
 309   "[$" @varid "|"  / { ifExtension qqEnabled }
 310                      { lex_quasiquote_tok }
 311 }
 312
 313 <0> {
 314   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 315                                         { special IToparenbar }
 316   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 317 }
 318
 319 <0> {
 320   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 321 }
 322
 323 <0> {
 324   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 325          { token IToubxparen }
 326   "#)" / { ifExtension unboxedTuplesEnabled }
 327          { token ITcubxparen }
 328 }
 329
 330 <0> {
 331   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 332   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 333 }
 334
 335 <0,option_prags> {
 336   \(                                    { special IToparen }
 337   \)                                    { special ITcparen }
 338   \[                                    { special ITobrack }
 339   \]                                    { special ITcbrack }
 340   \,                                    { special ITcomma }
 341   \;                                    { special ITsemi }
 342   \`                                    { special ITbackquote }
 343
 344   \{                                    { open_brace }
 345   \}                                    { close_brace }
 346 }
 347
 348 <0,option_prags> {
 349   @qual @varid                  { idtoken qvarid }
 350   @qual @conid                  { idtoken qconid }
 351   @varid                        { varid }
 352   @conid                        { idtoken conid }
 353 }
 354
 355 <0> {
 356   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 357   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 358   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 359   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 360 }
 361
 362 -- ToDo: - move `var` and (sym) into lexical syntax?
 363 --       - remove backquote from $special?
 364 <0> {
 365   @qual @varsym       / { ifExtension oldQualOps } { idtoken qvarsym }
 366   @qual @consym       / { ifExtension oldQualOps } { idtoken qconsym }
 367   @qual \( @varsym \) / { ifExtension newQualOps } { idtoken prefixqvarsym }
 368   @qual \( @consym \) / { ifExtension newQualOps } { idtoken prefixqconsym }
 369   @varsym                                          { varsym }
 370   @consym                                          { consym }
 371 }
 372
 373 -- For the normal boxed literals we need to be careful
 374 -- when trying to be close to Haskell98
 375 <0> {
 376   -- Normal integral literals (:: Num a => a, from Integer)
 377   @decimal           { tok_num positive 0 0 decimal }
 378   0[oO] @octal       { tok_num positive 2 2 octal }
 379   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 380
 381   -- Normal rational literals (:: Fractional a => a, from Rational)
 382   @floating_point    { strtoken tok_float }
 383 }
 384
 385 <0> {
 386   -- Unboxed ints (:: Int#) and words (:: Word#)
 387   -- It's simpler (and faster?) to give separate cases to the negatives,
 388   -- especially considering octal/hexadecimal prefixes.
 389   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 390   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 391   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 392   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 393   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 394   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 395
 396   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 397   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 398   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 399
 400   -- Unboxed floats and doubles (:: Float#, :: Double#)
 401   -- prim_{float,double} work with signed literals
 402   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 403   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 404 }
 405
 406 -- Strings and chars are lexed by hand-written code.  The reason is
 407 -- that even if we recognise the string or char here in the regex
 408 -- lexer, we would still have to parse the string afterward in order
 409 -- to convert it to a String.
 410 <0> {
 411   \'                            { lex_char_tok }
 412   \"                            { lex_string_tok }
 413 }
 414
 415 {
 416 -- -----------------------------------------------------------------------------
 417 -- The token type
 418
 419 data Token
 420   = ITas                        -- Haskell keywords
 421   | ITcase
 422   | ITclass
 423   | ITdata
 424   | ITdefault
 425   | ITderiving
 426   | ITdo
 427   | ITelse
 428   | IThiding
 429   | ITif
 430   | ITimport
 431   | ITin
 432   | ITinfix
 433   | ITinfixl
 434   | ITinfixr
 435   | ITinstance
 436   | ITlet
 437   | ITmodule
 438   | ITnewtype
 439   | ITof
 440   | ITqualified
 441   | ITthen
 442   | ITtype
 443   | ITwhere
 444   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 445
 446   | ITforall                    -- GHC extension keywords
 447   | ITforeign
 448   | ITexport
 449   | ITlabel
 450   | ITdynamic
 451   | ITsafe
 452   | ITthreadsafe
 453   | ITunsafe
 454   | ITstdcallconv
 455   | ITccallconv
 456   | ITprimcallconv
 457   | ITmdo
 458   | ITfamily
 459   | ITgroup
 460   | ITby
 461   | ITusing
 462
 463         -- Pragmas
 464   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 465   | ITinline_conlike_prag Bool  -- same
 466   | ITspec_prag                 -- SPECIALISE
 467   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 468   | ITsource_prag
 469   | ITrules_prag
 470   | ITwarning_prag
 471   | ITdeprecated_prag
 472   | ITline_prag
 473   | ITscc_prag
 474   | ITgenerated_prag
 475   | ITcore_prag                 -- hdaume: core annotations
 476   | ITunpack_prag
 477   | ITann_prag
 478   | ITclose_prag
 479   | IToptions_prag String
 480   | ITinclude_prag String
 481   | ITlanguage_prag
 482
 483   | ITdotdot                    -- reserved symbols
 484   | ITcolon
 485   | ITdcolon
 486   | ITequal
 487   | ITlam
 488   | ITvbar
 489   | ITlarrow
 490   | ITrarrow
 491   | ITat
 492   | ITtilde
 493   | ITdarrow
 494   | ITminus
 495   | ITbang
 496   | ITstar
 497   | ITdot
 498
 499   | ITbiglam                    -- GHC-extension symbols
 500
 501   | ITocurly                    -- special symbols
 502   | ITccurly
 503   | ITocurlybar                 -- {|, for type applications
 504   | ITccurlybar                 -- |}, for type applications
 505   | ITvocurly
 506   | ITvccurly
 507   | ITobrack
 508   | ITopabrack                  -- [:, for parallel arrays with -XParr
 509   | ITcpabrack                  -- :], for parallel arrays with -XParr
 510   | ITcbrack
 511   | IToparen
 512   | ITcparen
 513   | IToubxparen
 514   | ITcubxparen
 515   | ITsemi
 516   | ITcomma
 517   | ITunderscore
 518   | ITbackquote
 519
 520   | ITvarid   FastString        -- identifiers
 521   | ITconid   FastString
 522   | ITvarsym  FastString
 523   | ITconsym  FastString
 524   | ITqvarid  (FastString,FastString)
 525   | ITqconid  (FastString,FastString)
 526   | ITqvarsym (FastString,FastString)
 527   | ITqconsym (FastString,FastString)
 528   | ITprefixqvarsym (FastString,FastString)
 529   | ITprefixqconsym (FastString,FastString)
 530
 531   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 532
 533   | ITpragma StringBuffer
 534
 535   | ITchar       Char
 536   | ITstring     FastString
 537   | ITinteger    Integer
 538   | ITrational   Rational
 539
 540   | ITprimchar   Char
 541   | ITprimstring FastString
 542   | ITprimint    Integer
 543   | ITprimword   Integer
 544   | ITprimfloat  Rational
 545   | ITprimdouble Rational
 546
 547   -- MetaHaskell extension tokens
 548   | ITopenExpQuote              --  [| or [e|
 549   | ITopenPatQuote              --  [p|
 550   | ITopenDecQuote              --  [d|
 551   | ITopenTypQuote              --  [t|
 552   | ITcloseQuote                --  |]
 553   | ITidEscape   FastString     --  $x
 554   | ITparenEscape               --  $(
 555   | ITvarQuote                  --  '
 556   | ITtyQuote                   --  ''
 557   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 558
 559   -- Arrow notation extension
 560   | ITproc
 561   | ITrec
 562   | IToparenbar                 --  (|
 563   | ITcparenbar                 --  |)
 564   | ITlarrowtail                --  -<
 565   | ITrarrowtail                --  >-
 566   | ITLarrowtail                --  -<<
 567   | ITRarrowtail                --  >>-
 568
 569   | ITunknown String            -- Used when the lexer can't make sense of it
 570   | ITeof                       -- end of file token
 571
 572   -- Documentation annotations
 573   | ITdocCommentNext  String     -- something beginning '-- |'
 574   | ITdocCommentPrev  String     -- something beginning '-- ^'
 575   | ITdocCommentNamed String     -- something beginning '-- $'
 576   | ITdocSection      Int String -- a section heading
 577   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 578   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 579   | ITlineComment     String     -- comment starting by "--"
 580   | ITblockComment    String     -- comment in {- -}
 581
 582 #ifdef DEBUG
 583   deriving Show -- debugging
 584 #endif
 585
 586 {-
 587 isSpecial :: Token -> Bool
 588 -- If we see M.x, where x is a keyword, but
 589 -- is special, we treat is as just plain M.x,
 590 -- not as a keyword.
 591 isSpecial ITas          = True
 592 isSpecial IThiding      = True
 593 isSpecial ITqualified   = True
 594 isSpecial ITforall      = True
 595 isSpecial ITexport      = True
 596 isSpecial ITlabel       = True
 597 isSpecial ITdynamic     = True
 598 isSpecial ITsafe        = True
 599 isSpecial ITthreadsafe  = True
 600 isSpecial ITunsafe      = True
 601 isSpecial ITccallconv   = True
 602 isSpecial ITstdcallconv = True
 603 isSpecial ITprimcallconv = True
 604 isSpecial ITmdo         = True
 605 isSpecial ITfamily      = True
 606 isSpecial ITgroup   = True
 607 isSpecial ITby      = True
 608 isSpecial ITusing   = True
 609 isSpecial _             = False
 610 -}
 611
 612 -- the bitmap provided as the third component indicates whether the
 613 -- corresponding extension keyword is valid under the extension options
 614 -- provided to the compiler; if the extension corresponding to *any* of the
 615 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 616 -- facilitates using a keyword in two different extensions that can be
 617 -- activated independently)
 618 --
 619 reservedWordsFM :: UniqFM (Token, Int)
 620 reservedWordsFM = listToUFM $
 621         map (\(x, y, z) -> (mkFastString x, (y, z)))
 622        [( "_",          ITunderscore,   0 ),
 623         ( "as",         ITas,           0 ),
 624         ( "case",       ITcase,         0 ),
 625         ( "class",      ITclass,        0 ),
 626         ( "data",       ITdata,         0 ),
 627         ( "default",    ITdefault,      0 ),
 628         ( "deriving",   ITderiving,     0 ),
 629         ( "do",         ITdo,           0 ),
 630         ( "else",       ITelse,         0 ),
 631         ( "hiding",     IThiding,       0 ),
 632         ( "if",         ITif,           0 ),
 633         ( "import",     ITimport,       0 ),
 634         ( "in",         ITin,           0 ),
 635         ( "infix",      ITinfix,        0 ),
 636         ( "infixl",     ITinfixl,       0 ),
 637         ( "infixr",     ITinfixr,       0 ),
 638         ( "instance",   ITinstance,     0 ),
 639         ( "let",        ITlet,          0 ),
 640         ( "module",     ITmodule,       0 ),
 641         ( "newtype",    ITnewtype,      0 ),
 642         ( "of",         ITof,           0 ),
 643         ( "qualified",  ITqualified,    0 ),
 644         ( "then",       ITthen,         0 ),
 645         ( "type",       ITtype,         0 ),
 646         ( "where",      ITwhere,        0 ),
 647         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 648
 649     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 650         ( "mdo",        ITmdo,           bit recursiveDoBit),
 651         ( "family",     ITfamily,        bit tyFamBit),
 652     ( "group",  ITgroup,     bit transformComprehensionsBit),
 653     ( "by",     ITby,        bit transformComprehensionsBit),
 654     ( "using",  ITusing,     bit transformComprehensionsBit),
 655
 656         ( "foreign",    ITforeign,       bit ffiBit),
 657         ( "export",     ITexport,        bit ffiBit),
 658         ( "label",      ITlabel,         bit ffiBit),
 659         ( "dynamic",    ITdynamic,       bit ffiBit),
 660         ( "safe",       ITsafe,          bit ffiBit),
 661         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 662         ( "unsafe",     ITunsafe,        bit ffiBit),
 663         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 664         ( "ccall",      ITccallconv,     bit ffiBit),
 665         ( "prim",       ITprimcallconv,  bit ffiBit),
 666
 667         ( "rec",        ITrec,           bit arrowsBit),
 668         ( "proc",       ITproc,          bit arrowsBit)
 669      ]
 670
 671 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 672 reservedSymsFM = listToUFM $
 673     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 674       [ ("..",  ITdotdot,   always)
 675         -- (:) is a reserved op, meaning only list cons
 676        ,(":",   ITcolon,    always)
 677        ,("::",  ITdcolon,   always)
 678        ,("=",   ITequal,    always)
 679        ,("\\",  ITlam,      always)
 680        ,("|",   ITvbar,     always)
 681        ,("<-",  ITlarrow,   always)
 682        ,("->",  ITrarrow,   always)
 683        ,("@",   ITat,       always)
 684        ,("~",   ITtilde,    always)
 685        ,("=>",  ITdarrow,   always)
 686        ,("-",   ITminus,    always)
 687        ,("!",   ITbang,     always)
 688
 689         -- For data T (a::*) = MkT
 690        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 691         -- For 'forall a . t'
 692        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 693
 694        ,("-<",  ITlarrowtail, arrowsEnabled)
 695        ,(">-",  ITrarrowtail, arrowsEnabled)
 696        ,("-<<", ITLarrowtail, arrowsEnabled)
 697        ,(">>-", ITRarrowtail, arrowsEnabled)
 698
 699        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 700        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 701        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 702                                 explicitForallEnabled i)
 703        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 704        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 705        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 706         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 707         -- form part of a large operator.  This would let us have a better
 708         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 709        ]
 710
 711 -- -----------------------------------------------------------------------------
 712 -- Lexer actions
 713
 714 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 715
 716 special :: Token -> Action
 717 special tok span _buf _len = return (L span tok)
 718
 719 token, layout_token :: Token -> Action
 720 token t span _buf _len = return (L span t)
 721 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 722
 723 idtoken :: (StringBuffer -> Int -> Token) -> Action
 724 idtoken f span buf len = return (L span $! (f buf len))
 725
 726 skip_one_varid :: (FastString -> Token) -> Action
 727 skip_one_varid f span buf len
 728   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 729
 730 strtoken :: (String -> Token) -> Action
 731 strtoken f span buf len =
 732   return (L span $! (f $! lexemeToString buf len))
 733
 734 init_strtoken :: Int -> (String -> Token) -> Action
 735 -- like strtoken, but drops the last N character(s)
 736 init_strtoken drop f span buf len =
 737   return (L span $! (f $! lexemeToString buf (len-drop)))
 738
 739 begin :: Int -> Action
 740 begin code _span _str _len = do pushLexState code; lexToken
 741
 742 pop :: Action
 743 pop _span _buf _len = do popLexState; lexToken
 744
 745 pop_and :: Action -> Action
 746 pop_and act span buf len = do popLexState; act span buf len
 747
 748 {-# INLINE nextCharIs #-}
 749 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 750 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 751
 752 notFollowedBy :: Char -> AlexAccPred Int
 753 notFollowedBy char _ _ _ (AI _ _ buf)
 754   = nextCharIs buf (/=char)
 755
 756 notFollowedBySymbol :: AlexAccPred Int
 757 notFollowedBySymbol _ _ _ (AI _ _ buf)
 758   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 759
 760 -- We must reject doc comments as being ordinary comments everywhere.
 761 -- In some cases the doc comment will be selected as the lexeme due to
 762 -- maximal munch, but not always, because the nested comment rule is
 763 -- valid in all states, but the doc-comment rules are only valid in
 764 -- the non-layout states.
 765 isNormalComment :: AlexAccPred Int
 766 isNormalComment bits _ _ (AI _ _ buf)
 767   | haddockEnabled bits = notFollowedByDocOrPragma
 768   | otherwise           = nextCharIs buf (/='#')
 769   where
 770     notFollowedByDocOrPragma
 771        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 772
 773 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 774 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 775
 776 {-
 777 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 778   = if haddockEnabled bits then False else (p buf)
 779 -}
 780
 781 atEOL :: AlexAccPred Int
 782 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 783
 784 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 785 ifExtension pred bits _ _ _ = pred bits
 786
 787 multiline_doc_comment :: Action
 788 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 789   where
 790     worker commentAcc input docType oneLine = case alexGetChar input of
 791       Just ('\n', input')
 792         | oneLine -> docCommentEnd input commentAcc docType buf span
 793         | otherwise -> case checkIfCommentLine input' of
 794           Just input -> worker ('\n':commentAcc) input docType False
 795           Nothing -> docCommentEnd input commentAcc docType buf span
 796       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 797       Nothing -> docCommentEnd input commentAcc docType buf span
 798
 799     checkIfCommentLine input = check (dropNonNewlineSpace input)
 800       where
 801         check input = case alexGetChar input of
 802           Just ('-', input) -> case alexGetChar input of
 803             Just ('-', input) -> case alexGetChar input of
 804               Just (c, _) | c /= '-' -> Just input
 805               _ -> Nothing
 806             _ -> Nothing
 807           _ -> Nothing
 808
 809         dropNonNewlineSpace input = case alexGetChar input of
 810           Just (c, input')
 811             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 812             | otherwise -> input
 813           Nothing -> input
 814
 815 lineCommentToken :: Action
 816 lineCommentToken span buf len = do
 817   b <- extension rawTokenStreamEnabled
 818   if b then strtoken ITlineComment span buf len else lexToken
 819
 820 {-
 821   nested comments require traversing by hand, they can't be parsed
 822   using regular expressions.
 823 -}
 824 nested_comment :: P (Located Token) -> Action
 825 nested_comment cont span _str _len = do
 826   input <- getInput
 827   go "" (1::Int) input
 828   where
 829     go commentAcc 0 input = do setInput input
 830                                b <- extension rawTokenStreamEnabled
 831                                if b
 832                                  then docCommentEnd input commentAcc ITblockComment _str span
 833                                  else cont
 834     go commentAcc n input = case alexGetChar input of
 835       Nothing -> errBrace input span
 836       Just ('-',input) -> case alexGetChar input of
 837         Nothing  -> errBrace input span
 838         Just ('\125',input) -> go commentAcc (n-1) input
 839         Just (_,_)          -> go ('-':commentAcc) n input
 840       Just ('\123',input) -> case alexGetChar input of
 841         Nothing  -> errBrace input span
 842         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 843         Just (_,_)       -> go ('\123':commentAcc) n input
 844       Just (c,input) -> go (c:commentAcc) n input
 845
 846 nested_doc_comment :: Action
 847 nested_doc_comment span buf _len = withLexedDocType (go "")
 848   where
 849     go commentAcc input docType _ = case alexGetChar input of
 850       Nothing -> errBrace input span
 851       Just ('-',input) -> case alexGetChar input of
 852         Nothing -> errBrace input span
 853         Just ('\125',input) ->
 854           docCommentEnd input commentAcc docType buf span
 855         Just (_,_) -> go ('-':commentAcc) input docType False
 856       Just ('\123', input) -> case alexGetChar input of
 857         Nothing  -> errBrace input span
 858         Just ('-',input) -> do
 859           setInput input
 860           let cont = do input <- getInput; go commentAcc input docType False
 861           nested_comment cont span buf _len
 862         Just (_,_) -> go ('\123':commentAcc) input docType False
 863       Just (c,input) -> go (c:commentAcc) input docType False
 864
 865 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 866                  -> P (Located Token)
 867 withLexedDocType lexDocComment = do
 868   input@(AI _ _ buf) <- getInput
 869   case prevChar buf ' ' of
 870     '|' -> lexDocComment input ITdocCommentNext False
 871     '^' -> lexDocComment input ITdocCommentPrev False
 872     '$' -> lexDocComment input ITdocCommentNamed False
 873     '*' -> lexDocSection 1 input
 874     '#' -> lexDocComment input ITdocOptionsOld False
 875     _ -> panic "withLexedDocType: Bad doc type"
 876  where
 877     lexDocSection n input = case alexGetChar input of
 878       Just ('*', input) -> lexDocSection (n+1) input
 879       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 880       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 881
 882 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 883 -- off again at the end of the pragma.
 884 rulePrag :: Action
 885 rulePrag span _ _ = do
 886   setExts (.|. bit inRulePragBit)
 887   return (L span ITrules_prag)
 888
 889 endPrag :: Action
 890 endPrag span _ _ = do
 891   setExts (.&. complement (bit inRulePragBit))
 892   return (L span ITclose_prag)
 893
 894 -- docCommentEnd
 895 -------------------------------------------------------------------------------
 896 -- This function is quite tricky. We can't just return a new token, we also
 897 -- need to update the state of the parser. Why? Because the token is longer
 898 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 899 -- it writes the wrong token length to the parser state. This function is
 900 -- called afterwards, so it can just update the state.
 901
 902 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 903 -- which is something that the original lexer didn't account for.
 904 -- I have added last_line_len in the parser state which represents the length
 905 -- of the part of the token that is on the last line. It is now used for layout
 906 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 907 -- was before, the full length of the token, and it is now only used for error
 908 -- messages. /Waern
 909
 910 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 911                  SrcSpan -> P (Located Token)
 912 docCommentEnd input commentAcc docType buf span = do
 913   setInput input
 914   let (AI loc last_offs nextBuf) = input
 915       comment = reverse commentAcc
 916       span' = mkSrcSpan (srcSpanStart span) loc
 917       last_len = byteDiff buf nextBuf
 918
 919       last_line_len = if (last_offs - last_len < 0)
 920         then last_offs
 921         else last_len
 922
 923   span `seq` setLastToken span' last_len last_line_len
 924   return (L span' (docType comment))
 925
 926 errBrace :: AlexInput -> SrcSpan -> P a
 927 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 928
 929 open_brace, close_brace :: Action
 930 open_brace span _str _len = do
 931   ctx <- getContext
 932   setContext (NoLayout:ctx)
 933   return (L span ITocurly)
 934 close_brace span _str _len = do
 935   popContext
 936   return (L span ITccurly)
 937
 938 qvarid, qconid :: StringBuffer -> Int -> Token
 939 qvarid buf len = ITqvarid $! splitQualName buf len False
 940 qconid buf len = ITqconid $! splitQualName buf len False
 941
 942 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 943 -- takes a StringBuffer and a length, and returns the module name
 944 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 945 -- because of hierarchical module names.
 946 splitQualName orig_buf len parens = split orig_buf orig_buf
 947   where
 948     split buf dot_buf
 949         | orig_buf `byteDiff` buf >= len  = done dot_buf
 950         | c == '.'                        = found_dot buf'
 951         | otherwise                       = split buf' dot_buf
 952       where
 953        (c,buf') = nextChar buf
 954
 955     -- careful, we might get names like M....
 956     -- so, if the character after the dot is not upper-case, this is
 957     -- the end of the qualifier part.
 958     found_dot buf -- buf points after the '.'
 959         | isUpper c    = split buf' buf
 960         | otherwise    = done buf
 961       where
 962        (c,buf') = nextChar buf
 963
 964     done dot_buf =
 965         (lexemeToFastString orig_buf (qual_size - 1),
 966          if parens -- Prelude.(+)
 967             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 968             else lexemeToFastString dot_buf (len - qual_size))
 969       where
 970         qual_size = orig_buf `byteDiff` dot_buf
 971
 972 varid :: Action
 973 varid span buf len =
 974   fs `seq`
 975   case lookupUFM reservedWordsFM fs of
 976         Just (keyword,0)    -> do
 977                 maybe_layout keyword
 978                 return (L span keyword)
 979         Just (keyword,exts) -> do
 980                 b <- extension (\i -> exts .&. i /= 0)
 981                 if b then do maybe_layout keyword
 982                              return (L span keyword)
 983                      else return (L span (ITvarid fs))
 984         _other -> return (L span (ITvarid fs))
 985   where
 986         fs = lexemeToFastString buf len
 987
 988 conid :: StringBuffer -> Int -> Token
 989 conid buf len = ITconid fs
 990   where fs = lexemeToFastString buf len
 991
 992 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
 993 qvarsym buf len = ITqvarsym $! splitQualName buf len False
 994 qconsym buf len = ITqconsym $! splitQualName buf len False
 995 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
 996 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
 997
 998 varsym, consym :: Action
 999 varsym = sym ITvarsym
1000 consym = sym ITconsym
1001
1002 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
1003     -> P (Located Token)
1004 sym con span buf len =
1005   case lookupUFM reservedSymsFM fs of
1006         Just (keyword,exts) -> do
1007                 b <- extension exts
1008                 if b then return (L span keyword)
1009                      else return (L span $! con fs)
1010         _other -> return (L span $! con fs)
1011   where
1012         fs = lexemeToFastString buf len
1013
1014 -- Variations on the integral numeric literal.
1015 tok_integral :: (Integer -> Token)
1016      -> (Integer -> Integer)
1017  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1018      -> Int -> Int
1019      -> (Integer, (Char->Int)) -> Action
1020 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1021   return $ L span $ itint $! transint $ parseUnsignedInteger
1022      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1023
1024 -- some conveniences for use with tok_integral
1025 tok_num :: (Integer -> Integer)
1026         -> Int -> Int
1027         -> (Integer, (Char->Int)) -> Action
1028 tok_num = tok_integral ITinteger
1029 tok_primint :: (Integer -> Integer)
1030             -> Int -> Int
1031             -> (Integer, (Char->Int)) -> Action
1032 tok_primint = tok_integral ITprimint
1033 tok_primword :: Int -> Int
1034              -> (Integer, (Char->Int)) -> Action
1035 tok_primword = tok_integral ITprimword positive
1036 positive, negative :: (Integer -> Integer)
1037 positive = id
1038 negative = negate
1039 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1040 decimal = (10,octDecDigit)
1041 octal = (8,octDecDigit)
1042 hexadecimal = (16,hexDigit)
1043
1044 -- readRational can understand negative rationals, exponents, everything.
1045 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1046 tok_float        str = ITrational   $! readRational str
1047 tok_primfloat    str = ITprimfloat  $! readRational str
1048 tok_primdouble   str = ITprimdouble $! readRational str
1049
1050 -- -----------------------------------------------------------------------------
1051 -- Layout processing
1052
1053 -- we're at the first token on a line, insert layout tokens if necessary
1054 do_bol :: Action
1055 do_bol span _str _len = do
1056         pos <- getOffside
1057         case pos of
1058             LT -> do
1059                 --trace "layout: inserting '}'" $ do
1060                 popContext
1061                 -- do NOT pop the lex state, we might have a ';' to insert
1062                 return (L span ITvccurly)
1063             EQ -> do
1064                 --trace "layout: inserting ';'" $ do
1065                 popLexState
1066                 return (L span ITsemi)
1067             GT -> do
1068                 popLexState
1069                 lexToken
1070
1071 -- certain keywords put us in the "layout" state, where we might
1072 -- add an opening curly brace.
1073 maybe_layout :: Token -> P ()
1074 maybe_layout ITdo       = pushLexState layout_do
1075 maybe_layout ITmdo      = pushLexState layout_do
1076 maybe_layout ITof       = pushLexState layout
1077 maybe_layout ITlet      = pushLexState layout
1078 maybe_layout ITwhere    = pushLexState layout
1079 maybe_layout ITrec      = pushLexState layout
1080 maybe_layout _          = return ()
1081
1082 -- Pushing a new implicit layout context.  If the indentation of the
1083 -- next token is not greater than the previous layout context, then
1084 -- Haskell 98 says that the new layout context should be empty; that is
1085 -- the lexer must generate {}.
1086 --
1087 -- We are slightly more lenient than this: when the new context is started
1088 -- by a 'do', then we allow the new context to be at the same indentation as
1089 -- the previous context.  This is what the 'strict' argument is for.
1090 --
1091 new_layout_context :: Bool -> Action
1092 new_layout_context strict span _buf _len = do
1093     popLexState
1094     (AI _ offset _) <- getInput
1095     ctx <- getContext
1096     case ctx of
1097         Layout prev_off : _  |
1098            (strict     && prev_off >= offset  ||
1099             not strict && prev_off > offset) -> do
1100                 -- token is indented to the left of the previous context.
1101                 -- we must generate a {} sequence now.
1102                 pushLexState layout_left
1103                 return (L span ITvocurly)
1104         _ -> do
1105                 setContext (Layout offset : ctx)
1106                 return (L span ITvocurly)
1107
1108 do_layout_left :: Action
1109 do_layout_left span _buf _len = do
1110     popLexState
1111     pushLexState bol  -- we must be at the start of a line
1112     return (L span ITvccurly)
1113
1114 -- -----------------------------------------------------------------------------
1115 -- LINE pragmas
1116
1117 setLine :: Int -> Action
1118 setLine code span buf len = do
1119   let line = parseUnsignedInteger buf len 10 octDecDigit
1120   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1121         -- subtract one: the line number refers to the *following* line
1122   popLexState
1123   pushLexState code
1124   lexToken
1125
1126 setFile :: Int -> Action
1127 setFile code span buf len = do
1128   let file = lexemeToFastString (stepOn buf) (len-2)
1129   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1130   popLexState
1131   pushLexState code
1132   lexToken
1133
1134
1135 -- -----------------------------------------------------------------------------
1136 -- Options, includes and language pragmas.
1137
1138 lex_string_prag :: (String -> Token) -> Action
1139 lex_string_prag mkTok span _buf _len
1140     = do input <- getInput
1141          start <- getSrcLoc
1142          tok <- go [] input
1143          end <- getSrcLoc
1144          return (L (mkSrcSpan start end) tok)
1145     where go acc input
1146               = if isString input "#-}"
1147                    then do setInput input
1148                            return (mkTok (reverse acc))
1149                    else case alexGetChar input of
1150                           Just (c,i) -> go (c:acc) i
1151                           Nothing -> err input
1152           isString _ [] = True
1153           isString i (x:xs)
1154               = case alexGetChar i of
1155                   Just (c,i') | c == x    -> isString i' xs
1156                   _other -> False
1157           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1158
1159
1160 -- -----------------------------------------------------------------------------
1161 -- Strings & Chars
1162
1163 -- This stuff is horrible.  I hates it.
1164
1165 lex_string_tok :: Action
1166 lex_string_tok span _buf _len = do
1167   tok <- lex_string ""
1168   end <- getSrcLoc
1169   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1170
1171 lex_string :: String -> P Token
1172 lex_string s = do
1173   i <- getInput
1174   case alexGetChar' i of
1175     Nothing -> lit_error
1176
1177     Just ('"',i)  -> do
1178         setInput i
1179         magicHash <- extension magicHashEnabled
1180         if magicHash
1181           then do
1182             i <- getInput
1183             case alexGetChar' i of
1184               Just ('#',i) -> do
1185                    setInput i
1186                    if any (> '\xFF') s
1187                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1188                     else let s' = mkZFastString (reverse s) in
1189                          return (ITprimstring s')
1190                         -- mkZFastString is a hack to avoid encoding the
1191                         -- string in UTF-8.  We just want the exact bytes.
1192               _other ->
1193                 return (ITstring (mkFastString (reverse s)))
1194           else
1195                 return (ITstring (mkFastString (reverse s)))
1196
1197     Just ('\\',i)
1198         | Just ('&',i) <- next -> do
1199                 setInput i; lex_string s
1200         | Just (c,i) <- next, is_space c -> do
1201                 setInput i; lex_stringgap s
1202         where next = alexGetChar' i
1203
1204     Just (c, i) -> do
1205         c' <- lex_char c i
1206         lex_string (c':s)
1207
1208 lex_stringgap :: String -> P Token
1209 lex_stringgap s = do
1210   c <- getCharOrFail
1211   case c of
1212     '\\' -> lex_string s
1213     c | is_space c -> lex_stringgap s
1214     _other -> lit_error
1215
1216
1217 lex_char_tok :: Action
1218 -- Here we are basically parsing character literals, such as 'x' or '\n'
1219 -- but, when Template Haskell is on, we additionally spot
1220 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1221 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1222 -- So we have to do two characters of lookahead: when we see 'x we need to
1223 -- see if there's a trailing quote
1224 lex_char_tok span _buf _len = do        -- We've seen '
1225    i1 <- getInput       -- Look ahead to first character
1226    let loc = srcSpanStart span
1227    case alexGetChar' i1 of
1228         Nothing -> lit_error
1229
1230         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1231                   th_exts <- extension thEnabled
1232                   if th_exts then do
1233                         setInput i2
1234                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1235                    else lit_error
1236
1237         Just ('\\', i2@(AI _end2 _ _)) -> do    -- We've seen 'backslash
1238                   setInput i2
1239                   lit_ch <- lex_escape
1240                   mc <- getCharOrFail   -- Trailing quote
1241                   if mc == '\'' then finish_char_tok loc lit_ch
1242                                 else do setInput i2; lit_error
1243
1244         Just (c, i2@(AI _end2 _ _))
1245                 | not (isAny c) -> lit_error
1246                 | otherwise ->
1247
1248                 -- We've seen 'x, where x is a valid character
1249                 --  (i.e. not newline etc) but not a quote or backslash
1250            case alexGetChar' i2 of      -- Look ahead one more character
1251                 Just ('\'', i3) -> do   -- We've seen 'x'
1252                         setInput i3
1253                         finish_char_tok loc c
1254                 _other -> do            -- We've seen 'x not followed by quote
1255                                         -- (including the possibility of EOF)
1256                                         -- If TH is on, just parse the quote only
1257                         th_exts <- extension thEnabled
1258                         let (AI end _ _) = i1
1259                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1260                                    else do setInput i2; lit_error
1261
1262 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1263 finish_char_tok loc ch  -- We've already seen the closing quote
1264                         -- Just need to check for trailing #
1265   = do  magicHash <- extension magicHashEnabled
1266         i@(AI end _ _) <- getInput
1267         if magicHash then do
1268                 case alexGetChar' i of
1269                         Just ('#',i@(AI end _ _)) -> do
1270                                 setInput i
1271                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1272                         _other ->
1273                                 return (L (mkSrcSpan loc end) (ITchar ch))
1274                 else do
1275                    return (L (mkSrcSpan loc end) (ITchar ch))
1276
1277 lex_char :: Char -> AlexInput -> P Char
1278 lex_char c inp = do
1279   case c of
1280       '\\' -> do setInput inp; lex_escape
1281       c | isAny c -> do setInput inp; return c
1282       _other -> lit_error
1283
1284 isAny :: Char -> Bool
1285 isAny c | c > '\x7f' = isPrint c
1286         | otherwise  = is_any c
1287
1288 lex_escape :: P Char
1289 lex_escape = do
1290   c <- getCharOrFail
1291   case c of
1292         'a'   -> return '\a'
1293         'b'   -> return '\b'
1294         'f'   -> return '\f'
1295         'n'   -> return '\n'
1296         'r'   -> return '\r'
1297         't'   -> return '\t'
1298         'v'   -> return '\v'
1299         '\\'  -> return '\\'
1300         '"'   -> return '\"'
1301         '\''  -> return '\''
1302         '^'   -> do c <- getCharOrFail
1303                     if c >= '@' && c <= '_'
1304                         then return (chr (ord c - ord '@'))
1305                         else lit_error
1306
1307         'x'   -> readNum is_hexdigit 16 hexDigit
1308         'o'   -> readNum is_octdigit  8 octDecDigit
1309         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1310
1311         c1 ->  do
1312            i <- getInput
1313            case alexGetChar' i of
1314             Nothing -> lit_error
1315             Just (c2,i2) ->
1316               case alexGetChar' i2 of
1317                 Nothing -> do setInput i2; lit_error
1318                 Just (c3,i3) ->
1319                    let str = [c1,c2,c3] in
1320                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1321                                      Just rest <- [stripPrefix p str] ] of
1322                           (escape_char,[]):_ -> do
1323                                 setInput i3
1324                                 return escape_char
1325                           (escape_char,_:_):_ -> do
1326                                 setInput i2
1327                                 return escape_char
1328                           [] -> lit_error
1329
1330 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1331 readNum is_digit base conv = do
1332   i <- getInput
1333   c <- getCharOrFail
1334   if is_digit c
1335         then readNum2 is_digit base conv (conv c)
1336         else do setInput i; lit_error
1337
1338 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1339 readNum2 is_digit base conv i = do
1340   input <- getInput
1341   read i input
1342   where read i input = do
1343           case alexGetChar' input of
1344             Just (c,input') | is_digit c -> do
1345                 read (i*base + conv c) input'
1346             _other -> do
1347                 if i >= 0 && i <= 0x10FFFF
1348                    then do setInput input; return (chr i)
1349                    else lit_error
1350
1351 silly_escape_chars :: [(String, Char)]
1352 silly_escape_chars = [
1353         ("NUL", '\NUL'),
1354         ("SOH", '\SOH'),
1355         ("STX", '\STX'),
1356         ("ETX", '\ETX'),
1357         ("EOT", '\EOT'),
1358         ("ENQ", '\ENQ'),
1359         ("ACK", '\ACK'),
1360         ("BEL", '\BEL'),
1361         ("BS", '\BS'),
1362         ("HT", '\HT'),
1363         ("LF", '\LF'),
1364         ("VT", '\VT'),
1365         ("FF", '\FF'),
1366         ("CR", '\CR'),
1367         ("SO", '\SO'),
1368         ("SI", '\SI'),
1369         ("DLE", '\DLE'),
1370         ("DC1", '\DC1'),
1371         ("DC2", '\DC2'),
1372         ("DC3", '\DC3'),
1373         ("DC4", '\DC4'),
1374         ("NAK", '\NAK'),
1375         ("SYN", '\SYN'),
1376         ("ETB", '\ETB'),
1377         ("CAN", '\CAN'),
1378         ("EM", '\EM'),
1379         ("SUB", '\SUB'),
1380         ("ESC", '\ESC'),
1381         ("FS", '\FS'),
1382         ("GS", '\GS'),
1383         ("RS", '\RS'),
1384         ("US", '\US'),
1385         ("SP", '\SP'),
1386         ("DEL", '\DEL')
1387         ]
1388
1389 -- before calling lit_error, ensure that the current input is pointing to
1390 -- the position of the error in the buffer.  This is so that we can report
1391 -- a correct location to the user, but also so we can detect UTF-8 decoding
1392 -- errors if they occur.
1393 lit_error :: P a
1394 lit_error = lexError "lexical error in string/character literal"
1395
1396 getCharOrFail :: P Char
1397 getCharOrFail =  do
1398   i <- getInput
1399   case alexGetChar' i of
1400         Nothing -> lexError "unexpected end-of-file in string/character literal"
1401         Just (c,i)  -> do setInput i; return c
1402
1403 -- -----------------------------------------------------------------------------
1404 -- QuasiQuote
1405
1406 lex_quasiquote_tok :: Action
1407 lex_quasiquote_tok span buf len = do
1408   let quoter = reverse $ takeWhile (/= '$')
1409                $ reverse $ lexemeToString buf (len - 1)
1410   quoteStart <- getSrcLoc
1411   quote <- lex_quasiquote ""
1412   end <- getSrcLoc
1413   return (L (mkSrcSpan (srcSpanStart span) end)
1414            (ITquasiQuote (mkFastString quoter,
1415                           mkFastString (reverse quote),
1416                           mkSrcSpan quoteStart end)))
1417
1418 lex_quasiquote :: String -> P String
1419 lex_quasiquote s = do
1420   i <- getInput
1421   case alexGetChar' i of
1422     Nothing -> lit_error
1423
1424     Just ('\\',i)
1425         | Just ('|',i) <- next -> do
1426                 setInput i; lex_quasiquote ('|' : s)
1427         | Just (']',i) <- next -> do
1428                 setInput i; lex_quasiquote (']' : s)
1429         where next = alexGetChar' i
1430
1431     Just ('|',i)
1432         | Just (']',i) <- next -> do
1433                 setInput i; return s
1434         where next = alexGetChar' i
1435
1436     Just (c, i) -> do
1437          setInput i; lex_quasiquote (c : s)
1438
1439 -- -----------------------------------------------------------------------------
1440 -- Warnings
1441
1442 warn :: DynFlag -> SDoc -> Action
1443 warn option warning srcspan _buf _len = do
1444     addWarning option srcspan warning
1445     lexToken
1446
1447 warnThen :: DynFlag -> SDoc -> Action -> Action
1448 warnThen option warning action srcspan buf len = do
1449     addWarning option srcspan warning
1450     action srcspan buf len
1451
1452 -- -----------------------------------------------------------------------------
1453 -- The Parse Monad
1454
1455 data LayoutContext
1456   = NoLayout
1457   | Layout !Int
1458   deriving Show
1459
1460 data ParseResult a
1461   = POk PState a
1462   | PFailed
1463         SrcSpan         -- The start and end of the text span related to
1464                         -- the error.  Might be used in environments which can
1465                         -- show this span, e.g. by highlighting it.
1466         Message         -- The error message
1467
1468 data PState = PState {
1469         buffer     :: StringBuffer,
1470         dflags     :: DynFlags,
1471         messages   :: Messages,
1472         last_loc   :: SrcSpan,  -- pos of previous token
1473         last_offs  :: !Int,     -- offset of the previous token from the
1474                                 -- beginning of  the current line.
1475                                 -- \t is equal to 8 spaces.
1476         last_len   :: !Int,     -- len of previous token
1477         last_line_len :: !Int,
1478         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1479         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1480         context    :: [LayoutContext],
1481         lex_state  :: [Int]
1482      }
1483         -- last_loc and last_len are used when generating error messages,
1484         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1485         -- current token to happyError, we could at least get rid of last_len.
1486         -- Getting rid of last_loc would require finding another way to
1487         -- implement pushCurrentContext (which is only called from one place).
1488
1489 newtype P a = P { unP :: PState -> ParseResult a }
1490
1491 instance Monad P where
1492   return = returnP
1493   (>>=) = thenP
1494   fail = failP
1495
1496 returnP :: a -> P a
1497 returnP a = a `seq` (P $ \s -> POk s a)
1498
1499 thenP :: P a -> (a -> P b) -> P b
1500 (P m) `thenP` k = P $ \ s ->
1501         case m s of
1502                 POk s1 a         -> (unP (k a)) s1
1503                 PFailed span err -> PFailed span err
1504
1505 failP :: String -> P a
1506 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1507
1508 failMsgP :: String -> P a
1509 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1510
1511 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1512 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1513
1514 failSpanMsgP :: SrcSpan -> SDoc -> P a
1515 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1516
1517 getPState :: P PState
1518 getPState = P $ \s -> POk s s
1519
1520 extension :: (Int -> Bool) -> P Bool
1521 extension p = P $ \s -> POk s (p $! extsBitmap s)
1522
1523 getExts :: P Int
1524 getExts = P $ \s -> POk s (extsBitmap s)
1525
1526 setExts :: (Int -> Int) -> P ()
1527 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1528
1529 setSrcLoc :: SrcLoc -> P ()
1530 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1531
1532 getSrcLoc :: P SrcLoc
1533 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1534
1535 setLastToken :: SrcSpan -> Int -> Int -> P ()
1536 setLastToken loc len line_len = P $ \s -> POk s {
1537   last_loc=loc,
1538   last_len=len,
1539   last_line_len=line_len
1540 } ()
1541
1542 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1543
1544 alexInputPrevChar :: AlexInput -> Char
1545 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1546
1547 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1548 alexGetChar (AI loc ofs s)
1549   | atEnd s   = Nothing
1550   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1551                 --trace (show (ord c)) $
1552                 Just (adj_c, (AI loc' ofs' s'))
1553   where (c,s') = nextChar s
1554         loc'   = advanceSrcLoc loc c
1555         ofs'   = advanceOffs c ofs
1556
1557         non_graphic     = '\x0'
1558         upper           = '\x1'
1559         lower           = '\x2'
1560         digit           = '\x3'
1561         symbol          = '\x4'
1562         space           = '\x5'
1563         other_graphic   = '\x6'
1564
1565         adj_c
1566           | c <= '\x06' = non_graphic
1567           | c <= '\x7f' = c
1568           -- Alex doesn't handle Unicode, so when Unicode
1569           -- character is encountered we output these values
1570           -- with the actual character value hidden in the state.
1571           | otherwise =
1572                 case generalCategory c of
1573                   UppercaseLetter       -> upper
1574                   LowercaseLetter       -> lower
1575                   TitlecaseLetter       -> upper
1576                   ModifierLetter        -> other_graphic
1577                   OtherLetter           -> lower -- see #1103
1578                   NonSpacingMark        -> other_graphic
1579                   SpacingCombiningMark  -> other_graphic
1580                   EnclosingMark         -> other_graphic
1581                   DecimalNumber         -> digit
1582                   LetterNumber          -> other_graphic
1583                   OtherNumber           -> other_graphic
1584                   ConnectorPunctuation  -> symbol
1585                   DashPunctuation       -> symbol
1586                   OpenPunctuation       -> other_graphic
1587                   ClosePunctuation      -> other_graphic
1588                   InitialQuote          -> other_graphic
1589                   FinalQuote            -> other_graphic
1590                   OtherPunctuation      -> symbol
1591                   MathSymbol            -> symbol
1592                   CurrencySymbol        -> symbol
1593                   ModifierSymbol        -> symbol
1594                   OtherSymbol           -> symbol
1595                   Space                 -> space
1596                   _other                -> non_graphic
1597
1598 -- This version does not squash unicode characters, it is used when
1599 -- lexing strings.
1600 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1601 alexGetChar' (AI loc ofs s)
1602   | atEnd s   = Nothing
1603   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1604                 --trace (show (ord c)) $
1605                 Just (c, (AI loc' ofs' s'))
1606   where (c,s') = nextChar s
1607         loc'   = advanceSrcLoc loc c
1608         ofs'   = advanceOffs c ofs
1609
1610 advanceOffs :: Char -> Int -> Int
1611 advanceOffs '\n' _    = 0
1612 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1613 advanceOffs _    offs = offs + 1
1614
1615 getInput :: P AlexInput
1616 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1617
1618 setInput :: AlexInput -> P ()
1619 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1620
1621 pushLexState :: Int -> P ()
1622 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1623
1624 popLexState :: P Int
1625 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1626
1627 getLexState :: P Int
1628 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1629
1630 -- for reasons of efficiency, flags indicating language extensions (eg,
1631 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1632 -- integer
1633
1634 genericsBit :: Int
1635 genericsBit = 0 -- {| and |}
1636 ffiBit :: Int
1637 ffiBit     = 1
1638 parrBit :: Int
1639 parrBit    = 2
1640 arrowsBit :: Int
1641 arrowsBit  = 4
1642 thBit :: Int
1643 thBit      = 5
1644 ipBit :: Int
1645 ipBit      = 6
1646 explicitForallBit :: Int
1647 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1648 bangPatBit :: Int
1649 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1650                 -- (doesn't affect the lexer)
1651 tyFamBit :: Int
1652 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1653 haddockBit :: Int
1654 haddockBit = 10 -- Lex and parse Haddock comments
1655 magicHashBit :: Int
1656 magicHashBit = 11 -- "#" in both functions and operators
1657 kindSigsBit :: Int
1658 kindSigsBit = 12 -- Kind signatures on type variables
1659 recursiveDoBit :: Int
1660 recursiveDoBit = 13 -- mdo
1661 unicodeSyntaxBit :: Int
1662 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1663 unboxedTuplesBit :: Int
1664 unboxedTuplesBit = 15 -- (# and #)
1665 standaloneDerivingBit :: Int
1666 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1667 transformComprehensionsBit :: Int
1668 transformComprehensionsBit = 17
1669 qqBit :: Int
1670 qqBit      = 18 -- enable quasiquoting
1671 inRulePragBit :: Int
1672 inRulePragBit = 19
1673 rawTokenStreamBit :: Int
1674 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1675 newQualOpsBit :: Int
1676 newQualOpsBit = 21 -- Haskell' qualified operator syntax, e.g. Prelude.(+)
1677
1678 always :: Int -> Bool
1679 always           _     = True
1680 genericsEnabled :: Int -> Bool
1681 genericsEnabled  flags = testBit flags genericsBit
1682 parrEnabled :: Int -> Bool
1683 parrEnabled      flags = testBit flags parrBit
1684 arrowsEnabled :: Int -> Bool
1685 arrowsEnabled    flags = testBit flags arrowsBit
1686 thEnabled :: Int -> Bool
1687 thEnabled        flags = testBit flags thBit
1688 ipEnabled :: Int -> Bool
1689 ipEnabled        flags = testBit flags ipBit
1690 explicitForallEnabled :: Int -> Bool
1691 explicitForallEnabled flags = testBit flags explicitForallBit
1692 bangPatEnabled :: Int -> Bool
1693 bangPatEnabled   flags = testBit flags bangPatBit
1694 -- tyFamEnabled :: Int -> Bool
1695 -- tyFamEnabled     flags = testBit flags tyFamBit
1696 haddockEnabled :: Int -> Bool
1697 haddockEnabled   flags = testBit flags haddockBit
1698 magicHashEnabled :: Int -> Bool
1699 magicHashEnabled flags = testBit flags magicHashBit
1700 -- kindSigsEnabled :: Int -> Bool
1701 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1702 unicodeSyntaxEnabled :: Int -> Bool
1703 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1704 unboxedTuplesEnabled :: Int -> Bool
1705 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1706 standaloneDerivingEnabled :: Int -> Bool
1707 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1708 qqEnabled :: Int -> Bool
1709 qqEnabled        flags = testBit flags qqBit
1710 -- inRulePrag :: Int -> Bool
1711 -- inRulePrag       flags = testBit flags inRulePragBit
1712 rawTokenStreamEnabled :: Int -> Bool
1713 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1714 newQualOps :: Int -> Bool
1715 newQualOps       flags = testBit flags newQualOpsBit
1716 oldQualOps :: Int -> Bool
1717 oldQualOps flags = not (newQualOps flags)
1718
1719 -- PState for parsing options pragmas
1720 --
1721 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1722 pragState dynflags buf loc =
1723   PState {
1724       buffer        = buf,
1725       messages      = emptyMessages,
1726       dflags        = dynflags,
1727       last_loc      = mkSrcSpan loc loc,
1728       last_offs     = 0,
1729       last_len      = 0,
1730       last_line_len = 0,
1731       loc           = loc,
1732       extsBitmap    = 0,
1733       context       = [],
1734       lex_state     = [bol, option_prags, 0]
1735     }
1736
1737
1738 -- create a parse state
1739 --
1740 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1741 mkPState buf loc flags  =
1742   PState {
1743       buffer          = buf,
1744       dflags        = flags,
1745       messages      = emptyMessages,
1746       last_loc      = mkSrcSpan loc loc,
1747       last_offs     = 0,
1748       last_len      = 0,
1749       last_line_len = 0,
1750       loc           = loc,
1751       extsBitmap    = fromIntegral bitmap,
1752       context       = [],
1753       lex_state     = [bol, 0]
1754         -- we begin in the layout state if toplev_layout is set
1755     }
1756     where
1757       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1758                .|. ffiBit       `setBitIf` dopt Opt_ForeignFunctionInterface flags
1759                .|. parrBit      `setBitIf` dopt Opt_PArr         flags
1760                .|. arrowsBit    `setBitIf` dopt Opt_Arrows       flags
1761                .|. thBit        `setBitIf` dopt Opt_TemplateHaskell flags
1762                .|. qqBit        `setBitIf` dopt Opt_QuasiQuotes flags
1763                .|. ipBit        `setBitIf` dopt Opt_ImplicitParams flags
1764                .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
1765                .|. explicitForallBit `setBitIf` dopt Opt_LiberalTypeSynonyms flags
1766                .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
1767                .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
1768                .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
1769                .|. explicitForallBit `setBitIf` dopt Opt_RankNTypes flags
1770                .|. bangPatBit   `setBitIf` dopt Opt_BangPatterns flags
1771                .|. tyFamBit     `setBitIf` dopt Opt_TypeFamilies flags
1772                .|. haddockBit   `setBitIf` dopt Opt_Haddock      flags
1773                .|. magicHashBit `setBitIf` dopt Opt_MagicHash    flags
1774                .|. kindSigsBit  `setBitIf` dopt Opt_KindSignatures flags
1775                .|. recursiveDoBit `setBitIf` dopt Opt_RecursiveDo flags
1776                .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
1777                .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
1778                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1779                .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1780                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1781                .|. newQualOpsBit `setBitIf` dopt Opt_NewQualifiedOperators flags
1782       --
1783       setBitIf :: Int -> Bool -> Int
1784       b `setBitIf` cond | cond      = bit b
1785                         | otherwise = 0
1786
1787 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1788 addWarning option srcspan warning
1789  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1790        let warning' = mkWarnMsg srcspan alwaysQualify warning
1791            ws' = if dopt option d then ws `snocBag` warning' else ws
1792        in POk s{messages=(ws', es)} ()
1793
1794 getMessages :: PState -> Messages
1795 getMessages PState{messages=ms} = ms
1796
1797 getContext :: P [LayoutContext]
1798 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1799
1800 setContext :: [LayoutContext] -> P ()
1801 setContext ctx = P $ \s -> POk s{context=ctx} ()
1802
1803 popContext :: P ()
1804 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1805                               last_len = len, last_loc = last_loc }) ->
1806   case ctx of
1807         (_:tl) -> POk s{ context = tl } ()
1808         []     -> PFailed last_loc (srcParseErr buf len)
1809
1810 -- Push a new layout context at the indentation of the last token read.
1811 -- This is only used at the outer level of a module when the 'module'
1812 -- keyword is missing.
1813 pushCurrentContext :: P ()
1814 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1815     POk s{context = Layout (offs-len) : ctx} ()
1816 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1817
1818 getOffside :: P Ordering
1819 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1820                 let ord = case stk of
1821                         (Layout n:_) -> compare offs n
1822                         _            -> GT
1823                 in POk s ord
1824
1825 -- ---------------------------------------------------------------------------
1826 -- Construct a parse error
1827
1828 srcParseErr
1829   :: StringBuffer       -- current buffer (placed just after the last token)
1830   -> Int                -- length of the previous token
1831   -> Message
1832 srcParseErr buf len
1833   = hcat [ if null token
1834              then ptext (sLit "parse error (possibly incorrect indentation)")
1835              else hcat [ptext (sLit "parse error on input "),
1836                         char '`', text token, char '\'']
1837     ]
1838   where token = lexemeToString (offsetBytes (-len) buf) len
1839
1840 -- Report a parse failure, giving the span of the previous token as
1841 -- the location of the error.  This is the entry point for errors
1842 -- detected during parsing.
1843 srcParseFail :: P a
1844 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1845                             last_loc = last_loc } ->
1846     PFailed last_loc (srcParseErr buf len)
1847
1848 -- A lexical error is reported at a particular position in the source file,
1849 -- not over a token range.
1850 lexError :: String -> P a
1851 lexError str = do
1852   loc <- getSrcLoc
1853   (AI end _ buf) <- getInput
1854   reportLexError loc end buf str
1855
1856 -- -----------------------------------------------------------------------------
1857 -- This is the top-level function: called from the parser each time a
1858 -- new token is to be read from the input.
1859
1860 lexer :: (Located Token -> P a) -> P a
1861 lexer cont = do
1862   tok@(L _span _tok__) <- lexToken
1863 --  trace ("token: " ++ show tok__) $ do
1864   cont tok
1865
1866 lexToken :: P (Located Token)
1867 lexToken = do
1868   inp@(AI loc1 _ buf) <- getInput
1869   sc <- getLexState
1870   exts <- getExts
1871   case alexScanUser exts inp sc of
1872     AlexEOF -> do
1873         let span = mkSrcSpan loc1 loc1
1874         setLastToken span 0 0
1875         return (L span ITeof)
1876     AlexError (AI loc2 _ buf) ->
1877         reportLexError loc1 loc2 buf "lexical error"
1878     AlexSkip inp2 _ -> do
1879         setInput inp2
1880         lexToken
1881     AlexToken inp2@(AI end _ buf2) _ t -> do
1882         setInput inp2
1883         let span = mkSrcSpan loc1 end
1884         let bytes = byteDiff buf buf2
1885         span `seq` setLastToken span bytes bytes
1886         t span buf bytes
1887
1888 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
1889 reportLexError loc1 loc2 buf str
1890   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1891   | otherwise =
1892   let
1893         c = fst (nextChar buf)
1894   in
1895   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1896     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1897     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1898
1899 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
1900 lexTokenStream buf loc dflags = unP go initState
1901     where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
1902           go = do
1903             ltok <- lexer return
1904             case ltok of
1905               L _ ITeof -> return []
1906               _ -> liftM (ltok:) go
1907
1908 linePrags = Map.singleton "line" (begin line_prag2)
1909
1910 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
1911                                  ("options_ghc", lex_string_prag IToptions_prag),
1912                                  ("options_haddock", lex_string_prag ITdocOptions),
1913                                  ("language", token ITlanguage_prag),
1914                                  ("include", lex_string_prag ITinclude_prag)])
1915
1916 ignoredPrags = Map.fromList (map ignored pragmas)
1917                where ignored opt = (opt, nested_comment lexToken)
1918                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
1919                      options_pragmas = map ("options_" ++) impls
1920                      -- CFILES is a hugs-only thing.
1921                      pragmas = options_pragmas ++ ["cfiles", "contract"]
1922
1923 oneWordPrags = Map.fromList([("rules", rulePrag),
1924                            ("inline", token (ITinline_prag True)),
1925                            ("notinline", token (ITinline_prag False)),
1926                            ("specialize", token ITspec_prag),
1927                            ("source", token ITsource_prag),
1928                            ("warning", token ITwarning_prag),
1929                            ("deprecated", token ITdeprecated_prag),
1930                            ("scc", token ITscc_prag),
1931                            ("generated", token ITgenerated_prag),
1932                            ("core", token ITcore_prag),
1933                            ("unpack", token ITunpack_prag),
1934                            ("ann", token ITann_prag)])
1935
1936 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_conlike_prag True)),
1937                              ("notinline conlike", token (ITinline_conlike_prag False)),
1938                              ("specialize inline", token (ITspec_inline_prag True)),
1939                              ("specialize notinline", token (ITspec_inline_prag False))])
1940
1941
1942 dispatch_pragmas :: Map String Action -> Action
1943 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
1944                                        Just found -> found span buf len
1945                                        Nothing -> lexError "unknown pragma"
1946
1947 known_pragma :: Map String Action -> AlexAccPred Int
1948 known_pragma prags _ _ len (AI _ _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
1949                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
1950
1951 clean_pragma :: String -> String
1952 clean_pragma prag = canon_ws (map toLower (unprefix prag))
1953                     where unprefix prag' = case stripPrefix "{-#" prag' of
1954                                              Just rest -> rest
1955                                              Nothing -> prag'
1956                           canonical prag' = case prag' of
1957                                               "noinline" -> "notinline"
1958                                               "specialise" -> "specialize"
1959                                               "constructorlike" -> "conlike"
1960                                               otherwise -> prag'
1961                           canon_ws s = unwords (map canonical (words s))
1962 }