compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 {-# OPTIONS -Wwarn -w #-}
  35 -- The above -Wwarn supression flag is a temporary kludge.
  36 -- While working on this module you are encouraged to remove it and fix
  37 -- any warnings in the module. See
  38 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  39 -- for details
  40 --
  41 -- Note that Alex itself generates code with with some unused bindings and
  42 -- without type signatures, so removing the flag might not be possible.
  43
  44 {-# OPTIONS_GHC -funbox-strict-fields #-}
  45
  46 module Lexer (
  47    Token(..), lexer, pragState, mkPState, PState(..),
  48    P(..), ParseResult(..), getSrcLoc,
  49    getPState, getDynFlags, withThisPackage,
  50    failLocMsgP, failSpanMsgP, srcParseFail,
  51    getMessages,
  52    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  53    getLexState, popLexState, pushLexState,
  54    extension, standaloneDerivingEnabled, bangPatEnabled,
  55    addWarning,
  56    lexTokenStream
  57   ) where
  58
  59 import Bag
  60 import ErrUtils
  61 import Outputable
  62 import StringBuffer
  63 import FastString
  64 import SrcLoc
  65 import UniqFM
  66 import DynFlags
  67 import Module
  68 import Ctype
  69 import Util             ( readRational )
  70
  71 import Control.Monad
  72 import Data.Bits
  73 import Data.Char
  74 import Data.List
  75 import Data.Maybe
  76 import Data.Map (Map)
  77 import qualified Data.Map as Map
  78 import Data.Ratio
  79 }
  80
  81 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  82 $whitechar   = [\ \n\r\f\v $unispace]
  83 $white_no_nl = $whitechar # \n
  84 $tab         = \t
  85
  86 $ascdigit  = 0-9
  87 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  88 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  89 $digit     = [$ascdigit $unidigit]
  90
  91 $special   = [\(\)\,\;\[\]\`\{\}]
  92 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  93 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  94 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  95
  96 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  97 $asclarge  = [A-Z]
  98 $large     = [$asclarge $unilarge]
  99
 100 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 101 $ascsmall  = [a-z]
 102 $small     = [$ascsmall $unismall \_]
 103
 104 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 105 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 106
 107 $octit     = 0-7
 108 $hexit     = [$decdigit A-F a-f]
 109 $symchar   = [$symbol \:]
 110 $nl        = [\n\r]
 111 $idchar    = [$small $large $digit \']
 112
 113 $pragmachar = [$small $large $digit]
 114
 115 $docsym    = [\| \^ \* \$]
 116
 117 @varid     = $small $idchar*
 118 @conid     = $large $idchar*
 119
 120 @varsym    = $symbol $symchar*
 121 @consym    = \: $symchar*
 122
 123 @decimal     = $decdigit+
 124 @octal       = $octit+
 125 @hexadecimal = $hexit+
 126 @exponent    = [eE] [\-\+]? @decimal
 127
 128 -- we support the hierarchical module name extension:
 129 @qual = (@conid \.)+
 130
 131 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 132
 133 -- normal signed numerical literals can only be explicitly negative,
 134 -- not explicitly positive (contrast @exponent)
 135 @negative = \-
 136 @signed = @negative ?
 137
 138 haskell :-
 139
 140 -- everywhere: skip whitespace and comments
 141 $white_no_nl+                           ;
 142 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 143
 144 -- Everywhere: deal with nested comments.  We explicitly rule out
 145 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 146 -- (this can happen even though pragmas will normally take precedence due to
 147 -- longest-match, because pragmas aren't valid in every state, but comments
 148 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 149 -- set.
 150
 151 "{-" / { isNormalComment } { nested_comment lexToken }
 152
 153 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 154 -- more dashes followed by a symbol should be parsed as a varsym, so we
 155 -- have to exclude those.
 156
 157 -- Since Haddock comments aren't valid in every state, we need to rule them
 158 -- out here.
 159
 160 -- The following two rules match comments that begin with two dashes, but
 161 -- continue with a different character. The rules test that this character
 162 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 163 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 164 -- have a Haddock comment). The rules then munch the rest of the line.
 165
 166 "-- " ~[$docsym \#] .* { lineCommentToken }
 167 "--" [^$symbol : \ ] .* { lineCommentToken }
 168
 169 -- Next, match Haddock comments if no -haddock flag
 170
 171 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 172
 173 -- Now, when we've matched comments that begin with 2 dashes and continue
 174 -- with a different character, we need to match comments that begin with three
 175 -- or more dashes (which clearly can't be Haddock comments). We only need to
 176 -- make sure that the first non-dash character isn't a symbol, and munch the
 177 -- rest of the line.
 178
 179 "---"\-* [^$symbol :] .* { lineCommentToken }
 180
 181 -- Since the previous rules all match dashes followed by at least one
 182 -- character, we also need to match a whole line filled with just dashes.
 183
 184 "--"\-* / { atEOL } { lineCommentToken }
 185
 186 -- We need this rule since none of the other single line comment rules
 187 -- actually match this case.
 188
 189 "-- " / { atEOL } { lineCommentToken }
 190
 191 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 192 -- blank lines) until we find a non-whitespace character, then do layout
 193 -- processing.
 194 --
 195 -- One slight wibble here: what if the line begins with {-#? In
 196 -- theory, we have to lex the pragma to see if it's one we recognise,
 197 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 198 -- as a nested comment.  We don't bother with this: if the line begins
 199 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 200 <bol> {
 201   \n                                    ;
 202   ^\# (line)?                           { begin line_prag1 }
 203   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 204   ^\# \! .* \n                          ; -- #!, for scripts
 205   ()                                    { do_bol }
 206 }
 207
 208 -- after a layout keyword (let, where, do, of), we begin a new layout
 209 -- context if the curly brace is missing.
 210 -- Careful! This stuff is quite delicate.
 211 <layout, layout_do> {
 212   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 213         -- we might encounter {-# here, but {- has been handled already
 214   \n                                    ;
 215   ^\# (line)?                           { begin line_prag1 }
 216 }
 217
 218 -- do is treated in a subtly different way, see new_layout_context
 219 <layout>    ()                          { new_layout_context True }
 220 <layout_do> ()                          { new_layout_context False }
 221
 222 -- after a new layout context which was found to be to the left of the
 223 -- previous context, we have generated a '{' token, and we now need to
 224 -- generate a matching '}' token.
 225 <layout_left>  ()                       { do_layout_left }
 226
 227 <0,option_prags> \n                             { begin bol }
 228
 229 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 230                                 { dispatch_pragmas linePrags }
 231
 232 -- single-line line pragmas, of the form
 233 --    # <line> "<file>" <extra-stuff> \n
 234 <line_prag1> $decdigit+                 { setLine line_prag1a }
 235 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 236 <line_prag1b> .*                        { pop }
 237
 238 -- Haskell-style line pragmas, of the form
 239 --    {-# LINE <line> "<file>" #-}
 240 <line_prag2> $decdigit+                 { setLine line_prag2a }
 241 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 242 <line_prag2b> "#-}"|"-}"                { pop }
 243    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 244    -- with older versions of GHC which generated these.
 245
 246 <0,option_prags> {
 247   "{-#" $whitechar* $pragmachar+
 248         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 249                                  { dispatch_pragmas twoWordPrags }
 250
 251   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 252                                  { dispatch_pragmas oneWordPrags }
 253
 254   -- We ignore all these pragmas, but don't generate a warning for them
 255   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 256                                  { dispatch_pragmas ignoredPrags }
 257
 258   -- ToDo: should only be valid inside a pragma:
 259   "#-}"                                 { endPrag }
 260 }
 261
 262 <option_prags> {
 263   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 264                                    { dispatch_pragmas fileHeaderPrags }
 265
 266   "-- #"                                 { multiline_doc_comment }
 267 }
 268
 269 <0> {
 270   -- In the "0" mode we ignore these pragmas
 271   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 272                      { nested_comment lexToken }
 273 }
 274
 275 <0> {
 276   "-- #" .* { lineCommentToken }
 277 }
 278
 279 <0,option_prags> {
 280   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 281                     (nested_comment lexToken) }
 282 }
 283
 284 -- '0' state: ordinary lexemes
 285
 286 -- Haddock comments
 287
 288 <0> {
 289   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 290   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 291 }
 292
 293 -- "special" symbols
 294
 295 <0> {
 296   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 297   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 298 }
 299
 300 <0> {
 301   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 302   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 303   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 304   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 305   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 306   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 307   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 308   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 309
 310   "[$" @varid "|"  / { ifExtension qqEnabled }
 311                      { lex_quasiquote_tok }
 312 }
 313
 314 <0> {
 315   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 316                                         { special IToparenbar }
 317   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 318 }
 319
 320 <0> {
 321   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 322 }
 323
 324 <0> {
 325   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 326          { token IToubxparen }
 327   "#)" / { ifExtension unboxedTuplesEnabled }
 328          { token ITcubxparen }
 329 }
 330
 331 <0> {
 332   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 333   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 334 }
 335
 336 <0,option_prags> {
 337   \(                                    { special IToparen }
 338   \)                                    { special ITcparen }
 339   \[                                    { special ITobrack }
 340   \]                                    { special ITcbrack }
 341   \,                                    { special ITcomma }
 342   \;                                    { special ITsemi }
 343   \`                                    { special ITbackquote }
 344
 345   \{                                    { open_brace }
 346   \}                                    { close_brace }
 347 }
 348
 349 <0,option_prags> {
 350   @qual @varid                  { idtoken qvarid }
 351   @qual @conid                  { idtoken qconid }
 352   @varid                        { varid }
 353   @conid                        { idtoken conid }
 354 }
 355
 356 <0> {
 357   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 358   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 359   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 360   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 361 }
 362
 363 -- ToDo: - move `var` and (sym) into lexical syntax?
 364 --       - remove backquote from $special?
 365 <0> {
 366   @qual @varsym       / { ifExtension oldQualOps } { idtoken qvarsym }
 367   @qual @consym       / { ifExtension oldQualOps } { idtoken qconsym }
 368   @qual \( @varsym \) / { ifExtension newQualOps } { idtoken prefixqvarsym }
 369   @qual \( @consym \) / { ifExtension newQualOps } { idtoken prefixqconsym }
 370   @varsym                                          { varsym }
 371   @consym                                          { consym }
 372 }
 373
 374 -- For the normal boxed literals we need to be careful
 375 -- when trying to be close to Haskell98
 376 <0> {
 377   -- Normal integral literals (:: Num a => a, from Integer)
 378   @decimal           { tok_num positive 0 0 decimal }
 379   0[oO] @octal       { tok_num positive 2 2 octal }
 380   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 381
 382   -- Normal rational literals (:: Fractional a => a, from Rational)
 383   @floating_point    { strtoken tok_float }
 384 }
 385
 386 <0> {
 387   -- Unboxed ints (:: Int#) and words (:: Word#)
 388   -- It's simpler (and faster?) to give separate cases to the negatives,
 389   -- especially considering octal/hexadecimal prefixes.
 390   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 391   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 392   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 393   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 394   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 395   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 396
 397   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 398   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 399   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 400
 401   -- Unboxed floats and doubles (:: Float#, :: Double#)
 402   -- prim_{float,double} work with signed literals
 403   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 404   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 405 }
 406
 407 -- Strings and chars are lexed by hand-written code.  The reason is
 408 -- that even if we recognise the string or char here in the regex
 409 -- lexer, we would still have to parse the string afterward in order
 410 -- to convert it to a String.
 411 <0> {
 412   \'                            { lex_char_tok }
 413   \"                            { lex_string_tok }
 414 }
 415
 416 {
 417 -- -----------------------------------------------------------------------------
 418 -- The token type
 419
 420 data Token
 421   = ITas                        -- Haskell keywords
 422   | ITcase
 423   | ITclass
 424   | ITdata
 425   | ITdefault
 426   | ITderiving
 427   | ITdo
 428   | ITelse
 429   | IThiding
 430   | ITif
 431   | ITimport
 432   | ITin
 433   | ITinfix
 434   | ITinfixl
 435   | ITinfixr
 436   | ITinstance
 437   | ITlet
 438   | ITmodule
 439   | ITnewtype
 440   | ITof
 441   | ITqualified
 442   | ITthen
 443   | ITtype
 444   | ITwhere
 445   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 446
 447   | ITforall                    -- GHC extension keywords
 448   | ITforeign
 449   | ITexport
 450   | ITlabel
 451   | ITdynamic
 452   | ITsafe
 453   | ITthreadsafe
 454   | ITunsafe
 455   | ITstdcallconv
 456   | ITccallconv
 457   | ITprimcallconv
 458   | ITmdo
 459   | ITfamily
 460   | ITgroup
 461   | ITby
 462   | ITusing
 463
 464         -- Pragmas
 465   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 466   | ITinline_conlike_prag Bool  -- same
 467   | ITspec_prag                 -- SPECIALISE
 468   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 469   | ITsource_prag
 470   | ITrules_prag
 471   | ITwarning_prag
 472   | ITdeprecated_prag
 473   | ITline_prag
 474   | ITscc_prag
 475   | ITgenerated_prag
 476   | ITcore_prag                 -- hdaume: core annotations
 477   | ITunpack_prag
 478   | ITann_prag
 479   | ITclose_prag
 480   | IToptions_prag String
 481   | ITinclude_prag String
 482   | ITlanguage_prag
 483
 484   | ITdotdot                    -- reserved symbols
 485   | ITcolon
 486   | ITdcolon
 487   | ITequal
 488   | ITlam
 489   | ITvbar
 490   | ITlarrow
 491   | ITrarrow
 492   | ITat
 493   | ITtilde
 494   | ITdarrow
 495   | ITminus
 496   | ITbang
 497   | ITstar
 498   | ITdot
 499
 500   | ITbiglam                    -- GHC-extension symbols
 501
 502   | ITocurly                    -- special symbols
 503   | ITccurly
 504   | ITocurlybar                 -- {|, for type applications
 505   | ITccurlybar                 -- |}, for type applications
 506   | ITvocurly
 507   | ITvccurly
 508   | ITobrack
 509   | ITopabrack                  -- [:, for parallel arrays with -XParr
 510   | ITcpabrack                  -- :], for parallel arrays with -XParr
 511   | ITcbrack
 512   | IToparen
 513   | ITcparen
 514   | IToubxparen
 515   | ITcubxparen
 516   | ITsemi
 517   | ITcomma
 518   | ITunderscore
 519   | ITbackquote
 520
 521   | ITvarid   FastString        -- identifiers
 522   | ITconid   FastString
 523   | ITvarsym  FastString
 524   | ITconsym  FastString
 525   | ITqvarid  (FastString,FastString)
 526   | ITqconid  (FastString,FastString)
 527   | ITqvarsym (FastString,FastString)
 528   | ITqconsym (FastString,FastString)
 529   | ITprefixqvarsym (FastString,FastString)
 530   | ITprefixqconsym (FastString,FastString)
 531
 532   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 533
 534   | ITchar       Char
 535   | ITstring     FastString
 536   | ITinteger    Integer
 537   | ITrational   Rational
 538
 539   | ITprimchar   Char
 540   | ITprimstring FastString
 541   | ITprimint    Integer
 542   | ITprimword   Integer
 543   | ITprimfloat  Rational
 544   | ITprimdouble Rational
 545
 546   -- Template Haskell extension tokens
 547   | ITopenExpQuote              --  [| or [e|
 548   | ITopenPatQuote              --  [p|
 549   | ITopenDecQuote              --  [d|
 550   | ITopenTypQuote              --  [t|
 551   | ITcloseQuote                --  |]
 552   | ITidEscape   FastString     --  $x
 553   | ITparenEscape               --  $(
 554   | ITvarQuote                  --  '
 555   | ITtyQuote                   --  ''
 556   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 557
 558   -- Arrow notation extension
 559   | ITproc
 560   | ITrec
 561   | IToparenbar                 --  (|
 562   | ITcparenbar                 --  |)
 563   | ITlarrowtail                --  -<
 564   | ITrarrowtail                --  >-
 565   | ITLarrowtail                --  -<<
 566   | ITRarrowtail                --  >>-
 567
 568   | ITunknown String            -- Used when the lexer can't make sense of it
 569   | ITeof                       -- end of file token
 570
 571   -- Documentation annotations
 572   | ITdocCommentNext  String     -- something beginning '-- |'
 573   | ITdocCommentPrev  String     -- something beginning '-- ^'
 574   | ITdocCommentNamed String     -- something beginning '-- $'
 575   | ITdocSection      Int String -- a section heading
 576   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 577   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 578   | ITlineComment     String     -- comment starting by "--"
 579   | ITblockComment    String     -- comment in {- -}
 580
 581 #ifdef DEBUG
 582   deriving Show -- debugging
 583 #endif
 584
 585 {-
 586 isSpecial :: Token -> Bool
 587 -- If we see M.x, where x is a keyword, but
 588 -- is special, we treat is as just plain M.x,
 589 -- not as a keyword.
 590 isSpecial ITas          = True
 591 isSpecial IThiding      = True
 592 isSpecial ITqualified   = True
 593 isSpecial ITforall      = True
 594 isSpecial ITexport      = True
 595 isSpecial ITlabel       = True
 596 isSpecial ITdynamic     = True
 597 isSpecial ITsafe        = True
 598 isSpecial ITthreadsafe  = True
 599 isSpecial ITunsafe      = True
 600 isSpecial ITccallconv   = True
 601 isSpecial ITstdcallconv = True
 602 isSpecial ITprimcallconv = True
 603 isSpecial ITmdo         = True
 604 isSpecial ITfamily      = True
 605 isSpecial ITgroup   = True
 606 isSpecial ITby      = True
 607 isSpecial ITusing   = True
 608 isSpecial _             = False
 609 -}
 610
 611 -- the bitmap provided as the third component indicates whether the
 612 -- corresponding extension keyword is valid under the extension options
 613 -- provided to the compiler; if the extension corresponding to *any* of the
 614 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 615 -- facilitates using a keyword in two different extensions that can be
 616 -- activated independently)
 617 --
 618 reservedWordsFM :: UniqFM (Token, Int)
 619 reservedWordsFM = listToUFM $
 620         map (\(x, y, z) -> (mkFastString x, (y, z)))
 621        [( "_",          ITunderscore,   0 ),
 622         ( "as",         ITas,           0 ),
 623         ( "case",       ITcase,         0 ),
 624         ( "class",      ITclass,        0 ),
 625         ( "data",       ITdata,         0 ),
 626         ( "default",    ITdefault,      0 ),
 627         ( "deriving",   ITderiving,     0 ),
 628         ( "do",         ITdo,           0 ),
 629         ( "else",       ITelse,         0 ),
 630         ( "hiding",     IThiding,       0 ),
 631         ( "if",         ITif,           0 ),
 632         ( "import",     ITimport,       0 ),
 633         ( "in",         ITin,           0 ),
 634         ( "infix",      ITinfix,        0 ),
 635         ( "infixl",     ITinfixl,       0 ),
 636         ( "infixr",     ITinfixr,       0 ),
 637         ( "instance",   ITinstance,     0 ),
 638         ( "let",        ITlet,          0 ),
 639         ( "module",     ITmodule,       0 ),
 640         ( "newtype",    ITnewtype,      0 ),
 641         ( "of",         ITof,           0 ),
 642         ( "qualified",  ITqualified,    0 ),
 643         ( "then",       ITthen,         0 ),
 644         ( "type",       ITtype,         0 ),
 645         ( "where",      ITwhere,        0 ),
 646         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 647
 648     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 649         ( "mdo",        ITmdo,           bit recursiveDoBit),
 650         ( "family",     ITfamily,        bit tyFamBit),
 651     ( "group",  ITgroup,     bit transformComprehensionsBit),
 652     ( "by",     ITby,        bit transformComprehensionsBit),
 653     ( "using",  ITusing,     bit transformComprehensionsBit),
 654
 655         ( "foreign",    ITforeign,       bit ffiBit),
 656         ( "export",     ITexport,        bit ffiBit),
 657         ( "label",      ITlabel,         bit ffiBit),
 658         ( "dynamic",    ITdynamic,       bit ffiBit),
 659         ( "safe",       ITsafe,          bit ffiBit),
 660         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 661         ( "unsafe",     ITunsafe,        bit ffiBit),
 662         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 663         ( "ccall",      ITccallconv,     bit ffiBit),
 664         ( "prim",       ITprimcallconv,  bit ffiBit),
 665
 666         ( "rec",        ITrec,           bit recBit),
 667         ( "proc",       ITproc,          bit arrowsBit)
 668      ]
 669
 670 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 671 reservedSymsFM = listToUFM $
 672     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 673       [ ("..",  ITdotdot,   always)
 674         -- (:) is a reserved op, meaning only list cons
 675        ,(":",   ITcolon,    always)
 676        ,("::",  ITdcolon,   always)
 677        ,("=",   ITequal,    always)
 678        ,("\\",  ITlam,      always)
 679        ,("|",   ITvbar,     always)
 680        ,("<-",  ITlarrow,   always)
 681        ,("->",  ITrarrow,   always)
 682        ,("@",   ITat,       always)
 683        ,("~",   ITtilde,    always)
 684        ,("=>",  ITdarrow,   always)
 685        ,("-",   ITminus,    always)
 686        ,("!",   ITbang,     always)
 687
 688         -- For data T (a::*) = MkT
 689        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 690         -- For 'forall a . t'
 691        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 692
 693        ,("-<",  ITlarrowtail, arrowsEnabled)
 694        ,(">-",  ITrarrowtail, arrowsEnabled)
 695        ,("-<<", ITLarrowtail, arrowsEnabled)
 696        ,(">>-", ITRarrowtail, arrowsEnabled)
 697
 698        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 699        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 700        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 701                                 explicitForallEnabled i)
 702        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 703        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 704        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 705         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 706         -- form part of a large operator.  This would let us have a better
 707         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 708        ]
 709
 710 -- -----------------------------------------------------------------------------
 711 -- Lexer actions
 712
 713 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 714
 715 special :: Token -> Action
 716 special tok span _buf _len = return (L span tok)
 717
 718 token, layout_token :: Token -> Action
 719 token t span _buf _len = return (L span t)
 720 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 721
 722 idtoken :: (StringBuffer -> Int -> Token) -> Action
 723 idtoken f span buf len = return (L span $! (f buf len))
 724
 725 skip_one_varid :: (FastString -> Token) -> Action
 726 skip_one_varid f span buf len
 727   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 728
 729 strtoken :: (String -> Token) -> Action
 730 strtoken f span buf len =
 731   return (L span $! (f $! lexemeToString buf len))
 732
 733 init_strtoken :: Int -> (String -> Token) -> Action
 734 -- like strtoken, but drops the last N character(s)
 735 init_strtoken drop f span buf len =
 736   return (L span $! (f $! lexemeToString buf (len-drop)))
 737
 738 begin :: Int -> Action
 739 begin code _span _str _len = do pushLexState code; lexToken
 740
 741 pop :: Action
 742 pop _span _buf _len = do popLexState; lexToken
 743
 744 pop_and :: Action -> Action
 745 pop_and act span buf len = do popLexState; act span buf len
 746
 747 {-# INLINE nextCharIs #-}
 748 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 749 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 750
 751 notFollowedBy :: Char -> AlexAccPred Int
 752 notFollowedBy char _ _ _ (AI _ _ buf)
 753   = nextCharIs buf (/=char)
 754
 755 notFollowedBySymbol :: AlexAccPred Int
 756 notFollowedBySymbol _ _ _ (AI _ _ buf)
 757   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 758
 759 -- We must reject doc comments as being ordinary comments everywhere.
 760 -- In some cases the doc comment will be selected as the lexeme due to
 761 -- maximal munch, but not always, because the nested comment rule is
 762 -- valid in all states, but the doc-comment rules are only valid in
 763 -- the non-layout states.
 764 isNormalComment :: AlexAccPred Int
 765 isNormalComment bits _ _ (AI _ _ buf)
 766   | haddockEnabled bits = notFollowedByDocOrPragma
 767   | otherwise           = nextCharIs buf (/='#')
 768   where
 769     notFollowedByDocOrPragma
 770        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 771
 772 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 773 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 774
 775 {-
 776 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 777   = if haddockEnabled bits then False else (p buf)
 778 -}
 779
 780 atEOL :: AlexAccPred Int
 781 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 782
 783 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 784 ifExtension pred bits _ _ _ = pred bits
 785
 786 multiline_doc_comment :: Action
 787 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 788   where
 789     worker commentAcc input docType oneLine = case alexGetChar input of
 790       Just ('\n', input')
 791         | oneLine -> docCommentEnd input commentAcc docType buf span
 792         | otherwise -> case checkIfCommentLine input' of
 793           Just input -> worker ('\n':commentAcc) input docType False
 794           Nothing -> docCommentEnd input commentAcc docType buf span
 795       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 796       Nothing -> docCommentEnd input commentAcc docType buf span
 797
 798     checkIfCommentLine input = check (dropNonNewlineSpace input)
 799       where
 800         check input = case alexGetChar input of
 801           Just ('-', input) -> case alexGetChar input of
 802             Just ('-', input) -> case alexGetChar input of
 803               Just (c, _) | c /= '-' -> Just input
 804               _ -> Nothing
 805             _ -> Nothing
 806           _ -> Nothing
 807
 808         dropNonNewlineSpace input = case alexGetChar input of
 809           Just (c, input')
 810             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 811             | otherwise -> input
 812           Nothing -> input
 813
 814 lineCommentToken :: Action
 815 lineCommentToken span buf len = do
 816   b <- extension rawTokenStreamEnabled
 817   if b then strtoken ITlineComment span buf len else lexToken
 818
 819 {-
 820   nested comments require traversing by hand, they can't be parsed
 821   using regular expressions.
 822 -}
 823 nested_comment :: P (Located Token) -> Action
 824 nested_comment cont span _str _len = do
 825   input <- getInput
 826   go "" (1::Int) input
 827   where
 828     go commentAcc 0 input = do setInput input
 829                                b <- extension rawTokenStreamEnabled
 830                                if b
 831                                  then docCommentEnd input commentAcc ITblockComment _str span
 832                                  else cont
 833     go commentAcc n input = case alexGetChar input of
 834       Nothing -> errBrace input span
 835       Just ('-',input) -> case alexGetChar input of
 836         Nothing  -> errBrace input span
 837         Just ('\125',input) -> go commentAcc (n-1) input
 838         Just (_,_)          -> go ('-':commentAcc) n input
 839       Just ('\123',input) -> case alexGetChar input of
 840         Nothing  -> errBrace input span
 841         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 842         Just (_,_)       -> go ('\123':commentAcc) n input
 843       Just (c,input) -> go (c:commentAcc) n input
 844
 845 nested_doc_comment :: Action
 846 nested_doc_comment span buf _len = withLexedDocType (go "")
 847   where
 848     go commentAcc input docType _ = case alexGetChar input of
 849       Nothing -> errBrace input span
 850       Just ('-',input) -> case alexGetChar input of
 851         Nothing -> errBrace input span
 852         Just ('\125',input) ->
 853           docCommentEnd input commentAcc docType buf span
 854         Just (_,_) -> go ('-':commentAcc) input docType False
 855       Just ('\123', input) -> case alexGetChar input of
 856         Nothing  -> errBrace input span
 857         Just ('-',input) -> do
 858           setInput input
 859           let cont = do input <- getInput; go commentAcc input docType False
 860           nested_comment cont span buf _len
 861         Just (_,_) -> go ('\123':commentAcc) input docType False
 862       Just (c,input) -> go (c:commentAcc) input docType False
 863
 864 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 865                  -> P (Located Token)
 866 withLexedDocType lexDocComment = do
 867   input@(AI _ _ buf) <- getInput
 868   case prevChar buf ' ' of
 869     '|' -> lexDocComment input ITdocCommentNext False
 870     '^' -> lexDocComment input ITdocCommentPrev False
 871     '$' -> lexDocComment input ITdocCommentNamed False
 872     '*' -> lexDocSection 1 input
 873     '#' -> lexDocComment input ITdocOptionsOld False
 874     _ -> panic "withLexedDocType: Bad doc type"
 875  where
 876     lexDocSection n input = case alexGetChar input of
 877       Just ('*', input) -> lexDocSection (n+1) input
 878       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 879       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 880
 881 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 882 -- off again at the end of the pragma.
 883 rulePrag :: Action
 884 rulePrag span _buf _len = do
 885   setExts (.|. bit inRulePragBit)
 886   return (L span ITrules_prag)
 887
 888 endPrag :: Action
 889 endPrag span _buf _len = do
 890   setExts (.&. complement (bit inRulePragBit))
 891   return (L span ITclose_prag)
 892
 893 -- docCommentEnd
 894 -------------------------------------------------------------------------------
 895 -- This function is quite tricky. We can't just return a new token, we also
 896 -- need to update the state of the parser. Why? Because the token is longer
 897 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 898 -- it writes the wrong token length to the parser state. This function is
 899 -- called afterwards, so it can just update the state.
 900
 901 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 902 -- which is something that the original lexer didn't account for.
 903 -- I have added last_line_len in the parser state which represents the length
 904 -- of the part of the token that is on the last line. It is now used for layout
 905 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 906 -- was before, the full length of the token, and it is now only used for error
 907 -- messages. /Waern
 908
 909 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 910                  SrcSpan -> P (Located Token)
 911 docCommentEnd input commentAcc docType buf span = do
 912   setInput input
 913   let (AI loc last_offs nextBuf) = input
 914       comment = reverse commentAcc
 915       span' = mkSrcSpan (srcSpanStart span) loc
 916       last_len = byteDiff buf nextBuf
 917
 918       last_line_len = if (last_offs - last_len < 0)
 919         then last_offs
 920         else last_len
 921
 922   span `seq` setLastToken span' last_len last_line_len
 923   return (L span' (docType comment))
 924
 925 errBrace :: AlexInput -> SrcSpan -> P a
 926 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 927
 928 open_brace, close_brace :: Action
 929 open_brace span _str _len = do
 930   ctx <- getContext
 931   setContext (NoLayout:ctx)
 932   return (L span ITocurly)
 933 close_brace span _str _len = do
 934   popContext
 935   return (L span ITccurly)
 936
 937 qvarid, qconid :: StringBuffer -> Int -> Token
 938 qvarid buf len = ITqvarid $! splitQualName buf len False
 939 qconid buf len = ITqconid $! splitQualName buf len False
 940
 941 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 942 -- takes a StringBuffer and a length, and returns the module name
 943 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 944 -- because of hierarchical module names.
 945 splitQualName orig_buf len parens = split orig_buf orig_buf
 946   where
 947     split buf dot_buf
 948         | orig_buf `byteDiff` buf >= len  = done dot_buf
 949         | c == '.'                        = found_dot buf'
 950         | otherwise                       = split buf' dot_buf
 951       where
 952        (c,buf') = nextChar buf
 953
 954     -- careful, we might get names like M....
 955     -- so, if the character after the dot is not upper-case, this is
 956     -- the end of the qualifier part.
 957     found_dot buf -- buf points after the '.'
 958         | isUpper c    = split buf' buf
 959         | otherwise    = done buf
 960       where
 961        (c,buf') = nextChar buf
 962
 963     done dot_buf =
 964         (lexemeToFastString orig_buf (qual_size - 1),
 965          if parens -- Prelude.(+)
 966             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 967             else lexemeToFastString dot_buf (len - qual_size))
 968       where
 969         qual_size = orig_buf `byteDiff` dot_buf
 970
 971 varid :: Action
 972 varid span buf len =
 973   fs `seq`
 974   case lookupUFM reservedWordsFM fs of
 975         Just (keyword,0)    -> do
 976                 maybe_layout keyword
 977                 return (L span keyword)
 978         Just (keyword,exts) -> do
 979                 b <- extension (\i -> exts .&. i /= 0)
 980                 if b then do maybe_layout keyword
 981                              return (L span keyword)
 982                      else return (L span (ITvarid fs))
 983         _other -> return (L span (ITvarid fs))
 984   where
 985         fs = lexemeToFastString buf len
 986
 987 conid :: StringBuffer -> Int -> Token
 988 conid buf len = ITconid fs
 989   where fs = lexemeToFastString buf len
 990
 991 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
 992 qvarsym buf len = ITqvarsym $! splitQualName buf len False
 993 qconsym buf len = ITqconsym $! splitQualName buf len False
 994 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
 995 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
 996
 997 varsym, consym :: Action
 998 varsym = sym ITvarsym
 999 consym = sym ITconsym
1000
1001 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
1002     -> P (Located Token)
1003 sym con span buf len =
1004   case lookupUFM reservedSymsFM fs of
1005         Just (keyword,exts) -> do
1006                 b <- extension exts
1007                 if b then return (L span keyword)
1008                      else return (L span $! con fs)
1009         _other -> return (L span $! con fs)
1010   where
1011         fs = lexemeToFastString buf len
1012
1013 -- Variations on the integral numeric literal.
1014 tok_integral :: (Integer -> Token)
1015      -> (Integer -> Integer)
1016  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1017      -> Int -> Int
1018      -> (Integer, (Char->Int)) -> Action
1019 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1020   return $ L span $ itint $! transint $ parseUnsignedInteger
1021      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1022
1023 -- some conveniences for use with tok_integral
1024 tok_num :: (Integer -> Integer)
1025         -> Int -> Int
1026         -> (Integer, (Char->Int)) -> Action
1027 tok_num = tok_integral ITinteger
1028 tok_primint :: (Integer -> Integer)
1029             -> Int -> Int
1030             -> (Integer, (Char->Int)) -> Action
1031 tok_primint = tok_integral ITprimint
1032 tok_primword :: Int -> Int
1033              -> (Integer, (Char->Int)) -> Action
1034 tok_primword = tok_integral ITprimword positive
1035 positive, negative :: (Integer -> Integer)
1036 positive = id
1037 negative = negate
1038 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1039 decimal = (10,octDecDigit)
1040 octal = (8,octDecDigit)
1041 hexadecimal = (16,hexDigit)
1042
1043 -- readRational can understand negative rationals, exponents, everything.
1044 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1045 tok_float        str = ITrational   $! readRational str
1046 tok_primfloat    str = ITprimfloat  $! readRational str
1047 tok_primdouble   str = ITprimdouble $! readRational str
1048
1049 -- -----------------------------------------------------------------------------
1050 -- Layout processing
1051
1052 -- we're at the first token on a line, insert layout tokens if necessary
1053 do_bol :: Action
1054 do_bol span _str _len = do
1055         pos <- getOffside
1056         case pos of
1057             LT -> do
1058                 --trace "layout: inserting '}'" $ do
1059                 popContext
1060                 -- do NOT pop the lex state, we might have a ';' to insert
1061                 return (L span ITvccurly)
1062             EQ -> do
1063                 --trace "layout: inserting ';'" $ do
1064                 popLexState
1065                 return (L span ITsemi)
1066             GT -> do
1067                 popLexState
1068                 lexToken
1069
1070 -- certain keywords put us in the "layout" state, where we might
1071 -- add an opening curly brace.
1072 maybe_layout :: Token -> P ()
1073 maybe_layout ITdo       = pushLexState layout_do
1074 maybe_layout ITmdo      = pushLexState layout_do
1075 maybe_layout ITof       = pushLexState layout
1076 maybe_layout ITlet      = pushLexState layout
1077 maybe_layout ITwhere    = pushLexState layout
1078 maybe_layout ITrec      = pushLexState layout
1079 maybe_layout _          = return ()
1080
1081 -- Pushing a new implicit layout context.  If the indentation of the
1082 -- next token is not greater than the previous layout context, then
1083 -- Haskell 98 says that the new layout context should be empty; that is
1084 -- the lexer must generate {}.
1085 --
1086 -- We are slightly more lenient than this: when the new context is started
1087 -- by a 'do', then we allow the new context to be at the same indentation as
1088 -- the previous context.  This is what the 'strict' argument is for.
1089 --
1090 new_layout_context :: Bool -> Action
1091 new_layout_context strict span _buf _len = do
1092     popLexState
1093     (AI _ offset _) <- getInput
1094     ctx <- getContext
1095     case ctx of
1096         Layout prev_off : _  |
1097            (strict     && prev_off >= offset  ||
1098             not strict && prev_off > offset) -> do
1099                 -- token is indented to the left of the previous context.
1100                 -- we must generate a {} sequence now.
1101                 pushLexState layout_left
1102                 return (L span ITvocurly)
1103         _ -> do
1104                 setContext (Layout offset : ctx)
1105                 return (L span ITvocurly)
1106
1107 do_layout_left :: Action
1108 do_layout_left span _buf _len = do
1109     popLexState
1110     pushLexState bol  -- we must be at the start of a line
1111     return (L span ITvccurly)
1112
1113 -- -----------------------------------------------------------------------------
1114 -- LINE pragmas
1115
1116 setLine :: Int -> Action
1117 setLine code span buf len = do
1118   let line = parseUnsignedInteger buf len 10 octDecDigit
1119   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1120         -- subtract one: the line number refers to the *following* line
1121   popLexState
1122   pushLexState code
1123   lexToken
1124
1125 setFile :: Int -> Action
1126 setFile code span buf len = do
1127   let file = lexemeToFastString (stepOn buf) (len-2)
1128   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1129   popLexState
1130   pushLexState code
1131   lexToken
1132
1133
1134 -- -----------------------------------------------------------------------------
1135 -- Options, includes and language pragmas.
1136
1137 lex_string_prag :: (String -> Token) -> Action
1138 lex_string_prag mkTok span _buf _len
1139     = do input <- getInput
1140          start <- getSrcLoc
1141          tok <- go [] input
1142          end <- getSrcLoc
1143          return (L (mkSrcSpan start end) tok)
1144     where go acc input
1145               = if isString input "#-}"
1146                    then do setInput input
1147                            return (mkTok (reverse acc))
1148                    else case alexGetChar input of
1149                           Just (c,i) -> go (c:acc) i
1150                           Nothing -> err input
1151           isString _ [] = True
1152           isString i (x:xs)
1153               = case alexGetChar i of
1154                   Just (c,i') | c == x    -> isString i' xs
1155                   _other -> False
1156           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1157
1158
1159 -- -----------------------------------------------------------------------------
1160 -- Strings & Chars
1161
1162 -- This stuff is horrible.  I hates it.
1163
1164 lex_string_tok :: Action
1165 lex_string_tok span _buf _len = do
1166   tok <- lex_string ""
1167   end <- getSrcLoc
1168   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1169
1170 lex_string :: String -> P Token
1171 lex_string s = do
1172   i <- getInput
1173   case alexGetChar' i of
1174     Nothing -> lit_error
1175
1176     Just ('"',i)  -> do
1177         setInput i
1178         magicHash <- extension magicHashEnabled
1179         if magicHash
1180           then do
1181             i <- getInput
1182             case alexGetChar' i of
1183               Just ('#',i) -> do
1184                    setInput i
1185                    if any (> '\xFF') s
1186                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1187                     else let s' = mkZFastString (reverse s) in
1188                          return (ITprimstring s')
1189                         -- mkZFastString is a hack to avoid encoding the
1190                         -- string in UTF-8.  We just want the exact bytes.
1191               _other ->
1192                 return (ITstring (mkFastString (reverse s)))
1193           else
1194                 return (ITstring (mkFastString (reverse s)))
1195
1196     Just ('\\',i)
1197         | Just ('&',i) <- next -> do
1198                 setInput i; lex_string s
1199         | Just (c,i) <- next, is_space c -> do
1200                 setInput i; lex_stringgap s
1201         where next = alexGetChar' i
1202
1203     Just (c, i) -> do
1204         c' <- lex_char c i
1205         lex_string (c':s)
1206
1207 lex_stringgap :: String -> P Token
1208 lex_stringgap s = do
1209   c <- getCharOrFail
1210   case c of
1211     '\\' -> lex_string s
1212     c | is_space c -> lex_stringgap s
1213     _other -> lit_error
1214
1215
1216 lex_char_tok :: Action
1217 -- Here we are basically parsing character literals, such as 'x' or '\n'
1218 -- but, when Template Haskell is on, we additionally spot
1219 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1220 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1221 -- So we have to do two characters of lookahead: when we see 'x we need to
1222 -- see if there's a trailing quote
1223 lex_char_tok span _buf _len = do        -- We've seen '
1224    i1 <- getInput       -- Look ahead to first character
1225    let loc = srcSpanStart span
1226    case alexGetChar' i1 of
1227         Nothing -> lit_error
1228
1229         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1230                   th_exts <- extension thEnabled
1231                   if th_exts then do
1232                         setInput i2
1233                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1234                    else lit_error
1235
1236         Just ('\\', i2@(AI _end2 _ _)) -> do    -- We've seen 'backslash
1237                   setInput i2
1238                   lit_ch <- lex_escape
1239                   mc <- getCharOrFail   -- Trailing quote
1240                   if mc == '\'' then finish_char_tok loc lit_ch
1241                                 else do setInput i2; lit_error
1242
1243         Just (c, i2@(AI _end2 _ _))
1244                 | not (isAny c) -> lit_error
1245                 | otherwise ->
1246
1247                 -- We've seen 'x, where x is a valid character
1248                 --  (i.e. not newline etc) but not a quote or backslash
1249            case alexGetChar' i2 of      -- Look ahead one more character
1250                 Just ('\'', i3) -> do   -- We've seen 'x'
1251                         setInput i3
1252                         finish_char_tok loc c
1253                 _other -> do            -- We've seen 'x not followed by quote
1254                                         -- (including the possibility of EOF)
1255                                         -- If TH is on, just parse the quote only
1256                         th_exts <- extension thEnabled
1257                         let (AI end _ _) = i1
1258                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1259                                    else do setInput i2; lit_error
1260
1261 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1262 finish_char_tok loc ch  -- We've already seen the closing quote
1263                         -- Just need to check for trailing #
1264   = do  magicHash <- extension magicHashEnabled
1265         i@(AI end _ _) <- getInput
1266         if magicHash then do
1267                 case alexGetChar' i of
1268                         Just ('#',i@(AI end _ _)) -> do
1269                                 setInput i
1270                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1271                         _other ->
1272                                 return (L (mkSrcSpan loc end) (ITchar ch))
1273                 else do
1274                    return (L (mkSrcSpan loc end) (ITchar ch))
1275
1276 lex_char :: Char -> AlexInput -> P Char
1277 lex_char c inp = do
1278   case c of
1279       '\\' -> do setInput inp; lex_escape
1280       c | isAny c -> do setInput inp; return c
1281       _other -> lit_error
1282
1283 isAny :: Char -> Bool
1284 isAny c | c > '\x7f' = isPrint c
1285         | otherwise  = is_any c
1286
1287 lex_escape :: P Char
1288 lex_escape = do
1289   c <- getCharOrFail
1290   case c of
1291         'a'   -> return '\a'
1292         'b'   -> return '\b'
1293         'f'   -> return '\f'
1294         'n'   -> return '\n'
1295         'r'   -> return '\r'
1296         't'   -> return '\t'
1297         'v'   -> return '\v'
1298         '\\'  -> return '\\'
1299         '"'   -> return '\"'
1300         '\''  -> return '\''
1301         '^'   -> do c <- getCharOrFail
1302                     if c >= '@' && c <= '_'
1303                         then return (chr (ord c - ord '@'))
1304                         else lit_error
1305
1306         'x'   -> readNum is_hexdigit 16 hexDigit
1307         'o'   -> readNum is_octdigit  8 octDecDigit
1308         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1309
1310         c1 ->  do
1311            i <- getInput
1312            case alexGetChar' i of
1313             Nothing -> lit_error
1314             Just (c2,i2) ->
1315               case alexGetChar' i2 of
1316                 Nothing -> do setInput i2; lit_error
1317                 Just (c3,i3) ->
1318                    let str = [c1,c2,c3] in
1319                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1320                                      Just rest <- [stripPrefix p str] ] of
1321                           (escape_char,[]):_ -> do
1322                                 setInput i3
1323                                 return escape_char
1324                           (escape_char,_:_):_ -> do
1325                                 setInput i2
1326                                 return escape_char
1327                           [] -> lit_error
1328
1329 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1330 readNum is_digit base conv = do
1331   i <- getInput
1332   c <- getCharOrFail
1333   if is_digit c
1334         then readNum2 is_digit base conv (conv c)
1335         else do setInput i; lit_error
1336
1337 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1338 readNum2 is_digit base conv i = do
1339   input <- getInput
1340   read i input
1341   where read i input = do
1342           case alexGetChar' input of
1343             Just (c,input') | is_digit c -> do
1344                 read (i*base + conv c) input'
1345             _other -> do
1346                 if i >= 0 && i <= 0x10FFFF
1347                    then do setInput input; return (chr i)
1348                    else lit_error
1349
1350 silly_escape_chars :: [(String, Char)]
1351 silly_escape_chars = [
1352         ("NUL", '\NUL'),
1353         ("SOH", '\SOH'),
1354         ("STX", '\STX'),
1355         ("ETX", '\ETX'),
1356         ("EOT", '\EOT'),
1357         ("ENQ", '\ENQ'),
1358         ("ACK", '\ACK'),
1359         ("BEL", '\BEL'),
1360         ("BS", '\BS'),
1361         ("HT", '\HT'),
1362         ("LF", '\LF'),
1363         ("VT", '\VT'),
1364         ("FF", '\FF'),
1365         ("CR", '\CR'),
1366         ("SO", '\SO'),
1367         ("SI", '\SI'),
1368         ("DLE", '\DLE'),
1369         ("DC1", '\DC1'),
1370         ("DC2", '\DC2'),
1371         ("DC3", '\DC3'),
1372         ("DC4", '\DC4'),
1373         ("NAK", '\NAK'),
1374         ("SYN", '\SYN'),
1375         ("ETB", '\ETB'),
1376         ("CAN", '\CAN'),
1377         ("EM", '\EM'),
1378         ("SUB", '\SUB'),
1379         ("ESC", '\ESC'),
1380         ("FS", '\FS'),
1381         ("GS", '\GS'),
1382         ("RS", '\RS'),
1383         ("US", '\US'),
1384         ("SP", '\SP'),
1385         ("DEL", '\DEL')
1386         ]
1387
1388 -- before calling lit_error, ensure that the current input is pointing to
1389 -- the position of the error in the buffer.  This is so that we can report
1390 -- a correct location to the user, but also so we can detect UTF-8 decoding
1391 -- errors if they occur.
1392 lit_error :: P a
1393 lit_error = lexError "lexical error in string/character literal"
1394
1395 getCharOrFail :: P Char
1396 getCharOrFail =  do
1397   i <- getInput
1398   case alexGetChar' i of
1399         Nothing -> lexError "unexpected end-of-file in string/character literal"
1400         Just (c,i)  -> do setInput i; return c
1401
1402 -- -----------------------------------------------------------------------------
1403 -- QuasiQuote
1404
1405 lex_quasiquote_tok :: Action
1406 lex_quasiquote_tok span buf len = do
1407   let quoter = reverse $ takeWhile (/= '$')
1408                $ reverse $ lexemeToString buf (len - 1)
1409   quoteStart <- getSrcLoc
1410   quote <- lex_quasiquote ""
1411   end <- getSrcLoc
1412   return (L (mkSrcSpan (srcSpanStart span) end)
1413            (ITquasiQuote (mkFastString quoter,
1414                           mkFastString (reverse quote),
1415                           mkSrcSpan quoteStart end)))
1416
1417 lex_quasiquote :: String -> P String
1418 lex_quasiquote s = do
1419   i <- getInput
1420   case alexGetChar' i of
1421     Nothing -> lit_error
1422
1423     Just ('\\',i)
1424         | Just ('|',i) <- next -> do
1425                 setInput i; lex_quasiquote ('|' : s)
1426         | Just (']',i) <- next -> do
1427                 setInput i; lex_quasiquote (']' : s)
1428         where next = alexGetChar' i
1429
1430     Just ('|',i)
1431         | Just (']',i) <- next -> do
1432                 setInput i; return s
1433         where next = alexGetChar' i
1434
1435     Just (c, i) -> do
1436          setInput i; lex_quasiquote (c : s)
1437
1438 -- -----------------------------------------------------------------------------
1439 -- Warnings
1440
1441 warn :: DynFlag -> SDoc -> Action
1442 warn option warning srcspan _buf _len = do
1443     addWarning option srcspan warning
1444     lexToken
1445
1446 warnThen :: DynFlag -> SDoc -> Action -> Action
1447 warnThen option warning action srcspan buf len = do
1448     addWarning option srcspan warning
1449     action srcspan buf len
1450
1451 -- -----------------------------------------------------------------------------
1452 -- The Parse Monad
1453
1454 data LayoutContext
1455   = NoLayout
1456   | Layout !Int
1457   deriving Show
1458
1459 data ParseResult a
1460   = POk PState a
1461   | PFailed
1462         SrcSpan         -- The start and end of the text span related to
1463                         -- the error.  Might be used in environments which can
1464                         -- show this span, e.g. by highlighting it.
1465         Message         -- The error message
1466
1467 data PState = PState {
1468         buffer     :: StringBuffer,
1469         dflags     :: DynFlags,
1470         messages   :: Messages,
1471         last_loc   :: SrcSpan,  -- pos of previous token
1472         last_offs  :: !Int,     -- offset of the previous token from the
1473                                 -- beginning of  the current line.
1474                                 -- \t is equal to 8 spaces.
1475         last_len   :: !Int,     -- len of previous token
1476         last_line_len :: !Int,
1477         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1478         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1479         context    :: [LayoutContext],
1480         lex_state  :: [Int]
1481      }
1482         -- last_loc and last_len are used when generating error messages,
1483         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1484         -- current token to happyError, we could at least get rid of last_len.
1485         -- Getting rid of last_loc would require finding another way to
1486         -- implement pushCurrentContext (which is only called from one place).
1487
1488 newtype P a = P { unP :: PState -> ParseResult a }
1489
1490 instance Monad P where
1491   return = returnP
1492   (>>=) = thenP
1493   fail = failP
1494
1495 returnP :: a -> P a
1496 returnP a = a `seq` (P $ \s -> POk s a)
1497
1498 thenP :: P a -> (a -> P b) -> P b
1499 (P m) `thenP` k = P $ \ s ->
1500         case m s of
1501                 POk s1 a         -> (unP (k a)) s1
1502                 PFailed span err -> PFailed span err
1503
1504 failP :: String -> P a
1505 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1506
1507 failMsgP :: String -> P a
1508 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1509
1510 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1511 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1512
1513 failSpanMsgP :: SrcSpan -> SDoc -> P a
1514 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1515
1516 getPState :: P PState
1517 getPState = P $ \s -> POk s s
1518
1519 getDynFlags :: P DynFlags
1520 getDynFlags = P $ \s -> POk s (dflags s)
1521
1522 withThisPackage :: (PackageId -> a) -> P a
1523 withThisPackage f
1524  = do   pkg     <- liftM thisPackage getDynFlags
1525         return  $ f pkg
1526
1527 extension :: (Int -> Bool) -> P Bool
1528 extension p = P $ \s -> POk s (p $! extsBitmap s)
1529
1530 getExts :: P Int
1531 getExts = P $ \s -> POk s (extsBitmap s)
1532
1533 setExts :: (Int -> Int) -> P ()
1534 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1535
1536 setSrcLoc :: SrcLoc -> P ()
1537 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1538
1539 getSrcLoc :: P SrcLoc
1540 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1541
1542 setLastToken :: SrcSpan -> Int -> Int -> P ()
1543 setLastToken loc len line_len = P $ \s -> POk s {
1544   last_loc=loc,
1545   last_len=len,
1546   last_line_len=line_len
1547 } ()
1548
1549 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1550
1551 alexInputPrevChar :: AlexInput -> Char
1552 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1553
1554 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1555 alexGetChar (AI loc ofs s)
1556   | atEnd s   = Nothing
1557   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1558                 --trace (show (ord c)) $
1559                 Just (adj_c, (AI loc' ofs' s'))
1560   where (c,s') = nextChar s
1561         loc'   = advanceSrcLoc loc c
1562         ofs'   = advanceOffs c ofs
1563
1564         non_graphic     = '\x0'
1565         upper           = '\x1'
1566         lower           = '\x2'
1567         digit           = '\x3'
1568         symbol          = '\x4'
1569         space           = '\x5'
1570         other_graphic   = '\x6'
1571
1572         adj_c
1573           | c <= '\x06' = non_graphic
1574           | c <= '\x7f' = c
1575           -- Alex doesn't handle Unicode, so when Unicode
1576           -- character is encountered we output these values
1577           -- with the actual character value hidden in the state.
1578           | otherwise =
1579                 case generalCategory c of
1580                   UppercaseLetter       -> upper
1581                   LowercaseLetter       -> lower
1582                   TitlecaseLetter       -> upper
1583                   ModifierLetter        -> other_graphic
1584                   OtherLetter           -> lower -- see #1103
1585                   NonSpacingMark        -> other_graphic
1586                   SpacingCombiningMark  -> other_graphic
1587                   EnclosingMark         -> other_graphic
1588                   DecimalNumber         -> digit
1589                   LetterNumber          -> other_graphic
1590                   OtherNumber           -> other_graphic
1591                   ConnectorPunctuation  -> symbol
1592                   DashPunctuation       -> symbol
1593                   OpenPunctuation       -> other_graphic
1594                   ClosePunctuation      -> other_graphic
1595                   InitialQuote          -> other_graphic
1596                   FinalQuote            -> other_graphic
1597                   OtherPunctuation      -> symbol
1598                   MathSymbol            -> symbol
1599                   CurrencySymbol        -> symbol
1600                   ModifierSymbol        -> symbol
1601                   OtherSymbol           -> symbol
1602                   Space                 -> space
1603                   _other                -> non_graphic
1604
1605 -- This version does not squash unicode characters, it is used when
1606 -- lexing strings.
1607 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1608 alexGetChar' (AI loc ofs s)
1609   | atEnd s   = Nothing
1610   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1611                 --trace (show (ord c)) $
1612                 Just (c, (AI loc' ofs' s'))
1613   where (c,s') = nextChar s
1614         loc'   = advanceSrcLoc loc c
1615         ofs'   = advanceOffs c ofs
1616
1617 advanceOffs :: Char -> Int -> Int
1618 advanceOffs '\n' _    = 0
1619 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1620 advanceOffs _    offs = offs + 1
1621
1622 getInput :: P AlexInput
1623 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1624
1625 setInput :: AlexInput -> P ()
1626 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1627
1628 pushLexState :: Int -> P ()
1629 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1630
1631 popLexState :: P Int
1632 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1633
1634 getLexState :: P Int
1635 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1636
1637 -- for reasons of efficiency, flags indicating language extensions (eg,
1638 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1639 -- integer
1640
1641 genericsBit :: Int
1642 genericsBit = 0 -- {| and |}
1643 ffiBit :: Int
1644 ffiBit     = 1
1645 parrBit :: Int
1646 parrBit    = 2
1647 arrowsBit :: Int
1648 arrowsBit  = 4
1649 thBit :: Int
1650 thBit      = 5
1651 ipBit :: Int
1652 ipBit      = 6
1653 explicitForallBit :: Int
1654 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1655 bangPatBit :: Int
1656 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1657                 -- (doesn't affect the lexer)
1658 tyFamBit :: Int
1659 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1660 haddockBit :: Int
1661 haddockBit = 10 -- Lex and parse Haddock comments
1662 magicHashBit :: Int
1663 magicHashBit = 11 -- "#" in both functions and operators
1664 kindSigsBit :: Int
1665 kindSigsBit = 12 -- Kind signatures on type variables
1666 recursiveDoBit :: Int
1667 recursiveDoBit = 13 -- mdo
1668 unicodeSyntaxBit :: Int
1669 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1670 unboxedTuplesBit :: Int
1671 unboxedTuplesBit = 15 -- (# and #)
1672 standaloneDerivingBit :: Int
1673 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1674 transformComprehensionsBit :: Int
1675 transformComprehensionsBit = 17
1676 qqBit :: Int
1677 qqBit      = 18 -- enable quasiquoting
1678 inRulePragBit :: Int
1679 inRulePragBit = 19
1680 rawTokenStreamBit :: Int
1681 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1682 newQualOpsBit :: Int
1683 newQualOpsBit = 21 -- Haskell' qualified operator syntax, e.g. Prelude.(+)
1684 recBit :: Int
1685 recBit = 22 -- rec
1686
1687 always :: Int -> Bool
1688 always           _     = True
1689 genericsEnabled :: Int -> Bool
1690 genericsEnabled  flags = testBit flags genericsBit
1691 parrEnabled :: Int -> Bool
1692 parrEnabled      flags = testBit flags parrBit
1693 arrowsEnabled :: Int -> Bool
1694 arrowsEnabled    flags = testBit flags arrowsBit
1695 thEnabled :: Int -> Bool
1696 thEnabled        flags = testBit flags thBit
1697 ipEnabled :: Int -> Bool
1698 ipEnabled        flags = testBit flags ipBit
1699 explicitForallEnabled :: Int -> Bool
1700 explicitForallEnabled flags = testBit flags explicitForallBit
1701 bangPatEnabled :: Int -> Bool
1702 bangPatEnabled   flags = testBit flags bangPatBit
1703 -- tyFamEnabled :: Int -> Bool
1704 -- tyFamEnabled     flags = testBit flags tyFamBit
1705 haddockEnabled :: Int -> Bool
1706 haddockEnabled   flags = testBit flags haddockBit
1707 magicHashEnabled :: Int -> Bool
1708 magicHashEnabled flags = testBit flags magicHashBit
1709 -- kindSigsEnabled :: Int -> Bool
1710 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1711 unicodeSyntaxEnabled :: Int -> Bool
1712 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1713 unboxedTuplesEnabled :: Int -> Bool
1714 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1715 standaloneDerivingEnabled :: Int -> Bool
1716 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1717 qqEnabled :: Int -> Bool
1718 qqEnabled        flags = testBit flags qqBit
1719 -- inRulePrag :: Int -> Bool
1720 -- inRulePrag       flags = testBit flags inRulePragBit
1721 rawTokenStreamEnabled :: Int -> Bool
1722 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1723 newQualOps :: Int -> Bool
1724 newQualOps       flags = testBit flags newQualOpsBit
1725 oldQualOps :: Int -> Bool
1726 oldQualOps flags = not (newQualOps flags)
1727
1728 -- PState for parsing options pragmas
1729 --
1730 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1731 pragState dynflags buf loc =
1732   PState {
1733       buffer        = buf,
1734       messages      = emptyMessages,
1735       dflags        = dynflags,
1736       last_loc      = mkSrcSpan loc loc,
1737       last_offs     = 0,
1738       last_len      = 0,
1739       last_line_len = 0,
1740       loc           = loc,
1741       extsBitmap    = 0,
1742       context       = [],
1743       lex_state     = [bol, option_prags, 0]
1744     }
1745
1746
1747 -- create a parse state
1748 --
1749 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1750 mkPState buf loc flags  =
1751   PState {
1752       buffer          = buf,
1753       dflags        = flags,
1754       messages      = emptyMessages,
1755       last_loc      = mkSrcSpan loc loc,
1756       last_offs     = 0,
1757       last_len      = 0,
1758       last_line_len = 0,
1759       loc           = loc,
1760       extsBitmap    = fromIntegral bitmap,
1761       context       = [],
1762       lex_state     = [bol, 0]
1763         -- we begin in the layout state if toplev_layout is set
1764     }
1765     where
1766       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1767                .|. ffiBit            `setBitIf` dopt Opt_ForeignFunctionInterface flags
1768                .|. parrBit           `setBitIf` dopt Opt_PArr         flags
1769                .|. arrowsBit         `setBitIf` dopt Opt_Arrows       flags
1770                .|. thBit             `setBitIf` dopt Opt_TemplateHaskell flags
1771                .|. qqBit             `setBitIf` dopt Opt_QuasiQuotes flags
1772                .|. ipBit             `setBitIf` dopt Opt_ImplicitParams flags
1773                .|. explicitForallBit `setBitIf` dopt Opt_ExplicitForAll flags
1774                .|. bangPatBit        `setBitIf` dopt Opt_BangPatterns flags
1775                .|. tyFamBit          `setBitIf` dopt Opt_TypeFamilies flags
1776                .|. haddockBit        `setBitIf` dopt Opt_Haddock      flags
1777                .|. magicHashBit      `setBitIf` dopt Opt_MagicHash    flags
1778                .|. kindSigsBit       `setBitIf` dopt Opt_KindSignatures flags
1779                .|. recursiveDoBit    `setBitIf` dopt Opt_RecursiveDo flags
1780                .|. recBit            `setBitIf` dopt Opt_DoRec  flags
1781                .|. recBit            `setBitIf` dopt Opt_Arrows flags
1782                .|. unicodeSyntaxBit  `setBitIf` dopt Opt_UnicodeSyntax flags
1783                .|. unboxedTuplesBit  `setBitIf` dopt Opt_UnboxedTuples flags
1784                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1785                .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1786                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1787                .|. newQualOpsBit `setBitIf` dopt Opt_NewQualifiedOperators flags
1788       --
1789       setBitIf :: Int -> Bool -> Int
1790       b `setBitIf` cond | cond      = bit b
1791                         | otherwise = 0
1792
1793 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1794 addWarning option srcspan warning
1795  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1796        let warning' = mkWarnMsg srcspan alwaysQualify warning
1797            ws' = if dopt option d then ws `snocBag` warning' else ws
1798        in POk s{messages=(ws', es)} ()
1799
1800 getMessages :: PState -> Messages
1801 getMessages PState{messages=ms} = ms
1802
1803 getContext :: P [LayoutContext]
1804 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1805
1806 setContext :: [LayoutContext] -> P ()
1807 setContext ctx = P $ \s -> POk s{context=ctx} ()
1808
1809 popContext :: P ()
1810 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1811                               last_len = len, last_loc = last_loc }) ->
1812   case ctx of
1813         (_:tl) -> POk s{ context = tl } ()
1814         []     -> PFailed last_loc (srcParseErr buf len)
1815
1816 -- Push a new layout context at the indentation of the last token read.
1817 -- This is only used at the outer level of a module when the 'module'
1818 -- keyword is missing.
1819 pushCurrentContext :: P ()
1820 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1821     POk s{context = Layout (offs-len) : ctx} ()
1822 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1823
1824 getOffside :: P Ordering
1825 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1826                 let ord = case stk of
1827                         (Layout n:_) -> compare offs n
1828                         _            -> GT
1829                 in POk s ord
1830
1831 -- ---------------------------------------------------------------------------
1832 -- Construct a parse error
1833
1834 srcParseErr
1835   :: StringBuffer       -- current buffer (placed just after the last token)
1836   -> Int                -- length of the previous token
1837   -> Message
1838 srcParseErr buf len
1839   = hcat [ if null token
1840              then ptext (sLit "parse error (possibly incorrect indentation)")
1841              else hcat [ptext (sLit "parse error on input "),
1842                         char '`', text token, char '\'']
1843     ]
1844   where token = lexemeToString (offsetBytes (-len) buf) len
1845
1846 -- Report a parse failure, giving the span of the previous token as
1847 -- the location of the error.  This is the entry point for errors
1848 -- detected during parsing.
1849 srcParseFail :: P a
1850 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1851                             last_loc = last_loc } ->
1852     PFailed last_loc (srcParseErr buf len)
1853
1854 -- A lexical error is reported at a particular position in the source file,
1855 -- not over a token range.
1856 lexError :: String -> P a
1857 lexError str = do
1858   loc <- getSrcLoc
1859   (AI end _ buf) <- getInput
1860   reportLexError loc end buf str
1861
1862 -- -----------------------------------------------------------------------------
1863 -- This is the top-level function: called from the parser each time a
1864 -- new token is to be read from the input.
1865
1866 lexer :: (Located Token -> P a) -> P a
1867 lexer cont = do
1868   tok@(L _span _tok__) <- lexToken
1869 --  trace ("token: " ++ show tok__) $ do
1870   cont tok
1871
1872 lexToken :: P (Located Token)
1873 lexToken = do
1874   inp@(AI loc1 _ buf) <- getInput
1875   sc <- getLexState
1876   exts <- getExts
1877   case alexScanUser exts inp sc of
1878     AlexEOF -> do
1879         let span = mkSrcSpan loc1 loc1
1880         setLastToken span 0 0
1881         return (L span ITeof)
1882     AlexError (AI loc2 _ buf) ->
1883         reportLexError loc1 loc2 buf "lexical error"
1884     AlexSkip inp2 _ -> do
1885         setInput inp2
1886         lexToken
1887     AlexToken inp2@(AI end _ buf2) _ t -> do
1888         setInput inp2
1889         let span = mkSrcSpan loc1 end
1890         let bytes = byteDiff buf buf2
1891         span `seq` setLastToken span bytes bytes
1892         t span buf bytes
1893
1894 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
1895 reportLexError loc1 loc2 buf str
1896   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1897   | otherwise =
1898   let
1899         c = fst (nextChar buf)
1900   in
1901   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1902     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1903     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1904
1905 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
1906 lexTokenStream buf loc dflags = unP go initState
1907     where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
1908           go = do
1909             ltok <- lexer return
1910             case ltok of
1911               L _ ITeof -> return []
1912               _ -> liftM (ltok:) go
1913
1914 linePrags = Map.singleton "line" (begin line_prag2)
1915
1916 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
1917                                  ("options_ghc", lex_string_prag IToptions_prag),
1918                                  ("options_haddock", lex_string_prag ITdocOptions),
1919                                  ("language", token ITlanguage_prag),
1920                                  ("include", lex_string_prag ITinclude_prag)])
1921
1922 ignoredPrags = Map.fromList (map ignored pragmas)
1923                where ignored opt = (opt, nested_comment lexToken)
1924                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
1925                      options_pragmas = map ("options_" ++) impls
1926                      -- CFILES is a hugs-only thing.
1927                      pragmas = options_pragmas ++ ["cfiles", "contract"]
1928
1929 oneWordPrags = Map.fromList([("rules", rulePrag),
1930                            ("inline", token (ITinline_prag True)),
1931                            ("notinline", token (ITinline_prag False)),
1932                            ("specialize", token ITspec_prag),
1933                            ("source", token ITsource_prag),
1934                            ("warning", token ITwarning_prag),
1935                            ("deprecated", token ITdeprecated_prag),
1936                            ("scc", token ITscc_prag),
1937                            ("generated", token ITgenerated_prag),
1938                            ("core", token ITcore_prag),
1939                            ("unpack", token ITunpack_prag),
1940                            ("ann", token ITann_prag)])
1941
1942 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_conlike_prag True)),
1943                              ("notinline conlike", token (ITinline_conlike_prag False)),
1944                              ("specialize inline", token (ITspec_inline_prag True)),
1945                              ("specialize notinline", token (ITspec_inline_prag False))])
1946
1947
1948 dispatch_pragmas :: Map String Action -> Action
1949 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
1950                                        Just found -> found span buf len
1951                                        Nothing -> lexError "unknown pragma"
1952
1953 known_pragma :: Map String Action -> AlexAccPred Int
1954 known_pragma prags _ _ len (AI _ _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
1955                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
1956
1957 clean_pragma :: String -> String
1958 clean_pragma prag = canon_ws (map toLower (unprefix prag))
1959                     where unprefix prag' = case stripPrefix "{-#" prag' of
1960                                              Just rest -> rest
1961                                              Nothing -> prag'
1962                           canonical prag' = case prag' of
1963                                               "noinline" -> "notinline"
1964                                               "specialise" -> "specialize"
1965                                               "constructorlike" -> "conlike"
1966                                               otherwise -> prag'
1967                           canon_ws s = unwords (map canonical (words s))
1968 }