compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 --   qualified operator NOTES.
  24 --
  25 --   - If M.(+) is a single lexeme, then..
  26 --     - Probably (+) should be a single lexeme too, for consistency.
  27 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  28 --     - But we have to rule out reserved operators, otherwise (..) becomes
  29 --       a different lexeme.
  30 --     - Should we therefore also rule out reserved operators in the qualified
  31 --       form?  This is quite difficult to achieve.  We don't do it for
  32 --       qualified varids.
  33
  34 {
  35 {-# OPTIONS -w #-}
  36 -- The above warning supression flag is a temporary kludge.
  37 -- While working on this module you are encouraged to remove it and fix
  38 -- any warnings in the module. See
  39 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  40 -- for details
  41 --
  42 -- Note that Alex itself generates code with with some unused bindings and
  43 -- without type signatures, so removing the flag might not be possible.
  44
  45 {-# OPTIONS_GHC -funbox-strict-fields #-}
  46
  47 module Lexer (
  48    Token(..), lexer, pragState, mkPState, PState(..),
  49    P(..), ParseResult(..), getSrcLoc,
  50    failLocMsgP, failSpanMsgP, srcParseFail,
  51    getMessages,
  52    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  53    getLexState, popLexState, pushLexState,
  54    extension, standaloneDerivingEnabled, bangPatEnabled,
  55    addWarning,
  56    lexTokenStream
  57   ) where
  58
  59 import Bag
  60 import ErrUtils
  61 import Outputable
  62 import StringBuffer
  63 import FastString
  64 import FastTypes
  65 import SrcLoc
  66 import UniqFM
  67 import DynFlags
  68 import Ctype
  69 import Util             ( maybePrefixMatch, readRational )
  70
  71 import Control.Monad
  72 import Data.Bits
  73 import Data.Char
  74 import Data.Ratio
  75 import Debug.Trace
  76 }
  77
  78 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  79 $whitechar   = [\ \n\r\f\v $unispace]
  80 $white_no_nl = $whitechar # \n
  81 $tab         = \t
  82
  83 $ascdigit  = 0-9
  84 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  85 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  86 $digit     = [$ascdigit $unidigit]
  87
  88 $special   = [\(\)\,\;\[\]\`\{\}]
  89 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  90 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  91 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  92
  93 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  94 $asclarge  = [A-Z]
  95 $large     = [$asclarge $unilarge]
  96
  97 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
  98 $ascsmall  = [a-z]
  99 $small     = [$ascsmall $unismall \_]
 100
 101 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 102 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 103
 104 $octit     = 0-7
 105 $hexit     = [$decdigit A-F a-f]
 106 $symchar   = [$symbol \:]
 107 $nl        = [\n\r]
 108 $idchar    = [$small $large $digit \']
 109
 110 $docsym    = [\| \^ \* \$]
 111
 112 @varid     = $small $idchar*
 113 @conid     = $large $idchar*
 114
 115 @varsym    = $symbol $symchar*
 116 @consym    = \: $symchar*
 117
 118 @decimal     = $decdigit+
 119 @octal       = $octit+
 120 @hexadecimal = $hexit+
 121 @exponent    = [eE] [\-\+]? @decimal
 122
 123 -- we support the hierarchical module name extension:
 124 @qual = (@conid \.)+
 125
 126 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 127
 128 -- normal signed numerical literals can only be explicitly negative,
 129 -- not explicitly positive (contrast @exponent)
 130 @negative = \-
 131 @signed = @negative ?
 132
 133 haskell :-
 134
 135 -- everywhere: skip whitespace and comments
 136 $white_no_nl+                           ;
 137 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 138
 139 -- Everywhere: deal with nested comments.  We explicitly rule out
 140 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 141 -- (this can happen even though pragmas will normally take precedence due to
 142 -- longest-match, because pragmas aren't valid in every state, but comments
 143 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 144 -- set.
 145
 146 "{-" / { isNormalComment } { nested_comment lexToken }
 147
 148 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 149 -- more dashes followed by a symbol should be parsed as a varsym, so we
 150 -- have to exclude those.
 151
 152 -- Since Haddock comments aren't valid in every state, we need to rule them
 153 -- out here.
 154
 155 -- The following two rules match comments that begin with two dashes, but
 156 -- continue with a different character. The rules test that this character
 157 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 158 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 159 -- have a Haddock comment). The rules then munch the rest of the line.
 160
 161 "-- " ~[$docsym \#] .* { lineCommentToken }
 162 "--" [^$symbol : \ ] .* { lineCommentToken }
 163
 164 -- Next, match Haddock comments if no -haddock flag
 165
 166 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 167
 168 -- Now, when we've matched comments that begin with 2 dashes and continue
 169 -- with a different character, we need to match comments that begin with three
 170 -- or more dashes (which clearly can't be Haddock comments). We only need to
 171 -- make sure that the first non-dash character isn't a symbol, and munch the
 172 -- rest of the line.
 173
 174 "---"\-* [^$symbol :] .* { lineCommentToken }
 175
 176 -- Since the previous rules all match dashes followed by at least one
 177 -- character, we also need to match a whole line filled with just dashes.
 178
 179 "--"\-* / { atEOL } { lineCommentToken }
 180
 181 -- We need this rule since none of the other single line comment rules
 182 -- actually match this case.
 183
 184 "-- " / { atEOL } { lineCommentToken }
 185
 186 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 187 -- blank lines) until we find a non-whitespace character, then do layout
 188 -- processing.
 189 --
 190 -- One slight wibble here: what if the line begins with {-#? In
 191 -- theory, we have to lex the pragma to see if it's one we recognise,
 192 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 193 -- as a nested comment.  We don't bother with this: if the line begins
 194 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 195 <bol> {
 196   \n                                    ;
 197   ^\# (line)?                           { begin line_prag1 }
 198   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 199   ^\# \! .* \n                          ; -- #!, for scripts
 200   ()                                    { do_bol }
 201 }
 202
 203 -- after a layout keyword (let, where, do, of), we begin a new layout
 204 -- context if the curly brace is missing.
 205 -- Careful! This stuff is quite delicate.
 206 <layout, layout_do> {
 207   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 208         -- we might encounter {-# here, but {- has been handled already
 209   \n                                    ;
 210   ^\# (line)?                           { begin line_prag1 }
 211 }
 212
 213 -- do is treated in a subtly different way, see new_layout_context
 214 <layout>    ()                          { new_layout_context True }
 215 <layout_do> ()                          { new_layout_context False }
 216
 217 -- after a new layout context which was found to be to the left of the
 218 -- previous context, we have generated a '{' token, and we now need to
 219 -- generate a matching '}' token.
 220 <layout_left>  ()                       { do_layout_left }
 221
 222 <0,option_prags> \n                             { begin bol }
 223
 224 "{-#" $whitechar* (line|LINE) / { notFollowedByPragmaChar }
 225                             { begin line_prag2 }
 226
 227 -- single-line line pragmas, of the form
 228 --    # <line> "<file>" <extra-stuff> \n
 229 <line_prag1> $decdigit+                 { setLine line_prag1a }
 230 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 231 <line_prag1b> .*                        { pop }
 232
 233 -- Haskell-style line pragmas, of the form
 234 --    {-# LINE <line> "<file>" #-}
 235 <line_prag2> $decdigit+                 { setLine line_prag2a }
 236 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 237 <line_prag2b> "#-}"|"-}"                { pop }
 238    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 239    -- with older versions of GHC which generated these.
 240
 241 <0,option_prags> {
 242   "{-#" $whitechar* (RULES|rules)  / { notFollowedByPragmaChar } { rulePrag }
 243   "{-#" $whitechar* (INLINE|inline)      / { notFollowedByPragmaChar }
 244                     { token (ITinline_prag True) }
 245   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline) / { notFollowedByPragmaChar }
 246                                         { token (ITinline_prag False) }
 247   "{-#" $whitechar* (INLINE|inline)
 248         $whitechar+ (CONLIKE|conlike) / { notFollowedByPragmaChar }
 249                                         { token (ITinline_conlike_prag True) }
 250   "{-#" $whitechar* (NO(T)?INLINE|no(t?)inline)
 251         $whitechar+ (CONLIKE|constructorlike) / { notFollowedByPragmaChar }
 252                                         { token (ITinline_conlike_prag False) }
 253   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e) / { notFollowedByPragmaChar }
 254                                         { token ITspec_prag }
 255   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 256         $whitechar+ (INLINE|inline) / { notFollowedByPragmaChar }
 257                     { token (ITspec_inline_prag True) }
 258   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 259         $whitechar+ (NO(T?)INLINE|no(t?)inline) / { notFollowedByPragmaChar }
 260                                         { token (ITspec_inline_prag False) }
 261   "{-#" $whitechar* (SOURCE|source) / { notFollowedByPragmaChar }
 262                     { token ITsource_prag }
 263   "{-#" $whitechar* (WARNING|warning) / { notFollowedByPragmaChar }
 264                                         { token ITwarning_prag }
 265   "{-#" $whitechar* (DEPRECATED|deprecated) / { notFollowedByPragmaChar }
 266                                         { token ITdeprecated_prag }
 267   "{-#" $whitechar* (SCC|scc)  / { notFollowedByPragmaChar }
 268                     { token ITscc_prag }
 269   "{-#" $whitechar* (GENERATED|generated) / { notFollowedByPragmaChar }
 270                                         { token ITgenerated_prag }
 271   "{-#" $whitechar* (CORE|core) / { notFollowedByPragmaChar }
 272                     { token ITcore_prag }
 273   "{-#" $whitechar* (UNPACK|unpack) / { notFollowedByPragmaChar }
 274                     { token ITunpack_prag }
 275   "{-#" $whitechar* (ANN|ann) / { notFollowedByPragmaChar }
 276                     { token ITann_prag }
 277
 278   -- We ignore all these pragmas, but don't generate a warning for them
 279   -- CFILES is a hugs-only thing.
 280   "{-#" $whitechar* (OPTIONS_(HUGS|hugs|NHC98|nhc98|JHC|jhc|YHC|yhc|CATCH|catch|DERIVE|derive)|CFILES|cfiles|CONTRACT|contract) / { notFollowedByPragmaChar }
 281                     { nested_comment lexToken }
 282
 283   -- ToDo: should only be valid inside a pragma:
 284   "#-}"                                 { endPrag }
 285 }
 286
 287 <option_prags> {
 288   "{-#"  $whitechar* (OPTIONS|options) / { notFollowedByPragmaChar }
 289                                         { lex_string_prag IToptions_prag }
 290   "{-#"  $whitechar* (OPTIONS_GHC|options_ghc) / { notFollowedByPragmaChar }
 291                                         { lex_string_prag IToptions_prag }
 292   "{-#"  $whitechar* (OPTIONS_HADDOCK|options_haddock)
 293                    / { notFollowedByPragmaChar }
 294                                          { lex_string_prag ITdocOptions }
 295   "-- #"                                 { multiline_doc_comment }
 296   "{-#"  $whitechar* (LANGUAGE|language) / { notFollowedByPragmaChar }
 297                                          { token ITlanguage_prag }
 298   "{-#"  $whitechar* (INCLUDE|include) / { notFollowedByPragmaChar }
 299                                          { lex_string_prag ITinclude_prag }
 300 }
 301
 302 <0> {
 303   -- In the "0" mode we ignore these pragmas
 304   "{-#"  $whitechar* (OPTIONS|options|OPTIONS_GHC|options_ghc|OPTIONS_HADDOCK|options_haddock|LANGUAGE|language|INCLUDE|include) / { notFollowedByPragmaChar }
 305                      { nested_comment lexToken }
 306 }
 307
 308 <0> {
 309   "-- #" .* { lineCommentToken }
 310 }
 311
 312 <0,option_prags> {
 313   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 314                     (nested_comment lexToken) }
 315 }
 316
 317 -- '0' state: ordinary lexemes
 318
 319 -- Haddock comments
 320
 321 <0> {
 322   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 323   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 324 }
 325
 326 -- "special" symbols
 327
 328 <0> {
 329   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 330   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 331 }
 332
 333 <0> {
 334   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 335   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 336   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 337   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 338   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 339   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 340   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 341   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 342
 343   "[$" @varid "|"  / { ifExtension qqEnabled }
 344                      { lex_quasiquote_tok }
 345 }
 346
 347 <0> {
 348   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 349                                         { special IToparenbar }
 350   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 351 }
 352
 353 <0> {
 354   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 355 }
 356
 357 <0> {
 358   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 359          { token IToubxparen }
 360   "#)" / { ifExtension unboxedTuplesEnabled }
 361          { token ITcubxparen }
 362 }
 363
 364 <0> {
 365   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 366   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 367 }
 368
 369 <0,option_prags> {
 370   \(                                    { special IToparen }
 371   \)                                    { special ITcparen }
 372   \[                                    { special ITobrack }
 373   \]                                    { special ITcbrack }
 374   \,                                    { special ITcomma }
 375   \;                                    { special ITsemi }
 376   \`                                    { special ITbackquote }
 377
 378   \{                                    { open_brace }
 379   \}                                    { close_brace }
 380 }
 381
 382 <0,option_prags> {
 383   @qual @varid                  { idtoken qvarid }
 384   @qual @conid                  { idtoken qconid }
 385   @varid                        { varid }
 386   @conid                        { idtoken conid }
 387 }
 388
 389 <0> {
 390   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 391   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 392   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 393   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 394 }
 395
 396 -- ToDo: - move `var` and (sym) into lexical syntax?
 397 --       - remove backquote from $special?
 398 <0> {
 399   @qual @varsym       / { ifExtension oldQualOps } { idtoken qvarsym }
 400   @qual @consym       / { ifExtension oldQualOps } { idtoken qconsym }
 401   @qual \( @varsym \) / { ifExtension newQualOps } { idtoken prefixqvarsym }
 402   @qual \( @consym \) / { ifExtension newQualOps } { idtoken prefixqconsym }
 403   @varsym                                          { varsym }
 404   @consym                                          { consym }
 405 }
 406
 407 -- For the normal boxed literals we need to be careful
 408 -- when trying to be close to Haskell98
 409 <0> {
 410   -- Normal integral literals (:: Num a => a, from Integer)
 411   @decimal           { tok_num positive 0 0 decimal }
 412   0[oO] @octal       { tok_num positive 2 2 octal }
 413   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 414
 415   -- Normal rational literals (:: Fractional a => a, from Rational)
 416   @floating_point    { strtoken tok_float }
 417 }
 418
 419 <0> {
 420   -- Unboxed ints (:: Int#) and words (:: Word#)
 421   -- It's simpler (and faster?) to give separate cases to the negatives,
 422   -- especially considering octal/hexadecimal prefixes.
 423   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 424   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 425   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 426   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 427   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 428   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 429
 430   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 431   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 432   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 433
 434   -- Unboxed floats and doubles (:: Float#, :: Double#)
 435   -- prim_{float,double} work with signed literals
 436   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 437   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 438 }
 439
 440 -- Strings and chars are lexed by hand-written code.  The reason is
 441 -- that even if we recognise the string or char here in the regex
 442 -- lexer, we would still have to parse the string afterward in order
 443 -- to convert it to a String.
 444 <0> {
 445   \'                            { lex_char_tok }
 446   \"                            { lex_string_tok }
 447 }
 448
 449 {
 450 -- -----------------------------------------------------------------------------
 451 -- The token type
 452
 453 data Token
 454   = ITas                        -- Haskell keywords
 455   | ITcase
 456   | ITclass
 457   | ITdata
 458   | ITdefault
 459   | ITderiving
 460   | ITdo
 461   | ITelse
 462   | IThiding
 463   | ITif
 464   | ITimport
 465   | ITin
 466   | ITinfix
 467   | ITinfixl
 468   | ITinfixr
 469   | ITinstance
 470   | ITlet
 471   | ITmodule
 472   | ITnewtype
 473   | ITof
 474   | ITqualified
 475   | ITthen
 476   | ITtype
 477   | ITwhere
 478   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 479
 480   | ITforall                    -- GHC extension keywords
 481   | ITforeign
 482   | ITexport
 483   | ITlabel
 484   | ITdynamic
 485   | ITsafe
 486   | ITthreadsafe
 487   | ITunsafe
 488   | ITstdcallconv
 489   | ITccallconv
 490   | ITdotnet
 491   | ITmdo
 492   | ITfamily
 493   | ITgroup
 494   | ITby
 495   | ITusing
 496
 497         -- Pragmas
 498   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 499   | ITinline_conlike_prag Bool  -- same
 500   | ITspec_prag                 -- SPECIALISE
 501   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 502   | ITsource_prag
 503   | ITrules_prag
 504   | ITwarning_prag
 505   | ITdeprecated_prag
 506   | ITline_prag
 507   | ITscc_prag
 508   | ITgenerated_prag
 509   | ITcore_prag                 -- hdaume: core annotations
 510   | ITunpack_prag
 511   | ITann_prag
 512   | ITclose_prag
 513   | IToptions_prag String
 514   | ITinclude_prag String
 515   | ITlanguage_prag
 516
 517   | ITdotdot                    -- reserved symbols
 518   | ITcolon
 519   | ITdcolon
 520   | ITequal
 521   | ITlam
 522   | ITvbar
 523   | ITlarrow
 524   | ITrarrow
 525   | ITat
 526   | ITtilde
 527   | ITdarrow
 528   | ITminus
 529   | ITbang
 530   | ITstar
 531   | ITdot
 532
 533   | ITbiglam                    -- GHC-extension symbols
 534
 535   | ITocurly                    -- special symbols
 536   | ITccurly
 537   | ITocurlybar                 -- {|, for type applications
 538   | ITccurlybar                 -- |}, for type applications
 539   | ITvocurly
 540   | ITvccurly
 541   | ITobrack
 542   | ITopabrack                  -- [:, for parallel arrays with -XParr
 543   | ITcpabrack                  -- :], for parallel arrays with -XParr
 544   | ITcbrack
 545   | IToparen
 546   | ITcparen
 547   | IToubxparen
 548   | ITcubxparen
 549   | ITsemi
 550   | ITcomma
 551   | ITunderscore
 552   | ITbackquote
 553
 554   | ITvarid   FastString        -- identifiers
 555   | ITconid   FastString
 556   | ITvarsym  FastString
 557   | ITconsym  FastString
 558   | ITqvarid  (FastString,FastString)
 559   | ITqconid  (FastString,FastString)
 560   | ITqvarsym (FastString,FastString)
 561   | ITqconsym (FastString,FastString)
 562   | ITprefixqvarsym (FastString,FastString)
 563   | ITprefixqconsym (FastString,FastString)
 564
 565   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 566
 567   | ITpragma StringBuffer
 568
 569   | ITchar       Char
 570   | ITstring     FastString
 571   | ITinteger    Integer
 572   | ITrational   Rational
 573
 574   | ITprimchar   Char
 575   | ITprimstring FastString
 576   | ITprimint    Integer
 577   | ITprimword   Integer
 578   | ITprimfloat  Rational
 579   | ITprimdouble Rational
 580
 581   -- MetaHaskell extension tokens
 582   | ITopenExpQuote              --  [| or [e|
 583   | ITopenPatQuote              --  [p|
 584   | ITopenDecQuote              --  [d|
 585   | ITopenTypQuote              --  [t|
 586   | ITcloseQuote                --  |]
 587   | ITidEscape   FastString     --  $x
 588   | ITparenEscape               --  $(
 589   | ITvarQuote                  --  '
 590   | ITtyQuote                   --  ''
 591   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 592
 593   -- Arrow notation extension
 594   | ITproc
 595   | ITrec
 596   | IToparenbar                 --  (|
 597   | ITcparenbar                 --  |)
 598   | ITlarrowtail                --  -<
 599   | ITrarrowtail                --  >-
 600   | ITLarrowtail                --  -<<
 601   | ITRarrowtail                --  >>-
 602
 603   | ITunknown String            -- Used when the lexer can't make sense of it
 604   | ITeof                       -- end of file token
 605
 606   -- Documentation annotations
 607   | ITdocCommentNext  String     -- something beginning '-- |'
 608   | ITdocCommentPrev  String     -- something beginning '-- ^'
 609   | ITdocCommentNamed String     -- something beginning '-- $'
 610   | ITdocSection      Int String -- a section heading
 611   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 612   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 613   | ITlineComment     String     -- comment starting by "--"
 614   | ITblockComment    String     -- comment in {- -}
 615
 616 #ifdef DEBUG
 617   deriving Show -- debugging
 618 #endif
 619
 620 {-
 621 isSpecial :: Token -> Bool
 622 -- If we see M.x, where x is a keyword, but
 623 -- is special, we treat is as just plain M.x,
 624 -- not as a keyword.
 625 isSpecial ITas          = True
 626 isSpecial IThiding      = True
 627 isSpecial ITqualified   = True
 628 isSpecial ITforall      = True
 629 isSpecial ITexport      = True
 630 isSpecial ITlabel       = True
 631 isSpecial ITdynamic     = True
 632 isSpecial ITsafe        = True
 633 isSpecial ITthreadsafe  = True
 634 isSpecial ITunsafe      = True
 635 isSpecial ITccallconv   = True
 636 isSpecial ITstdcallconv = True
 637 isSpecial ITmdo         = True
 638 isSpecial ITfamily      = True
 639 isSpecial ITgroup   = True
 640 isSpecial ITby      = True
 641 isSpecial ITusing   = True
 642 isSpecial _             = False
 643 -}
 644
 645 -- the bitmap provided as the third component indicates whether the
 646 -- corresponding extension keyword is valid under the extension options
 647 -- provided to the compiler; if the extension corresponding to *any* of the
 648 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 649 -- facilitates using a keyword in two different extensions that can be
 650 -- activated independently)
 651 --
 652 reservedWordsFM = listToUFM $
 653         map (\(x, y, z) -> (mkFastString x, (y, z)))
 654        [( "_",          ITunderscore,   0 ),
 655         ( "as",         ITas,           0 ),
 656         ( "case",       ITcase,         0 ),
 657         ( "class",      ITclass,        0 ),
 658         ( "data",       ITdata,         0 ),
 659         ( "default",    ITdefault,      0 ),
 660         ( "deriving",   ITderiving,     0 ),
 661         ( "do",         ITdo,           0 ),
 662         ( "else",       ITelse,         0 ),
 663         ( "hiding",     IThiding,       0 ),
 664         ( "if",         ITif,           0 ),
 665         ( "import",     ITimport,       0 ),
 666         ( "in",         ITin,           0 ),
 667         ( "infix",      ITinfix,        0 ),
 668         ( "infixl",     ITinfixl,       0 ),
 669         ( "infixr",     ITinfixr,       0 ),
 670         ( "instance",   ITinstance,     0 ),
 671         ( "let",        ITlet,          0 ),
 672         ( "module",     ITmodule,       0 ),
 673         ( "newtype",    ITnewtype,      0 ),
 674         ( "of",         ITof,           0 ),
 675         ( "qualified",  ITqualified,    0 ),
 676         ( "then",       ITthen,         0 ),
 677         ( "type",       ITtype,         0 ),
 678         ( "where",      ITwhere,        0 ),
 679         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 680
 681     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 682         ( "mdo",        ITmdo,           bit recursiveDoBit),
 683         ( "family",     ITfamily,        bit tyFamBit),
 684     ( "group",  ITgroup,     bit transformComprehensionsBit),
 685     ( "by",     ITby,        bit transformComprehensionsBit),
 686     ( "using",  ITusing,     bit transformComprehensionsBit),
 687
 688         ( "foreign",    ITforeign,       bit ffiBit),
 689         ( "export",     ITexport,        bit ffiBit),
 690         ( "label",      ITlabel,         bit ffiBit),
 691         ( "dynamic",    ITdynamic,       bit ffiBit),
 692         ( "safe",       ITsafe,          bit ffiBit),
 693         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 694         ( "unsafe",     ITunsafe,        bit ffiBit),
 695         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 696         ( "ccall",      ITccallconv,     bit ffiBit),
 697         ( "dotnet",     ITdotnet,        bit ffiBit),
 698
 699         ( "rec",        ITrec,           bit arrowsBit),
 700         ( "proc",       ITproc,          bit arrowsBit)
 701      ]
 702
 703 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 704 reservedSymsFM = listToUFM $
 705     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 706       [ ("..",  ITdotdot,   always)
 707         -- (:) is a reserved op, meaning only list cons
 708        ,(":",   ITcolon,    always)
 709        ,("::",  ITdcolon,   always)
 710        ,("=",   ITequal,    always)
 711        ,("\\",  ITlam,      always)
 712        ,("|",   ITvbar,     always)
 713        ,("<-",  ITlarrow,   always)
 714        ,("->",  ITrarrow,   always)
 715        ,("@",   ITat,       always)
 716        ,("~",   ITtilde,    always)
 717        ,("=>",  ITdarrow,   always)
 718        ,("-",   ITminus,    always)
 719        ,("!",   ITbang,     always)
 720
 721         -- For data T (a::*) = MkT
 722        ,("*", ITstar, \i -> kindSigsEnabled i || tyFamEnabled i)
 723         -- For 'forall a . t'
 724        ,(".", ITdot, \i -> explicitForallEnabled i || inRulePrag i)
 725
 726        ,("-<",  ITlarrowtail, arrowsEnabled)
 727        ,(">-",  ITrarrowtail, arrowsEnabled)
 728        ,("-<<", ITLarrowtail, arrowsEnabled)
 729        ,(">>-", ITRarrowtail, arrowsEnabled)
 730
 731        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 732        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 733        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 734                                 explicitForallEnabled i)
 735        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 736        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 737        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 738         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 739         -- form part of a large operator.  This would let us have a better
 740         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 741        ]
 742
 743 -- -----------------------------------------------------------------------------
 744 -- Lexer actions
 745
 746 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 747
 748 special :: Token -> Action
 749 special tok span _buf _len = return (L span tok)
 750
 751 token, layout_token :: Token -> Action
 752 token t span _buf _len = return (L span t)
 753 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 754
 755 idtoken :: (StringBuffer -> Int -> Token) -> Action
 756 idtoken f span buf len = return (L span $! (f buf len))
 757
 758 skip_one_varid :: (FastString -> Token) -> Action
 759 skip_one_varid f span buf len
 760   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 761
 762 strtoken :: (String -> Token) -> Action
 763 strtoken f span buf len =
 764   return (L span $! (f $! lexemeToString buf len))
 765
 766 init_strtoken :: Int -> (String -> Token) -> Action
 767 -- like strtoken, but drops the last N character(s)
 768 init_strtoken drop f span buf len =
 769   return (L span $! (f $! lexemeToString buf (len-drop)))
 770
 771 begin :: Int -> Action
 772 begin code _span _str _len = do pushLexState code; lexToken
 773
 774 pop :: Action
 775 pop _span _buf _len = do popLexState; lexToken
 776
 777 pop_and :: Action -> Action
 778 pop_and act span buf len = do popLexState; act span buf len
 779
 780 {-# INLINE nextCharIs #-}
 781 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 782
 783 notFollowedBy char _ _ _ (AI _ _ buf)
 784   = nextCharIs buf (/=char)
 785
 786 notFollowedBySymbol _ _ _ (AI _ _ buf)
 787   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 788
 789 notFollowedByPragmaChar _ _ _ (AI _ _ buf)
 790   = nextCharIs buf (\c -> not (isAlphaNum c || c == '_'))
 791
 792 -- We must reject doc comments as being ordinary comments everywhere.
 793 -- In some cases the doc comment will be selected as the lexeme due to
 794 -- maximal munch, but not always, because the nested comment rule is
 795 -- valid in all states, but the doc-comment rules are only valid in
 796 -- the non-layout states.
 797 isNormalComment bits _ _ (AI _ _ buf)
 798   | haddockEnabled bits = notFollowedByDocOrPragma
 799   | otherwise           = nextCharIs buf (/='#')
 800   where
 801     notFollowedByDocOrPragma
 802        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 803
 804 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 805
 806 {-
 807 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 808   = if haddockEnabled bits then False else (p buf)
 809 -}
 810
 811 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 812
 813 ifExtension pred bits _ _ _ = pred bits
 814
 815 multiline_doc_comment :: Action
 816 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 817   where
 818     worker commentAcc input docType oneLine = case alexGetChar input of
 819       Just ('\n', input')
 820         | oneLine -> docCommentEnd input commentAcc docType buf span
 821         | otherwise -> case checkIfCommentLine input' of
 822           Just input -> worker ('\n':commentAcc) input docType False
 823           Nothing -> docCommentEnd input commentAcc docType buf span
 824       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 825       Nothing -> docCommentEnd input commentAcc docType buf span
 826
 827     checkIfCommentLine input = check (dropNonNewlineSpace input)
 828       where
 829         check input = case alexGetChar input of
 830           Just ('-', input) -> case alexGetChar input of
 831             Just ('-', input) -> case alexGetChar input of
 832               Just (c, _) | c /= '-' -> Just input
 833               _ -> Nothing
 834             _ -> Nothing
 835           _ -> Nothing
 836
 837         dropNonNewlineSpace input = case alexGetChar input of
 838           Just (c, input')
 839             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 840             | otherwise -> input
 841           Nothing -> input
 842
 843 lineCommentToken :: Action
 844 lineCommentToken span buf len = do
 845   b <- extension rawTokenStreamEnabled
 846   if b then strtoken ITlineComment span buf len else lexToken
 847
 848 {-
 849   nested comments require traversing by hand, they can't be parsed
 850   using regular expressions.
 851 -}
 852 nested_comment :: P (Located Token) -> Action
 853 nested_comment cont span _str _len = do
 854   input <- getInput
 855   go "" (1::Int) input
 856   where
 857     go commentAcc 0 input = do setInput input
 858                                b <- extension rawTokenStreamEnabled
 859                                if b
 860                                  then docCommentEnd input commentAcc ITblockComment _str span
 861                                  else cont
 862     go commentAcc n input = case alexGetChar input of
 863       Nothing -> errBrace input span
 864       Just ('-',input) -> case alexGetChar input of
 865         Nothing  -> errBrace input span
 866         Just ('\125',input) -> go commentAcc (n-1) input
 867         Just (_,_)          -> go ('-':commentAcc) n input
 868       Just ('\123',input) -> case alexGetChar input of
 869         Nothing  -> errBrace input span
 870         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 871         Just (_,_)       -> go ('\123':commentAcc) n input
 872       Just (c,input) -> go (c:commentAcc) n input
 873
 874 nested_doc_comment :: Action
 875 nested_doc_comment span buf _len = withLexedDocType (go "")
 876   where
 877     go commentAcc input docType _ = case alexGetChar input of
 878       Nothing -> errBrace input span
 879       Just ('-',input) -> case alexGetChar input of
 880         Nothing -> errBrace input span
 881         Just ('\125',input) ->
 882           docCommentEnd input commentAcc docType buf span
 883         Just (_,_) -> go ('-':commentAcc) input docType False
 884       Just ('\123', input) -> case alexGetChar input of
 885         Nothing  -> errBrace input span
 886         Just ('-',input) -> do
 887           setInput input
 888           let cont = do input <- getInput; go commentAcc input docType False
 889           nested_comment cont span buf _len
 890         Just (_,_) -> go ('\123':commentAcc) input docType False
 891       Just (c,input) -> go (c:commentAcc) input docType False
 892
 893 withLexedDocType lexDocComment = do
 894   input@(AI _ _ buf) <- getInput
 895   case prevChar buf ' ' of
 896     '|' -> lexDocComment input ITdocCommentNext False
 897     '^' -> lexDocComment input ITdocCommentPrev False
 898     '$' -> lexDocComment input ITdocCommentNamed False
 899     '*' -> lexDocSection 1 input
 900     '#' -> lexDocComment input ITdocOptionsOld False
 901  where
 902     lexDocSection n input = case alexGetChar input of
 903       Just ('*', input) -> lexDocSection (n+1) input
 904       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 905       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 906
 907 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 908 -- off again at the end of the pragma.
 909 rulePrag :: Action
 910 rulePrag span buf len = do
 911   setExts (.|. bit inRulePragBit)
 912   return (L span ITrules_prag)
 913
 914 endPrag :: Action
 915 endPrag span buf len = do
 916   setExts (.&. complement (bit inRulePragBit))
 917   return (L span ITclose_prag)
 918
 919 -- docCommentEnd
 920 -------------------------------------------------------------------------------
 921 -- This function is quite tricky. We can't just return a new token, we also
 922 -- need to update the state of the parser. Why? Because the token is longer
 923 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 924 -- it writes the wrong token length to the parser state. This function is
 925 -- called afterwards, so it can just update the state.
 926
 927 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 928 -- which is something that the original lexer didn't account for.
 929 -- I have added last_line_len in the parser state which represents the length
 930 -- of the part of the token that is on the last line. It is now used for layout
 931 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 932 -- was before, the full length of the token, and it is now only used for error
 933 -- messages. /Waern
 934
 935 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 936                  SrcSpan -> P (Located Token)
 937 docCommentEnd input commentAcc docType buf span = do
 938   setInput input
 939   let (AI loc last_offs nextBuf) = input
 940       comment = reverse commentAcc
 941       span' = mkSrcSpan (srcSpanStart span) loc
 942       last_len = byteDiff buf nextBuf
 943
 944       last_line_len = if (last_offs - last_len < 0)
 945         then last_offs
 946         else last_len
 947
 948   span `seq` setLastToken span' last_len last_line_len
 949   return (L span' (docType comment))
 950
 951 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 952
 953 open_brace, close_brace :: Action
 954 open_brace span _str _len = do
 955   ctx <- getContext
 956   setContext (NoLayout:ctx)
 957   return (L span ITocurly)
 958 close_brace span _str _len = do
 959   popContext
 960   return (L span ITccurly)
 961
 962 qvarid buf len = ITqvarid $! splitQualName buf len False
 963 qconid buf len = ITqconid $! splitQualName buf len False
 964
 965 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 966 -- takes a StringBuffer and a length, and returns the module name
 967 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 968 -- because of hierarchical module names.
 969 splitQualName orig_buf len parens = split orig_buf orig_buf
 970   where
 971     split buf dot_buf
 972         | orig_buf `byteDiff` buf >= len  = done dot_buf
 973         | c == '.'                        = found_dot buf'
 974         | otherwise                       = split buf' dot_buf
 975       where
 976        (c,buf') = nextChar buf
 977
 978     -- careful, we might get names like M....
 979     -- so, if the character after the dot is not upper-case, this is
 980     -- the end of the qualifier part.
 981     found_dot buf -- buf points after the '.'
 982         | isUpper c    = split buf' buf
 983         | otherwise    = done buf
 984       where
 985        (c,buf') = nextChar buf
 986
 987     done dot_buf =
 988         (lexemeToFastString orig_buf (qual_size - 1),
 989          if parens -- Prelude.(+)
 990             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 991             else lexemeToFastString dot_buf (len - qual_size))
 992       where
 993         qual_size = orig_buf `byteDiff` dot_buf
 994
 995 varid span buf len =
 996   fs `seq`
 997   case lookupUFM reservedWordsFM fs of
 998         Just (keyword,0)    -> do
 999                 maybe_layout keyword
1000                 return (L span keyword)
1001         Just (keyword,exts) -> do
1002                 b <- extension (\i -> exts .&. i /= 0)
1003                 if b then do maybe_layout keyword
1004                              return (L span keyword)
1005                      else return (L span (ITvarid fs))
1006         _other -> return (L span (ITvarid fs))
1007   where
1008         fs = lexemeToFastString buf len
1009
1010 conid buf len = ITconid fs
1011   where fs = lexemeToFastString buf len
1012
1013 qvarsym buf len = ITqvarsym $! splitQualName buf len False
1014 qconsym buf len = ITqconsym $! splitQualName buf len False
1015 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
1016 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
1017
1018 varsym = sym ITvarsym
1019 consym = sym ITconsym
1020
1021 sym con span buf len =
1022   case lookupUFM reservedSymsFM fs of
1023         Just (keyword,exts) -> do
1024                 b <- extension exts
1025                 if b then return (L span keyword)
1026                      else return (L span $! con fs)
1027         _other -> return (L span $! con fs)
1028   where
1029         fs = lexemeToFastString buf len
1030
1031 -- Variations on the integral numeric literal.
1032 tok_integral :: (Integer -> Token)
1033      -> (Integer -> Integer)
1034  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1035      -> Int -> Int
1036      -> (Integer, (Char->Int)) -> Action
1037 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1038   return $ L span $ itint $! transint $ parseUnsignedInteger
1039      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1040
1041 -- some conveniences for use with tok_integral
1042 tok_num = tok_integral ITinteger
1043 tok_primint = tok_integral ITprimint
1044 tok_primword = tok_integral ITprimword positive
1045 positive = id
1046 negative = negate
1047 decimal = (10,octDecDigit)
1048 octal = (8,octDecDigit)
1049 hexadecimal = (16,hexDigit)
1050
1051 -- readRational can understand negative rationals, exponents, everything.
1052 tok_float        str = ITrational   $! readRational str
1053 tok_primfloat    str = ITprimfloat  $! readRational str
1054 tok_primdouble   str = ITprimdouble $! readRational str
1055
1056 -- -----------------------------------------------------------------------------
1057 -- Layout processing
1058
1059 -- we're at the first token on a line, insert layout tokens if necessary
1060 do_bol :: Action
1061 do_bol span _str _len = do
1062         pos <- getOffside
1063         case pos of
1064             LT -> do
1065                 --trace "layout: inserting '}'" $ do
1066                 popContext
1067                 -- do NOT pop the lex state, we might have a ';' to insert
1068                 return (L span ITvccurly)
1069             EQ -> do
1070                 --trace "layout: inserting ';'" $ do
1071                 popLexState
1072                 return (L span ITsemi)
1073             GT -> do
1074                 popLexState
1075                 lexToken
1076
1077 -- certain keywords put us in the "layout" state, where we might
1078 -- add an opening curly brace.
1079 maybe_layout ITdo       = pushLexState layout_do
1080 maybe_layout ITmdo      = pushLexState layout_do
1081 maybe_layout ITof       = pushLexState layout
1082 maybe_layout ITlet      = pushLexState layout
1083 maybe_layout ITwhere    = pushLexState layout
1084 maybe_layout ITrec      = pushLexState layout
1085 maybe_layout _          = return ()
1086
1087 -- Pushing a new implicit layout context.  If the indentation of the
1088 -- next token is not greater than the previous layout context, then
1089 -- Haskell 98 says that the new layout context should be empty; that is
1090 -- the lexer must generate {}.
1091 --
1092 -- We are slightly more lenient than this: when the new context is started
1093 -- by a 'do', then we allow the new context to be at the same indentation as
1094 -- the previous context.  This is what the 'strict' argument is for.
1095 --
1096 new_layout_context strict span _buf _len = do
1097     popLexState
1098     (AI _ offset _) <- getInput
1099     ctx <- getContext
1100     case ctx of
1101         Layout prev_off : _  |
1102            (strict     && prev_off >= offset  ||
1103             not strict && prev_off > offset) -> do
1104                 -- token is indented to the left of the previous context.
1105                 -- we must generate a {} sequence now.
1106                 pushLexState layout_left
1107                 return (L span ITvocurly)
1108         _ -> do
1109                 setContext (Layout offset : ctx)
1110                 return (L span ITvocurly)
1111
1112 do_layout_left span _buf _len = do
1113     popLexState
1114     pushLexState bol  -- we must be at the start of a line
1115     return (L span ITvccurly)
1116
1117 -- -----------------------------------------------------------------------------
1118 -- LINE pragmas
1119
1120 setLine :: Int -> Action
1121 setLine code span buf len = do
1122   let line = parseUnsignedInteger buf len 10 octDecDigit
1123   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1124         -- subtract one: the line number refers to the *following* line
1125   popLexState
1126   pushLexState code
1127   lexToken
1128
1129 setFile :: Int -> Action
1130 setFile code span buf len = do
1131   let file = lexemeToFastString (stepOn buf) (len-2)
1132   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1133   popLexState
1134   pushLexState code
1135   lexToken
1136
1137
1138 -- -----------------------------------------------------------------------------
1139 -- Options, includes and language pragmas.
1140
1141 lex_string_prag :: (String -> Token) -> Action
1142 lex_string_prag mkTok span _buf _len
1143     = do input <- getInput
1144          start <- getSrcLoc
1145          tok <- go [] input
1146          end <- getSrcLoc
1147          return (L (mkSrcSpan start end) tok)
1148     where go acc input
1149               = if isString input "#-}"
1150                    then do setInput input
1151                            return (mkTok (reverse acc))
1152                    else case alexGetChar input of
1153                           Just (c,i) -> go (c:acc) i
1154                           Nothing -> err input
1155           isString _ [] = True
1156           isString i (x:xs)
1157               = case alexGetChar i of
1158                   Just (c,i') | c == x    -> isString i' xs
1159                   _other -> False
1160           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1161
1162
1163 -- -----------------------------------------------------------------------------
1164 -- Strings & Chars
1165
1166 -- This stuff is horrible.  I hates it.
1167
1168 lex_string_tok :: Action
1169 lex_string_tok span _buf _len = do
1170   tok <- lex_string ""
1171   end <- getSrcLoc
1172   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1173
1174 lex_string :: String -> P Token
1175 lex_string s = do
1176   i <- getInput
1177   case alexGetChar' i of
1178     Nothing -> lit_error
1179
1180     Just ('"',i)  -> do
1181         setInput i
1182         magicHash <- extension magicHashEnabled
1183         if magicHash
1184           then do
1185             i <- getInput
1186             case alexGetChar' i of
1187               Just ('#',i) -> do
1188                    setInput i
1189                    if any (> '\xFF') s
1190                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1191                     else let s' = mkZFastString (reverse s) in
1192                          return (ITprimstring s')
1193                         -- mkZFastString is a hack to avoid encoding the
1194                         -- string in UTF-8.  We just want the exact bytes.
1195               _other ->
1196                 return (ITstring (mkFastString (reverse s)))
1197           else
1198                 return (ITstring (mkFastString (reverse s)))
1199
1200     Just ('\\',i)
1201         | Just ('&',i) <- next -> do
1202                 setInput i; lex_string s
1203         | Just (c,i) <- next, is_space c -> do
1204                 setInput i; lex_stringgap s
1205         where next = alexGetChar' i
1206
1207     Just (c, i) -> do
1208         c' <- lex_char c i
1209         lex_string (c':s)
1210
1211 lex_stringgap s = do
1212   c <- getCharOrFail
1213   case c of
1214     '\\' -> lex_string s
1215     c | is_space c -> lex_stringgap s
1216     _other -> lit_error
1217
1218
1219 lex_char_tok :: Action
1220 -- Here we are basically parsing character literals, such as 'x' or '\n'
1221 -- but, when Template Haskell is on, we additionally spot
1222 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1223 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1224 -- So we have to do two characters of lookahead: when we see 'x we need to
1225 -- see if there's a trailing quote
1226 lex_char_tok span _buf _len = do        -- We've seen '
1227    i1 <- getInput       -- Look ahead to first character
1228    let loc = srcSpanStart span
1229    case alexGetChar' i1 of
1230         Nothing -> lit_error
1231
1232         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1233                   th_exts <- extension thEnabled
1234                   if th_exts then do
1235                         setInput i2
1236                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1237                    else lit_error
1238
1239         Just ('\\', i2@(AI _end2 _ _)) -> do    -- We've seen 'backslash
1240                   setInput i2
1241                   lit_ch <- lex_escape
1242                   mc <- getCharOrFail   -- Trailing quote
1243                   if mc == '\'' then finish_char_tok loc lit_ch
1244                                 else do setInput i2; lit_error
1245
1246         Just (c, i2@(AI _end2 _ _))
1247                 | not (isAny c) -> lit_error
1248                 | otherwise ->
1249
1250                 -- We've seen 'x, where x is a valid character
1251                 --  (i.e. not newline etc) but not a quote or backslash
1252            case alexGetChar' i2 of      -- Look ahead one more character
1253                 Just ('\'', i3) -> do   -- We've seen 'x'
1254                         setInput i3
1255                         finish_char_tok loc c
1256                 _other -> do            -- We've seen 'x not followed by quote
1257                                         -- (including the possibility of EOF)
1258                                         -- If TH is on, just parse the quote only
1259                         th_exts <- extension thEnabled
1260                         let (AI end _ _) = i1
1261                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1262                                    else do setInput i2; lit_error
1263
1264 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1265 finish_char_tok loc ch  -- We've already seen the closing quote
1266                         -- Just need to check for trailing #
1267   = do  magicHash <- extension magicHashEnabled
1268         i@(AI end _ _) <- getInput
1269         if magicHash then do
1270                 case alexGetChar' i of
1271                         Just ('#',i@(AI end _ _)) -> do
1272                                 setInput i
1273                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1274                         _other ->
1275                                 return (L (mkSrcSpan loc end) (ITchar ch))
1276                 else do
1277                    return (L (mkSrcSpan loc end) (ITchar ch))
1278
1279 lex_char :: Char -> AlexInput -> P Char
1280 lex_char c inp = do
1281   case c of
1282       '\\' -> do setInput inp; lex_escape
1283       c | isAny c -> do setInput inp; return c
1284       _other -> lit_error
1285
1286 isAny c | c > '\x7f' = isPrint c
1287         | otherwise  = is_any c
1288
1289 lex_escape :: P Char
1290 lex_escape = do
1291   c <- getCharOrFail
1292   case c of
1293         'a'   -> return '\a'
1294         'b'   -> return '\b'
1295         'f'   -> return '\f'
1296         'n'   -> return '\n'
1297         'r'   -> return '\r'
1298         't'   -> return '\t'
1299         'v'   -> return '\v'
1300         '\\'  -> return '\\'
1301         '"'   -> return '\"'
1302         '\''  -> return '\''
1303         '^'   -> do c <- getCharOrFail
1304                     if c >= '@' && c <= '_'
1305                         then return (chr (ord c - ord '@'))
1306                         else lit_error
1307
1308         'x'   -> readNum is_hexdigit 16 hexDigit
1309         'o'   -> readNum is_octdigit  8 octDecDigit
1310         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1311
1312         c1 ->  do
1313            i <- getInput
1314            case alexGetChar' i of
1315             Nothing -> lit_error
1316             Just (c2,i2) ->
1317               case alexGetChar' i2 of
1318                 Nothing -> do setInput i2; lit_error
1319                 Just (c3,i3) ->
1320                    let str = [c1,c2,c3] in
1321                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1322                                      Just rest <- [maybePrefixMatch p str] ] of
1323                           (escape_char,[]):_ -> do
1324                                 setInput i3
1325                                 return escape_char
1326                           (escape_char,_:_):_ -> do
1327                                 setInput i2
1328                                 return escape_char
1329                           [] -> lit_error
1330
1331 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1332 readNum is_digit base conv = do
1333   i <- getInput
1334   c <- getCharOrFail
1335   if is_digit c
1336         then readNum2 is_digit base conv (conv c)
1337         else do setInput i; lit_error
1338
1339 readNum2 is_digit base conv i = do
1340   input <- getInput
1341   read i input
1342   where read i input = do
1343           case alexGetChar' input of
1344             Just (c,input') | is_digit c -> do
1345                 read (i*base + conv c) input'
1346             _other -> do
1347                 if i >= 0 && i <= 0x10FFFF
1348                    then do setInput input; return (chr i)
1349                    else lit_error
1350
1351 silly_escape_chars = [
1352         ("NUL", '\NUL'),
1353         ("SOH", '\SOH'),
1354         ("STX", '\STX'),
1355         ("ETX", '\ETX'),
1356         ("EOT", '\EOT'),
1357         ("ENQ", '\ENQ'),
1358         ("ACK", '\ACK'),
1359         ("BEL", '\BEL'),
1360         ("BS", '\BS'),
1361         ("HT", '\HT'),
1362         ("LF", '\LF'),
1363         ("VT", '\VT'),
1364         ("FF", '\FF'),
1365         ("CR", '\CR'),
1366         ("SO", '\SO'),
1367         ("SI", '\SI'),
1368         ("DLE", '\DLE'),
1369         ("DC1", '\DC1'),
1370         ("DC2", '\DC2'),
1371         ("DC3", '\DC3'),
1372         ("DC4", '\DC4'),
1373         ("NAK", '\NAK'),
1374         ("SYN", '\SYN'),
1375         ("ETB", '\ETB'),
1376         ("CAN", '\CAN'),
1377         ("EM", '\EM'),
1378         ("SUB", '\SUB'),
1379         ("ESC", '\ESC'),
1380         ("FS", '\FS'),
1381         ("GS", '\GS'),
1382         ("RS", '\RS'),
1383         ("US", '\US'),
1384         ("SP", '\SP'),
1385         ("DEL", '\DEL')
1386         ]
1387
1388 -- before calling lit_error, ensure that the current input is pointing to
1389 -- the position of the error in the buffer.  This is so that we can report
1390 -- a correct location to the user, but also so we can detect UTF-8 decoding
1391 -- errors if they occur.
1392 lit_error = lexError "lexical error in string/character literal"
1393
1394 getCharOrFail :: P Char
1395 getCharOrFail =  do
1396   i <- getInput
1397   case alexGetChar' i of
1398         Nothing -> lexError "unexpected end-of-file in string/character literal"
1399         Just (c,i)  -> do setInput i; return c
1400
1401 -- -----------------------------------------------------------------------------
1402 -- QuasiQuote
1403
1404 lex_quasiquote_tok :: Action
1405 lex_quasiquote_tok span buf len = do
1406   let quoter = reverse $ takeWhile (/= '$')
1407                $ reverse $ lexemeToString buf (len - 1)
1408   quoteStart <- getSrcLoc
1409   quote <- lex_quasiquote ""
1410   end <- getSrcLoc
1411   return (L (mkSrcSpan (srcSpanStart span) end)
1412            (ITquasiQuote (mkFastString quoter,
1413                           mkFastString (reverse quote),
1414                           mkSrcSpan quoteStart end)))
1415
1416 lex_quasiquote :: String -> P String
1417 lex_quasiquote s = do
1418   i <- getInput
1419   case alexGetChar' i of
1420     Nothing -> lit_error
1421
1422     Just ('\\',i)
1423         | Just ('|',i) <- next -> do
1424                 setInput i; lex_quasiquote ('|' : s)
1425         | Just (']',i) <- next -> do
1426                 setInput i; lex_quasiquote (']' : s)
1427         where next = alexGetChar' i
1428
1429     Just ('|',i)
1430         | Just (']',i) <- next -> do
1431                 setInput i; return s
1432         where next = alexGetChar' i
1433
1434     Just (c, i) -> do
1435          setInput i; lex_quasiquote (c : s)
1436
1437 -- -----------------------------------------------------------------------------
1438 -- Warnings
1439
1440 warn :: DynFlag -> SDoc -> Action
1441 warn option warning srcspan _buf _len = do
1442     addWarning option srcspan warning
1443     lexToken
1444
1445 warnThen :: DynFlag -> SDoc -> Action -> Action
1446 warnThen option warning action srcspan buf len = do
1447     addWarning option srcspan warning
1448     action srcspan buf len
1449
1450 -- -----------------------------------------------------------------------------
1451 -- The Parse Monad
1452
1453 data LayoutContext
1454   = NoLayout
1455   | Layout !Int
1456   deriving Show
1457
1458 data ParseResult a
1459   = POk PState a
1460   | PFailed
1461         SrcSpan         -- The start and end of the text span related to
1462                         -- the error.  Might be used in environments which can
1463                         -- show this span, e.g. by highlighting it.
1464         Message         -- The error message
1465
1466 data PState = PState {
1467         buffer     :: StringBuffer,
1468     dflags     :: DynFlags,
1469     messages   :: Messages,
1470         last_loc   :: SrcSpan,  -- pos of previous token
1471         last_offs  :: !Int,     -- offset of the previous token from the
1472                                 -- beginning of  the current line.
1473                                 -- \t is equal to 8 spaces.
1474         last_len   :: !Int,     -- len of previous token
1475   last_line_len :: !Int,
1476         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1477         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1478         context    :: [LayoutContext],
1479         lex_state  :: [Int]
1480      }
1481         -- last_loc and last_len are used when generating error messages,
1482         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1483         -- current token to happyError, we could at least get rid of last_len.
1484         -- Getting rid of last_loc would require finding another way to
1485         -- implement pushCurrentContext (which is only called from one place).
1486
1487 newtype P a = P { unP :: PState -> ParseResult a }
1488
1489 instance Monad P where
1490   return = returnP
1491   (>>=) = thenP
1492   fail = failP
1493
1494 returnP :: a -> P a
1495 returnP a = a `seq` (P $ \s -> POk s a)
1496
1497 thenP :: P a -> (a -> P b) -> P b
1498 (P m) `thenP` k = P $ \ s ->
1499         case m s of
1500                 POk s1 a         -> (unP (k a)) s1
1501                 PFailed span err -> PFailed span err
1502
1503 failP :: String -> P a
1504 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1505
1506 failMsgP :: String -> P a
1507 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1508
1509 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1510 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1511
1512 failSpanMsgP :: SrcSpan -> SDoc -> P a
1513 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1514
1515 extension :: (Int -> Bool) -> P Bool
1516 extension p = P $ \s -> POk s (p $! extsBitmap s)
1517
1518 getExts :: P Int
1519 getExts = P $ \s -> POk s (extsBitmap s)
1520
1521 setExts :: (Int -> Int) -> P ()
1522 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1523
1524 setSrcLoc :: SrcLoc -> P ()
1525 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1526
1527 getSrcLoc :: P SrcLoc
1528 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1529
1530 setLastToken :: SrcSpan -> Int -> Int -> P ()
1531 setLastToken loc len line_len = P $ \s -> POk s {
1532   last_loc=loc,
1533   last_len=len,
1534   last_line_len=line_len
1535 } ()
1536
1537 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1538
1539 alexInputPrevChar :: AlexInput -> Char
1540 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1541
1542 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1543 alexGetChar (AI loc ofs s)
1544   | atEnd s   = Nothing
1545   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1546                 --trace (show (ord c)) $
1547                 Just (adj_c, (AI loc' ofs' s'))
1548   where (c,s') = nextChar s
1549         loc'   = advanceSrcLoc loc c
1550         ofs'   = advanceOffs c ofs
1551
1552         non_graphic     = '\x0'
1553         upper           = '\x1'
1554         lower           = '\x2'
1555         digit           = '\x3'
1556         symbol          = '\x4'
1557         space           = '\x5'
1558         other_graphic   = '\x6'
1559
1560         adj_c
1561           | c <= '\x06' = non_graphic
1562           | c <= '\x7f' = c
1563           -- Alex doesn't handle Unicode, so when Unicode
1564           -- character is encoutered we output these values
1565           -- with the actual character value hidden in the state.
1566           | otherwise =
1567                 case generalCategory c of
1568                   UppercaseLetter       -> upper
1569                   LowercaseLetter       -> lower
1570                   TitlecaseLetter       -> upper
1571                   ModifierLetter        -> other_graphic
1572                   OtherLetter           -> lower -- see #1103
1573                   NonSpacingMark        -> other_graphic
1574                   SpacingCombiningMark  -> other_graphic
1575                   EnclosingMark         -> other_graphic
1576                   DecimalNumber         -> digit
1577                   LetterNumber          -> other_graphic
1578                   OtherNumber           -> other_graphic
1579                   ConnectorPunctuation  -> symbol
1580                   DashPunctuation       -> symbol
1581                   OpenPunctuation       -> other_graphic
1582                   ClosePunctuation      -> other_graphic
1583                   InitialQuote          -> other_graphic
1584                   FinalQuote            -> other_graphic
1585                   OtherPunctuation      -> symbol
1586                   MathSymbol            -> symbol
1587                   CurrencySymbol        -> symbol
1588                   ModifierSymbol        -> symbol
1589                   OtherSymbol           -> symbol
1590                   Space                 -> space
1591                   _other                -> non_graphic
1592
1593 -- This version does not squash unicode characters, it is used when
1594 -- lexing strings.
1595 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1596 alexGetChar' (AI loc ofs s)
1597   | atEnd s   = Nothing
1598   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1599                 --trace (show (ord c)) $
1600                 Just (c, (AI loc' ofs' s'))
1601   where (c,s') = nextChar s
1602         loc'   = advanceSrcLoc loc c
1603         ofs'   = advanceOffs c ofs
1604
1605 advanceOffs :: Char -> Int -> Int
1606 advanceOffs '\n' _    = 0
1607 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1608 advanceOffs _    offs = offs + 1
1609
1610 getInput :: P AlexInput
1611 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1612
1613 setInput :: AlexInput -> P ()
1614 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1615
1616 pushLexState :: Int -> P ()
1617 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1618
1619 popLexState :: P Int
1620 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1621
1622 getLexState :: P Int
1623 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1624
1625 -- for reasons of efficiency, flags indicating language extensions (eg,
1626 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1627 -- integer
1628
1629 genericsBit, ffiBit, parrBit :: Int
1630 genericsBit = 0 -- {| and |}
1631 ffiBit     = 1
1632 parrBit    = 2
1633 arrowsBit  = 4
1634 thBit      = 5
1635 ipBit      = 6
1636 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1637 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1638                 -- (doesn't affect the lexer)
1639 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1640 haddockBit = 10 -- Lex and parse Haddock comments
1641 magicHashBit = 11 -- "#" in both functions and operators
1642 kindSigsBit = 12 -- Kind signatures on type variables
1643 recursiveDoBit = 13 -- mdo
1644 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1645 unboxedTuplesBit = 15 -- (# and #)
1646 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1647 transformComprehensionsBit = 17
1648 qqBit      = 18 -- enable quasiquoting
1649 inRulePragBit = 19
1650 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1651 newQualOpsBit = 21 -- Haskell' qualified operator syntax, e.g. Prelude.(+)
1652
1653 genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1654 always           _     = True
1655 genericsEnabled  flags = testBit flags genericsBit
1656 ffiEnabled       flags = testBit flags ffiBit
1657 parrEnabled      flags = testBit flags parrBit
1658 arrowsEnabled    flags = testBit flags arrowsBit
1659 thEnabled        flags = testBit flags thBit
1660 ipEnabled        flags = testBit flags ipBit
1661 explicitForallEnabled flags = testBit flags explicitForallBit
1662 bangPatEnabled   flags = testBit flags bangPatBit
1663 tyFamEnabled     flags = testBit flags tyFamBit
1664 haddockEnabled   flags = testBit flags haddockBit
1665 magicHashEnabled flags = testBit flags magicHashBit
1666 kindSigsEnabled  flags = testBit flags kindSigsBit
1667 recursiveDoEnabled flags = testBit flags recursiveDoBit
1668 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1669 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1670 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1671 transformComprehensionsEnabled flags = testBit flags transformComprehensionsBit
1672 qqEnabled        flags = testBit flags qqBit
1673 inRulePrag       flags = testBit flags inRulePragBit
1674 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1675 newQualOps       flags = testBit flags newQualOpsBit
1676 oldQualOps flags = not (newQualOps flags)
1677
1678 -- PState for parsing options pragmas
1679 --
1680 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1681 pragState dynflags buf loc =
1682   PState {
1683       buffer        = buf,
1684       messages      = emptyMessages,
1685       dflags        = dynflags,
1686       last_loc      = mkSrcSpan loc loc,
1687       last_offs     = 0,
1688       last_len      = 0,
1689       last_line_len = 0,
1690       loc           = loc,
1691       extsBitmap    = 0,
1692       context       = [],
1693       lex_state     = [bol, option_prags, 0]
1694     }
1695
1696
1697 -- create a parse state
1698 --
1699 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1700 mkPState buf loc flags  =
1701   PState {
1702       buffer          = buf,
1703       dflags        = flags,
1704       messages      = emptyMessages,
1705       last_loc      = mkSrcSpan loc loc,
1706       last_offs     = 0,
1707       last_len      = 0,
1708       last_line_len = 0,
1709       loc           = loc,
1710       extsBitmap    = fromIntegral bitmap,
1711       context       = [],
1712       lex_state     = [bol, 0]
1713         -- we begin in the layout state if toplev_layout is set
1714     }
1715     where
1716       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1717                .|. ffiBit       `setBitIf` dopt Opt_ForeignFunctionInterface flags
1718                .|. parrBit      `setBitIf` dopt Opt_PArr         flags
1719                .|. arrowsBit    `setBitIf` dopt Opt_Arrows       flags
1720                .|. thBit        `setBitIf` dopt Opt_TemplateHaskell flags
1721                .|. qqBit        `setBitIf` dopt Opt_QuasiQuotes flags
1722                .|. ipBit        `setBitIf` dopt Opt_ImplicitParams flags
1723                .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
1724                .|. explicitForallBit `setBitIf` dopt Opt_LiberalTypeSynonyms flags
1725                .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
1726                .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
1727                .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
1728                .|. explicitForallBit `setBitIf` dopt Opt_RankNTypes flags
1729                .|. bangPatBit   `setBitIf` dopt Opt_BangPatterns flags
1730                .|. tyFamBit     `setBitIf` dopt Opt_TypeFamilies flags
1731                .|. haddockBit   `setBitIf` dopt Opt_Haddock      flags
1732                .|. magicHashBit `setBitIf` dopt Opt_MagicHash    flags
1733                .|. kindSigsBit  `setBitIf` dopt Opt_KindSignatures flags
1734                .|. recursiveDoBit `setBitIf` dopt Opt_RecursiveDo flags
1735                .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
1736                .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
1737                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1738                .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1739                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1740                .|. newQualOpsBit `setBitIf` dopt Opt_NewQualifiedOperators flags
1741       --
1742       setBitIf :: Int -> Bool -> Int
1743       b `setBitIf` cond | cond      = bit b
1744                         | otherwise = 0
1745
1746 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1747 addWarning option srcspan warning
1748  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1749        let warning' = mkWarnMsg srcspan alwaysQualify warning
1750            ws' = if dopt option d then ws `snocBag` warning' else ws
1751        in POk s{messages=(ws', es)} ()
1752
1753 getMessages :: PState -> Messages
1754 getMessages PState{messages=ms} = ms
1755
1756 getContext :: P [LayoutContext]
1757 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1758
1759 setContext :: [LayoutContext] -> P ()
1760 setContext ctx = P $ \s -> POk s{context=ctx} ()
1761
1762 popContext :: P ()
1763 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1764                               last_len = len, last_loc = last_loc }) ->
1765   case ctx of
1766         (_:tl) -> POk s{ context = tl } ()
1767         []     -> PFailed last_loc (srcParseErr buf len)
1768
1769 -- Push a new layout context at the indentation of the last token read.
1770 -- This is only used at the outer level of a module when the 'module'
1771 -- keyword is missing.
1772 pushCurrentContext :: P ()
1773 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1774     POk s{context = Layout (offs-len) : ctx} ()
1775 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1776
1777 getOffside :: P Ordering
1778 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1779                 let ord = case stk of
1780                         (Layout n:_) -> compare offs n
1781                         _            -> GT
1782                 in POk s ord
1783
1784 -- ---------------------------------------------------------------------------
1785 -- Construct a parse error
1786
1787 srcParseErr
1788   :: StringBuffer       -- current buffer (placed just after the last token)
1789   -> Int                -- length of the previous token
1790   -> Message
1791 srcParseErr buf len
1792   = hcat [ if null token
1793              then ptext (sLit "parse error (possibly incorrect indentation)")
1794              else hcat [ptext (sLit "parse error on input "),
1795                         char '`', text token, char '\'']
1796     ]
1797   where token = lexemeToString (offsetBytes (-len) buf) len
1798
1799 -- Report a parse failure, giving the span of the previous token as
1800 -- the location of the error.  This is the entry point for errors
1801 -- detected during parsing.
1802 srcParseFail :: P a
1803 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1804                             last_loc = last_loc } ->
1805     PFailed last_loc (srcParseErr buf len)
1806
1807 -- A lexical error is reported at a particular position in the source file,
1808 -- not over a token range.
1809 lexError :: String -> P a
1810 lexError str = do
1811   loc <- getSrcLoc
1812   (AI end _ buf) <- getInput
1813   reportLexError loc end buf str
1814
1815 -- -----------------------------------------------------------------------------
1816 -- This is the top-level function: called from the parser each time a
1817 -- new token is to be read from the input.
1818
1819 lexer :: (Located Token -> P a) -> P a
1820 lexer cont = do
1821   tok@(L _span _tok__) <- lexToken
1822 --  trace ("token: " ++ show tok__) $ do
1823   cont tok
1824
1825 lexToken :: P (Located Token)
1826 lexToken = do
1827   inp@(AI loc1 _ buf) <- getInput
1828   sc <- getLexState
1829   exts <- getExts
1830   case alexScanUser exts inp sc of
1831     AlexEOF -> do
1832         let span = mkSrcSpan loc1 loc1
1833         setLastToken span 0 0
1834         return (L span ITeof)
1835     AlexError (AI loc2 _ buf) ->
1836         reportLexError loc1 loc2 buf "lexical error"
1837     AlexSkip inp2 _ -> do
1838         setInput inp2
1839         lexToken
1840     AlexToken inp2@(AI end _ buf2) _ t -> do
1841         setInput inp2
1842         let span = mkSrcSpan loc1 end
1843         let bytes = byteDiff buf buf2
1844         span `seq` setLastToken span bytes bytes
1845         t span buf bytes
1846
1847 reportLexError loc1 loc2 buf str
1848   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1849   | otherwise =
1850   let
1851         c = fst (nextChar buf)
1852   in
1853   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1854     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1855     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1856
1857 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
1858 lexTokenStream buf loc dflags = unP go initState
1859     where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
1860           go = do
1861             ltok <- lexer return
1862             case ltok of
1863               L _ ITeof -> return []
1864               _ -> liftM (ltok:) go
1865 }