compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 --   qualified operator NOTES.
  24 --
  25 --   - If M.(+) is a single lexeme, then..
  26 --     - Probably (+) should be a single lexeme too, for consistency.
  27 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  28 --     - But we have to rule out reserved operators, otherwise (..) becomes
  29 --       a different lexeme.
  30 --     - Should we therefore also rule out reserved operators in the qualified
  31 --       form?  This is quite difficult to achieve.  We don't do it for
  32 --       qualified varids.
  33
  34 {
  35 {-# OPTIONS -w #-}
  36 -- The above warning supression flag is a temporary kludge.
  37 -- While working on this module you are encouraged to remove it and fix
  38 -- any warnings in the module. See
  39 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  40 -- for details
  41 --
  42 -- Note that Alex itself generates code with with some unused bindings and
  43 -- without type signatures, so removing the flag might not be possible.
  44
  45 {-# OPTIONS_GHC -funbox-strict-fields #-}
  46
  47 module Lexer (
  48    Token(..), lexer, pragState, mkPState, PState(..),
  49    P(..), ParseResult(..), getSrcLoc,
  50    failLocMsgP, failSpanMsgP, srcParseFail,
  51    getMessages,
  52    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  53    getLexState, popLexState, pushLexState,
  54    extension, standaloneDerivingEnabled, bangPatEnabled,
  55    addWarning,
  56    lexTokenStream
  57   ) where
  58
  59 import Bag
  60 import ErrUtils
  61 import Outputable
  62 import StringBuffer
  63 import FastString
  64 import FastTypes
  65 import SrcLoc
  66 import UniqFM
  67 import DynFlags
  68 import Ctype
  69 import Util             ( maybePrefixMatch, readRational )
  70
  71 import Control.Monad
  72 import Data.Bits
  73 import Data.Char        ( chr, ord, isSpace )
  74 import Data.Ratio
  75 import Debug.Trace
  76
  77 import Unicode ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  78 }
  79
  80 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  81 $whitechar   = [\ \n\r\f\v $unispace]
  82 $white_no_nl = $whitechar # \n
  83 $tab         = \t
  84
  85 $ascdigit  = 0-9
  86 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  87 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  88 $digit     = [$ascdigit $unidigit]
  89
  90 $special   = [\(\)\,\;\[\]\`\{\}]
  91 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  92 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  93 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  94
  95 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  96 $asclarge  = [A-Z]
  97 $large     = [$asclarge $unilarge]
  98
  99 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 100 $ascsmall  = [a-z]
 101 $small     = [$ascsmall $unismall \_]
 102
 103 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 104 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 105
 106 $octit     = 0-7
 107 $hexit     = [$decdigit A-F a-f]
 108 $symchar   = [$symbol \:]
 109 $nl        = [\n\r]
 110 $idchar    = [$small $large $digit \']
 111
 112 $docsym    = [\| \^ \* \$]
 113
 114 @varid     = $small $idchar*
 115 @conid     = $large $idchar*
 116
 117 @varsym    = $symbol $symchar*
 118 @consym    = \: $symchar*
 119
 120 @decimal     = $decdigit+
 121 @octal       = $octit+
 122 @hexadecimal = $hexit+
 123 @exponent    = [eE] [\-\+]? @decimal
 124
 125 -- we support the hierarchical module name extension:
 126 @qual = (@conid \.)+
 127
 128 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 129
 130 -- normal signed numerical literals can only be explicitly negative,
 131 -- not explicitly positive (contrast @exponent)
 132 @negative = \-
 133 @signed = @negative ?
 134
 135 haskell :-
 136
 137 -- everywhere: skip whitespace and comments
 138 $white_no_nl+                           ;
 139 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 140
 141 -- Everywhere: deal with nested comments.  We explicitly rule out
 142 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 143 -- (this can happen even though pragmas will normally take precedence due to
 144 -- longest-match, because pragmas aren't valid in every state, but comments
 145 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 146 -- set.
 147
 148 "{-" / { isNormalComment } { nested_comment lexToken }
 149
 150 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 151 -- more dashes followed by a symbol should be parsed as a varsym, so we
 152 -- have to exclude those.
 153
 154 -- Since Haddock comments aren't valid in every state, we need to rule them
 155 -- out here.
 156
 157 -- The following two rules match comments that begin with two dashes, but
 158 -- continue with a different character. The rules test that this character
 159 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 160 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 161 -- have a Haddock comment). The rules then munch the rest of the line.
 162
 163 "-- " ~[$docsym \#] .* { lineCommentToken }
 164 "--" [^$symbol : \ ] .* { lineCommentToken }
 165
 166 -- Next, match Haddock comments if no -haddock flag
 167
 168 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 169
 170 -- Now, when we've matched comments that begin with 2 dashes and continue
 171 -- with a different character, we need to match comments that begin with three
 172 -- or more dashes (which clearly can't be Haddock comments). We only need to
 173 -- make sure that the first non-dash character isn't a symbol, and munch the
 174 -- rest of the line.
 175
 176 "---"\-* [^$symbol :] .* { lineCommentToken }
 177
 178 -- Since the previous rules all match dashes followed by at least one
 179 -- character, we also need to match a whole line filled with just dashes.
 180
 181 "--"\-* / { atEOL } { lineCommentToken }
 182
 183 -- We need this rule since none of the other single line comment rules
 184 -- actually match this case.
 185
 186 "-- " / { atEOL } { lineCommentToken }
 187
 188 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 189 -- blank lines) until we find a non-whitespace character, then do layout
 190 -- processing.
 191 --
 192 -- One slight wibble here: what if the line begins with {-#? In
 193 -- theory, we have to lex the pragma to see if it's one we recognise,
 194 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 195 -- as a nested comment.  We don't bother with this: if the line begins
 196 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 197 <bol> {
 198   \n                                    ;
 199   ^\# (line)?                           { begin line_prag1 }
 200   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 201   ^\# \! .* \n                          ; -- #!, for scripts
 202   ()                                    { do_bol }
 203 }
 204
 205 -- after a layout keyword (let, where, do, of), we begin a new layout
 206 -- context if the curly brace is missing.
 207 -- Careful! This stuff is quite delicate.
 208 <layout, layout_do> {
 209   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 210         -- we might encounter {-# here, but {- has been handled already
 211   \n                                    ;
 212   ^\# (line)?                           { begin line_prag1 }
 213 }
 214
 215 -- do is treated in a subtly different way, see new_layout_context
 216 <layout>    ()                          { new_layout_context True }
 217 <layout_do> ()                          { new_layout_context False }
 218
 219 -- after a new layout context which was found to be to the left of the
 220 -- previous context, we have generated a '{' token, and we now need to
 221 -- generate a matching '}' token.
 222 <layout_left>  ()                       { do_layout_left }
 223
 224 <0,option_prags> \n                             { begin bol }
 225
 226 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 227
 228 -- single-line line pragmas, of the form
 229 --    # <line> "<file>" <extra-stuff> \n
 230 <line_prag1> $decdigit+                 { setLine line_prag1a }
 231 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 232 <line_prag1b> .*                        { pop }
 233
 234 -- Haskell-style line pragmas, of the form
 235 --    {-# LINE <line> "<file>" #-}
 236 <line_prag2> $decdigit+                 { setLine line_prag2a }
 237 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 238 <line_prag2b> "#-}"|"-}"                { pop }
 239    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 240    -- with older versions of GHC which generated these.
 241
 242 <0,option_prags> {
 243   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 244   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 245   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 246                                         { token (ITinline_prag False) }
 247   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 248                                         { token ITspec_prag }
 249   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 250         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 251   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 252         $whitechar* (NO(T?)INLINE|no(t?)inline)
 253                                         { token (ITspec_inline_prag False) }
 254   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 255   "{-#" $whitechar* (WARNING|warning)
 256                                         { token ITwarning_prag }
 257   "{-#" $whitechar* (DEPRECATED|deprecated)
 258                                         { token ITdeprecated_prag }
 259   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 260   "{-#" $whitechar* (GENERATED|generated)
 261                                         { token ITgenerated_prag }
 262   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 263   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 264
 265   -- We ignore all these pragmas, but don't generate a warning for them
 266   -- CFILES is a hugs-only thing.
 267   "{-#" $whitechar* (OPTIONS_HUGS|options_hugs|OPTIONS_NHC98|options_nhc98|OPTIONS_JHC|options_jhc|CFILES|cfiles)
 268                     { nested_comment lexToken }
 269
 270   -- ToDo: should only be valid inside a pragma:
 271   "#-}"                                 { endPrag }
 272 }
 273
 274 <option_prags> {
 275   "{-#"  $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 276   "{-#"  $whitechar* (OPTIONS_GHC|options_ghc)
 277                                         { lex_string_prag IToptions_prag }
 278   "{-#"  $whitechar* (OPTIONS_HADDOCK|options_haddock)
 279                                          { lex_string_prag ITdocOptions }
 280   "-- #"                                 { multiline_doc_comment }
 281   "{-#"  $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 282   "{-#"  $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 283 }
 284
 285 <0> {
 286   -- In the "0" mode we ignore these pragmas
 287   "{-#"  $whitechar* (OPTIONS|options|OPTIONS_GHC|options_ghc|OPTIONS_HADDOCK|options_haddock|LANGUAGE|language|INCLUDE|include)
 288                      { nested_comment lexToken }
 289 }
 290
 291 <0> {
 292   "-- #" .* { lineCommentToken }
 293 }
 294
 295 <0,option_prags> {
 296   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 297                     (nested_comment lexToken) }
 298 }
 299
 300 -- '0' state: ordinary lexemes
 301
 302 -- Haddock comments
 303
 304 <0> {
 305   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 306   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 307 }
 308
 309 -- "special" symbols
 310
 311 <0> {
 312   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 313   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 314 }
 315
 316 <0> {
 317   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 318   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 319   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 320   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 321   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 322   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 323   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 324   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 325
 326   "[$" @varid "|"  / { ifExtension qqEnabled }
 327                      { lex_quasiquote_tok }
 328 }
 329
 330 <0> {
 331   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 332                                         { special IToparenbar }
 333   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 334 }
 335
 336 <0> {
 337   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 338 }
 339
 340 <0> {
 341   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 342          { token IToubxparen }
 343   "#)" / { ifExtension unboxedTuplesEnabled }
 344          { token ITcubxparen }
 345 }
 346
 347 <0> {
 348   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 349   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 350 }
 351
 352 <0,option_prags> {
 353   \(                                    { special IToparen }
 354   \)                                    { special ITcparen }
 355   \[                                    { special ITobrack }
 356   \]                                    { special ITcbrack }
 357   \,                                    { special ITcomma }
 358   \;                                    { special ITsemi }
 359   \`                                    { special ITbackquote }
 360
 361   \{                                    { open_brace }
 362   \}                                    { close_brace }
 363 }
 364
 365 <0,option_prags> {
 366   @qual @varid                  { idtoken qvarid }
 367   @qual @conid                  { idtoken qconid }
 368   @varid                        { varid }
 369   @conid                        { idtoken conid }
 370 }
 371
 372 <0> {
 373   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 374   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 375   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 376   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 377 }
 378
 379 -- ToDo: - move `var` and (sym) into lexical syntax?
 380 --       - remove backquote from $special?
 381 <0> {
 382   @qual @varsym       / { ifExtension oldQualOps } { idtoken qvarsym }
 383   @qual @consym       / { ifExtension oldQualOps } { idtoken qconsym }
 384   @qual \( @varsym \) / { ifExtension newQualOps } { idtoken prefixqvarsym }
 385   @qual \( @consym \) / { ifExtension newQualOps } { idtoken prefixqconsym }
 386   @varsym                                          { varsym }
 387   @consym                                          { consym }
 388 }
 389
 390 -- For the normal boxed literals we need to be careful
 391 -- when trying to be close to Haskell98
 392 <0> {
 393   -- Normal integral literals (:: Num a => a, from Integer)
 394   @decimal           { tok_num positive 0 0 decimal }
 395   0[oO] @octal       { tok_num positive 2 2 octal }
 396   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 397
 398   -- Normal rational literals (:: Fractional a => a, from Rational)
 399   @floating_point    { strtoken tok_float }
 400 }
 401
 402 <0> {
 403   -- Unboxed ints (:: Int#) and words (:: Word#)
 404   -- It's simpler (and faster?) to give separate cases to the negatives,
 405   -- especially considering octal/hexadecimal prefixes.
 406   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 407   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 408   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 409   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 410   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 411   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 412
 413   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 414   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 415   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 416
 417   -- Unboxed floats and doubles (:: Float#, :: Double#)
 418   -- prim_{float,double} work with signed literals
 419   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 420   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 421 }
 422
 423 -- Strings and chars are lexed by hand-written code.  The reason is
 424 -- that even if we recognise the string or char here in the regex
 425 -- lexer, we would still have to parse the string afterward in order
 426 -- to convert it to a String.
 427 <0> {
 428   \'                            { lex_char_tok }
 429   \"                            { lex_string_tok }
 430 }
 431
 432 {
 433 -- -----------------------------------------------------------------------------
 434 -- The token type
 435
 436 data Token
 437   = ITas                        -- Haskell keywords
 438   | ITcase
 439   | ITclass
 440   | ITdata
 441   | ITdefault
 442   | ITderiving
 443   | ITdo
 444   | ITelse
 445   | IThiding
 446   | ITif
 447   | ITimport
 448   | ITin
 449   | ITinfix
 450   | ITinfixl
 451   | ITinfixr
 452   | ITinstance
 453   | ITlet
 454   | ITmodule
 455   | ITnewtype
 456   | ITof
 457   | ITqualified
 458   | ITthen
 459   | ITtype
 460   | ITwhere
 461   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 462
 463   | ITforall                    -- GHC extension keywords
 464   | ITforeign
 465   | ITexport
 466   | ITlabel
 467   | ITdynamic
 468   | ITsafe
 469   | ITthreadsafe
 470   | ITunsafe
 471   | ITstdcallconv
 472   | ITccallconv
 473   | ITdotnet
 474   | ITmdo
 475   | ITfamily
 476   | ITgroup
 477   | ITby
 478   | ITusing
 479
 480         -- Pragmas
 481   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 482   | ITspec_prag                 -- SPECIALISE
 483   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 484   | ITsource_prag
 485   | ITrules_prag
 486   | ITwarning_prag
 487   | ITdeprecated_prag
 488   | ITline_prag
 489   | ITscc_prag
 490   | ITgenerated_prag
 491   | ITcore_prag                 -- hdaume: core annotations
 492   | ITunpack_prag
 493   | ITclose_prag
 494   | IToptions_prag String
 495   | ITinclude_prag String
 496   | ITlanguage_prag
 497
 498   | ITdotdot                    -- reserved symbols
 499   | ITcolon
 500   | ITdcolon
 501   | ITequal
 502   | ITlam
 503   | ITvbar
 504   | ITlarrow
 505   | ITrarrow
 506   | ITat
 507   | ITtilde
 508   | ITdarrow
 509   | ITminus
 510   | ITbang
 511   | ITstar
 512   | ITdot
 513
 514   | ITbiglam                    -- GHC-extension symbols
 515
 516   | ITocurly                    -- special symbols
 517   | ITccurly
 518   | ITocurlybar                 -- {|, for type applications
 519   | ITccurlybar                 -- |}, for type applications
 520   | ITvocurly
 521   | ITvccurly
 522   | ITobrack
 523   | ITopabrack                  -- [:, for parallel arrays with -XParr
 524   | ITcpabrack                  -- :], for parallel arrays with -XParr
 525   | ITcbrack
 526   | IToparen
 527   | ITcparen
 528   | IToubxparen
 529   | ITcubxparen
 530   | ITsemi
 531   | ITcomma
 532   | ITunderscore
 533   | ITbackquote
 534
 535   | ITvarid   FastString        -- identifiers
 536   | ITconid   FastString
 537   | ITvarsym  FastString
 538   | ITconsym  FastString
 539   | ITqvarid  (FastString,FastString)
 540   | ITqconid  (FastString,FastString)
 541   | ITqvarsym (FastString,FastString)
 542   | ITqconsym (FastString,FastString)
 543   | ITprefixqvarsym (FastString,FastString)
 544   | ITprefixqconsym (FastString,FastString)
 545
 546   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 547
 548   | ITpragma StringBuffer
 549
 550   | ITchar       Char
 551   | ITstring     FastString
 552   | ITinteger    Integer
 553   | ITrational   Rational
 554
 555   | ITprimchar   Char
 556   | ITprimstring FastString
 557   | ITprimint    Integer
 558   | ITprimword   Integer
 559   | ITprimfloat  Rational
 560   | ITprimdouble Rational
 561
 562   -- MetaHaskell extension tokens
 563   | ITopenExpQuote              --  [| or [e|
 564   | ITopenPatQuote              --  [p|
 565   | ITopenDecQuote              --  [d|
 566   | ITopenTypQuote              --  [t|
 567   | ITcloseQuote                --  |]
 568   | ITidEscape   FastString     --  $x
 569   | ITparenEscape               --  $(
 570   | ITvarQuote                  --  '
 571   | ITtyQuote                   --  ''
 572   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 573
 574   -- Arrow notation extension
 575   | ITproc
 576   | ITrec
 577   | IToparenbar                 --  (|
 578   | ITcparenbar                 --  |)
 579   | ITlarrowtail                --  -<
 580   | ITrarrowtail                --  >-
 581   | ITLarrowtail                --  -<<
 582   | ITRarrowtail                --  >>-
 583
 584   | ITunknown String            -- Used when the lexer can't make sense of it
 585   | ITeof                       -- end of file token
 586
 587   -- Documentation annotations
 588   | ITdocCommentNext  String     -- something beginning '-- |'
 589   | ITdocCommentPrev  String     -- something beginning '-- ^'
 590   | ITdocCommentNamed String     -- something beginning '-- $'
 591   | ITdocSection      Int String -- a section heading
 592   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 593   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 594   | ITlineComment     String     -- comment starting by "--"
 595   | ITblockComment    String     -- comment in {- -}
 596
 597 #ifdef DEBUG
 598   deriving Show -- debugging
 599 #endif
 600
 601 {-
 602 isSpecial :: Token -> Bool
 603 -- If we see M.x, where x is a keyword, but
 604 -- is special, we treat is as just plain M.x,
 605 -- not as a keyword.
 606 isSpecial ITas          = True
 607 isSpecial IThiding      = True
 608 isSpecial ITqualified   = True
 609 isSpecial ITforall      = True
 610 isSpecial ITexport      = True
 611 isSpecial ITlabel       = True
 612 isSpecial ITdynamic     = True
 613 isSpecial ITsafe        = True
 614 isSpecial ITthreadsafe  = True
 615 isSpecial ITunsafe      = True
 616 isSpecial ITccallconv   = True
 617 isSpecial ITstdcallconv = True
 618 isSpecial ITmdo         = True
 619 isSpecial ITfamily      = True
 620 isSpecial ITgroup   = True
 621 isSpecial ITby      = True
 622 isSpecial ITusing   = True
 623 isSpecial _             = False
 624 -}
 625
 626 -- the bitmap provided as the third component indicates whether the
 627 -- corresponding extension keyword is valid under the extension options
 628 -- provided to the compiler; if the extension corresponding to *any* of the
 629 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 630 -- facilitates using a keyword in two different extensions that can be
 631 -- activated independently)
 632 --
 633 reservedWordsFM = listToUFM $
 634         map (\(x, y, z) -> (mkFastString x, (y, z)))
 635        [( "_",          ITunderscore,   0 ),
 636         ( "as",         ITas,           0 ),
 637         ( "case",       ITcase,         0 ),
 638         ( "class",      ITclass,        0 ),
 639         ( "data",       ITdata,         0 ),
 640         ( "default",    ITdefault,      0 ),
 641         ( "deriving",   ITderiving,     0 ),
 642         ( "do",         ITdo,           0 ),
 643         ( "else",       ITelse,         0 ),
 644         ( "hiding",     IThiding,       0 ),
 645         ( "if",         ITif,           0 ),
 646         ( "import",     ITimport,       0 ),
 647         ( "in",         ITin,           0 ),
 648         ( "infix",      ITinfix,        0 ),
 649         ( "infixl",     ITinfixl,       0 ),
 650         ( "infixr",     ITinfixr,       0 ),
 651         ( "instance",   ITinstance,     0 ),
 652         ( "let",        ITlet,          0 ),
 653         ( "module",     ITmodule,       0 ),
 654         ( "newtype",    ITnewtype,      0 ),
 655         ( "of",         ITof,           0 ),
 656         ( "qualified",  ITqualified,    0 ),
 657         ( "then",       ITthen,         0 ),
 658         ( "type",       ITtype,         0 ),
 659         ( "where",      ITwhere,        0 ),
 660         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 661
 662     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 663         ( "mdo",        ITmdo,           bit recursiveDoBit),
 664         ( "family",     ITfamily,        bit tyFamBit),
 665     ( "group",  ITgroup,     bit transformComprehensionsBit),
 666     ( "by",     ITby,        bit transformComprehensionsBit),
 667     ( "using",  ITusing,     bit transformComprehensionsBit),
 668
 669         ( "foreign",    ITforeign,       bit ffiBit),
 670         ( "export",     ITexport,        bit ffiBit),
 671         ( "label",      ITlabel,         bit ffiBit),
 672         ( "dynamic",    ITdynamic,       bit ffiBit),
 673         ( "safe",       ITsafe,          bit ffiBit),
 674         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 675         ( "unsafe",     ITunsafe,        bit ffiBit),
 676         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 677         ( "ccall",      ITccallconv,     bit ffiBit),
 678         ( "dotnet",     ITdotnet,        bit ffiBit),
 679
 680         ( "rec",        ITrec,           bit arrowsBit),
 681         ( "proc",       ITproc,          bit arrowsBit)
 682      ]
 683
 684 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 685 reservedSymsFM = listToUFM $
 686     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 687       [ ("..",  ITdotdot,   always)
 688         -- (:) is a reserved op, meaning only list cons
 689        ,(":",   ITcolon,    always)
 690        ,("::",  ITdcolon,   always)
 691        ,("=",   ITequal,    always)
 692        ,("\\",  ITlam,      always)
 693        ,("|",   ITvbar,     always)
 694        ,("<-",  ITlarrow,   always)
 695        ,("->",  ITrarrow,   always)
 696        ,("@",   ITat,       always)
 697        ,("~",   ITtilde,    always)
 698        ,("=>",  ITdarrow,   always)
 699        ,("-",   ITminus,    always)
 700        ,("!",   ITbang,     always)
 701
 702         -- For data T (a::*) = MkT
 703        ,("*", ITstar, \i -> kindSigsEnabled i || tyFamEnabled i)
 704         -- For 'forall a . t'
 705        ,(".", ITdot, \i -> explicitForallEnabled i || inRulePrag i)
 706
 707        ,("-<",  ITlarrowtail, arrowsEnabled)
 708        ,(">-",  ITrarrowtail, arrowsEnabled)
 709        ,("-<<", ITLarrowtail, arrowsEnabled)
 710        ,(">>-", ITRarrowtail, arrowsEnabled)
 711
 712 #if __GLASGOW_HASKELL__ >= 605
 713        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 714        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 715        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 716                                 explicitForallEnabled i)
 717        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 718        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 719        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 720         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 721         -- form part of a large operator.  This would let us have a better
 722         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 723 #endif
 724        ]
 725
 726 -- -----------------------------------------------------------------------------
 727 -- Lexer actions
 728
 729 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 730
 731 special :: Token -> Action
 732 special tok span _buf _len = return (L span tok)
 733
 734 token, layout_token :: Token -> Action
 735 token t span _buf _len = return (L span t)
 736 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 737
 738 idtoken :: (StringBuffer -> Int -> Token) -> Action
 739 idtoken f span buf len = return (L span $! (f buf len))
 740
 741 skip_one_varid :: (FastString -> Token) -> Action
 742 skip_one_varid f span buf len
 743   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 744
 745 strtoken :: (String -> Token) -> Action
 746 strtoken f span buf len =
 747   return (L span $! (f $! lexemeToString buf len))
 748
 749 init_strtoken :: Int -> (String -> Token) -> Action
 750 -- like strtoken, but drops the last N character(s)
 751 init_strtoken drop f span buf len =
 752   return (L span $! (f $! lexemeToString buf (len-drop)))
 753
 754 begin :: Int -> Action
 755 begin code _span _str _len = do pushLexState code; lexToken
 756
 757 pop :: Action
 758 pop _span _buf _len = do popLexState; lexToken
 759
 760 pop_and :: Action -> Action
 761 pop_and act span buf len = do popLexState; act span buf len
 762
 763 {-# INLINE nextCharIs #-}
 764 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 765
 766 notFollowedBy char _ _ _ (AI _ _ buf)
 767   = nextCharIs buf (/=char)
 768
 769 notFollowedBySymbol _ _ _ (AI _ _ buf)
 770   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 771
 772 -- We must reject doc comments as being ordinary comments everywhere.
 773 -- In some cases the doc comment will be selected as the lexeme due to
 774 -- maximal munch, but not always, because the nested comment rule is
 775 -- valid in all states, but the doc-comment rules are only valid in
 776 -- the non-layout states.
 777 isNormalComment bits _ _ (AI _ _ buf)
 778   | haddockEnabled bits = notFollowedByDocOrPragma
 779   | otherwise           = nextCharIs buf (/='#')
 780   where
 781     notFollowedByDocOrPragma
 782        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 783
 784 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 785
 786 {-
 787 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 788   = if haddockEnabled bits then False else (p buf)
 789 -}
 790
 791 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 792
 793 ifExtension pred bits _ _ _ = pred bits
 794
 795 multiline_doc_comment :: Action
 796 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 797   where
 798     worker commentAcc input docType oneLine = case alexGetChar input of
 799       Just ('\n', input')
 800         | oneLine -> docCommentEnd input commentAcc docType buf span
 801         | otherwise -> case checkIfCommentLine input' of
 802           Just input -> worker ('\n':commentAcc) input docType False
 803           Nothing -> docCommentEnd input commentAcc docType buf span
 804       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 805       Nothing -> docCommentEnd input commentAcc docType buf span
 806
 807     checkIfCommentLine input = check (dropNonNewlineSpace input)
 808       where
 809         check input = case alexGetChar input of
 810           Just ('-', input) -> case alexGetChar input of
 811             Just ('-', input) -> case alexGetChar input of
 812               Just (c, _) | c /= '-' -> Just input
 813               _ -> Nothing
 814             _ -> Nothing
 815           _ -> Nothing
 816
 817         dropNonNewlineSpace input = case alexGetChar input of
 818           Just (c, input')
 819             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 820             | otherwise -> input
 821           Nothing -> input
 822
 823 lineCommentToken :: Action
 824 lineCommentToken span buf len = do
 825   b <- extension rawTokenStreamEnabled
 826   if b then strtoken ITlineComment span buf len else lexToken
 827
 828 {-
 829   nested comments require traversing by hand, they can't be parsed
 830   using regular expressions.
 831 -}
 832 nested_comment :: P (Located Token) -> Action
 833 nested_comment cont span _str _len = do
 834   input <- getInput
 835   go "" (1::Int) input
 836   where
 837     go commentAcc 0 input = do setInput input
 838                                b <- extension rawTokenStreamEnabled
 839                                if b
 840                                  then docCommentEnd input commentAcc ITblockComment _str span
 841                                  else cont
 842     go commentAcc n input = case alexGetChar input of
 843       Nothing -> errBrace input span
 844       Just ('-',input) -> case alexGetChar input of
 845         Nothing  -> errBrace input span
 846         Just ('\125',input) -> go commentAcc (n-1) input
 847         Just (_,_)          -> go ('-':commentAcc) n input
 848       Just ('\123',input) -> case alexGetChar input of
 849         Nothing  -> errBrace input span
 850         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 851         Just (_,_)       -> go ('\123':commentAcc) n input
 852       Just (c,input) -> go (c:commentAcc) n input
 853
 854 nested_doc_comment :: Action
 855 nested_doc_comment span buf _len = withLexedDocType (go "")
 856   where
 857     go commentAcc input docType _ = case alexGetChar input of
 858       Nothing -> errBrace input span
 859       Just ('-',input) -> case alexGetChar input of
 860         Nothing -> errBrace input span
 861         Just ('\125',input) ->
 862           docCommentEnd input commentAcc docType buf span
 863         Just (_,_) -> go ('-':commentAcc) input docType False
 864       Just ('\123', input) -> case alexGetChar input of
 865         Nothing  -> errBrace input span
 866         Just ('-',input) -> do
 867           setInput input
 868           let cont = do input <- getInput; go commentAcc input docType False
 869           nested_comment cont span buf _len
 870         Just (_,_) -> go ('\123':commentAcc) input docType False
 871       Just (c,input) -> go (c:commentAcc) input docType False
 872
 873 withLexedDocType lexDocComment = do
 874   input@(AI _ _ buf) <- getInput
 875   case prevChar buf ' ' of
 876     '|' -> lexDocComment input ITdocCommentNext False
 877     '^' -> lexDocComment input ITdocCommentPrev False
 878     '$' -> lexDocComment input ITdocCommentNamed False
 879     '*' -> lexDocSection 1 input
 880     '#' -> lexDocComment input ITdocOptionsOld False
 881  where
 882     lexDocSection n input = case alexGetChar input of
 883       Just ('*', input) -> lexDocSection (n+1) input
 884       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 885       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 886
 887 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 888 -- off again at the end of the pragma.
 889 rulePrag :: Action
 890 rulePrag span buf len = do
 891   setExts (.|. inRulePragBit)
 892   return (L span ITrules_prag)
 893
 894 endPrag :: Action
 895 endPrag span buf len = do
 896   setExts (.&. complement (bit inRulePragBit))
 897   return (L span ITclose_prag)
 898
 899 -- docCommentEnd
 900 -------------------------------------------------------------------------------
 901 -- This function is quite tricky. We can't just return a new token, we also
 902 -- need to update the state of the parser. Why? Because the token is longer
 903 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 904 -- it writes the wrong token length to the parser state. This function is
 905 -- called afterwards, so it can just update the state.
 906
 907 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 908 -- which is something that the original lexer didn't account for.
 909 -- I have added last_line_len in the parser state which represents the length
 910 -- of the part of the token that is on the last line. It is now used for layout
 911 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 912 -- was before, the full length of the token, and it is now only used for error
 913 -- messages. /Waern
 914
 915 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 916                  SrcSpan -> P (Located Token)
 917 docCommentEnd input commentAcc docType buf span = do
 918   setInput input
 919   let (AI loc last_offs nextBuf) = input
 920       comment = reverse commentAcc
 921       span' = mkSrcSpan (srcSpanStart span) loc
 922       last_len = byteDiff buf nextBuf
 923
 924       last_line_len = if (last_offs - last_len < 0)
 925         then last_offs
 926         else last_len
 927
 928   span `seq` setLastToken span' last_len last_line_len
 929   return (L span' (docType comment))
 930
 931 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 932
 933 open_brace, close_brace :: Action
 934 open_brace span _str _len = do
 935   ctx <- getContext
 936   setContext (NoLayout:ctx)
 937   return (L span ITocurly)
 938 close_brace span _str _len = do
 939   popContext
 940   return (L span ITccurly)
 941
 942 qvarid buf len = ITqvarid $! splitQualName buf len False
 943 qconid buf len = ITqconid $! splitQualName buf len False
 944
 945 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 946 -- takes a StringBuffer and a length, and returns the module name
 947 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 948 -- because of hierarchical module names.
 949 splitQualName orig_buf len parens = split orig_buf orig_buf
 950   where
 951     split buf dot_buf
 952         | orig_buf `byteDiff` buf >= len  = done dot_buf
 953         | c == '.'                        = found_dot buf'
 954         | otherwise                       = split buf' dot_buf
 955       where
 956        (c,buf') = nextChar buf
 957
 958     -- careful, we might get names like M....
 959     -- so, if the character after the dot is not upper-case, this is
 960     -- the end of the qualifier part.
 961     found_dot buf -- buf points after the '.'
 962         | isUpper c    = split buf' buf
 963         | otherwise    = done buf
 964       where
 965        (c,buf') = nextChar buf
 966
 967     done dot_buf =
 968         (lexemeToFastString orig_buf (qual_size - 1),
 969          if parens -- Prelude.(+)
 970             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 971             else lexemeToFastString dot_buf (len - qual_size))
 972       where
 973         qual_size = orig_buf `byteDiff` dot_buf
 974
 975 varid span buf len =
 976   fs `seq`
 977   case lookupUFM reservedWordsFM fs of
 978         Just (keyword,0)    -> do
 979                 maybe_layout keyword
 980                 return (L span keyword)
 981         Just (keyword,exts) -> do
 982                 b <- extension (\i -> exts .&. i /= 0)
 983                 if b then do maybe_layout keyword
 984                              return (L span keyword)
 985                      else return (L span (ITvarid fs))
 986         _other -> return (L span (ITvarid fs))
 987   where
 988         fs = lexemeToFastString buf len
 989
 990 conid buf len = ITconid fs
 991   where fs = lexemeToFastString buf len
 992
 993 qvarsym buf len = ITqvarsym $! splitQualName buf len False
 994 qconsym buf len = ITqconsym $! splitQualName buf len False
 995 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
 996 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
 997
 998 varsym = sym ITvarsym
 999 consym = sym ITconsym
1000
1001 sym con span buf len =
1002   case lookupUFM reservedSymsFM fs of
1003         Just (keyword,exts) -> do
1004                 b <- extension exts
1005                 if b then return (L span keyword)
1006                      else return (L span $! con fs)
1007         _other -> return (L span $! con fs)
1008   where
1009         fs = lexemeToFastString buf len
1010
1011 -- Variations on the integral numeric literal.
1012 tok_integral :: (Integer -> Token)
1013      -> (Integer -> Integer)
1014  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1015      -> Int -> Int
1016      -> (Integer, (Char->Int)) -> Action
1017 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1018   return $ L span $ itint $! transint $ parseUnsignedInteger
1019      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1020
1021 -- some conveniences for use with tok_integral
1022 tok_num = tok_integral ITinteger
1023 tok_primint = tok_integral ITprimint
1024 tok_primword = tok_integral ITprimword positive
1025 positive = id
1026 negative = negate
1027 decimal = (10,octDecDigit)
1028 octal = (8,octDecDigit)
1029 hexadecimal = (16,hexDigit)
1030
1031 -- readRational can understand negative rationals, exponents, everything.
1032 tok_float        str = ITrational   $! readRational str
1033 tok_primfloat    str = ITprimfloat  $! readRational str
1034 tok_primdouble   str = ITprimdouble $! readRational str
1035
1036 -- -----------------------------------------------------------------------------
1037 -- Layout processing
1038
1039 -- we're at the first token on a line, insert layout tokens if necessary
1040 do_bol :: Action
1041 do_bol span _str _len = do
1042         pos <- getOffside
1043         case pos of
1044             LT -> do
1045                 --trace "layout: inserting '}'" $ do
1046                 popContext
1047                 -- do NOT pop the lex state, we might have a ';' to insert
1048                 return (L span ITvccurly)
1049             EQ -> do
1050                 --trace "layout: inserting ';'" $ do
1051                 popLexState
1052                 return (L span ITsemi)
1053             GT -> do
1054                 popLexState
1055                 lexToken
1056
1057 -- certain keywords put us in the "layout" state, where we might
1058 -- add an opening curly brace.
1059 maybe_layout ITdo       = pushLexState layout_do
1060 maybe_layout ITmdo      = pushLexState layout_do
1061 maybe_layout ITof       = pushLexState layout
1062 maybe_layout ITlet      = pushLexState layout
1063 maybe_layout ITwhere    = pushLexState layout
1064 maybe_layout ITrec      = pushLexState layout
1065 maybe_layout _          = return ()
1066
1067 -- Pushing a new implicit layout context.  If the indentation of the
1068 -- next token is not greater than the previous layout context, then
1069 -- Haskell 98 says that the new layout context should be empty; that is
1070 -- the lexer must generate {}.
1071 --
1072 -- We are slightly more lenient than this: when the new context is started
1073 -- by a 'do', then we allow the new context to be at the same indentation as
1074 -- the previous context.  This is what the 'strict' argument is for.
1075 --
1076 new_layout_context strict span _buf _len = do
1077     popLexState
1078     (AI _ offset _) <- getInput
1079     ctx <- getContext
1080     case ctx of
1081         Layout prev_off : _  |
1082            (strict     && prev_off >= offset  ||
1083             not strict && prev_off > offset) -> do
1084                 -- token is indented to the left of the previous context.
1085                 -- we must generate a {} sequence now.
1086                 pushLexState layout_left
1087                 return (L span ITvocurly)
1088         _ -> do
1089                 setContext (Layout offset : ctx)
1090                 return (L span ITvocurly)
1091
1092 do_layout_left span _buf _len = do
1093     popLexState
1094     pushLexState bol  -- we must be at the start of a line
1095     return (L span ITvccurly)
1096
1097 -- -----------------------------------------------------------------------------
1098 -- LINE pragmas
1099
1100 setLine :: Int -> Action
1101 setLine code span buf len = do
1102   let line = parseUnsignedInteger buf len 10 octDecDigit
1103   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1104         -- subtract one: the line number refers to the *following* line
1105   popLexState
1106   pushLexState code
1107   lexToken
1108
1109 setFile :: Int -> Action
1110 setFile code span buf len = do
1111   let file = lexemeToFastString (stepOn buf) (len-2)
1112   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1113   popLexState
1114   pushLexState code
1115   lexToken
1116
1117
1118 -- -----------------------------------------------------------------------------
1119 -- Options, includes and language pragmas.
1120
1121 lex_string_prag :: (String -> Token) -> Action
1122 lex_string_prag mkTok span _buf _len
1123     = do input <- getInput
1124          start <- getSrcLoc
1125          tok <- go [] input
1126          end <- getSrcLoc
1127          return (L (mkSrcSpan start end) tok)
1128     where go acc input
1129               = if isString input "#-}"
1130                    then do setInput input
1131                            return (mkTok (reverse acc))
1132                    else case alexGetChar input of
1133                           Just (c,i) -> go (c:acc) i
1134                           Nothing -> err input
1135           isString _ [] = True
1136           isString i (x:xs)
1137               = case alexGetChar i of
1138                   Just (c,i') | c == x    -> isString i' xs
1139                   _other -> False
1140           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1141
1142
1143 -- -----------------------------------------------------------------------------
1144 -- Strings & Chars
1145
1146 -- This stuff is horrible.  I hates it.
1147
1148 lex_string_tok :: Action
1149 lex_string_tok span _buf _len = do
1150   tok <- lex_string ""
1151   end <- getSrcLoc
1152   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1153
1154 lex_string :: String -> P Token
1155 lex_string s = do
1156   i <- getInput
1157   case alexGetChar' i of
1158     Nothing -> lit_error
1159
1160     Just ('"',i)  -> do
1161         setInput i
1162         magicHash <- extension magicHashEnabled
1163         if magicHash
1164           then do
1165             i <- getInput
1166             case alexGetChar' i of
1167               Just ('#',i) -> do
1168                    setInput i
1169                    if any (> '\xFF') s
1170                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1171                     else let s' = mkZFastString (reverse s) in
1172                          return (ITprimstring s')
1173                         -- mkZFastString is a hack to avoid encoding the
1174                         -- string in UTF-8.  We just want the exact bytes.
1175               _other ->
1176                 return (ITstring (mkFastString (reverse s)))
1177           else
1178                 return (ITstring (mkFastString (reverse s)))
1179
1180     Just ('\\',i)
1181         | Just ('&',i) <- next -> do
1182                 setInput i; lex_string s
1183         | Just (c,i) <- next, is_space c -> do
1184                 setInput i; lex_stringgap s
1185         where next = alexGetChar' i
1186
1187     Just (c, i) -> do
1188         c' <- lex_char c i
1189         lex_string (c':s)
1190
1191 lex_stringgap s = do
1192   c <- getCharOrFail
1193   case c of
1194     '\\' -> lex_string s
1195     c | is_space c -> lex_stringgap s
1196     _other -> lit_error
1197
1198
1199 lex_char_tok :: Action
1200 -- Here we are basically parsing character literals, such as 'x' or '\n'
1201 -- but, when Template Haskell is on, we additionally spot
1202 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1203 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1204 -- So we have to do two characters of lookahead: when we see 'x we need to
1205 -- see if there's a trailing quote
1206 lex_char_tok span _buf _len = do        -- We've seen '
1207    i1 <- getInput       -- Look ahead to first character
1208    let loc = srcSpanStart span
1209    case alexGetChar' i1 of
1210         Nothing -> lit_error
1211
1212         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1213                   th_exts <- extension thEnabled
1214                   if th_exts then do
1215                         setInput i2
1216                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1217                    else lit_error
1218
1219         Just ('\\', i2@(AI _end2 _ _)) -> do    -- We've seen 'backslash
1220                   setInput i2
1221                   lit_ch <- lex_escape
1222                   mc <- getCharOrFail   -- Trailing quote
1223                   if mc == '\'' then finish_char_tok loc lit_ch
1224                                 else do setInput i2; lit_error
1225
1226         Just (c, i2@(AI _end2 _ _))
1227                 | not (isAny c) -> lit_error
1228                 | otherwise ->
1229
1230                 -- We've seen 'x, where x is a valid character
1231                 --  (i.e. not newline etc) but not a quote or backslash
1232            case alexGetChar' i2 of      -- Look ahead one more character
1233                 Nothing -> lit_error
1234                 Just ('\'', i3) -> do   -- We've seen 'x'
1235                         setInput i3
1236                         finish_char_tok loc c
1237                 _other -> do            -- We've seen 'x not followed by quote
1238                                         -- If TH is on, just parse the quote only
1239                         th_exts <- extension thEnabled
1240                         let (AI end _ _) = i1
1241                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1242                                    else do setInput i2; lit_error
1243
1244 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1245 finish_char_tok loc ch  -- We've already seen the closing quote
1246                         -- Just need to check for trailing #
1247   = do  magicHash <- extension magicHashEnabled
1248         i@(AI end _ _) <- getInput
1249         if magicHash then do
1250                 case alexGetChar' i of
1251                         Just ('#',i@(AI end _ _)) -> do
1252                                 setInput i
1253                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1254                         _other ->
1255                                 return (L (mkSrcSpan loc end) (ITchar ch))
1256                 else do
1257                    return (L (mkSrcSpan loc end) (ITchar ch))
1258
1259 lex_char :: Char -> AlexInput -> P Char
1260 lex_char c inp = do
1261   case c of
1262       '\\' -> do setInput inp; lex_escape
1263       c | isAny c -> do setInput inp; return c
1264       _other -> lit_error
1265
1266 isAny c | c > '\x7f' = isPrint c
1267         | otherwise  = is_any c
1268
1269 lex_escape :: P Char
1270 lex_escape = do
1271   c <- getCharOrFail
1272   case c of
1273         'a'   -> return '\a'
1274         'b'   -> return '\b'
1275         'f'   -> return '\f'
1276         'n'   -> return '\n'
1277         'r'   -> return '\r'
1278         't'   -> return '\t'
1279         'v'   -> return '\v'
1280         '\\'  -> return '\\'
1281         '"'   -> return '\"'
1282         '\''  -> return '\''
1283         '^'   -> do c <- getCharOrFail
1284                     if c >= '@' && c <= '_'
1285                         then return (chr (ord c - ord '@'))
1286                         else lit_error
1287
1288         'x'   -> readNum is_hexdigit 16 hexDigit
1289         'o'   -> readNum is_octdigit  8 octDecDigit
1290         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1291
1292         c1 ->  do
1293            i <- getInput
1294            case alexGetChar' i of
1295             Nothing -> lit_error
1296             Just (c2,i2) ->
1297               case alexGetChar' i2 of
1298                 Nothing -> do setInput i2; lit_error
1299                 Just (c3,i3) ->
1300                    let str = [c1,c2,c3] in
1301                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1302                                      Just rest <- [maybePrefixMatch p str] ] of
1303                           (escape_char,[]):_ -> do
1304                                 setInput i3
1305                                 return escape_char
1306                           (escape_char,_:_):_ -> do
1307                                 setInput i2
1308                                 return escape_char
1309                           [] -> lit_error
1310
1311 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1312 readNum is_digit base conv = do
1313   i <- getInput
1314   c <- getCharOrFail
1315   if is_digit c
1316         then readNum2 is_digit base conv (conv c)
1317         else do setInput i; lit_error
1318
1319 readNum2 is_digit base conv i = do
1320   input <- getInput
1321   read i input
1322   where read i input = do
1323           case alexGetChar' input of
1324             Just (c,input') | is_digit c -> do
1325                 read (i*base + conv c) input'
1326             _other -> do
1327                 if i >= 0 && i <= 0x10FFFF
1328                    then do setInput input; return (chr i)
1329                    else lit_error
1330
1331 silly_escape_chars = [
1332         ("NUL", '\NUL'),
1333         ("SOH", '\SOH'),
1334         ("STX", '\STX'),
1335         ("ETX", '\ETX'),
1336         ("EOT", '\EOT'),
1337         ("ENQ", '\ENQ'),
1338         ("ACK", '\ACK'),
1339         ("BEL", '\BEL'),
1340         ("BS", '\BS'),
1341         ("HT", '\HT'),
1342         ("LF", '\LF'),
1343         ("VT", '\VT'),
1344         ("FF", '\FF'),
1345         ("CR", '\CR'),
1346         ("SO", '\SO'),
1347         ("SI", '\SI'),
1348         ("DLE", '\DLE'),
1349         ("DC1", '\DC1'),
1350         ("DC2", '\DC2'),
1351         ("DC3", '\DC3'),
1352         ("DC4", '\DC4'),
1353         ("NAK", '\NAK'),
1354         ("SYN", '\SYN'),
1355         ("ETB", '\ETB'),
1356         ("CAN", '\CAN'),
1357         ("EM", '\EM'),
1358         ("SUB", '\SUB'),
1359         ("ESC", '\ESC'),
1360         ("FS", '\FS'),
1361         ("GS", '\GS'),
1362         ("RS", '\RS'),
1363         ("US", '\US'),
1364         ("SP", '\SP'),
1365         ("DEL", '\DEL')
1366         ]
1367
1368 -- before calling lit_error, ensure that the current input is pointing to
1369 -- the position of the error in the buffer.  This is so that we can report
1370 -- a correct location to the user, but also so we can detect UTF-8 decoding
1371 -- errors if they occur.
1372 lit_error = lexError "lexical error in string/character literal"
1373
1374 getCharOrFail :: P Char
1375 getCharOrFail =  do
1376   i <- getInput
1377   case alexGetChar' i of
1378         Nothing -> lexError "unexpected end-of-file in string/character literal"
1379         Just (c,i)  -> do setInput i; return c
1380
1381 -- -----------------------------------------------------------------------------
1382 -- QuasiQuote
1383
1384 lex_quasiquote_tok :: Action
1385 lex_quasiquote_tok span buf len = do
1386   let quoter = reverse $ takeWhile (/= '$')
1387                $ reverse $ lexemeToString buf (len - 1)
1388   quoteStart <- getSrcLoc
1389   quote <- lex_quasiquote ""
1390   end <- getSrcLoc
1391   return (L (mkSrcSpan (srcSpanStart span) end)
1392            (ITquasiQuote (mkFastString quoter,
1393                           mkFastString (reverse quote),
1394                           mkSrcSpan quoteStart end)))
1395
1396 lex_quasiquote :: String -> P String
1397 lex_quasiquote s = do
1398   i <- getInput
1399   case alexGetChar' i of
1400     Nothing -> lit_error
1401
1402     Just ('\\',i)
1403         | Just ('|',i) <- next -> do
1404                 setInput i; lex_quasiquote ('|' : s)
1405         | Just (']',i) <- next -> do
1406                 setInput i; lex_quasiquote (']' : s)
1407         where next = alexGetChar' i
1408
1409     Just ('|',i)
1410         | Just (']',i) <- next -> do
1411                 setInput i; return s
1412         where next = alexGetChar' i
1413
1414     Just (c, i) -> do
1415          setInput i; lex_quasiquote (c : s)
1416
1417 -- -----------------------------------------------------------------------------
1418 -- Warnings
1419
1420 warn :: DynFlag -> SDoc -> Action
1421 warn option warning srcspan _buf _len = do
1422     addWarning option srcspan warning
1423     lexToken
1424
1425 warnThen :: DynFlag -> SDoc -> Action -> Action
1426 warnThen option warning action srcspan buf len = do
1427     addWarning option srcspan warning
1428     action srcspan buf len
1429
1430 -- -----------------------------------------------------------------------------
1431 -- The Parse Monad
1432
1433 data LayoutContext
1434   = NoLayout
1435   | Layout !Int
1436   deriving Show
1437
1438 data ParseResult a
1439   = POk PState a
1440   | PFailed
1441         SrcSpan         -- The start and end of the text span related to
1442                         -- the error.  Might be used in environments which can
1443                         -- show this span, e.g. by highlighting it.
1444         Message         -- The error message
1445
1446 data PState = PState {
1447         buffer     :: StringBuffer,
1448     dflags     :: DynFlags,
1449     messages   :: Messages,
1450         last_loc   :: SrcSpan,  -- pos of previous token
1451         last_offs  :: !Int,     -- offset of the previous token from the
1452                                 -- beginning of  the current line.
1453                                 -- \t is equal to 8 spaces.
1454         last_len   :: !Int,     -- len of previous token
1455   last_line_len :: !Int,
1456         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1457         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1458         context    :: [LayoutContext],
1459         lex_state  :: [Int]
1460      }
1461         -- last_loc and last_len are used when generating error messages,
1462         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1463         -- current token to happyError, we could at least get rid of last_len.
1464         -- Getting rid of last_loc would require finding another way to
1465         -- implement pushCurrentContext (which is only called from one place).
1466
1467 newtype P a = P { unP :: PState -> ParseResult a }
1468
1469 instance Monad P where
1470   return = returnP
1471   (>>=) = thenP
1472   fail = failP
1473
1474 returnP :: a -> P a
1475 returnP a = a `seq` (P $ \s -> POk s a)
1476
1477 thenP :: P a -> (a -> P b) -> P b
1478 (P m) `thenP` k = P $ \ s ->
1479         case m s of
1480                 POk s1 a         -> (unP (k a)) s1
1481                 PFailed span err -> PFailed span err
1482
1483 failP :: String -> P a
1484 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1485
1486 failMsgP :: String -> P a
1487 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1488
1489 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1490 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1491
1492 failSpanMsgP :: SrcSpan -> SDoc -> P a
1493 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1494
1495 extension :: (Int -> Bool) -> P Bool
1496 extension p = P $ \s -> POk s (p $! extsBitmap s)
1497
1498 getExts :: P Int
1499 getExts = P $ \s -> POk s (extsBitmap s)
1500
1501 setExts :: (Int -> Int) -> P ()
1502 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1503
1504 setSrcLoc :: SrcLoc -> P ()
1505 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1506
1507 getSrcLoc :: P SrcLoc
1508 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1509
1510 setLastToken :: SrcSpan -> Int -> Int -> P ()
1511 setLastToken loc len line_len = P $ \s -> POk s {
1512   last_loc=loc,
1513   last_len=len,
1514   last_line_len=line_len
1515 } ()
1516
1517 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1518
1519 alexInputPrevChar :: AlexInput -> Char
1520 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1521
1522 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1523 alexGetChar (AI loc ofs s)
1524   | atEnd s   = Nothing
1525   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1526                 --trace (show (ord c)) $
1527                 Just (adj_c, (AI loc' ofs' s'))
1528   where (c,s') = nextChar s
1529         loc'   = advanceSrcLoc loc c
1530         ofs'   = advanceOffs c ofs
1531
1532         non_graphic     = '\x0'
1533         upper           = '\x1'
1534         lower           = '\x2'
1535         digit           = '\x3'
1536         symbol          = '\x4'
1537         space           = '\x5'
1538         other_graphic   = '\x6'
1539
1540         adj_c
1541           | c <= '\x06' = non_graphic
1542           | c <= '\x7f' = c
1543           -- Alex doesn't handle Unicode, so when Unicode
1544           -- character is encoutered we output these values
1545           -- with the actual character value hidden in the state.
1546           | otherwise =
1547                 case generalCategory c of
1548                   UppercaseLetter       -> upper
1549                   LowercaseLetter       -> lower
1550                   TitlecaseLetter       -> upper
1551                   ModifierLetter        -> other_graphic
1552                   OtherLetter           -> lower -- see #1103
1553                   NonSpacingMark        -> other_graphic
1554                   SpacingCombiningMark  -> other_graphic
1555                   EnclosingMark         -> other_graphic
1556                   DecimalNumber         -> digit
1557                   LetterNumber          -> other_graphic
1558                   OtherNumber           -> other_graphic
1559                   ConnectorPunctuation  -> other_graphic
1560                   DashPunctuation       -> other_graphic
1561                   OpenPunctuation       -> other_graphic
1562                   ClosePunctuation      -> other_graphic
1563                   InitialQuote          -> other_graphic
1564                   FinalQuote            -> other_graphic
1565                   OtherPunctuation      -> other_graphic
1566                   MathSymbol            -> symbol
1567                   CurrencySymbol        -> symbol
1568                   ModifierSymbol        -> symbol
1569                   OtherSymbol           -> symbol
1570                   Space                 -> space
1571                   _other                -> non_graphic
1572
1573 -- This version does not squash unicode characters, it is used when
1574 -- lexing strings.
1575 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1576 alexGetChar' (AI loc ofs s)
1577   | atEnd s   = Nothing
1578   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1579                 --trace (show (ord c)) $
1580                 Just (c, (AI loc' ofs' s'))
1581   where (c,s') = nextChar s
1582         loc'   = advanceSrcLoc loc c
1583         ofs'   = advanceOffs c ofs
1584
1585 advanceOffs :: Char -> Int -> Int
1586 advanceOffs '\n' _    = 0
1587 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1588 advanceOffs _    offs = offs + 1
1589
1590 getInput :: P AlexInput
1591 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1592
1593 setInput :: AlexInput -> P ()
1594 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1595
1596 pushLexState :: Int -> P ()
1597 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1598
1599 popLexState :: P Int
1600 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1601
1602 getLexState :: P Int
1603 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1604
1605 -- for reasons of efficiency, flags indicating language extensions (eg,
1606 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1607 -- integer
1608
1609 genericsBit, ffiBit, parrBit :: Int
1610 genericsBit = 0 -- {| and |}
1611 ffiBit     = 1
1612 parrBit    = 2
1613 arrowsBit  = 4
1614 thBit      = 5
1615 ipBit      = 6
1616 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1617 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1618                 -- (doesn't affect the lexer)
1619 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1620 haddockBit = 10 -- Lex and parse Haddock comments
1621 magicHashBit = 11 -- "#" in both functions and operators
1622 kindSigsBit = 12 -- Kind signatures on type variables
1623 recursiveDoBit = 13 -- mdo
1624 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1625 unboxedTuplesBit = 15 -- (# and #)
1626 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1627 transformComprehensionsBit = 17
1628 qqBit      = 18 -- enable quasiquoting
1629 inRulePragBit = 19
1630 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1631 newQualOpsBit = 21 -- Haskell' qualified operator syntax, e.g. Prelude.(+)
1632
1633 genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1634 always           _     = True
1635 genericsEnabled  flags = testBit flags genericsBit
1636 ffiEnabled       flags = testBit flags ffiBit
1637 parrEnabled      flags = testBit flags parrBit
1638 arrowsEnabled    flags = testBit flags arrowsBit
1639 thEnabled        flags = testBit flags thBit
1640 ipEnabled        flags = testBit flags ipBit
1641 explicitForallEnabled flags = testBit flags explicitForallBit
1642 bangPatEnabled   flags = testBit flags bangPatBit
1643 tyFamEnabled     flags = testBit flags tyFamBit
1644 haddockEnabled   flags = testBit flags haddockBit
1645 magicHashEnabled flags = testBit flags magicHashBit
1646 kindSigsEnabled  flags = testBit flags kindSigsBit
1647 recursiveDoEnabled flags = testBit flags recursiveDoBit
1648 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1649 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1650 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1651 transformComprehensionsEnabled flags = testBit flags transformComprehensionsBit
1652 qqEnabled        flags = testBit flags qqBit
1653 inRulePrag       flags = testBit flags inRulePragBit
1654 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1655 newQualOps       flags = testBit flags newQualOpsBit
1656 oldQualOps flags = not (newQualOps flags)
1657
1658 -- PState for parsing options pragmas
1659 --
1660 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1661 pragState dynflags buf loc =
1662   PState {
1663       buffer        = buf,
1664       messages      = emptyMessages,
1665       dflags        = dynflags,
1666       last_loc      = mkSrcSpan loc loc,
1667       last_offs     = 0,
1668       last_len      = 0,
1669       last_line_len = 0,
1670       loc           = loc,
1671       extsBitmap    = 0,
1672       context       = [],
1673       lex_state     = [bol, option_prags, 0]
1674     }
1675
1676
1677 -- create a parse state
1678 --
1679 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1680 mkPState buf loc flags  =
1681   PState {
1682       buffer          = buf,
1683       dflags        = flags,
1684       messages      = emptyMessages,
1685       last_loc      = mkSrcSpan loc loc,
1686       last_offs     = 0,
1687       last_len      = 0,
1688       last_line_len = 0,
1689       loc           = loc,
1690       extsBitmap    = fromIntegral bitmap,
1691       context       = [],
1692       lex_state     = [bol, 0]
1693         -- we begin in the layout state if toplev_layout is set
1694     }
1695     where
1696       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1697                .|. ffiBit       `setBitIf` dopt Opt_ForeignFunctionInterface flags
1698                .|. parrBit      `setBitIf` dopt Opt_PArr         flags
1699                .|. arrowsBit    `setBitIf` dopt Opt_Arrows       flags
1700                .|. thBit        `setBitIf` dopt Opt_TemplateHaskell flags
1701                .|. qqBit        `setBitIf` dopt Opt_QuasiQuotes flags
1702                .|. ipBit        `setBitIf` dopt Opt_ImplicitParams flags
1703                .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
1704                .|. explicitForallBit `setBitIf` dopt Opt_LiberalTypeSynonyms flags
1705                .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
1706                .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
1707                .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
1708                .|. explicitForallBit `setBitIf` dopt Opt_RankNTypes flags
1709                .|. bangPatBit   `setBitIf` dopt Opt_BangPatterns flags
1710                .|. tyFamBit     `setBitIf` dopt Opt_TypeFamilies flags
1711                .|. haddockBit   `setBitIf` dopt Opt_Haddock      flags
1712                .|. magicHashBit `setBitIf` dopt Opt_MagicHash    flags
1713                .|. kindSigsBit  `setBitIf` dopt Opt_KindSignatures flags
1714                .|. recursiveDoBit `setBitIf` dopt Opt_RecursiveDo flags
1715                .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
1716                .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
1717                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1718                .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1719                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1720                .|. newQualOpsBit `setBitIf` dopt Opt_NewQualifiedOperators flags
1721       --
1722       setBitIf :: Int -> Bool -> Int
1723       b `setBitIf` cond | cond      = bit b
1724                         | otherwise = 0
1725
1726 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1727 addWarning option srcspan warning
1728  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1729        let warning' = mkWarnMsg srcspan alwaysQualify warning
1730            ws' = if dopt option d then ws `snocBag` warning' else ws
1731        in POk s{messages=(ws', es)} ()
1732
1733 getMessages :: PState -> Messages
1734 getMessages PState{messages=ms} = ms
1735
1736 getContext :: P [LayoutContext]
1737 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1738
1739 setContext :: [LayoutContext] -> P ()
1740 setContext ctx = P $ \s -> POk s{context=ctx} ()
1741
1742 popContext :: P ()
1743 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1744                               last_len = len, last_loc = last_loc }) ->
1745   case ctx of
1746         (_:tl) -> POk s{ context = tl } ()
1747         []     -> PFailed last_loc (srcParseErr buf len)
1748
1749 -- Push a new layout context at the indentation of the last token read.
1750 -- This is only used at the outer level of a module when the 'module'
1751 -- keyword is missing.
1752 pushCurrentContext :: P ()
1753 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1754     POk s{context = Layout (offs-len) : ctx} ()
1755 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1756
1757 getOffside :: P Ordering
1758 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1759                 let ord = case stk of
1760                         (Layout n:_) -> compare offs n
1761                         _            -> GT
1762                 in POk s ord
1763
1764 -- ---------------------------------------------------------------------------
1765 -- Construct a parse error
1766
1767 srcParseErr
1768   :: StringBuffer       -- current buffer (placed just after the last token)
1769   -> Int                -- length of the previous token
1770   -> Message
1771 srcParseErr buf len
1772   = hcat [ if null token
1773              then ptext (sLit "parse error (possibly incorrect indentation)")
1774              else hcat [ptext (sLit "parse error on input "),
1775                         char '`', text token, char '\'']
1776     ]
1777   where token = lexemeToString (offsetBytes (-len) buf) len
1778
1779 -- Report a parse failure, giving the span of the previous token as
1780 -- the location of the error.  This is the entry point for errors
1781 -- detected during parsing.
1782 srcParseFail :: P a
1783 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1784                             last_loc = last_loc } ->
1785     PFailed last_loc (srcParseErr buf len)
1786
1787 -- A lexical error is reported at a particular position in the source file,
1788 -- not over a token range.
1789 lexError :: String -> P a
1790 lexError str = do
1791   loc <- getSrcLoc
1792   (AI end _ buf) <- getInput
1793   reportLexError loc end buf str
1794
1795 -- -----------------------------------------------------------------------------
1796 -- This is the top-level function: called from the parser each time a
1797 -- new token is to be read from the input.
1798
1799 lexer :: (Located Token -> P a) -> P a
1800 lexer cont = do
1801   tok@(L _span _tok__) <- lexToken
1802 --  trace ("token: " ++ show tok__) $ do
1803   cont tok
1804
1805 lexToken :: P (Located Token)
1806 lexToken = do
1807   inp@(AI loc1 _ buf) <- getInput
1808   sc <- getLexState
1809   exts <- getExts
1810   case alexScanUser exts inp sc of
1811     AlexEOF -> do
1812         let span = mkSrcSpan loc1 loc1
1813         setLastToken span 0 0
1814         return (L span ITeof)
1815     AlexError (AI loc2 _ buf) ->
1816         reportLexError loc1 loc2 buf "lexical error"
1817     AlexSkip inp2 _ -> do
1818         setInput inp2
1819         lexToken
1820     AlexToken inp2@(AI end _ buf2) _ t -> do
1821         setInput inp2
1822         let span = mkSrcSpan loc1 end
1823         let bytes = byteDiff buf buf2
1824         span `seq` setLastToken span bytes bytes
1825         t span buf bytes
1826
1827 reportLexError loc1 loc2 buf str
1828   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1829   | otherwise =
1830   let
1831         c = fst (nextChar buf)
1832   in
1833   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1834     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1835     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1836
1837 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
1838 lexTokenStream buf loc dflags = unP go initState
1839     where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
1840           go = do
1841             ltok <- lexer return
1842             case ltok of
1843               L _ ITeof -> return []
1844               _ -> liftM (ltok:) go
1845 }