compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 --   qualified operator NOTES.
  24 --
  25 --   - If M.(+) is a single lexeme, then..
  26 --     - Probably (+) should be a single lexeme too, for consistency.
  27 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  28 --     - But we have to rule out reserved operators, otherwise (..) becomes
  29 --       a different lexeme.
  30 --     - Should we therefore also rule out reserved operators in the qualified
  31 --       form?  This is quite difficult to achieve.  We don't do it for
  32 --       qualified varids.
  33
  34 {
  35 {-# OPTIONS -w #-}
  36 -- The above warning supression flag is a temporary kludge.
  37 -- While working on this module you are encouraged to remove it and fix
  38 -- any warnings in the module. See
  39 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  40 -- for details
  41 --
  42 -- Note that Alex itself generates code with with some unused bindings and
  43 -- without type signatures, so removing the flag might not be possible.
  44
  45 {-# OPTIONS_GHC -funbox-strict-fields #-}
  46
  47 module Lexer (
  48    Token(..), lexer, pragState, mkPState, PState(..),
  49    P(..), ParseResult(..), getSrcLoc,
  50    failLocMsgP, failSpanMsgP, srcParseFail,
  51    getMessages,
  52    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  53    getLexState, popLexState, pushLexState,
  54    extension, standaloneDerivingEnabled, bangPatEnabled,
  55    addWarning,
  56    lexTokenStream
  57   ) where
  58
  59 import Bag
  60 import ErrUtils
  61 import Outputable
  62 import StringBuffer
  63 import FastString
  64 import FastTypes
  65 import SrcLoc
  66 import UniqFM
  67 import DynFlags
  68 import Ctype
  69 import Util             ( maybePrefixMatch, readRational )
  70
  71 import Control.Monad
  72 import Data.Bits
  73 import Data.Char        ( chr, ord, isSpace )
  74 import Data.Ratio
  75 import Debug.Trace
  76
  77 import Unicode ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  78 }
  79
  80 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  81 $whitechar   = [\ \n\r\f\v $unispace]
  82 $white_no_nl = $whitechar # \n
  83 $tab         = \t
  84
  85 $ascdigit  = 0-9
  86 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  87 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  88 $digit     = [$ascdigit $unidigit]
  89
  90 $special   = [\(\)\,\;\[\]\`\{\}]
  91 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  92 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  93 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  94
  95 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  96 $asclarge  = [A-Z]
  97 $large     = [$asclarge $unilarge]
  98
  99 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 100 $ascsmall  = [a-z]
 101 $small     = [$ascsmall $unismall \_]
 102
 103 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 104 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 105
 106 $octit     = 0-7
 107 $hexit     = [$decdigit A-F a-f]
 108 $symchar   = [$symbol \:]
 109 $nl        = [\n\r]
 110 $idchar    = [$small $large $digit \']
 111
 112 $docsym    = [\| \^ \* \$]
 113
 114 @varid     = $small $idchar*
 115 @conid     = $large $idchar*
 116
 117 @varsym    = $symbol $symchar*
 118 @consym    = \: $symchar*
 119
 120 @decimal     = $decdigit+
 121 @octal       = $octit+
 122 @hexadecimal = $hexit+
 123 @exponent    = [eE] [\-\+]? @decimal
 124
 125 -- we support the hierarchical module name extension:
 126 @qual = (@conid \.)+
 127
 128 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 129
 130 -- normal signed numerical literals can only be explicitly negative,
 131 -- not explicitly positive (contrast @exponent)
 132 @negative = \-
 133 @signed = @negative ?
 134
 135 haskell :-
 136
 137 -- everywhere: skip whitespace and comments
 138 $white_no_nl+                           ;
 139 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 140
 141 -- Everywhere: deal with nested comments.  We explicitly rule out
 142 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 143 -- (this can happen even though pragmas will normally take precedence due to
 144 -- longest-match, because pragmas aren't valid in every state, but comments
 145 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 146 -- set.
 147
 148 "{-" / { isNormalComment } { nested_comment lexToken }
 149
 150 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 151 -- more dashes followed by a symbol should be parsed as a varsym, so we
 152 -- have to exclude those.
 153
 154 -- Since Haddock comments aren't valid in every state, we need to rule them
 155 -- out here.
 156
 157 -- The following two rules match comments that begin with two dashes, but
 158 -- continue with a different character. The rules test that this character
 159 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 160 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 161 -- have a Haddock comment). The rules then munch the rest of the line.
 162
 163 "-- " ~[$docsym \#] .* { lineCommentToken }
 164 "--" [^$symbol : \ ] .* { lineCommentToken }
 165
 166 -- Next, match Haddock comments if no -haddock flag
 167
 168 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 169
 170 -- Now, when we've matched comments that begin with 2 dashes and continue
 171 -- with a different character, we need to match comments that begin with three
 172 -- or more dashes (which clearly can't be Haddock comments). We only need to
 173 -- make sure that the first non-dash character isn't a symbol, and munch the
 174 -- rest of the line.
 175
 176 "---"\-* [^$symbol :] .* { lineCommentToken }
 177
 178 -- Since the previous rules all match dashes followed by at least one
 179 -- character, we also need to match a whole line filled with just dashes.
 180
 181 "--"\-* / { atEOL } { lineCommentToken }
 182
 183 -- We need this rule since none of the other single line comment rules
 184 -- actually match this case.
 185
 186 "-- " / { atEOL } { lineCommentToken }
 187
 188 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 189 -- blank lines) until we find a non-whitespace character, then do layout
 190 -- processing.
 191 --
 192 -- One slight wibble here: what if the line begins with {-#? In
 193 -- theory, we have to lex the pragma to see if it's one we recognise,
 194 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 195 -- as a nested comment.  We don't bother with this: if the line begins
 196 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 197 <bol> {
 198   \n                                    ;
 199   ^\# (line)?                           { begin line_prag1 }
 200   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 201   ^\# \! .* \n                          ; -- #!, for scripts
 202   ()                                    { do_bol }
 203 }
 204
 205 -- after a layout keyword (let, where, do, of), we begin a new layout
 206 -- context if the curly brace is missing.
 207 -- Careful! This stuff is quite delicate.
 208 <layout, layout_do> {
 209   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 210         -- we might encounter {-# here, but {- has been handled already
 211   \n                                    ;
 212   ^\# (line)?                           { begin line_prag1 }
 213 }
 214
 215 -- do is treated in a subtly different way, see new_layout_context
 216 <layout>    ()                          { new_layout_context True }
 217 <layout_do> ()                          { new_layout_context False }
 218
 219 -- after a new layout context which was found to be to the left of the
 220 -- previous context, we have generated a '{' token, and we now need to
 221 -- generate a matching '}' token.
 222 <layout_left>  ()                       { do_layout_left }
 223
 224 <0,option_prags> \n                             { begin bol }
 225
 226 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 227
 228 -- single-line line pragmas, of the form
 229 --    # <line> "<file>" <extra-stuff> \n
 230 <line_prag1> $decdigit+                 { setLine line_prag1a }
 231 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 232 <line_prag1b> .*                        { pop }
 233
 234 -- Haskell-style line pragmas, of the form
 235 --    {-# LINE <line> "<file>" #-}
 236 <line_prag2> $decdigit+                 { setLine line_prag2a }
 237 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 238 <line_prag2b> "#-}"|"-}"                { pop }
 239    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 240    -- with older versions of GHC which generated these.
 241
 242 <0,option_prags> {
 243   "{-#" $whitechar* (RULES|rules)       { rulePrag }
 244   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 245   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 246                                         { token (ITinline_prag False) }
 247   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 248                                         { token ITspec_prag }
 249   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 250         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 251   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 252         $whitechar* (NO(T?)INLINE|no(t?)inline)
 253                                         { token (ITspec_inline_prag False) }
 254   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 255   "{-#" $whitechar* (WARNING|warning)
 256                                         { token ITwarning_prag }
 257   "{-#" $whitechar* (DEPRECATED|deprecated)
 258                                         { token ITdeprecated_prag }
 259   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 260   "{-#" $whitechar* (GENERATED|generated)
 261                                         { token ITgenerated_prag }
 262   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 263   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 264   "{-#" $whitechar* (ANN|ann)           { token ITann_prag }
 265
 266   -- We ignore all these pragmas, but don't generate a warning for them
 267   -- CFILES is a hugs-only thing.
 268   "{-#" $whitechar* (OPTIONS_HUGS|options_hugs|OPTIONS_NHC98|options_nhc98|OPTIONS_JHC|options_jhc|CFILES|cfiles)
 269                     { nested_comment lexToken }
 270
 271   -- ToDo: should only be valid inside a pragma:
 272   "#-}"                                 { endPrag }
 273 }
 274
 275 <option_prags> {
 276   "{-#"  $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 277   "{-#"  $whitechar* (OPTIONS_GHC|options_ghc)
 278                                         { lex_string_prag IToptions_prag }
 279   "{-#"  $whitechar* (OPTIONS_HADDOCK|options_haddock)
 280                                          { lex_string_prag ITdocOptions }
 281   "-- #"                                 { multiline_doc_comment }
 282   "{-#"  $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 283   "{-#"  $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 284 }
 285
 286 <0> {
 287   -- In the "0" mode we ignore these pragmas
 288   "{-#"  $whitechar* (OPTIONS|options|OPTIONS_GHC|options_ghc|OPTIONS_HADDOCK|options_haddock|LANGUAGE|language|INCLUDE|include)
 289                      { nested_comment lexToken }
 290 }
 291
 292 <0> {
 293   "-- #" .* { lineCommentToken }
 294 }
 295
 296 <0,option_prags> {
 297   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 298                     (nested_comment lexToken) }
 299 }
 300
 301 -- '0' state: ordinary lexemes
 302
 303 -- Haddock comments
 304
 305 <0> {
 306   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 307   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 308 }
 309
 310 -- "special" symbols
 311
 312 <0> {
 313   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 314   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 315 }
 316
 317 <0> {
 318   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 319   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 320   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 321   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 322   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 323   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 324   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 325   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 326
 327   "[$" @varid "|"  / { ifExtension qqEnabled }
 328                      { lex_quasiquote_tok }
 329 }
 330
 331 <0> {
 332   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 333                                         { special IToparenbar }
 334   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 335 }
 336
 337 <0> {
 338   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 339 }
 340
 341 <0> {
 342   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 343          { token IToubxparen }
 344   "#)" / { ifExtension unboxedTuplesEnabled }
 345          { token ITcubxparen }
 346 }
 347
 348 <0> {
 349   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 350   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 351 }
 352
 353 <0,option_prags> {
 354   \(                                    { special IToparen }
 355   \)                                    { special ITcparen }
 356   \[                                    { special ITobrack }
 357   \]                                    { special ITcbrack }
 358   \,                                    { special ITcomma }
 359   \;                                    { special ITsemi }
 360   \`                                    { special ITbackquote }
 361
 362   \{                                    { open_brace }
 363   \}                                    { close_brace }
 364 }
 365
 366 <0,option_prags> {
 367   @qual @varid                  { idtoken qvarid }
 368   @qual @conid                  { idtoken qconid }
 369   @varid                        { varid }
 370   @conid                        { idtoken conid }
 371 }
 372
 373 <0> {
 374   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 375   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 376   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 377   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 378 }
 379
 380 -- ToDo: - move `var` and (sym) into lexical syntax?
 381 --       - remove backquote from $special?
 382 <0> {
 383   @qual @varsym       / { ifExtension oldQualOps } { idtoken qvarsym }
 384   @qual @consym       / { ifExtension oldQualOps } { idtoken qconsym }
 385   @qual \( @varsym \) / { ifExtension newQualOps } { idtoken prefixqvarsym }
 386   @qual \( @consym \) / { ifExtension newQualOps } { idtoken prefixqconsym }
 387   @varsym                                          { varsym }
 388   @consym                                          { consym }
 389 }
 390
 391 -- For the normal boxed literals we need to be careful
 392 -- when trying to be close to Haskell98
 393 <0> {
 394   -- Normal integral literals (:: Num a => a, from Integer)
 395   @decimal           { tok_num positive 0 0 decimal }
 396   0[oO] @octal       { tok_num positive 2 2 octal }
 397   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 398
 399   -- Normal rational literals (:: Fractional a => a, from Rational)
 400   @floating_point    { strtoken tok_float }
 401 }
 402
 403 <0> {
 404   -- Unboxed ints (:: Int#) and words (:: Word#)
 405   -- It's simpler (and faster?) to give separate cases to the negatives,
 406   -- especially considering octal/hexadecimal prefixes.
 407   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 408   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 409   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 410   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 411   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 412   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 413
 414   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 415   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 416   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 417
 418   -- Unboxed floats and doubles (:: Float#, :: Double#)
 419   -- prim_{float,double} work with signed literals
 420   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 421   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 422 }
 423
 424 -- Strings and chars are lexed by hand-written code.  The reason is
 425 -- that even if we recognise the string or char here in the regex
 426 -- lexer, we would still have to parse the string afterward in order
 427 -- to convert it to a String.
 428 <0> {
 429   \'                            { lex_char_tok }
 430   \"                            { lex_string_tok }
 431 }
 432
 433 {
 434 -- -----------------------------------------------------------------------------
 435 -- The token type
 436
 437 data Token
 438   = ITas                        -- Haskell keywords
 439   | ITcase
 440   | ITclass
 441   | ITdata
 442   | ITdefault
 443   | ITderiving
 444   | ITdo
 445   | ITelse
 446   | IThiding
 447   | ITif
 448   | ITimport
 449   | ITin
 450   | ITinfix
 451   | ITinfixl
 452   | ITinfixr
 453   | ITinstance
 454   | ITlet
 455   | ITmodule
 456   | ITnewtype
 457   | ITof
 458   | ITqualified
 459   | ITthen
 460   | ITtype
 461   | ITwhere
 462   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 463
 464   | ITforall                    -- GHC extension keywords
 465   | ITforeign
 466   | ITexport
 467   | ITlabel
 468   | ITdynamic
 469   | ITsafe
 470   | ITthreadsafe
 471   | ITunsafe
 472   | ITstdcallconv
 473   | ITccallconv
 474   | ITdotnet
 475   | ITmdo
 476   | ITfamily
 477   | ITgroup
 478   | ITby
 479   | ITusing
 480
 481         -- Pragmas
 482   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 483   | ITspec_prag                 -- SPECIALISE
 484   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 485   | ITsource_prag
 486   | ITrules_prag
 487   | ITwarning_prag
 488   | ITdeprecated_prag
 489   | ITline_prag
 490   | ITscc_prag
 491   | ITgenerated_prag
 492   | ITcore_prag                 -- hdaume: core annotations
 493   | ITunpack_prag
 494   | ITann_prag
 495   | ITclose_prag
 496   | IToptions_prag String
 497   | ITinclude_prag String
 498   | ITlanguage_prag
 499
 500   | ITdotdot                    -- reserved symbols
 501   | ITcolon
 502   | ITdcolon
 503   | ITequal
 504   | ITlam
 505   | ITvbar
 506   | ITlarrow
 507   | ITrarrow
 508   | ITat
 509   | ITtilde
 510   | ITdarrow
 511   | ITminus
 512   | ITbang
 513   | ITstar
 514   | ITdot
 515
 516   | ITbiglam                    -- GHC-extension symbols
 517
 518   | ITocurly                    -- special symbols
 519   | ITccurly
 520   | ITocurlybar                 -- {|, for type applications
 521   | ITccurlybar                 -- |}, for type applications
 522   | ITvocurly
 523   | ITvccurly
 524   | ITobrack
 525   | ITopabrack                  -- [:, for parallel arrays with -XParr
 526   | ITcpabrack                  -- :], for parallel arrays with -XParr
 527   | ITcbrack
 528   | IToparen
 529   | ITcparen
 530   | IToubxparen
 531   | ITcubxparen
 532   | ITsemi
 533   | ITcomma
 534   | ITunderscore
 535   | ITbackquote
 536
 537   | ITvarid   FastString        -- identifiers
 538   | ITconid   FastString
 539   | ITvarsym  FastString
 540   | ITconsym  FastString
 541   | ITqvarid  (FastString,FastString)
 542   | ITqconid  (FastString,FastString)
 543   | ITqvarsym (FastString,FastString)
 544   | ITqconsym (FastString,FastString)
 545   | ITprefixqvarsym (FastString,FastString)
 546   | ITprefixqconsym (FastString,FastString)
 547
 548   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 549
 550   | ITpragma StringBuffer
 551
 552   | ITchar       Char
 553   | ITstring     FastString
 554   | ITinteger    Integer
 555   | ITrational   Rational
 556
 557   | ITprimchar   Char
 558   | ITprimstring FastString
 559   | ITprimint    Integer
 560   | ITprimword   Integer
 561   | ITprimfloat  Rational
 562   | ITprimdouble Rational
 563
 564   -- MetaHaskell extension tokens
 565   | ITopenExpQuote              --  [| or [e|
 566   | ITopenPatQuote              --  [p|
 567   | ITopenDecQuote              --  [d|
 568   | ITopenTypQuote              --  [t|
 569   | ITcloseQuote                --  |]
 570   | ITidEscape   FastString     --  $x
 571   | ITparenEscape               --  $(
 572   | ITvarQuote                  --  '
 573   | ITtyQuote                   --  ''
 574   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 575
 576   -- Arrow notation extension
 577   | ITproc
 578   | ITrec
 579   | IToparenbar                 --  (|
 580   | ITcparenbar                 --  |)
 581   | ITlarrowtail                --  -<
 582   | ITrarrowtail                --  >-
 583   | ITLarrowtail                --  -<<
 584   | ITRarrowtail                --  >>-
 585
 586   | ITunknown String            -- Used when the lexer can't make sense of it
 587   | ITeof                       -- end of file token
 588
 589   -- Documentation annotations
 590   | ITdocCommentNext  String     -- something beginning '-- |'
 591   | ITdocCommentPrev  String     -- something beginning '-- ^'
 592   | ITdocCommentNamed String     -- something beginning '-- $'
 593   | ITdocSection      Int String -- a section heading
 594   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 595   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 596   | ITlineComment     String     -- comment starting by "--"
 597   | ITblockComment    String     -- comment in {- -}
 598
 599 #ifdef DEBUG
 600   deriving Show -- debugging
 601 #endif
 602
 603 {-
 604 isSpecial :: Token -> Bool
 605 -- If we see M.x, where x is a keyword, but
 606 -- is special, we treat is as just plain M.x,
 607 -- not as a keyword.
 608 isSpecial ITas          = True
 609 isSpecial IThiding      = True
 610 isSpecial ITqualified   = True
 611 isSpecial ITforall      = True
 612 isSpecial ITexport      = True
 613 isSpecial ITlabel       = True
 614 isSpecial ITdynamic     = True
 615 isSpecial ITsafe        = True
 616 isSpecial ITthreadsafe  = True
 617 isSpecial ITunsafe      = True
 618 isSpecial ITccallconv   = True
 619 isSpecial ITstdcallconv = True
 620 isSpecial ITmdo         = True
 621 isSpecial ITfamily      = True
 622 isSpecial ITgroup   = True
 623 isSpecial ITby      = True
 624 isSpecial ITusing   = True
 625 isSpecial _             = False
 626 -}
 627
 628 -- the bitmap provided as the third component indicates whether the
 629 -- corresponding extension keyword is valid under the extension options
 630 -- provided to the compiler; if the extension corresponding to *any* of the
 631 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 632 -- facilitates using a keyword in two different extensions that can be
 633 -- activated independently)
 634 --
 635 reservedWordsFM = listToUFM $
 636         map (\(x, y, z) -> (mkFastString x, (y, z)))
 637        [( "_",          ITunderscore,   0 ),
 638         ( "as",         ITas,           0 ),
 639         ( "case",       ITcase,         0 ),
 640         ( "class",      ITclass,        0 ),
 641         ( "data",       ITdata,         0 ),
 642         ( "default",    ITdefault,      0 ),
 643         ( "deriving",   ITderiving,     0 ),
 644         ( "do",         ITdo,           0 ),
 645         ( "else",       ITelse,         0 ),
 646         ( "hiding",     IThiding,       0 ),
 647         ( "if",         ITif,           0 ),
 648         ( "import",     ITimport,       0 ),
 649         ( "in",         ITin,           0 ),
 650         ( "infix",      ITinfix,        0 ),
 651         ( "infixl",     ITinfixl,       0 ),
 652         ( "infixr",     ITinfixr,       0 ),
 653         ( "instance",   ITinstance,     0 ),
 654         ( "let",        ITlet,          0 ),
 655         ( "module",     ITmodule,       0 ),
 656         ( "newtype",    ITnewtype,      0 ),
 657         ( "of",         ITof,           0 ),
 658         ( "qualified",  ITqualified,    0 ),
 659         ( "then",       ITthen,         0 ),
 660         ( "type",       ITtype,         0 ),
 661         ( "where",      ITwhere,        0 ),
 662         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 663
 664     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 665         ( "mdo",        ITmdo,           bit recursiveDoBit),
 666         ( "family",     ITfamily,        bit tyFamBit),
 667     ( "group",  ITgroup,     bit transformComprehensionsBit),
 668     ( "by",     ITby,        bit transformComprehensionsBit),
 669     ( "using",  ITusing,     bit transformComprehensionsBit),
 670
 671         ( "foreign",    ITforeign,       bit ffiBit),
 672         ( "export",     ITexport,        bit ffiBit),
 673         ( "label",      ITlabel,         bit ffiBit),
 674         ( "dynamic",    ITdynamic,       bit ffiBit),
 675         ( "safe",       ITsafe,          bit ffiBit),
 676         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 677         ( "unsafe",     ITunsafe,        bit ffiBit),
 678         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 679         ( "ccall",      ITccallconv,     bit ffiBit),
 680         ( "dotnet",     ITdotnet,        bit ffiBit),
 681
 682         ( "rec",        ITrec,           bit arrowsBit),
 683         ( "proc",       ITproc,          bit arrowsBit)
 684      ]
 685
 686 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 687 reservedSymsFM = listToUFM $
 688     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 689       [ ("..",  ITdotdot,   always)
 690         -- (:) is a reserved op, meaning only list cons
 691        ,(":",   ITcolon,    always)
 692        ,("::",  ITdcolon,   always)
 693        ,("=",   ITequal,    always)
 694        ,("\\",  ITlam,      always)
 695        ,("|",   ITvbar,     always)
 696        ,("<-",  ITlarrow,   always)
 697        ,("->",  ITrarrow,   always)
 698        ,("@",   ITat,       always)
 699        ,("~",   ITtilde,    always)
 700        ,("=>",  ITdarrow,   always)
 701        ,("-",   ITminus,    always)
 702        ,("!",   ITbang,     always)
 703
 704         -- For data T (a::*) = MkT
 705        ,("*", ITstar, \i -> kindSigsEnabled i || tyFamEnabled i)
 706         -- For 'forall a . t'
 707        ,(".", ITdot, \i -> explicitForallEnabled i || inRulePrag i)
 708
 709        ,("-<",  ITlarrowtail, arrowsEnabled)
 710        ,(">-",  ITrarrowtail, arrowsEnabled)
 711        ,("-<<", ITLarrowtail, arrowsEnabled)
 712        ,(">>-", ITRarrowtail, arrowsEnabled)
 713
 714 #if __GLASGOW_HASKELL__ >= 605
 715        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 716        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 717        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 718                                 explicitForallEnabled i)
 719        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 720        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 721        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 722         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 723         -- form part of a large operator.  This would let us have a better
 724         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 725 #endif
 726        ]
 727
 728 -- -----------------------------------------------------------------------------
 729 -- Lexer actions
 730
 731 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 732
 733 special :: Token -> Action
 734 special tok span _buf _len = return (L span tok)
 735
 736 token, layout_token :: Token -> Action
 737 token t span _buf _len = return (L span t)
 738 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 739
 740 idtoken :: (StringBuffer -> Int -> Token) -> Action
 741 idtoken f span buf len = return (L span $! (f buf len))
 742
 743 skip_one_varid :: (FastString -> Token) -> Action
 744 skip_one_varid f span buf len
 745   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 746
 747 strtoken :: (String -> Token) -> Action
 748 strtoken f span buf len =
 749   return (L span $! (f $! lexemeToString buf len))
 750
 751 init_strtoken :: Int -> (String -> Token) -> Action
 752 -- like strtoken, but drops the last N character(s)
 753 init_strtoken drop f span buf len =
 754   return (L span $! (f $! lexemeToString buf (len-drop)))
 755
 756 begin :: Int -> Action
 757 begin code _span _str _len = do pushLexState code; lexToken
 758
 759 pop :: Action
 760 pop _span _buf _len = do popLexState; lexToken
 761
 762 pop_and :: Action -> Action
 763 pop_and act span buf len = do popLexState; act span buf len
 764
 765 {-# INLINE nextCharIs #-}
 766 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 767
 768 notFollowedBy char _ _ _ (AI _ _ buf)
 769   = nextCharIs buf (/=char)
 770
 771 notFollowedBySymbol _ _ _ (AI _ _ buf)
 772   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 773
 774 -- We must reject doc comments as being ordinary comments everywhere.
 775 -- In some cases the doc comment will be selected as the lexeme due to
 776 -- maximal munch, but not always, because the nested comment rule is
 777 -- valid in all states, but the doc-comment rules are only valid in
 778 -- the non-layout states.
 779 isNormalComment bits _ _ (AI _ _ buf)
 780   | haddockEnabled bits = notFollowedByDocOrPragma
 781   | otherwise           = nextCharIs buf (/='#')
 782   where
 783     notFollowedByDocOrPragma
 784        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 785
 786 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 787
 788 {-
 789 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 790   = if haddockEnabled bits then False else (p buf)
 791 -}
 792
 793 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 794
 795 ifExtension pred bits _ _ _ = pred bits
 796
 797 multiline_doc_comment :: Action
 798 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 799   where
 800     worker commentAcc input docType oneLine = case alexGetChar input of
 801       Just ('\n', input')
 802         | oneLine -> docCommentEnd input commentAcc docType buf span
 803         | otherwise -> case checkIfCommentLine input' of
 804           Just input -> worker ('\n':commentAcc) input docType False
 805           Nothing -> docCommentEnd input commentAcc docType buf span
 806       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 807       Nothing -> docCommentEnd input commentAcc docType buf span
 808
 809     checkIfCommentLine input = check (dropNonNewlineSpace input)
 810       where
 811         check input = case alexGetChar input of
 812           Just ('-', input) -> case alexGetChar input of
 813             Just ('-', input) -> case alexGetChar input of
 814               Just (c, _) | c /= '-' -> Just input
 815               _ -> Nothing
 816             _ -> Nothing
 817           _ -> Nothing
 818
 819         dropNonNewlineSpace input = case alexGetChar input of
 820           Just (c, input')
 821             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 822             | otherwise -> input
 823           Nothing -> input
 824
 825 lineCommentToken :: Action
 826 lineCommentToken span buf len = do
 827   b <- extension rawTokenStreamEnabled
 828   if b then strtoken ITlineComment span buf len else lexToken
 829
 830 {-
 831   nested comments require traversing by hand, they can't be parsed
 832   using regular expressions.
 833 -}
 834 nested_comment :: P (Located Token) -> Action
 835 nested_comment cont span _str _len = do
 836   input <- getInput
 837   go "" (1::Int) input
 838   where
 839     go commentAcc 0 input = do setInput input
 840                                b <- extension rawTokenStreamEnabled
 841                                if b
 842                                  then docCommentEnd input commentAcc ITblockComment _str span
 843                                  else cont
 844     go commentAcc n input = case alexGetChar input of
 845       Nothing -> errBrace input span
 846       Just ('-',input) -> case alexGetChar input of
 847         Nothing  -> errBrace input span
 848         Just ('\125',input) -> go commentAcc (n-1) input
 849         Just (_,_)          -> go ('-':commentAcc) n input
 850       Just ('\123',input) -> case alexGetChar input of
 851         Nothing  -> errBrace input span
 852         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 853         Just (_,_)       -> go ('\123':commentAcc) n input
 854       Just (c,input) -> go (c:commentAcc) n input
 855
 856 nested_doc_comment :: Action
 857 nested_doc_comment span buf _len = withLexedDocType (go "")
 858   where
 859     go commentAcc input docType _ = case alexGetChar input of
 860       Nothing -> errBrace input span
 861       Just ('-',input) -> case alexGetChar input of
 862         Nothing -> errBrace input span
 863         Just ('\125',input) ->
 864           docCommentEnd input commentAcc docType buf span
 865         Just (_,_) -> go ('-':commentAcc) input docType False
 866       Just ('\123', input) -> case alexGetChar input of
 867         Nothing  -> errBrace input span
 868         Just ('-',input) -> do
 869           setInput input
 870           let cont = do input <- getInput; go commentAcc input docType False
 871           nested_comment cont span buf _len
 872         Just (_,_) -> go ('\123':commentAcc) input docType False
 873       Just (c,input) -> go (c:commentAcc) input docType False
 874
 875 withLexedDocType lexDocComment = do
 876   input@(AI _ _ buf) <- getInput
 877   case prevChar buf ' ' of
 878     '|' -> lexDocComment input ITdocCommentNext False
 879     '^' -> lexDocComment input ITdocCommentPrev False
 880     '$' -> lexDocComment input ITdocCommentNamed False
 881     '*' -> lexDocSection 1 input
 882     '#' -> lexDocComment input ITdocOptionsOld False
 883  where
 884     lexDocSection n input = case alexGetChar input of
 885       Just ('*', input) -> lexDocSection (n+1) input
 886       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 887       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 888
 889 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 890 -- off again at the end of the pragma.
 891 rulePrag :: Action
 892 rulePrag span buf len = do
 893   setExts (.|. bit inRulePragBit)
 894   return (L span ITrules_prag)
 895
 896 endPrag :: Action
 897 endPrag span buf len = do
 898   setExts (.&. complement (bit inRulePragBit))
 899   return (L span ITclose_prag)
 900
 901 -- docCommentEnd
 902 -------------------------------------------------------------------------------
 903 -- This function is quite tricky. We can't just return a new token, we also
 904 -- need to update the state of the parser. Why? Because the token is longer
 905 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 906 -- it writes the wrong token length to the parser state. This function is
 907 -- called afterwards, so it can just update the state.
 908
 909 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 910 -- which is something that the original lexer didn't account for.
 911 -- I have added last_line_len in the parser state which represents the length
 912 -- of the part of the token that is on the last line. It is now used for layout
 913 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 914 -- was before, the full length of the token, and it is now only used for error
 915 -- messages. /Waern
 916
 917 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 918                  SrcSpan -> P (Located Token)
 919 docCommentEnd input commentAcc docType buf span = do
 920   setInput input
 921   let (AI loc last_offs nextBuf) = input
 922       comment = reverse commentAcc
 923       span' = mkSrcSpan (srcSpanStart span) loc
 924       last_len = byteDiff buf nextBuf
 925
 926       last_line_len = if (last_offs - last_len < 0)
 927         then last_offs
 928         else last_len
 929
 930   span `seq` setLastToken span' last_len last_line_len
 931   return (L span' (docType comment))
 932
 933 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 934
 935 open_brace, close_brace :: Action
 936 open_brace span _str _len = do
 937   ctx <- getContext
 938   setContext (NoLayout:ctx)
 939   return (L span ITocurly)
 940 close_brace span _str _len = do
 941   popContext
 942   return (L span ITccurly)
 943
 944 qvarid buf len = ITqvarid $! splitQualName buf len False
 945 qconid buf len = ITqconid $! splitQualName buf len False
 946
 947 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 948 -- takes a StringBuffer and a length, and returns the module name
 949 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 950 -- because of hierarchical module names.
 951 splitQualName orig_buf len parens = split orig_buf orig_buf
 952   where
 953     split buf dot_buf
 954         | orig_buf `byteDiff` buf >= len  = done dot_buf
 955         | c == '.'                        = found_dot buf'
 956         | otherwise                       = split buf' dot_buf
 957       where
 958        (c,buf') = nextChar buf
 959
 960     -- careful, we might get names like M....
 961     -- so, if the character after the dot is not upper-case, this is
 962     -- the end of the qualifier part.
 963     found_dot buf -- buf points after the '.'
 964         | isUpper c    = split buf' buf
 965         | otherwise    = done buf
 966       where
 967        (c,buf') = nextChar buf
 968
 969     done dot_buf =
 970         (lexemeToFastString orig_buf (qual_size - 1),
 971          if parens -- Prelude.(+)
 972             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 973             else lexemeToFastString dot_buf (len - qual_size))
 974       where
 975         qual_size = orig_buf `byteDiff` dot_buf
 976
 977 varid span buf len =
 978   fs `seq`
 979   case lookupUFM reservedWordsFM fs of
 980         Just (keyword,0)    -> do
 981                 maybe_layout keyword
 982                 return (L span keyword)
 983         Just (keyword,exts) -> do
 984                 b <- extension (\i -> exts .&. i /= 0)
 985                 if b then do maybe_layout keyword
 986                              return (L span keyword)
 987                      else return (L span (ITvarid fs))
 988         _other -> return (L span (ITvarid fs))
 989   where
 990         fs = lexemeToFastString buf len
 991
 992 conid buf len = ITconid fs
 993   where fs = lexemeToFastString buf len
 994
 995 qvarsym buf len = ITqvarsym $! splitQualName buf len False
 996 qconsym buf len = ITqconsym $! splitQualName buf len False
 997 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
 998 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
 999
1000 varsym = sym ITvarsym
1001 consym = sym ITconsym
1002
1003 sym con span buf len =
1004   case lookupUFM reservedSymsFM fs of
1005         Just (keyword,exts) -> do
1006                 b <- extension exts
1007                 if b then return (L span keyword)
1008                      else return (L span $! con fs)
1009         _other -> return (L span $! con fs)
1010   where
1011         fs = lexemeToFastString buf len
1012
1013 -- Variations on the integral numeric literal.
1014 tok_integral :: (Integer -> Token)
1015      -> (Integer -> Integer)
1016  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1017      -> Int -> Int
1018      -> (Integer, (Char->Int)) -> Action
1019 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1020   return $ L span $ itint $! transint $ parseUnsignedInteger
1021      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1022
1023 -- some conveniences for use with tok_integral
1024 tok_num = tok_integral ITinteger
1025 tok_primint = tok_integral ITprimint
1026 tok_primword = tok_integral ITprimword positive
1027 positive = id
1028 negative = negate
1029 decimal = (10,octDecDigit)
1030 octal = (8,octDecDigit)
1031 hexadecimal = (16,hexDigit)
1032
1033 -- readRational can understand negative rationals, exponents, everything.
1034 tok_float        str = ITrational   $! readRational str
1035 tok_primfloat    str = ITprimfloat  $! readRational str
1036 tok_primdouble   str = ITprimdouble $! readRational str
1037
1038 -- -----------------------------------------------------------------------------
1039 -- Layout processing
1040
1041 -- we're at the first token on a line, insert layout tokens if necessary
1042 do_bol :: Action
1043 do_bol span _str _len = do
1044         pos <- getOffside
1045         case pos of
1046             LT -> do
1047                 --trace "layout: inserting '}'" $ do
1048                 popContext
1049                 -- do NOT pop the lex state, we might have a ';' to insert
1050                 return (L span ITvccurly)
1051             EQ -> do
1052                 --trace "layout: inserting ';'" $ do
1053                 popLexState
1054                 return (L span ITsemi)
1055             GT -> do
1056                 popLexState
1057                 lexToken
1058
1059 -- certain keywords put us in the "layout" state, where we might
1060 -- add an opening curly brace.
1061 maybe_layout ITdo       = pushLexState layout_do
1062 maybe_layout ITmdo      = pushLexState layout_do
1063 maybe_layout ITof       = pushLexState layout
1064 maybe_layout ITlet      = pushLexState layout
1065 maybe_layout ITwhere    = pushLexState layout
1066 maybe_layout ITrec      = pushLexState layout
1067 maybe_layout _          = return ()
1068
1069 -- Pushing a new implicit layout context.  If the indentation of the
1070 -- next token is not greater than the previous layout context, then
1071 -- Haskell 98 says that the new layout context should be empty; that is
1072 -- the lexer must generate {}.
1073 --
1074 -- We are slightly more lenient than this: when the new context is started
1075 -- by a 'do', then we allow the new context to be at the same indentation as
1076 -- the previous context.  This is what the 'strict' argument is for.
1077 --
1078 new_layout_context strict span _buf _len = do
1079     popLexState
1080     (AI _ offset _) <- getInput
1081     ctx <- getContext
1082     case ctx of
1083         Layout prev_off : _  |
1084            (strict     && prev_off >= offset  ||
1085             not strict && prev_off > offset) -> do
1086                 -- token is indented to the left of the previous context.
1087                 -- we must generate a {} sequence now.
1088                 pushLexState layout_left
1089                 return (L span ITvocurly)
1090         _ -> do
1091                 setContext (Layout offset : ctx)
1092                 return (L span ITvocurly)
1093
1094 do_layout_left span _buf _len = do
1095     popLexState
1096     pushLexState bol  -- we must be at the start of a line
1097     return (L span ITvccurly)
1098
1099 -- -----------------------------------------------------------------------------
1100 -- LINE pragmas
1101
1102 setLine :: Int -> Action
1103 setLine code span buf len = do
1104   let line = parseUnsignedInteger buf len 10 octDecDigit
1105   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1106         -- subtract one: the line number refers to the *following* line
1107   popLexState
1108   pushLexState code
1109   lexToken
1110
1111 setFile :: Int -> Action
1112 setFile code span buf len = do
1113   let file = lexemeToFastString (stepOn buf) (len-2)
1114   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1115   popLexState
1116   pushLexState code
1117   lexToken
1118
1119
1120 -- -----------------------------------------------------------------------------
1121 -- Options, includes and language pragmas.
1122
1123 lex_string_prag :: (String -> Token) -> Action
1124 lex_string_prag mkTok span _buf _len
1125     = do input <- getInput
1126          start <- getSrcLoc
1127          tok <- go [] input
1128          end <- getSrcLoc
1129          return (L (mkSrcSpan start end) tok)
1130     where go acc input
1131               = if isString input "#-}"
1132                    then do setInput input
1133                            return (mkTok (reverse acc))
1134                    else case alexGetChar input of
1135                           Just (c,i) -> go (c:acc) i
1136                           Nothing -> err input
1137           isString _ [] = True
1138           isString i (x:xs)
1139               = case alexGetChar i of
1140                   Just (c,i') | c == x    -> isString i' xs
1141                   _other -> False
1142           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1143
1144
1145 -- -----------------------------------------------------------------------------
1146 -- Strings & Chars
1147
1148 -- This stuff is horrible.  I hates it.
1149
1150 lex_string_tok :: Action
1151 lex_string_tok span _buf _len = do
1152   tok <- lex_string ""
1153   end <- getSrcLoc
1154   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1155
1156 lex_string :: String -> P Token
1157 lex_string s = do
1158   i <- getInput
1159   case alexGetChar' i of
1160     Nothing -> lit_error
1161
1162     Just ('"',i)  -> do
1163         setInput i
1164         magicHash <- extension magicHashEnabled
1165         if magicHash
1166           then do
1167             i <- getInput
1168             case alexGetChar' i of
1169               Just ('#',i) -> do
1170                    setInput i
1171                    if any (> '\xFF') s
1172                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1173                     else let s' = mkZFastString (reverse s) in
1174                          return (ITprimstring s')
1175                         -- mkZFastString is a hack to avoid encoding the
1176                         -- string in UTF-8.  We just want the exact bytes.
1177               _other ->
1178                 return (ITstring (mkFastString (reverse s)))
1179           else
1180                 return (ITstring (mkFastString (reverse s)))
1181
1182     Just ('\\',i)
1183         | Just ('&',i) <- next -> do
1184                 setInput i; lex_string s
1185         | Just (c,i) <- next, is_space c -> do
1186                 setInput i; lex_stringgap s
1187         where next = alexGetChar' i
1188
1189     Just (c, i) -> do
1190         c' <- lex_char c i
1191         lex_string (c':s)
1192
1193 lex_stringgap s = do
1194   c <- getCharOrFail
1195   case c of
1196     '\\' -> lex_string s
1197     c | is_space c -> lex_stringgap s
1198     _other -> lit_error
1199
1200
1201 lex_char_tok :: Action
1202 -- Here we are basically parsing character literals, such as 'x' or '\n'
1203 -- but, when Template Haskell is on, we additionally spot
1204 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1205 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1206 -- So we have to do two characters of lookahead: when we see 'x we need to
1207 -- see if there's a trailing quote
1208 lex_char_tok span _buf _len = do        -- We've seen '
1209    i1 <- getInput       -- Look ahead to first character
1210    let loc = srcSpanStart span
1211    case alexGetChar' i1 of
1212         Nothing -> lit_error
1213
1214         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1215                   th_exts <- extension thEnabled
1216                   if th_exts then do
1217                         setInput i2
1218                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1219                    else lit_error
1220
1221         Just ('\\', i2@(AI _end2 _ _)) -> do    -- We've seen 'backslash
1222                   setInput i2
1223                   lit_ch <- lex_escape
1224                   mc <- getCharOrFail   -- Trailing quote
1225                   if mc == '\'' then finish_char_tok loc lit_ch
1226                                 else do setInput i2; lit_error
1227
1228         Just (c, i2@(AI _end2 _ _))
1229                 | not (isAny c) -> lit_error
1230                 | otherwise ->
1231
1232                 -- We've seen 'x, where x is a valid character
1233                 --  (i.e. not newline etc) but not a quote or backslash
1234            case alexGetChar' i2 of      -- Look ahead one more character
1235                 Nothing -> lit_error
1236                 Just ('\'', i3) -> do   -- We've seen 'x'
1237                         setInput i3
1238                         finish_char_tok loc c
1239                 _other -> do            -- We've seen 'x not followed by quote
1240                                         -- If TH is on, just parse the quote only
1241                         th_exts <- extension thEnabled
1242                         let (AI end _ _) = i1
1243                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1244                                    else do setInput i2; lit_error
1245
1246 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1247 finish_char_tok loc ch  -- We've already seen the closing quote
1248                         -- Just need to check for trailing #
1249   = do  magicHash <- extension magicHashEnabled
1250         i@(AI end _ _) <- getInput
1251         if magicHash then do
1252                 case alexGetChar' i of
1253                         Just ('#',i@(AI end _ _)) -> do
1254                                 setInput i
1255                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1256                         _other ->
1257                                 return (L (mkSrcSpan loc end) (ITchar ch))
1258                 else do
1259                    return (L (mkSrcSpan loc end) (ITchar ch))
1260
1261 lex_char :: Char -> AlexInput -> P Char
1262 lex_char c inp = do
1263   case c of
1264       '\\' -> do setInput inp; lex_escape
1265       c | isAny c -> do setInput inp; return c
1266       _other -> lit_error
1267
1268 isAny c | c > '\x7f' = isPrint c
1269         | otherwise  = is_any c
1270
1271 lex_escape :: P Char
1272 lex_escape = do
1273   c <- getCharOrFail
1274   case c of
1275         'a'   -> return '\a'
1276         'b'   -> return '\b'
1277         'f'   -> return '\f'
1278         'n'   -> return '\n'
1279         'r'   -> return '\r'
1280         't'   -> return '\t'
1281         'v'   -> return '\v'
1282         '\\'  -> return '\\'
1283         '"'   -> return '\"'
1284         '\''  -> return '\''
1285         '^'   -> do c <- getCharOrFail
1286                     if c >= '@' && c <= '_'
1287                         then return (chr (ord c - ord '@'))
1288                         else lit_error
1289
1290         'x'   -> readNum is_hexdigit 16 hexDigit
1291         'o'   -> readNum is_octdigit  8 octDecDigit
1292         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1293
1294         c1 ->  do
1295            i <- getInput
1296            case alexGetChar' i of
1297             Nothing -> lit_error
1298             Just (c2,i2) ->
1299               case alexGetChar' i2 of
1300                 Nothing -> do setInput i2; lit_error
1301                 Just (c3,i3) ->
1302                    let str = [c1,c2,c3] in
1303                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1304                                      Just rest <- [maybePrefixMatch p str] ] of
1305                           (escape_char,[]):_ -> do
1306                                 setInput i3
1307                                 return escape_char
1308                           (escape_char,_:_):_ -> do
1309                                 setInput i2
1310                                 return escape_char
1311                           [] -> lit_error
1312
1313 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1314 readNum is_digit base conv = do
1315   i <- getInput
1316   c <- getCharOrFail
1317   if is_digit c
1318         then readNum2 is_digit base conv (conv c)
1319         else do setInput i; lit_error
1320
1321 readNum2 is_digit base conv i = do
1322   input <- getInput
1323   read i input
1324   where read i input = do
1325           case alexGetChar' input of
1326             Just (c,input') | is_digit c -> do
1327                 read (i*base + conv c) input'
1328             _other -> do
1329                 if i >= 0 && i <= 0x10FFFF
1330                    then do setInput input; return (chr i)
1331                    else lit_error
1332
1333 silly_escape_chars = [
1334         ("NUL", '\NUL'),
1335         ("SOH", '\SOH'),
1336         ("STX", '\STX'),
1337         ("ETX", '\ETX'),
1338         ("EOT", '\EOT'),
1339         ("ENQ", '\ENQ'),
1340         ("ACK", '\ACK'),
1341         ("BEL", '\BEL'),
1342         ("BS", '\BS'),
1343         ("HT", '\HT'),
1344         ("LF", '\LF'),
1345         ("VT", '\VT'),
1346         ("FF", '\FF'),
1347         ("CR", '\CR'),
1348         ("SO", '\SO'),
1349         ("SI", '\SI'),
1350         ("DLE", '\DLE'),
1351         ("DC1", '\DC1'),
1352         ("DC2", '\DC2'),
1353         ("DC3", '\DC3'),
1354         ("DC4", '\DC4'),
1355         ("NAK", '\NAK'),
1356         ("SYN", '\SYN'),
1357         ("ETB", '\ETB'),
1358         ("CAN", '\CAN'),
1359         ("EM", '\EM'),
1360         ("SUB", '\SUB'),
1361         ("ESC", '\ESC'),
1362         ("FS", '\FS'),
1363         ("GS", '\GS'),
1364         ("RS", '\RS'),
1365         ("US", '\US'),
1366         ("SP", '\SP'),
1367         ("DEL", '\DEL')
1368         ]
1369
1370 -- before calling lit_error, ensure that the current input is pointing to
1371 -- the position of the error in the buffer.  This is so that we can report
1372 -- a correct location to the user, but also so we can detect UTF-8 decoding
1373 -- errors if they occur.
1374 lit_error = lexError "lexical error in string/character literal"
1375
1376 getCharOrFail :: P Char
1377 getCharOrFail =  do
1378   i <- getInput
1379   case alexGetChar' i of
1380         Nothing -> lexError "unexpected end-of-file in string/character literal"
1381         Just (c,i)  -> do setInput i; return c
1382
1383 -- -----------------------------------------------------------------------------
1384 -- QuasiQuote
1385
1386 lex_quasiquote_tok :: Action
1387 lex_quasiquote_tok span buf len = do
1388   let quoter = reverse $ takeWhile (/= '$')
1389                $ reverse $ lexemeToString buf (len - 1)
1390   quoteStart <- getSrcLoc
1391   quote <- lex_quasiquote ""
1392   end <- getSrcLoc
1393   return (L (mkSrcSpan (srcSpanStart span) end)
1394            (ITquasiQuote (mkFastString quoter,
1395                           mkFastString (reverse quote),
1396                           mkSrcSpan quoteStart end)))
1397
1398 lex_quasiquote :: String -> P String
1399 lex_quasiquote s = do
1400   i <- getInput
1401   case alexGetChar' i of
1402     Nothing -> lit_error
1403
1404     Just ('\\',i)
1405         | Just ('|',i) <- next -> do
1406                 setInput i; lex_quasiquote ('|' : s)
1407         | Just (']',i) <- next -> do
1408                 setInput i; lex_quasiquote (']' : s)
1409         where next = alexGetChar' i
1410
1411     Just ('|',i)
1412         | Just (']',i) <- next -> do
1413                 setInput i; return s
1414         where next = alexGetChar' i
1415
1416     Just (c, i) -> do
1417          setInput i; lex_quasiquote (c : s)
1418
1419 -- -----------------------------------------------------------------------------
1420 -- Warnings
1421
1422 warn :: DynFlag -> SDoc -> Action
1423 warn option warning srcspan _buf _len = do
1424     addWarning option srcspan warning
1425     lexToken
1426
1427 warnThen :: DynFlag -> SDoc -> Action -> Action
1428 warnThen option warning action srcspan buf len = do
1429     addWarning option srcspan warning
1430     action srcspan buf len
1431
1432 -- -----------------------------------------------------------------------------
1433 -- The Parse Monad
1434
1435 data LayoutContext
1436   = NoLayout
1437   | Layout !Int
1438   deriving Show
1439
1440 data ParseResult a
1441   = POk PState a
1442   | PFailed
1443         SrcSpan         -- The start and end of the text span related to
1444                         -- the error.  Might be used in environments which can
1445                         -- show this span, e.g. by highlighting it.
1446         Message         -- The error message
1447
1448 data PState = PState {
1449         buffer     :: StringBuffer,
1450     dflags     :: DynFlags,
1451     messages   :: Messages,
1452         last_loc   :: SrcSpan,  -- pos of previous token
1453         last_offs  :: !Int,     -- offset of the previous token from the
1454                                 -- beginning of  the current line.
1455                                 -- \t is equal to 8 spaces.
1456         last_len   :: !Int,     -- len of previous token
1457   last_line_len :: !Int,
1458         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1459         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1460         context    :: [LayoutContext],
1461         lex_state  :: [Int]
1462      }
1463         -- last_loc and last_len are used when generating error messages,
1464         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1465         -- current token to happyError, we could at least get rid of last_len.
1466         -- Getting rid of last_loc would require finding another way to
1467         -- implement pushCurrentContext (which is only called from one place).
1468
1469 newtype P a = P { unP :: PState -> ParseResult a }
1470
1471 instance Monad P where
1472   return = returnP
1473   (>>=) = thenP
1474   fail = failP
1475
1476 returnP :: a -> P a
1477 returnP a = a `seq` (P $ \s -> POk s a)
1478
1479 thenP :: P a -> (a -> P b) -> P b
1480 (P m) `thenP` k = P $ \ s ->
1481         case m s of
1482                 POk s1 a         -> (unP (k a)) s1
1483                 PFailed span err -> PFailed span err
1484
1485 failP :: String -> P a
1486 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1487
1488 failMsgP :: String -> P a
1489 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1490
1491 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1492 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1493
1494 failSpanMsgP :: SrcSpan -> SDoc -> P a
1495 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1496
1497 extension :: (Int -> Bool) -> P Bool
1498 extension p = P $ \s -> POk s (p $! extsBitmap s)
1499
1500 getExts :: P Int
1501 getExts = P $ \s -> POk s (extsBitmap s)
1502
1503 setExts :: (Int -> Int) -> P ()
1504 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1505
1506 setSrcLoc :: SrcLoc -> P ()
1507 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1508
1509 getSrcLoc :: P SrcLoc
1510 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1511
1512 setLastToken :: SrcSpan -> Int -> Int -> P ()
1513 setLastToken loc len line_len = P $ \s -> POk s {
1514   last_loc=loc,
1515   last_len=len,
1516   last_line_len=line_len
1517 } ()
1518
1519 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1520
1521 alexInputPrevChar :: AlexInput -> Char
1522 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1523
1524 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1525 alexGetChar (AI loc ofs s)
1526   | atEnd s   = Nothing
1527   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1528                 --trace (show (ord c)) $
1529                 Just (adj_c, (AI loc' ofs' s'))
1530   where (c,s') = nextChar s
1531         loc'   = advanceSrcLoc loc c
1532         ofs'   = advanceOffs c ofs
1533
1534         non_graphic     = '\x0'
1535         upper           = '\x1'
1536         lower           = '\x2'
1537         digit           = '\x3'
1538         symbol          = '\x4'
1539         space           = '\x5'
1540         other_graphic   = '\x6'
1541
1542         adj_c
1543           | c <= '\x06' = non_graphic
1544           | c <= '\x7f' = c
1545           -- Alex doesn't handle Unicode, so when Unicode
1546           -- character is encoutered we output these values
1547           -- with the actual character value hidden in the state.
1548           | otherwise =
1549                 case generalCategory c of
1550                   UppercaseLetter       -> upper
1551                   LowercaseLetter       -> lower
1552                   TitlecaseLetter       -> upper
1553                   ModifierLetter        -> other_graphic
1554                   OtherLetter           -> lower -- see #1103
1555                   NonSpacingMark        -> other_graphic
1556                   SpacingCombiningMark  -> other_graphic
1557                   EnclosingMark         -> other_graphic
1558                   DecimalNumber         -> digit
1559                   LetterNumber          -> other_graphic
1560                   OtherNumber           -> other_graphic
1561                   ConnectorPunctuation  -> symbol
1562                   DashPunctuation       -> symbol
1563                   OpenPunctuation       -> other_graphic
1564                   ClosePunctuation      -> other_graphic
1565                   InitialQuote          -> other_graphic
1566                   FinalQuote            -> other_graphic
1567                   OtherPunctuation      -> symbol
1568                   MathSymbol            -> symbol
1569                   CurrencySymbol        -> symbol
1570                   ModifierSymbol        -> symbol
1571                   OtherSymbol           -> symbol
1572                   Space                 -> space
1573                   _other                -> non_graphic
1574
1575 -- This version does not squash unicode characters, it is used when
1576 -- lexing strings.
1577 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1578 alexGetChar' (AI loc ofs s)
1579   | atEnd s   = Nothing
1580   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1581                 --trace (show (ord c)) $
1582                 Just (c, (AI loc' ofs' s'))
1583   where (c,s') = nextChar s
1584         loc'   = advanceSrcLoc loc c
1585         ofs'   = advanceOffs c ofs
1586
1587 advanceOffs :: Char -> Int -> Int
1588 advanceOffs '\n' _    = 0
1589 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1590 advanceOffs _    offs = offs + 1
1591
1592 getInput :: P AlexInput
1593 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1594
1595 setInput :: AlexInput -> P ()
1596 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1597
1598 pushLexState :: Int -> P ()
1599 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1600
1601 popLexState :: P Int
1602 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1603
1604 getLexState :: P Int
1605 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1606
1607 -- for reasons of efficiency, flags indicating language extensions (eg,
1608 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1609 -- integer
1610
1611 genericsBit, ffiBit, parrBit :: Int
1612 genericsBit = 0 -- {| and |}
1613 ffiBit     = 1
1614 parrBit    = 2
1615 arrowsBit  = 4
1616 thBit      = 5
1617 ipBit      = 6
1618 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1619 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1620                 -- (doesn't affect the lexer)
1621 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1622 haddockBit = 10 -- Lex and parse Haddock comments
1623 magicHashBit = 11 -- "#" in both functions and operators
1624 kindSigsBit = 12 -- Kind signatures on type variables
1625 recursiveDoBit = 13 -- mdo
1626 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1627 unboxedTuplesBit = 15 -- (# and #)
1628 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1629 transformComprehensionsBit = 17
1630 qqBit      = 18 -- enable quasiquoting
1631 inRulePragBit = 19
1632 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1633 newQualOpsBit = 21 -- Haskell' qualified operator syntax, e.g. Prelude.(+)
1634
1635 genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1636 always           _     = True
1637 genericsEnabled  flags = testBit flags genericsBit
1638 ffiEnabled       flags = testBit flags ffiBit
1639 parrEnabled      flags = testBit flags parrBit
1640 arrowsEnabled    flags = testBit flags arrowsBit
1641 thEnabled        flags = testBit flags thBit
1642 ipEnabled        flags = testBit flags ipBit
1643 explicitForallEnabled flags = testBit flags explicitForallBit
1644 bangPatEnabled   flags = testBit flags bangPatBit
1645 tyFamEnabled     flags = testBit flags tyFamBit
1646 haddockEnabled   flags = testBit flags haddockBit
1647 magicHashEnabled flags = testBit flags magicHashBit
1648 kindSigsEnabled  flags = testBit flags kindSigsBit
1649 recursiveDoEnabled flags = testBit flags recursiveDoBit
1650 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1651 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1652 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1653 transformComprehensionsEnabled flags = testBit flags transformComprehensionsBit
1654 qqEnabled        flags = testBit flags qqBit
1655 inRulePrag       flags = testBit flags inRulePragBit
1656 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1657 newQualOps       flags = testBit flags newQualOpsBit
1658 oldQualOps flags = not (newQualOps flags)
1659
1660 -- PState for parsing options pragmas
1661 --
1662 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1663 pragState dynflags buf loc =
1664   PState {
1665       buffer        = buf,
1666       messages      = emptyMessages,
1667       dflags        = dynflags,
1668       last_loc      = mkSrcSpan loc loc,
1669       last_offs     = 0,
1670       last_len      = 0,
1671       last_line_len = 0,
1672       loc           = loc,
1673       extsBitmap    = 0,
1674       context       = [],
1675       lex_state     = [bol, option_prags, 0]
1676     }
1677
1678
1679 -- create a parse state
1680 --
1681 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1682 mkPState buf loc flags  =
1683   PState {
1684       buffer          = buf,
1685       dflags        = flags,
1686       messages      = emptyMessages,
1687       last_loc      = mkSrcSpan loc loc,
1688       last_offs     = 0,
1689       last_len      = 0,
1690       last_line_len = 0,
1691       loc           = loc,
1692       extsBitmap    = fromIntegral bitmap,
1693       context       = [],
1694       lex_state     = [bol, 0]
1695         -- we begin in the layout state if toplev_layout is set
1696     }
1697     where
1698       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1699                .|. ffiBit       `setBitIf` dopt Opt_ForeignFunctionInterface flags
1700                .|. parrBit      `setBitIf` dopt Opt_PArr         flags
1701                .|. arrowsBit    `setBitIf` dopt Opt_Arrows       flags
1702                .|. thBit        `setBitIf` dopt Opt_TemplateHaskell flags
1703                .|. qqBit        `setBitIf` dopt Opt_QuasiQuotes flags
1704                .|. ipBit        `setBitIf` dopt Opt_ImplicitParams flags
1705                .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
1706                .|. explicitForallBit `setBitIf` dopt Opt_LiberalTypeSynonyms flags
1707                .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
1708                .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
1709                .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
1710                .|. explicitForallBit `setBitIf` dopt Opt_RankNTypes flags
1711                .|. bangPatBit   `setBitIf` dopt Opt_BangPatterns flags
1712                .|. tyFamBit     `setBitIf` dopt Opt_TypeFamilies flags
1713                .|. haddockBit   `setBitIf` dopt Opt_Haddock      flags
1714                .|. magicHashBit `setBitIf` dopt Opt_MagicHash    flags
1715                .|. kindSigsBit  `setBitIf` dopt Opt_KindSignatures flags
1716                .|. recursiveDoBit `setBitIf` dopt Opt_RecursiveDo flags
1717                .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
1718                .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
1719                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1720                .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1721                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1722                .|. newQualOpsBit `setBitIf` dopt Opt_NewQualifiedOperators flags
1723       --
1724       setBitIf :: Int -> Bool -> Int
1725       b `setBitIf` cond | cond      = bit b
1726                         | otherwise = 0
1727
1728 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1729 addWarning option srcspan warning
1730  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1731        let warning' = mkWarnMsg srcspan alwaysQualify warning
1732            ws' = if dopt option d then ws `snocBag` warning' else ws
1733        in POk s{messages=(ws', es)} ()
1734
1735 getMessages :: PState -> Messages
1736 getMessages PState{messages=ms} = ms
1737
1738 getContext :: P [LayoutContext]
1739 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1740
1741 setContext :: [LayoutContext] -> P ()
1742 setContext ctx = P $ \s -> POk s{context=ctx} ()
1743
1744 popContext :: P ()
1745 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1746                               last_len = len, last_loc = last_loc }) ->
1747   case ctx of
1748         (_:tl) -> POk s{ context = tl } ()
1749         []     -> PFailed last_loc (srcParseErr buf len)
1750
1751 -- Push a new layout context at the indentation of the last token read.
1752 -- This is only used at the outer level of a module when the 'module'
1753 -- keyword is missing.
1754 pushCurrentContext :: P ()
1755 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1756     POk s{context = Layout (offs-len) : ctx} ()
1757 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1758
1759 getOffside :: P Ordering
1760 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1761                 let ord = case stk of
1762                         (Layout n:_) -> compare offs n
1763                         _            -> GT
1764                 in POk s ord
1765
1766 -- ---------------------------------------------------------------------------
1767 -- Construct a parse error
1768
1769 srcParseErr
1770   :: StringBuffer       -- current buffer (placed just after the last token)
1771   -> Int                -- length of the previous token
1772   -> Message
1773 srcParseErr buf len
1774   = hcat [ if null token
1775              then ptext (sLit "parse error (possibly incorrect indentation)")
1776              else hcat [ptext (sLit "parse error on input "),
1777                         char '`', text token, char '\'']
1778     ]
1779   where token = lexemeToString (offsetBytes (-len) buf) len
1780
1781 -- Report a parse failure, giving the span of the previous token as
1782 -- the location of the error.  This is the entry point for errors
1783 -- detected during parsing.
1784 srcParseFail :: P a
1785 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1786                             last_loc = last_loc } ->
1787     PFailed last_loc (srcParseErr buf len)
1788
1789 -- A lexical error is reported at a particular position in the source file,
1790 -- not over a token range.
1791 lexError :: String -> P a
1792 lexError str = do
1793   loc <- getSrcLoc
1794   (AI end _ buf) <- getInput
1795   reportLexError loc end buf str
1796
1797 -- -----------------------------------------------------------------------------
1798 -- This is the top-level function: called from the parser each time a
1799 -- new token is to be read from the input.
1800
1801 lexer :: (Located Token -> P a) -> P a
1802 lexer cont = do
1803   tok@(L _span _tok__) <- lexToken
1804 --  trace ("token: " ++ show tok__) $ do
1805   cont tok
1806
1807 lexToken :: P (Located Token)
1808 lexToken = do
1809   inp@(AI loc1 _ buf) <- getInput
1810   sc <- getLexState
1811   exts <- getExts
1812   case alexScanUser exts inp sc of
1813     AlexEOF -> do
1814         let span = mkSrcSpan loc1 loc1
1815         setLastToken span 0 0
1816         return (L span ITeof)
1817     AlexError (AI loc2 _ buf) ->
1818         reportLexError loc1 loc2 buf "lexical error"
1819     AlexSkip inp2 _ -> do
1820         setInput inp2
1821         lexToken
1822     AlexToken inp2@(AI end _ buf2) _ t -> do
1823         setInput inp2
1824         let span = mkSrcSpan loc1 end
1825         let bytes = byteDiff buf buf2
1826         span `seq` setLastToken span bytes bytes
1827         t span buf bytes
1828
1829 reportLexError loc1 loc2 buf str
1830   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1831   | otherwise =
1832   let
1833         c = fst (nextChar buf)
1834   in
1835   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1836     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1837     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1838
1839 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
1840 lexTokenStream buf loc dflags = unP go initState
1841     where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
1842           go = do
1843             ltok <- lexer return
1844             case ltok of
1845               L _ ITeof -> return []
1846               _ -> liftM (ltok:) go
1847 }