ghc/compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2003
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), Token__(..), lexer, mkPState, showPFailed,
  26    P(..), ParseResult(..), setSrcLocFor, getSrcLoc,
  27    failLocMsgP, srcParseFail,
  28    popContext, pushCurrentContext,
  29   ) where
  30
  31 #include "HsVersions.h"
  32
  33 import ForeignCall      ( Safety(..) )
  34 import ErrUtils         ( Message )
  35 import Outputable
  36 import StringBuffer
  37 import FastString
  38 import FastTypes
  39 import SrcLoc
  40 import UniqFM
  41 import CmdLineOpts
  42 import Ctype
  43 import Util             ( maybePrefixMatch )
  44
  45 import DATA_BITS
  46 import Char
  47 import Ratio
  48 import TRACE
  49 }
  50
  51 $whitechar   = [\ \t\n\r\f\v\xa0]
  52 $white_no_nl = $whitechar # \n
  53
  54 $ascdigit  = 0-9
  55 $unidigit  = \x01
  56 $digit     = [$ascdigit $unidigit]
  57
  58 $special   = [\(\)\,\;\[\]\`\{\}]
  59 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  60 $unisymbol = \x02
  61 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  62
  63 $unilarge  = \x03
  64 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  65 $large     = [$asclarge $unilarge]
  66
  67 $unismall  = \x04
  68 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  69 $small     = [$ascsmall $unismall \_]
  70
  71 $graphic   = [$small $large $symbol $digit $special \:\"\']
  72
  73 $octit     = 0-7
  74 $hexit     = [$digit A-F a-f]
  75 $symchar   = [$symbol \:]
  76 $nl        = [\n\r]
  77 $idchar    = [$small $large $digit \']
  78
  79 @varid     = $small $idchar*
  80 @conid     = $large $idchar*
  81
  82 @varsym    = $symbol $symchar*
  83 @consym    = \: $symchar*
  84
  85 @decimal     = $digit+
  86 @octal       = $octit+
  87 @hexadecimal = $hexit+
  88 @exponent    = [eE] [\-\+]? @decimal
  89
  90 -- we support the hierarchical module name extension:
  91 @qual = (@conid \.)+
  92
  93 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
  94
  95 haskell :-
  96
  97 -- everywhere: skip whitespace and comments
  98 $white_no_nl+                           ;
  99
 100 -- Everywhere: deal with nested comments.  We explicitly rule out
 101 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 102 -- (this can happen even though pragmas will normally take precedence due to
 103 -- longest-match, because pragmas aren't valid in every state, but comments
 104 -- are).
 105 "{-" / { notFollowedBy '#' }            { nested_comment }
 106
 107 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 108 -- more dashes followed by a symbol should be parsed as a varsym, so we
 109 -- have to exclude those.
 110 -- The regex says: "munch all the characters after the dashes, as long as
 111 -- the first one is not a symbol".
 112 "--"\-* ([^$symbol] .*)?                ;
 113
 114 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 115 -- blank lines) until we find a non-whitespace character, then do layout
 116 -- processing.
 117 --
 118 -- One slight wibble here: what if the line begins with {-#? In
 119 -- theory, we have to lex the pragma to see if it's one we recognise,
 120 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 121 -- as a nested comment.  We don't bother with this: if the line begins
 122 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 123 <bol> {
 124   \n                                    ;
 125   ^\# (line)?                           { begin line_prag1 }
 126   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 127   ()                                    { do_bol }
 128 }
 129
 130 -- after a layout keyword (let, where, do, of), we begin a new layout
 131 -- context if the curly brace is missing.
 132 -- Careful! This stuff is quite delicate.
 133 <layout, layout_do> {
 134   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 135         -- we might encounter {-# here, but {- has been handled already
 136   \n                                    ;
 137   ^\# (line)?                           { begin line_prag1 }
 138 }
 139
 140 -- do is treated in a subtly different way, see new_layout_context
 141 <layout>    ()                          { new_layout_context True }
 142 <layout_do> ()                          { new_layout_context False }
 143
 144 -- after a new layout context which was found to be to the left of the
 145 -- previous context, we have generated a '{' token, and we now need to
 146 -- generate a matching '}' token.
 147 <layout_left>  ()                       { do_layout_left }
 148
 149 <0,glaexts> \n                          { begin bol }
 150
 151 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 152
 153 -- single-line line pragmas, of the form
 154 --    # <line> "<file>" <extra-stuff> \n
 155 <line_prag1> $digit+                    { set_line line_prag1a }
 156 <line_prag1a> \" [$graphic \ ]* \"      { set_file line_prag1b }
 157 <line_prag1b> .*                        { pop }
 158
 159 -- Haskell-style line pragmas, of the form
 160 --    {-# LINE <line> "<file>" #-}
 161 <line_prag2> $digit+                    { set_line line_prag2a }
 162 <line_prag2a> \" [$graphic \ ]* \"      { set_file line_prag2b }
 163 <line_prag2b> "#-}"                     { pop }
 164
 165 <0,glaexts> {
 166   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 167                                         { token ITspecialise_prag }
 168   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 169   "{-#" $whitechar* (INLINE|inline)     { token ITinline_prag }
 170   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 171                                         { token ITnoinline_prag }
 172   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 173   "{-#" $whitechar* (DEPRECATED|deprecated)
 174                                         { token ITdeprecated_prag }
 175   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 176   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 177   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 178
 179   "{-#"                                 { nested_comment }
 180
 181   -- ToDo: should only be valid inside a pragma:
 182   "#-}"                                 { token ITclose_prag}
 183 }
 184
 185
 186 -- '0' state: ordinary lexemes
 187 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 188
 189 -- "special" symbols
 190
 191 <0,glaexts> {
 192   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 193   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 194 }
 195
 196 <0,glaexts> {
 197   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 198   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 199   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 200   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 201   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 202   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 203   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 204   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 205 }
 206
 207 <0,glaexts> {
 208   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 209                                         { special IToparenbar }
 210   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 211 }
 212
 213 <0,glaexts> {
 214   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 215   \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
 216 }
 217
 218 <glaexts> {
 219   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 220   "#)"                                  { token ITcubxparen }
 221   "{|"                                  { token ITocurlybar }
 222   "|}"                                  { token ITccurlybar }
 223 }
 224
 225 <0,glaexts> {
 226   \(                                    { special IToparen }
 227   \)                                    { special ITcparen }
 228   \[                                    { special ITobrack }
 229   \]                                    { special ITcbrack }
 230   \,                                    { special ITcomma }
 231   \;                                    { special ITsemi }
 232   \`                                    { special ITbackquote }
 233
 234   \{                                    { open_brace }
 235   \}                                    { close_brace }
 236 }
 237
 238 <0,glaexts> {
 239   @qual @varid                  { check_qvarid }
 240   @qual @conid                  { idtoken qconid }
 241   @varid                        { varid }
 242   @conid                        { idtoken conid }
 243 }
 244
 245 -- after an illegal qvarid, such as 'M.let',
 246 -- we back up and try again in the bad_qvarid state:
 247 <bad_qvarid> {
 248   @conid                        { pop_and (idtoken conid) }
 249   @qual @conid                  { pop_and (idtoken qconid) }
 250 }
 251
 252 <glaexts> {
 253   @qual @varid "#"+             { idtoken qvarid }
 254   @qual @conid "#"+             { idtoken qconid }
 255   @varid "#"+                   { varid }
 256   @conid "#"+                   { idtoken conid }
 257 }
 258
 259 -- ToDo: M.(,,,)
 260
 261 <0,glaexts> {
 262   @qual @varsym                 { idtoken qvarsym }
 263   @qual @consym                 { idtoken qconsym }
 264   @varsym                       { varsym }
 265   @consym                       { consym }
 266 }
 267
 268 <0,glaexts> {
 269   @decimal                      { tok_decimal }
 270   0[oO] @octal                  { tok_octal }
 271   0[xX] @hexadecimal            { tok_hexadecimal }
 272 }
 273
 274 <glaexts> {
 275   @decimal \#                   { prim_decimal }
 276   0[oO] @octal \#               { prim_octal }
 277   0[xX] @hexadecimal \#         { prim_hexadecimal }
 278 }
 279
 280 <0,glaexts> @floating_point             { strtoken tok_float }
 281 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 282 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 283
 284 -- Strings and chars are lexed by hand-written code.  The reason is
 285 -- that even if we recognise the string or char here in the regex
 286 -- lexer, we would still have to parse the string afterward in order
 287 -- to convert it to a String.
 288 <0,glaexts> {
 289   \'                            { lex_char_tok }
 290   \"                            { lex_string_tok }
 291 }
 292
 293 {
 294 -- work around bug in Alex 2.0
 295 #if __GLASGOW_HASKELL__ < 503
 296 unsafeAt arr i = arr ! i
 297 #endif
 298
 299 -- -----------------------------------------------------------------------------
 300 -- The token type
 301
 302 data Token = T SrcLoc{-start-} SrcLoc{-end-} Token__
 303
 304 data Token__
 305   = ITas                        -- Haskell keywords
 306   | ITcase
 307   | ITclass
 308   | ITdata
 309   | ITdefault
 310   | ITderiving
 311   | ITdo
 312   | ITelse
 313   | IThiding
 314   | ITif
 315   | ITimport
 316   | ITin
 317   | ITinfix
 318   | ITinfixl
 319   | ITinfixr
 320   | ITinstance
 321   | ITlet
 322   | ITmodule
 323   | ITnewtype
 324   | ITof
 325   | ITqualified
 326   | ITthen
 327   | ITtype
 328   | ITwhere
 329   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 330
 331   | ITforall                    -- GHC extension keywords
 332   | ITforeign
 333   | ITexport
 334   | ITlabel
 335   | ITdynamic
 336   | ITsafe
 337   | ITthreadsafe
 338   | ITunsafe
 339   | ITstdcallconv
 340   | ITccallconv
 341   | ITdotnet
 342   | ITmdo
 343
 344   | ITspecialise_prag           -- Pragmas
 345   | ITsource_prag
 346   | ITinline_prag
 347   | ITnoinline_prag
 348   | ITrules_prag
 349   | ITdeprecated_prag
 350   | ITline_prag
 351   | ITscc_prag
 352   | ITcore_prag                 -- hdaume: core annotations
 353   | ITunpack_prag
 354   | ITclose_prag
 355
 356   | ITdotdot                    -- reserved symbols
 357   | ITcolon
 358   | ITdcolon
 359   | ITequal
 360   | ITlam
 361   | ITvbar
 362   | ITlarrow
 363   | ITrarrow
 364   | ITat
 365   | ITtilde
 366   | ITdarrow
 367   | ITminus
 368   | ITbang
 369   | ITstar
 370   | ITdot
 371
 372   | ITbiglam                    -- GHC-extension symbols
 373
 374   | ITocurly                    -- special symbols
 375   | ITccurly
 376   | ITocurlybar                 -- {|, for type applications
 377   | ITccurlybar                 -- |}, for type applications
 378   | ITvocurly
 379   | ITvccurly
 380   | ITobrack
 381   | ITopabrack                  -- [:, for parallel arrays with -fparr
 382   | ITcpabrack                  -- :], for parallel arrays with -fparr
 383   | ITcbrack
 384   | IToparen
 385   | ITcparen
 386   | IToubxparen
 387   | ITcubxparen
 388   | ITsemi
 389   | ITcomma
 390   | ITunderscore
 391   | ITbackquote
 392
 393   | ITvarid   FastString        -- identifiers
 394   | ITconid   FastString
 395   | ITvarsym  FastString
 396   | ITconsym  FastString
 397   | ITqvarid  (FastString,FastString)
 398   | ITqconid  (FastString,FastString)
 399   | ITqvarsym (FastString,FastString)
 400   | ITqconsym (FastString,FastString)
 401
 402   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 403   | ITsplitipvarid FastString   -- GHC extension: implicit param: %x
 404
 405   | ITpragma StringBuffer
 406
 407   | ITchar       Char
 408   | ITstring     FastString
 409   | ITinteger    Integer
 410   | ITrational   Rational
 411
 412   | ITprimchar   Char
 413   | ITprimstring FastString
 414   | ITprimint    Integer
 415   | ITprimfloat  Rational
 416   | ITprimdouble Rational
 417
 418   -- MetaHaskell extension tokens
 419   | ITopenExpQuote              -- [| or [e|
 420   | ITopenPatQuote              -- [p|
 421   | ITopenDecQuote              -- [d|
 422   | ITopenTypQuote              -- [t|
 423   | ITcloseQuote                -- |]
 424   | ITidEscape   FastString     -- $x
 425   | ITparenEscape               -- $(
 426   | ITvarQuote                  -- '
 427   | ITtyQuote                   -- ''
 428
 429   -- Arrow notation extension
 430   | ITproc
 431   | ITrec
 432   | IToparenbar                 -- (|
 433   | ITcparenbar                 -- |)
 434   | ITlarrowtail                -- -<
 435   | ITrarrowtail                -- >-
 436   | ITLarrowtail                -- -<<
 437   | ITRarrowtail                -- >>-
 438
 439   | ITunknown String            -- Used when the lexer can't make sense of it
 440   | ITeof                       -- end of file token
 441 #ifdef DEBUG
 442   deriving Show -- debugging
 443 #endif
 444
 445 isSpecial :: Token__ -> Bool
 446 -- If we see M.x, where x is a keyword, but
 447 -- is special, we treat is as just plain M.x,
 448 -- not as a keyword.
 449 isSpecial ITas          = True
 450 isSpecial IThiding      = True
 451 isSpecial ITqualified   = True
 452 isSpecial ITforall      = True
 453 isSpecial ITexport      = True
 454 isSpecial ITlabel       = True
 455 isSpecial ITdynamic     = True
 456 isSpecial ITsafe        = True
 457 isSpecial ITthreadsafe  = True
 458 isSpecial ITunsafe      = True
 459 isSpecial ITccallconv   = True
 460 isSpecial ITstdcallconv = True
 461 isSpecial ITmdo         = True
 462 isSpecial _             = False
 463
 464 -- the bitmap provided as the third component indicates whether the
 465 -- corresponding extension keyword is valid under the extension options
 466 -- provided to the compiler; if the extension corresponding to *any* of the
 467 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 468 -- facilitates using a keyword in two different extensions that can be
 469 -- activated independently)
 470 --
 471 reservedWordsFM = listToUFM $
 472         map (\(x, y, z) -> (mkFastString x, (y, z)))
 473        [( "_",          ITunderscore,   0 ),
 474         ( "as",         ITas,           0 ),
 475         ( "case",       ITcase,         0 ),
 476         ( "class",      ITclass,        0 ),
 477         ( "data",       ITdata,         0 ),
 478         ( "default",    ITdefault,      0 ),
 479         ( "deriving",   ITderiving,     0 ),
 480         ( "do",         ITdo,           0 ),
 481         ( "else",       ITelse,         0 ),
 482         ( "hiding",     IThiding,       0 ),
 483         ( "if",         ITif,           0 ),
 484         ( "import",     ITimport,       0 ),
 485         ( "in",         ITin,           0 ),
 486         ( "infix",      ITinfix,        0 ),
 487         ( "infixl",     ITinfixl,       0 ),
 488         ( "infixr",     ITinfixr,       0 ),
 489         ( "instance",   ITinstance,     0 ),
 490         ( "let",        ITlet,          0 ),
 491         ( "module",     ITmodule,       0 ),
 492         ( "newtype",    ITnewtype,      0 ),
 493         ( "of",         ITof,           0 ),
 494         ( "qualified",  ITqualified,    0 ),
 495         ( "then",       ITthen,         0 ),
 496         ( "type",       ITtype,         0 ),
 497         ( "where",      ITwhere,        0 ),
 498         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 499
 500         ( "forall",     ITforall,        bit glaExtsBit),
 501         ( "mdo",        ITmdo,           bit glaExtsBit),
 502
 503         ( "foreign",    ITforeign,       bit ffiBit),
 504         ( "export",     ITexport,        bit ffiBit),
 505         ( "label",      ITlabel,         bit ffiBit),
 506         ( "dynamic",    ITdynamic,       bit ffiBit),
 507         ( "safe",       ITsafe,          bit ffiBit),
 508         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 509         ( "unsafe",     ITunsafe,        bit ffiBit),
 510         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 511         ( "ccall",      ITccallconv,     bit ffiBit),
 512         ( "dotnet",     ITdotnet,        bit ffiBit),
 513
 514         ( "rec",        ITrec,           bit arrowsBit),
 515         ( "proc",       ITproc,          bit arrowsBit)
 516      ]
 517
 518 reservedSymsFM = listToUFM $
 519         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 520       [ ("..",  ITdotdot,       0)
 521        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 522                                                 -- meaning only list cons
 523        ,("::",  ITdcolon,       0)
 524        ,("=",   ITequal,        0)
 525        ,("\\",  ITlam,          0)
 526        ,("|",   ITvbar,         0)
 527        ,("<-",  ITlarrow,       0)
 528        ,("->",  ITrarrow,       0)
 529        ,("@",   ITat,           0)
 530        ,("~",   ITtilde,        0)
 531        ,("=>",  ITdarrow,       0)
 532        ,("-",   ITminus,        0)
 533        ,("!",   ITbang,         0)
 534
 535        ,("*",   ITstar,         bit glaExtsBit) -- For data T (a::*) = MkT
 536        ,(".",   ITdot,          bit glaExtsBit) -- For 'forall a . t'
 537
 538        ,("-<",  ITlarrowtail,   bit arrowsBit)
 539        ,(">-",  ITrarrowtail,   bit arrowsBit)
 540        ,("-<<", ITLarrowtail,   bit arrowsBit)
 541        ,(">>-", ITRarrowtail,   bit arrowsBit)
 542        ]
 543
 544 -- -----------------------------------------------------------------------------
 545 -- Lexer actions
 546
 547 type Action = SrcLoc -> SrcLoc -> StringBuffer -> Int -> P Token
 548
 549 special :: Token__ -> Action
 550 special tok loc end _buf len = return (T loc end tok)
 551
 552 token, layout_token :: Token__ -> Action
 553 token t loc end buf len = return (T loc end t)
 554 layout_token t loc end buf len = pushLexState layout >> return (T loc end t)
 555
 556 idtoken :: (StringBuffer -> Int -> Token__) -> Action
 557 idtoken f loc end buf len = return (T loc end $! (f buf len))
 558
 559 skip_one_varid :: (FastString -> Token__) -> Action
 560 skip_one_varid f loc end buf len
 561   = return (T loc end $! f (lexemeToFastString (stepOn buf) (len-1)))
 562
 563 strtoken :: (String -> Token__) -> Action
 564 strtoken f loc end buf len =
 565   return (T loc end $! (f $! lexemeToString buf len))
 566
 567 init_strtoken :: Int -> (String -> Token__) -> Action
 568 -- like strtoken, but drops the last N character(s)
 569 init_strtoken drop f loc end buf len =
 570   return (T loc end $! (f $! lexemeToString buf (len-drop)))
 571
 572 begin :: Int -> Action
 573 begin code _loc _end _str _len = do pushLexState code; lexToken
 574
 575 pop :: Action
 576 pop _loc _end _buf _len = do popLexState; lexToken
 577
 578 pop_and :: Action -> Action
 579 pop_and act loc end buf len = do popLexState; act loc end buf len
 580
 581 notFollowedBy char _ _ _ (_,buf) = atEnd buf || currentChar buf /= char
 582
 583 notFollowedBySymbol _ _ _ (_,buf)
 584   = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
 585
 586 ifExtension pred bits _ _ _ = pred bits
 587
 588 {-
 589   nested comments require traversing by hand, they can't be parsed
 590   using regular expressions.
 591 -}
 592 nested_comment :: Action
 593 nested_comment loc _end _str _len = do
 594   input <- getInput
 595   go 1 input
 596   where go 0 input = do setInput input; lexToken
 597         go n input = do
 598           case alexGetChar input of
 599             Nothing  -> err input
 600             Just (c,input) -> do
 601               case c of
 602                 '-' -> do
 603                   case alexGetChar input of
 604                     Nothing  -> err input
 605                     Just ('\125',input) -> go (n-1) input
 606                     Just (c,_)          -> go n input
 607                 '\123' -> do
 608                   case alexGetChar input of
 609                     Nothing  -> err input
 610                     Just ('-',input') -> go (n+1) input'
 611                     Just (c,input)    -> go n input
 612                 c -> go n input
 613
 614         err input = do failLocMsgP loc (fst input) "unterminated `{-'"
 615
 616 open_brace, close_brace :: Action
 617 open_brace  loc end _str _len = do
 618   ctx <- getContext
 619   setContext (NoLayout:ctx)
 620   return (T loc end ITocurly)
 621 close_brace loc end _str _len = do
 622   popContext
 623   return (T loc end ITccurly)
 624
 625 -- We have to be careful not to count M.<varid> as a qualified name
 626 -- when <varid> is a keyword.  We hack around this by catching
 627 -- the offending tokens afterward, and re-lexing in a different state.
 628 check_qvarid loc end buf len = do
 629   case lookupUFM reservedWordsFM var of
 630         Just (keyword,exts)
 631           | not (isSpecial keyword) ->
 632           if exts == 0
 633              then try_again
 634              else do
 635                 b <- extension (\i -> exts .&. i /= 0)
 636                 if b then try_again
 637                      else return token
 638         _other -> return token
 639   where
 640         (mod,var) = splitQualName buf len
 641         token     = T loc end (ITqvarid (mod,var))
 642
 643         try_again = do
 644                 setInput (loc,buf)
 645                 pushLexState bad_qvarid
 646                 lexToken
 647
 648 qvarid buf len = ITqvarid $! splitQualName buf len
 649 qconid buf len = ITqconid $! splitQualName buf len
 650
 651 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 652 -- takes a StringBuffer and a length, and returns the module name
 653 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 654 -- because of hierarchical module names.
 655 splitQualName orig_buf len = split orig_buf 0 0
 656   where
 657     split buf dot_off n
 658         | n == len                = done dot_off
 659         | lookAhead buf n == '.'  = split2 buf n (n+1)
 660         | otherwise               = split buf dot_off (n+1)
 661
 662     -- careful, we might get names like M....
 663     -- so, if the character after the dot is not upper-case, this is
 664     -- the end of the qualifier part.
 665     split2 buf dot_off n
 666         | isUpper (lookAhead buf n) = split buf dot_off (n+1)
 667         | otherwise                 = done dot_off
 668
 669     done dot_off =
 670         (lexemeToFastString orig_buf dot_off,
 671          lexemeToFastString (stepOnBy (dot_off+1) orig_buf) (len - dot_off -1))
 672
 673 varid loc end buf len =
 674   case lookupUFM reservedWordsFM fs of
 675         Just (keyword,0)    -> do
 676                 maybe_layout keyword
 677                 return (T loc end keyword)
 678         Just (keyword,exts) -> do
 679                 b <- extension (\i -> exts .&. i /= 0)
 680                 if b then do maybe_layout keyword
 681                              return (T loc end keyword)
 682                      else return (T loc end (ITvarid fs))
 683         _other -> return (T loc end (ITvarid fs))
 684   where
 685         fs = lexemeToFastString buf len
 686
 687 conid buf len = ITconid fs
 688   where fs = lexemeToFastString buf len
 689
 690 qvarsym buf len = ITqvarsym $! splitQualName buf len
 691 qconsym buf len = ITqconsym $! splitQualName buf len
 692
 693 varsym = sym ITvarsym
 694 consym = sym ITconsym
 695
 696 sym con loc end buf len =
 697   case lookupUFM reservedSymsFM fs of
 698         Just (keyword,0)    -> return (T loc end keyword)
 699         Just (keyword,exts) -> do
 700                 b <- extension (\i -> exts .&. i /= 0)
 701                 if b then return (T loc end keyword)
 702                      else return (T loc end $! con fs)
 703         _other -> return (T loc end $! con fs)
 704   where
 705         fs = lexemeToFastString buf len
 706
 707 tok_decimal loc end buf len
 708   = return (T loc end (ITinteger  $! parseInteger buf len 10 oct_or_dec))
 709
 710 tok_octal loc end buf len
 711   = return (T loc end (ITinteger  $! parseInteger (stepOnBy 2 buf) (len-2) 8 oct_or_dec))
 712
 713 tok_hexadecimal loc end buf len
 714   = return (T loc end (ITinteger  $! parseInteger (stepOnBy 2 buf) (len-2) 16 hex))
 715
 716 prim_decimal loc end buf len
 717   = return (T loc end (ITprimint  $! parseInteger buf (len-1) 10 oct_or_dec))
 718
 719 prim_octal loc end buf len
 720   = return (T loc end (ITprimint  $! parseInteger (stepOnBy 2 buf) (len-3) 8 oct_or_dec))
 721
 722 prim_hexadecimal loc end buf len
 723   = return (T loc end (ITprimint  $! parseInteger (stepOnBy 2 buf) (len-3) 16 hex))
 724
 725 tok_float        str = ITrational $! readRational__ str
 726 prim_float       str = ITprimfloat  $! readRational__ str
 727 prim_double      str = ITprimdouble $! readRational__ str
 728
 729 parseInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
 730 parseInteger buf len radix to_int
 731   = go 0 0
 732   where go i x | i == len  = x
 733                | otherwise = go (i+1) (x * radix + toInteger (to_int (lookAhead buf i)))
 734
 735 -- -----------------------------------------------------------------------------
 736 -- Layout processing
 737
 738 -- we're at the first token on a line, insert layout tokens if necessary
 739 do_bol :: Action
 740 do_bol loc end _str _len = do
 741         pos <- getOffside end
 742         case pos of
 743             LT -> do
 744                 --trace "layout: inserting '}'" $ do
 745                 popContext
 746                 -- do NOT pop the lex state, we might have a ';' to insert
 747                 return (T loc end ITvccurly)
 748             EQ -> do
 749                 --trace "layout: inserting ';'" $ do
 750                 popLexState
 751                 return (T loc end ITsemi)
 752             GT -> do
 753                 popLexState
 754                 lexToken
 755
 756 -- certain keywords put us in the "layout" state, where we might
 757 -- add an opening curly brace.
 758 maybe_layout ITdo       = pushLexState layout_do
 759 maybe_layout ITmdo      = pushLexState layout_do
 760 maybe_layout ITof       = pushLexState layout
 761 maybe_layout ITlet      = pushLexState layout
 762 maybe_layout ITwhere    = pushLexState layout
 763 maybe_layout ITrec      = pushLexState layout
 764 maybe_layout _          = return ()
 765
 766 -- Pushing a new implicit layout context.  If the indentation of the
 767 -- next token is not greater than the previous layout context, then
 768 -- Haskell 98 says that the new layout context should be empty; that is
 769 -- the lexer must generate {}.
 770 --
 771 -- We are slightly more lenient than this: when the new context is started
 772 -- by a 'do', then we allow the new context to be at the same indentation as
 773 -- the previous context.  This is what the 'strict' argument is for.
 774 --
 775 new_layout_context strict loc end _buf _len = do
 776     popLexState
 777     let offset = srcLocCol loc
 778     ctx <- getContext
 779     case ctx of
 780         Layout prev_off : _  |
 781            (strict     && prev_off >= offset  ||
 782             not strict && prev_off > offset) -> do
 783                 -- token is indented to the left of the previous context.
 784                 -- we must generate a {} sequence now.
 785                 pushLexState layout_left
 786                 return (T loc end ITvocurly)
 787         other -> do
 788                 setContext (Layout offset : ctx)
 789                 return (T loc end ITvocurly)
 790
 791 do_layout_left loc end _buf _len = do
 792     popLexState
 793     pushLexState bol  -- we must be at the start of a line
 794     return (T loc end ITvccurly)
 795
 796 -- -----------------------------------------------------------------------------
 797 -- LINE pragmas
 798
 799 set_line :: Int -> Action
 800 set_line code loc end buf len = do
 801   let line = parseInteger buf len 10 oct_or_dec
 802   setSrcLoc (mkSrcLoc (srcLocFile end) (fromIntegral line - 1) 0)
 803         -- subtract one: the line number refers to the *following* line
 804   popLexState
 805   pushLexState code
 806   lexToken
 807
 808 set_file :: Int -> Action
 809 set_file code loc end buf len = do
 810   let file = lexemeToFastString (stepOn buf) (len-2)
 811   setSrcLoc (mkSrcLoc file (srcLocLine end) (srcLocCol end))
 812   popLexState
 813   pushLexState code
 814   lexToken
 815
 816 -- -----------------------------------------------------------------------------
 817 -- Strings & Chars
 818
 819 -- This stuff is horrible.  I hates it.
 820
 821 lex_string_tok :: Action
 822 lex_string_tok loc end buf len = do
 823   tok <- lex_string ""
 824   end <- getSrcLoc
 825   return (T loc end tok)
 826
 827 lex_string :: String -> P Token__
 828 lex_string s = do
 829   i <- getInput
 830   case alexGetChar i of
 831     Nothing -> lit_error
 832
 833     Just ('"',i)  -> do
 834         setInput i
 835         glaexts <- extension glaExtsEnabled
 836         if glaexts
 837           then do
 838             i <- getInput
 839             case alexGetChar i of
 840               Just ('#',i) -> do
 841                    setInput i
 842                    if any (> '\xFF') s
 843                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 844                     else let s' = mkFastStringNarrow (reverse s) in
 845                          -- always a narrow string/byte array
 846                          return (ITprimstring s')
 847               _other ->
 848                 return (ITstring (mkFastString (reverse s)))
 849           else
 850                 return (ITstring (mkFastString (reverse s)))
 851
 852     Just ('\\',i)
 853         | Just ('&',i) <- next -> do
 854                 setInput i; lex_string s
 855         | Just (c,i) <- next, is_space c -> do
 856                 setInput i; lex_stringgap s
 857         where next = alexGetChar i
 858
 859     Just _ -> do
 860         c <- lex_char
 861         lex_string (c:s)
 862
 863 lex_char :: P Char
 864 lex_char = do
 865   mc <- getCharOrFail
 866   case mc of
 867       '\\' -> lex_escape
 868       c | is_any c -> return c
 869       _other -> lit_error
 870
 871 lex_stringgap s = do
 872   c <- getCharOrFail
 873   case c of
 874     '\\' -> lex_string s
 875     c | is_space c -> lex_stringgap s
 876     _other -> lit_error
 877
 878
 879 lex_char_tok :: Action
 880 -- Here we are basically parsing character literals, such as 'x' or '\n'
 881 -- but, when Template Haskell is on, we additionally spot
 882 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
 883 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
 884 -- So we have to do two characters of lookahead: when we see 'x we need to
 885 -- see if there's a trailing quote
 886 lex_char_tok loc _end buf len = do      -- We've seen '
 887    i1 <- getInput       -- Look ahead to first character
 888    case alexGetChar i1 of
 889         Nothing -> lit_error
 890
 891         Just ('\'', i2@(end2,_)) -> do  -- We've seen ''
 892                   th_exts <- extension thEnabled
 893                   if th_exts then do
 894                         setInput i2
 895                         return (T loc end2 ITtyQuote)
 896                    else lit_error
 897
 898         Just ('\\', i2@(end2,_)) -> do  -- We've seen 'backslash
 899                   setInput i2
 900                   lit_ch <- lex_escape
 901                   mc <- getCharOrFail   -- Trailing quote
 902                   if mc == '\'' then finish_char_tok loc lit_ch
 903                                 else lit_error
 904
 905         Just (c, i2@(end2,_)) | not (is_any c) -> lit_error
 906                               | otherwise      ->
 907
 908                 -- We've seen 'x, where x is a valid character
 909                 --  (i.e. not newline etc) but not a quote or backslash
 910            case alexGetChar i2 of       -- Look ahead one more character
 911                 Nothing -> lit_error
 912                 Just ('\'', i3) -> do   -- We've seen 'x'
 913                         setInput i3
 914                         finish_char_tok loc c
 915                 _other -> do            -- We've seen 'x not followed by quote
 916                                         -- If TH is on, just parse the quote only
 917                         th_exts <- extension thEnabled
 918                         if th_exts then return (T loc (fst i1) ITvarQuote)
 919                                    else lit_error
 920
 921 finish_char_tok :: SrcLoc -> Char -> P Token
 922 finish_char_tok loc ch  -- We've already seen the closing quote
 923                         -- Just need to check for trailing #
 924   = do  glaexts <- extension glaExtsEnabled
 925         if glaexts then do
 926                 i@(end,_) <- getInput
 927                 case alexGetChar i of
 928                         Just ('#',i@(end,_)) -> do
 929                                 setInput i
 930                                 return (T loc end (ITprimchar ch))
 931                         _other ->
 932                                         return (T loc end (ITchar ch))
 933           else do end <- getSrcLoc
 934                   return (T loc end (ITchar ch))
 935
 936 lex_escape :: P Char
 937 lex_escape = do
 938   c <- getCharOrFail
 939   case c of
 940         'a'   -> return '\a'
 941         'b'   -> return '\b'
 942         'f'   -> return '\f'
 943         'n'   -> return '\n'
 944         'r'   -> return '\r'
 945         't'   -> return '\t'
 946         'v'   -> return '\v'
 947         '\\'  -> return '\\'
 948         '"'   -> return '\"'
 949         '\''  -> return '\''
 950         '^'   -> do c <- getCharOrFail
 951                     if c >= '@' && c <= '_'
 952                         then return (chr (ord c - ord '@'))
 953                         else lit_error
 954
 955         'x'   -> readNum is_hexdigit 16 hex
 956         'o'   -> readNum is_octdigit  8 oct_or_dec
 957         x | is_digit x -> readNum2 is_digit 10 oct_or_dec (oct_or_dec x)
 958
 959         c1 ->  do
 960            i <- getInput
 961            case alexGetChar i of
 962             Nothing -> lit_error
 963             Just (c2,i2) ->
 964               case alexGetChar i2 of
 965                 Nothing -> lit_error
 966                 Just (c3,i3) ->
 967                    let str = [c1,c2,c3] in
 968                    case [ (c,rest) | (p,c) <- silly_escape_chars,
 969                                      Just rest <- [maybePrefixMatch p str] ] of
 970                           (escape_char,[]):_ -> do
 971                                 setInput i3
 972                                 return escape_char
 973                           (escape_char,_:_):_ -> do
 974                                 setInput i2
 975                                 return escape_char
 976                           [] -> lit_error
 977
 978 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
 979 readNum is_digit base conv = do
 980   c <- getCharOrFail
 981   if is_digit c
 982         then readNum2 is_digit base conv (conv c)
 983         else lit_error
 984
 985 readNum2 is_digit base conv i = do
 986   input <- getInput
 987   read i input
 988   where read i input = do
 989           case alexGetChar input of
 990             Just (c,input') | is_digit c -> do
 991                 read (i*base + conv c) input'
 992             _other -> do
 993                 setInput input
 994                 if i >= 0 && i <= 0x10FFFF
 995                    then return (chr i)
 996                    else lit_error
 997
 998 is_hexdigit c
 999         =  is_digit c
1000         || (c >= 'a' && c <= 'f')
1001         || (c >= 'A' && c <= 'F')
1002
1003 hex c | is_digit c = ord c - ord '0'
1004       | otherwise  = ord (to_lower c) - ord 'a' + 10
1005
1006 oct_or_dec c = ord c - ord '0'
1007
1008 is_octdigit c = c >= '0' && c <= '7'
1009
1010 to_lower c
1011   | c >=  'A' && c <= 'Z' = chr (ord c - (ord 'A' - ord 'a'))
1012   | otherwise = c
1013
1014 silly_escape_chars = [
1015         ("NUL", '\NUL'),
1016         ("SOH", '\SOH'),
1017         ("STX", '\STX'),
1018         ("ETX", '\ETX'),
1019         ("EOT", '\EOT'),
1020         ("ENQ", '\ENQ'),
1021         ("ACK", '\ACK'),
1022         ("BEL", '\BEL'),
1023         ("BS", '\BS'),
1024         ("HT", '\HT'),
1025         ("LF", '\LF'),
1026         ("VT", '\VT'),
1027         ("FF", '\FF'),
1028         ("CR", '\CR'),
1029         ("SO", '\SO'),
1030         ("SI", '\SI'),
1031         ("DLE", '\DLE'),
1032         ("DC1", '\DC1'),
1033         ("DC2", '\DC2'),
1034         ("DC3", '\DC3'),
1035         ("DC4", '\DC4'),
1036         ("NAK", '\NAK'),
1037         ("SYN", '\SYN'),
1038         ("ETB", '\ETB'),
1039         ("CAN", '\CAN'),
1040         ("EM", '\EM'),
1041         ("SUB", '\SUB'),
1042         ("ESC", '\ESC'),
1043         ("FS", '\FS'),
1044         ("GS", '\GS'),
1045         ("RS", '\RS'),
1046         ("US", '\US'),
1047         ("SP", '\SP'),
1048         ("DEL", '\DEL')
1049         ]
1050
1051 lit_error = lexError "lexical error in string/character literal"
1052
1053 getCharOrFail :: P Char
1054 getCharOrFail =  do
1055   i <- getInput
1056   case alexGetChar i of
1057         Nothing -> lexError "unexpected end-of-file in string/character literal"
1058         Just (c,i)  -> do setInput i; return c
1059
1060 -- -----------------------------------------------------------------------------
1061 -- Floats
1062
1063 readRational :: ReadS Rational -- NB: doesn't handle leading "-"
1064 readRational r = do
1065      (n,d,s) <- readFix r
1066      (k,t)   <- readExp s
1067      return ((n%1)*10^^(k-d), t)
1068  where
1069      readFix r = do
1070         (ds,s)  <- lexDecDigits r
1071         (ds',t) <- lexDotDigits s
1072         return (read (ds++ds'), length ds', t)
1073
1074      readExp (e:s) | e `elem` "eE" = readExp' s
1075      readExp s                     = return (0,s)
1076
1077      readExp' ('+':s) = readDec s
1078      readExp' ('-':s) = do
1079                         (k,t) <- readDec s
1080                         return (-k,t)
1081      readExp' s       = readDec s
1082
1083      readDec s = do
1084         (ds,r) <- nonnull isDigit s
1085         return (foldl1 (\n d -> n * 10 + d) [ ord d - ord '0' | d <- ds ],
1086                 r)
1087
1088      lexDecDigits = nonnull isDigit
1089
1090      lexDotDigits ('.':s) = return (span isDigit s)
1091      lexDotDigits s       = return ("",s)
1092
1093      nonnull p s = do (cs@(_:_),t) <- return (span p s)
1094                       return (cs,t)
1095
1096 readRational__ :: String -> Rational -- NB: *does* handle a leading "-"
1097 readRational__ top_s
1098   = case top_s of
1099       '-' : xs -> - (read_me xs)
1100       xs       -> read_me xs
1101   where
1102     read_me s
1103       = case (do { (x,"") <- readRational s ; return x }) of
1104           [x] -> x
1105           []  -> error ("readRational__: no parse:"        ++ top_s)
1106           _   -> error ("readRational__: ambiguous parse:" ++ top_s)
1107
1108 -- -----------------------------------------------------------------------------
1109 -- The Parse Monad
1110
1111 data LayoutContext
1112   = NoLayout
1113   | Layout !Int
1114
1115 data ParseResult a
1116   = POk PState a
1117   | PFailed
1118         SrcLoc SrcLoc   -- The start and end of the text span related to
1119                         -- the error.  Might be used in environments which can
1120                         -- show this span, e.g. by highlighting it.
1121         Message         -- The error message
1122
1123 showPFailed loc1 loc2 err = hcat [ppr loc1, text ": ", err]
1124
1125 data PState = PState {
1126         buffer     :: StringBuffer,
1127         last_loc   :: SrcLoc,           -- pos of previous token
1128         last_len   :: !Int,             -- len of previous token
1129         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1130         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1131         context    :: [LayoutContext],
1132         lex_state  :: [Int]
1133      }
1134         -- last_loc and last_len are used when generating error messages,
1135         -- and in pushCurrentContext only.
1136
1137 newtype P a = P { unP :: PState -> ParseResult a }
1138
1139 instance Monad P where
1140   return = returnP
1141   (>>=) = thenP
1142   fail = failP
1143
1144 returnP :: a -> P a
1145 returnP a = P $ \s -> POk s a
1146
1147 thenP :: P a -> (a -> P b) -> P b
1148 (P m) `thenP` k = P $ \ s ->
1149         case m s of
1150                 POk s1 a          -> (unP (k a)) s1
1151                 PFailed l1 l2 err -> PFailed l1 l2 err
1152
1153 failP :: String -> P a
1154 failP msg = P $ \s -> PFailed (last_loc s) (loc s) (text msg)
1155
1156 failMsgP :: String -> P a
1157 failMsgP msg = P $ \s -> PFailed (last_loc s) (loc s) (text msg)
1158
1159 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1160 failLocMsgP loc1 loc2 str = P $ \s -> PFailed loc1 loc2 (text str)
1161
1162 extension :: (Int -> Bool) -> P Bool
1163 extension p = P $ \s -> POk s (p $! extsBitmap s)
1164
1165 getExts :: P Int
1166 getExts = P $ \s -> POk s (extsBitmap s)
1167
1168 setSrcLoc :: SrcLoc -> P ()
1169 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1170
1171 -- tmp, for supporting stuff in RdrHsSyn.  The scope better not include
1172 -- any calls to the lexer, because it assumes things about the SrcLoc.
1173 setSrcLocFor :: SrcLoc -> P a -> P a
1174 setSrcLocFor new_loc scope = P $ \s@PState{ loc = old_loc } ->
1175   case unP scope s{loc=new_loc} of
1176         PFailed l1 l2 msg -> PFailed l1 l2 msg
1177         POk _ r -> POk s r
1178
1179 getSrcLoc :: P SrcLoc
1180 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1181
1182 setLastToken :: SrcLoc -> Int -> P ()
1183 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1184
1185 type AlexInput = (SrcLoc,StringBuffer)
1186
1187 alexInputPrevChar :: AlexInput -> Char
1188 alexInputPrevChar (_,s) = prevChar s '\n'
1189
1190 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1191 alexGetChar (loc,s)
1192   | atEnd s   = Nothing
1193   | otherwise = c `seq` loc' `seq` s' `seq` Just (c, (loc', s'))
1194   where c = currentChar s
1195         loc' = advanceSrcLoc loc c
1196         s'   = stepOn s
1197
1198 getInput :: P AlexInput
1199 getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (l,b)
1200
1201 setInput :: AlexInput -> P ()
1202 setInput (l,b) = P $ \s -> POk s{ loc=l, buffer=b } ()
1203
1204 pushLexState :: Int -> P ()
1205 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1206
1207 popLexState :: P Int
1208 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1209
1210 getLexState :: P Int
1211 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1212
1213 -- for reasons of efficiency, flags indicating language extensions (eg,
1214 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1215 -- integer
1216
1217 glaExtsBit, ffiBit, parrBit :: Int
1218 glaExtsBit = 0
1219 ffiBit     = 1
1220 parrBit    = 2
1221 arrowsBit  = 4
1222 thBit      = 5
1223 ipBit      = 6
1224
1225 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1226 glaExtsEnabled flags = testBit flags glaExtsBit
1227 ffiEnabled     flags = testBit flags ffiBit
1228 parrEnabled    flags = testBit flags parrBit
1229 arrowsEnabled  flags = testBit flags arrowsBit
1230 thEnabled      flags = testBit flags thBit
1231 ipEnabled      flags = testBit flags ipBit
1232
1233 -- create a parse state
1234 --
1235 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1236 mkPState buf loc flags  =
1237   PState {
1238       buffer     = buf,
1239       last_loc   = loc,
1240       last_len   = 0,
1241       loc        = loc,
1242       extsBitmap = fromIntegral bitmap,
1243       context    = [],
1244       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1245         -- we begin in the layout state if toplev_layout is set
1246     }
1247     where
1248       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1249                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1250                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1251                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1252                .|. thBit      `setBitIf` dopt Opt_TH          flags
1253                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1254       --
1255       setBitIf :: Int -> Bool -> Int
1256       b `setBitIf` cond | cond      = bit b
1257                         | otherwise = 0
1258
1259 getContext :: P [LayoutContext]
1260 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1261
1262 setContext :: [LayoutContext] -> P ()
1263 setContext ctx = P $ \s -> POk s{context=ctx} ()
1264
1265 popContext :: P ()
1266 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1267                            loc = loc, last_len = len, last_loc = last_loc }) ->
1268   case ctx of
1269         (_:tl) -> POk s{ context = tl } ()
1270         []     -> PFailed last_loc loc (srcParseErr buf len)
1271
1272 -- Push a new layout context at the indentation of the last token read.
1273 -- This is only used at the outer level of a module when the 'module'
1274 -- keyword is missing.
1275 pushCurrentContext :: P ()
1276 pushCurrentContext = P $ \ s@PState{ last_loc=loc, context=ctx } ->
1277   POk s{ context = Layout (srcLocCol loc) : ctx} ()
1278
1279 getOffside :: SrcLoc -> P Ordering
1280 getOffside loc = P $ \s@PState{context=stk} ->
1281                 let ord = case stk of
1282                         (Layout n:_) -> compare (srcLocCol loc) n
1283                         _            -> GT
1284                 in POk s ord
1285
1286 -- ---------------------------------------------------------------------------
1287 -- Construct a parse error
1288
1289 srcParseErr
1290   :: StringBuffer       -- current buffer (placed just after the last token)
1291   -> Int                -- length of the previous token
1292   -> Message
1293 srcParseErr buf len
1294   = hcat [ if null token
1295              then ptext SLIT("parse error (possibly incorrect indentation)")
1296              else hcat [ptext SLIT("parse error on input "),
1297                         char '`', text token, char '\'']
1298     ]
1299   where token = lexemeToString (stepOnBy (-len) buf) len
1300
1301 -- Report a parse failure, giving the span of the previous token as
1302 -- the location of the error.  This is the entry point for errors
1303 -- detected during parsing.
1304 srcParseFail :: P a
1305 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1306                                 last_loc = last_loc, loc = loc } ->
1307     PFailed last_loc loc (srcParseErr buf len)
1308
1309 -- A lexical error is reported at a particular position in the source file,
1310 -- not over a token range.  TODO: this is slightly wrong, because we record
1311 -- the error at the character position following the one which caused the
1312 -- error.  We should somehow back up by one character.
1313 lexError :: String -> P a
1314 lexError str = do
1315   loc <- getSrcLoc
1316   failLocMsgP loc loc str
1317
1318 -- -----------------------------------------------------------------------------
1319 -- This is the top-level function: called from the parser each time a
1320 -- new token is to be read from the input.
1321
1322 lexer :: (Token -> P a) -> P a
1323 lexer cont = do
1324   tok@(T _ _ tok__) <- lexToken
1325   --trace ("token: " ++ show tok__) $ do
1326   cont tok
1327
1328 lexToken :: P Token
1329 lexToken = do
1330   inp@(loc1,buf) <- getInput
1331   sc <- getLexState
1332   exts <- getExts
1333   case alexScanUser exts inp sc of
1334     AlexEOF -> do setLastToken loc1 0
1335                   return (T loc1 loc1 ITeof)
1336     AlexError (loc2,_) -> do failLocMsgP loc1 loc2 "lexical error"
1337     AlexSkip inp2 _ -> do
1338         setInput inp2
1339         lexToken
1340     AlexToken inp2@(end,buf2) len t -> do
1341         setInput inp2
1342         setLastToken loc1 len
1343         t loc1 end buf len
1344 }