ghc/compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2003
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, mkPState,
  26    P(..), ParseResult(..), getSrcLoc,
  27    failMsgP, failLocMsgP, failSpanMsgP, srcParseFail,
  28    popContext, pushCurrentContext,
  29   ) where
  30
  31 #include "HsVersions.h"
  32
  33 import ErrUtils         ( Message )
  34 import Outputable
  35 import StringBuffer
  36 import FastString
  37 import FastTypes
  38 import SrcLoc
  39 import UniqFM
  40 import CmdLineOpts
  41 import Ctype
  42 import Util             ( maybePrefixMatch )
  43
  44 import DATA_BITS
  45 import Char
  46 import Ratio
  47 --import TRACE
  48 }
  49
  50 $whitechar   = [\ \t\n\r\f\v\xa0]
  51 $white_no_nl = $whitechar # \n
  52
  53 $ascdigit  = 0-9
  54 $unidigit  = \x01
  55 $digit     = [$ascdigit $unidigit]
  56
  57 $special   = [\(\)\,\;\[\]\`\{\}]
  58 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  59 $unisymbol = \x02
  60 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  61
  62 $unilarge  = \x03
  63 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  64 $large     = [$asclarge $unilarge]
  65
  66 $unismall  = \x04
  67 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  68 $small     = [$ascsmall $unismall \_]
  69
  70 $graphic   = [$small $large $symbol $digit $special \:\"\']
  71
  72 $octit     = 0-7
  73 $hexit     = [$digit A-F a-f]
  74 $symchar   = [$symbol \:]
  75 $nl        = [\n\r]
  76 $idchar    = [$small $large $digit \']
  77
  78 @varid     = $small $idchar*
  79 @conid     = $large $idchar*
  80
  81 @varsym    = $symbol $symchar*
  82 @consym    = \: $symchar*
  83
  84 @decimal     = $digit+
  85 @octal       = $octit+
  86 @hexadecimal = $hexit+
  87 @exponent    = [eE] [\-\+]? @decimal
  88
  89 -- we support the hierarchical module name extension:
  90 @qual = (@conid \.)+
  91
  92 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
  93
  94 haskell :-
  95
  96 -- everywhere: skip whitespace and comments
  97 $white_no_nl+                           ;
  98
  99 -- Everywhere: deal with nested comments.  We explicitly rule out
 100 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 101 -- (this can happen even though pragmas will normally take precedence due to
 102 -- longest-match, because pragmas aren't valid in every state, but comments
 103 -- are).
 104 "{-" / { notFollowedBy '#' }            { nested_comment }
 105
 106 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 107 -- more dashes followed by a symbol should be parsed as a varsym, so we
 108 -- have to exclude those.
 109 -- The regex says: "munch all the characters after the dashes, as long as
 110 -- the first one is not a symbol".
 111 "--"\-* ([^$symbol] .*)?                ;
 112
 113 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 114 -- blank lines) until we find a non-whitespace character, then do layout
 115 -- processing.
 116 --
 117 -- One slight wibble here: what if the line begins with {-#? In
 118 -- theory, we have to lex the pragma to see if it's one we recognise,
 119 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 120 -- as a nested comment.  We don't bother with this: if the line begins
 121 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 122 <bol> {
 123   \n                                    ;
 124   ^\# (line)?                           { begin line_prag1 }
 125   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 126   ()                                    { do_bol }
 127 }
 128
 129 -- after a layout keyword (let, where, do, of), we begin a new layout
 130 -- context if the curly brace is missing.
 131 -- Careful! This stuff is quite delicate.
 132 <layout, layout_do> {
 133   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 134         -- we might encounter {-# here, but {- has been handled already
 135   \n                                    ;
 136   ^\# (line)?                           { begin line_prag1 }
 137 }
 138
 139 -- do is treated in a subtly different way, see new_layout_context
 140 <layout>    ()                          { new_layout_context True }
 141 <layout_do> ()                          { new_layout_context False }
 142
 143 -- after a new layout context which was found to be to the left of the
 144 -- previous context, we have generated a '{' token, and we now need to
 145 -- generate a matching '}' token.
 146 <layout_left>  ()                       { do_layout_left }
 147
 148 <0,glaexts> \n                          { begin bol }
 149
 150 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 151
 152 -- single-line line pragmas, of the form
 153 --    # <line> "<file>" <extra-stuff> \n
 154 <line_prag1> $digit+                    { set_line line_prag1a }
 155 <line_prag1a> \" [$graphic \ ]* \"      { set_file line_prag1b }
 156 <line_prag1b> .*                        { pop }
 157
 158 -- Haskell-style line pragmas, of the form
 159 --    {-# LINE <line> "<file>" #-}
 160 <line_prag2> $digit+                    { set_line line_prag2a }
 161 <line_prag2a> \" [$graphic \ ]* \"      { set_file line_prag2b }
 162 <line_prag2b> "#-}"|"-}"                { pop }
 163    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 164    -- with older versions of GHC which generated these.
 165
 166 <0,glaexts> {
 167   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 168                                         { token ITspecialise_prag }
 169   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 170   "{-#" $whitechar* (INLINE|inline)     { token ITinline_prag }
 171   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 172                                         { token ITnoinline_prag }
 173   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 174   "{-#" $whitechar* (DEPRECATED|deprecated)
 175                                         { token ITdeprecated_prag }
 176   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 177   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 178   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 179
 180   "{-#"                                 { nested_comment }
 181
 182   -- ToDo: should only be valid inside a pragma:
 183   "#-}"                                 { token ITclose_prag}
 184 }
 185
 186
 187 -- '0' state: ordinary lexemes
 188 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 189
 190 -- "special" symbols
 191
 192 <0,glaexts> {
 193   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 194   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 195 }
 196
 197 <0,glaexts> {
 198   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 199   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 200   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 201   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 202   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 203   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 204   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 205   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 206 }
 207
 208 <0,glaexts> {
 209   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 210                                         { special IToparenbar }
 211   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 212 }
 213
 214 <0,glaexts> {
 215   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 216   \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
 217 }
 218
 219 <glaexts> {
 220   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 221   "#)"                                  { token ITcubxparen }
 222   "{|"                                  { token ITocurlybar }
 223   "|}"                                  { token ITccurlybar }
 224 }
 225
 226 <0,glaexts> {
 227   \(                                    { special IToparen }
 228   \)                                    { special ITcparen }
 229   \[                                    { special ITobrack }
 230   \]                                    { special ITcbrack }
 231   \,                                    { special ITcomma }
 232   \;                                    { special ITsemi }
 233   \`                                    { special ITbackquote }
 234
 235   \{                                    { open_brace }
 236   \}                                    { close_brace }
 237 }
 238
 239 <0,glaexts> {
 240   @qual @varid                  { check_qvarid }
 241   @qual @conid                  { idtoken qconid }
 242   @varid                        { varid }
 243   @conid                        { idtoken conid }
 244 }
 245
 246 -- after an illegal qvarid, such as 'M.let',
 247 -- we back up and try again in the bad_qvarid state:
 248 <bad_qvarid> {
 249   @conid                        { pop_and (idtoken conid) }
 250   @qual @conid                  { pop_and (idtoken qconid) }
 251 }
 252
 253 <glaexts> {
 254   @qual @varid "#"+             { idtoken qvarid }
 255   @qual @conid "#"+             { idtoken qconid }
 256   @varid "#"+                   { varid }
 257   @conid "#"+                   { idtoken conid }
 258 }
 259
 260 -- ToDo: M.(,,,)
 261
 262 <0,glaexts> {
 263   @qual @varsym                 { idtoken qvarsym }
 264   @qual @consym                 { idtoken qconsym }
 265   @varsym                       { varsym }
 266   @consym                       { consym }
 267 }
 268
 269 <0,glaexts> {
 270   @decimal                      { tok_decimal }
 271   0[oO] @octal                  { tok_octal }
 272   0[xX] @hexadecimal            { tok_hexadecimal }
 273 }
 274
 275 <glaexts> {
 276   @decimal \#                   { prim_decimal }
 277   0[oO] @octal \#               { prim_octal }
 278   0[xX] @hexadecimal \#         { prim_hexadecimal }
 279 }
 280
 281 <0,glaexts> @floating_point             { strtoken tok_float }
 282 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 283 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 284
 285 -- Strings and chars are lexed by hand-written code.  The reason is
 286 -- that even if we recognise the string or char here in the regex
 287 -- lexer, we would still have to parse the string afterward in order
 288 -- to convert it to a String.
 289 <0,glaexts> {
 290   \'                            { lex_char_tok }
 291   \"                            { lex_string_tok }
 292 }
 293
 294 {
 295 -- work around bug in Alex 2.0
 296 #if __GLASGOW_HASKELL__ < 503
 297 unsafeAt arr i = arr ! i
 298 #endif
 299
 300 -- -----------------------------------------------------------------------------
 301 -- The token type
 302
 303 data Token
 304   = ITas                        -- Haskell keywords
 305   | ITcase
 306   | ITclass
 307   | ITdata
 308   | ITdefault
 309   | ITderiving
 310   | ITdo
 311   | ITelse
 312   | IThiding
 313   | ITif
 314   | ITimport
 315   | ITin
 316   | ITinfix
 317   | ITinfixl
 318   | ITinfixr
 319   | ITinstance
 320   | ITlet
 321   | ITmodule
 322   | ITnewtype
 323   | ITof
 324   | ITqualified
 325   | ITthen
 326   | ITtype
 327   | ITwhere
 328   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 329
 330   | ITforall                    -- GHC extension keywords
 331   | ITforeign
 332   | ITexport
 333   | ITlabel
 334   | ITdynamic
 335   | ITsafe
 336   | ITthreadsafe
 337   | ITunsafe
 338   | ITstdcallconv
 339   | ITccallconv
 340   | ITdotnet
 341   | ITmdo
 342
 343   | ITspecialise_prag           -- Pragmas
 344   | ITsource_prag
 345   | ITinline_prag
 346   | ITnoinline_prag
 347   | ITrules_prag
 348   | ITdeprecated_prag
 349   | ITline_prag
 350   | ITscc_prag
 351   | ITcore_prag                 -- hdaume: core annotations
 352   | ITunpack_prag
 353   | ITclose_prag
 354
 355   | ITdotdot                    -- reserved symbols
 356   | ITcolon
 357   | ITdcolon
 358   | ITequal
 359   | ITlam
 360   | ITvbar
 361   | ITlarrow
 362   | ITrarrow
 363   | ITat
 364   | ITtilde
 365   | ITdarrow
 366   | ITminus
 367   | ITbang
 368   | ITstar
 369   | ITdot
 370
 371   | ITbiglam                    -- GHC-extension symbols
 372
 373   | ITocurly                    -- special symbols
 374   | ITccurly
 375   | ITocurlybar                 -- {|, for type applications
 376   | ITccurlybar                 -- |}, for type applications
 377   | ITvocurly
 378   | ITvccurly
 379   | ITobrack
 380   | ITopabrack                  -- [:, for parallel arrays with -fparr
 381   | ITcpabrack                  -- :], for parallel arrays with -fparr
 382   | ITcbrack
 383   | IToparen
 384   | ITcparen
 385   | IToubxparen
 386   | ITcubxparen
 387   | ITsemi
 388   | ITcomma
 389   | ITunderscore
 390   | ITbackquote
 391
 392   | ITvarid   FastString        -- identifiers
 393   | ITconid   FastString
 394   | ITvarsym  FastString
 395   | ITconsym  FastString
 396   | ITqvarid  (FastString,FastString)
 397   | ITqconid  (FastString,FastString)
 398   | ITqvarsym (FastString,FastString)
 399   | ITqconsym (FastString,FastString)
 400
 401   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 402   | ITsplitipvarid FastString   -- GHC extension: implicit param: %x
 403
 404   | ITpragma StringBuffer
 405
 406   | ITchar       Char
 407   | ITstring     FastString
 408   | ITinteger    Integer
 409   | ITrational   Rational
 410
 411   | ITprimchar   Char
 412   | ITprimstring FastString
 413   | ITprimint    Integer
 414   | ITprimfloat  Rational
 415   | ITprimdouble Rational
 416
 417   -- MetaHaskell extension tokens
 418   | ITopenExpQuote              -- [| or [e|
 419   | ITopenPatQuote              -- [p|
 420   | ITopenDecQuote              -- [d|
 421   | ITopenTypQuote              -- [t|
 422   | ITcloseQuote                -- |]
 423   | ITidEscape   FastString     -- $x
 424   | ITparenEscape               -- $(
 425   | ITvarQuote                  -- '
 426   | ITtyQuote                   -- ''
 427
 428   -- Arrow notation extension
 429   | ITproc
 430   | ITrec
 431   | IToparenbar                 -- (|
 432   | ITcparenbar                 -- |)
 433   | ITlarrowtail                -- -<
 434   | ITrarrowtail                -- >-
 435   | ITLarrowtail                -- -<<
 436   | ITRarrowtail                -- >>-
 437
 438   | ITunknown String            -- Used when the lexer can't make sense of it
 439   | ITeof                       -- end of file token
 440 #ifdef DEBUG
 441   deriving Show -- debugging
 442 #endif
 443
 444 isSpecial :: Token -> Bool
 445 -- If we see M.x, where x is a keyword, but
 446 -- is special, we treat is as just plain M.x,
 447 -- not as a keyword.
 448 isSpecial ITas          = True
 449 isSpecial IThiding      = True
 450 isSpecial ITqualified   = True
 451 isSpecial ITforall      = True
 452 isSpecial ITexport      = True
 453 isSpecial ITlabel       = True
 454 isSpecial ITdynamic     = True
 455 isSpecial ITsafe        = True
 456 isSpecial ITthreadsafe  = True
 457 isSpecial ITunsafe      = True
 458 isSpecial ITccallconv   = True
 459 isSpecial ITstdcallconv = True
 460 isSpecial ITmdo         = True
 461 isSpecial _             = False
 462
 463 -- the bitmap provided as the third component indicates whether the
 464 -- corresponding extension keyword is valid under the extension options
 465 -- provided to the compiler; if the extension corresponding to *any* of the
 466 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 467 -- facilitates using a keyword in two different extensions that can be
 468 -- activated independently)
 469 --
 470 reservedWordsFM = listToUFM $
 471         map (\(x, y, z) -> (mkFastString x, (y, z)))
 472        [( "_",          ITunderscore,   0 ),
 473         ( "as",         ITas,           0 ),
 474         ( "case",       ITcase,         0 ),
 475         ( "class",      ITclass,        0 ),
 476         ( "data",       ITdata,         0 ),
 477         ( "default",    ITdefault,      0 ),
 478         ( "deriving",   ITderiving,     0 ),
 479         ( "do",         ITdo,           0 ),
 480         ( "else",       ITelse,         0 ),
 481         ( "hiding",     IThiding,       0 ),
 482         ( "if",         ITif,           0 ),
 483         ( "import",     ITimport,       0 ),
 484         ( "in",         ITin,           0 ),
 485         ( "infix",      ITinfix,        0 ),
 486         ( "infixl",     ITinfixl,       0 ),
 487         ( "infixr",     ITinfixr,       0 ),
 488         ( "instance",   ITinstance,     0 ),
 489         ( "let",        ITlet,          0 ),
 490         ( "module",     ITmodule,       0 ),
 491         ( "newtype",    ITnewtype,      0 ),
 492         ( "of",         ITof,           0 ),
 493         ( "qualified",  ITqualified,    0 ),
 494         ( "then",       ITthen,         0 ),
 495         ( "type",       ITtype,         0 ),
 496         ( "where",      ITwhere,        0 ),
 497         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 498
 499         ( "forall",     ITforall,        bit glaExtsBit),
 500         ( "mdo",        ITmdo,           bit glaExtsBit),
 501
 502         ( "foreign",    ITforeign,       bit ffiBit),
 503         ( "export",     ITexport,        bit ffiBit),
 504         ( "label",      ITlabel,         bit ffiBit),
 505         ( "dynamic",    ITdynamic,       bit ffiBit),
 506         ( "safe",       ITsafe,          bit ffiBit),
 507         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 508         ( "unsafe",     ITunsafe,        bit ffiBit),
 509         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 510         ( "ccall",      ITccallconv,     bit ffiBit),
 511         ( "dotnet",     ITdotnet,        bit ffiBit),
 512
 513         ( "rec",        ITrec,           bit arrowsBit),
 514         ( "proc",       ITproc,          bit arrowsBit)
 515      ]
 516
 517 reservedSymsFM = listToUFM $
 518         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 519       [ ("..",  ITdotdot,       0)
 520        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 521                                                 -- meaning only list cons
 522        ,("::",  ITdcolon,       0)
 523        ,("=",   ITequal,        0)
 524        ,("\\",  ITlam,          0)
 525        ,("|",   ITvbar,         0)
 526        ,("<-",  ITlarrow,       0)
 527        ,("->",  ITrarrow,       0)
 528        ,("@",   ITat,           0)
 529        ,("~",   ITtilde,        0)
 530        ,("=>",  ITdarrow,       0)
 531        ,("-",   ITminus,        0)
 532        ,("!",   ITbang,         0)
 533
 534        ,("*",   ITstar,         bit glaExtsBit) -- For data T (a::*) = MkT
 535        ,(".",   ITdot,          bit glaExtsBit) -- For 'forall a . t'
 536
 537        ,("-<",  ITlarrowtail,   bit arrowsBit)
 538        ,(">-",  ITrarrowtail,   bit arrowsBit)
 539        ,("-<<", ITLarrowtail,   bit arrowsBit)
 540        ,(">>-", ITRarrowtail,   bit arrowsBit)
 541        ]
 542
 543 -- -----------------------------------------------------------------------------
 544 -- Lexer actions
 545
 546 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 547
 548 special :: Token -> Action
 549 special tok span _buf len = return (L span tok)
 550
 551 token, layout_token :: Token -> Action
 552 token t span buf len = return (L span t)
 553 layout_token t span buf len = pushLexState layout >> return (L span t)
 554
 555 idtoken :: (StringBuffer -> Int -> Token) -> Action
 556 idtoken f span buf len = return (L span $! (f buf len))
 557
 558 skip_one_varid :: (FastString -> Token) -> Action
 559 skip_one_varid f span buf len
 560   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 561
 562 strtoken :: (String -> Token) -> Action
 563 strtoken f span buf len =
 564   return (L span $! (f $! lexemeToString buf len))
 565
 566 init_strtoken :: Int -> (String -> Token) -> Action
 567 -- like strtoken, but drops the last N character(s)
 568 init_strtoken drop f span buf len =
 569   return (L span $! (f $! lexemeToString buf (len-drop)))
 570
 571 begin :: Int -> Action
 572 begin code _span _str _len = do pushLexState code; lexToken
 573
 574 pop :: Action
 575 pop _span _buf _len = do popLexState; lexToken
 576
 577 pop_and :: Action -> Action
 578 pop_and act span buf len = do popLexState; act span buf len
 579
 580 notFollowedBy char _ _ _ (_,buf) = atEnd buf || currentChar buf /= char
 581
 582 notFollowedBySymbol _ _ _ (_,buf)
 583   = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
 584
 585 ifExtension pred bits _ _ _ = pred bits
 586
 587 {-
 588   nested comments require traversing by hand, they can't be parsed
 589   using regular expressions.
 590 -}
 591 nested_comment :: Action
 592 nested_comment span _str _len = do
 593   input <- getInput
 594   go 1 input
 595   where go 0 input = do setInput input; lexToken
 596         go n input = do
 597           case alexGetChar input of
 598             Nothing  -> err input
 599             Just (c,input) -> do
 600               case c of
 601                 '-' -> do
 602                   case alexGetChar input of
 603                     Nothing  -> err input
 604                     Just ('\125',input) -> go (n-1) input
 605                     Just (c,_)          -> go n input
 606                 '\123' -> do
 607                   case alexGetChar input of
 608                     Nothing  -> err input
 609                     Just ('-',input') -> go (n+1) input'
 610                     Just (c,input)    -> go n input
 611                 c -> go n input
 612
 613         err input = do failLocMsgP (srcSpanStart span) (fst input)
 614                         "unterminated `{-'"
 615
 616 open_brace, close_brace :: Action
 617 open_brace span _str _len = do
 618   ctx <- getContext
 619   setContext (NoLayout:ctx)
 620   return (L span ITocurly)
 621 close_brace span _str _len = do
 622   popContext
 623   return (L span ITccurly)
 624
 625 -- We have to be careful not to count M.<varid> as a qualified name
 626 -- when <varid> is a keyword.  We hack around this by catching
 627 -- the offending tokens afterward, and re-lexing in a different state.
 628 check_qvarid span buf len = do
 629   case lookupUFM reservedWordsFM var of
 630         Just (keyword,exts)
 631           | not (isSpecial keyword) ->
 632           if exts == 0
 633              then try_again
 634              else do
 635                 b <- extension (\i -> exts .&. i /= 0)
 636                 if b then try_again
 637                      else return token
 638         _other -> return token
 639   where
 640         (mod,var) = splitQualName buf len
 641         token     = L span (ITqvarid (mod,var))
 642
 643         try_again = do
 644                 setInput (srcSpanStart span,buf)
 645                 pushLexState bad_qvarid
 646                 lexToken
 647
 648 qvarid buf len = ITqvarid $! splitQualName buf len
 649 qconid buf len = ITqconid $! splitQualName buf len
 650
 651 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 652 -- takes a StringBuffer and a length, and returns the module name
 653 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 654 -- because of hierarchical module names.
 655 splitQualName orig_buf len = split orig_buf 0 0
 656   where
 657     split buf dot_off n
 658         | n == len                = done dot_off
 659         | lookAhead buf n == '.'  = split2 buf n (n+1)
 660         | otherwise               = split buf dot_off (n+1)
 661
 662     -- careful, we might get names like M....
 663     -- so, if the character after the dot is not upper-case, this is
 664     -- the end of the qualifier part.
 665     split2 buf dot_off n
 666         | isUpper (lookAhead buf n) = split buf dot_off (n+1)
 667         | otherwise                 = done dot_off
 668
 669     done dot_off =
 670         (lexemeToFastString orig_buf dot_off,
 671          lexemeToFastString (stepOnBy (dot_off+1) orig_buf) (len - dot_off -1))
 672
 673 varid span buf len =
 674   case lookupUFM reservedWordsFM fs of
 675         Just (keyword,0)    -> do
 676                 maybe_layout keyword
 677                 return (L span keyword)
 678         Just (keyword,exts) -> do
 679                 b <- extension (\i -> exts .&. i /= 0)
 680                 if b then do maybe_layout keyword
 681                              return (L span keyword)
 682                      else return (L span (ITvarid fs))
 683         _other -> return (L span (ITvarid fs))
 684   where
 685         fs = lexemeToFastString buf len
 686
 687 conid buf len = ITconid fs
 688   where fs = lexemeToFastString buf len
 689
 690 qvarsym buf len = ITqvarsym $! splitQualName buf len
 691 qconsym buf len = ITqconsym $! splitQualName buf len
 692
 693 varsym = sym ITvarsym
 694 consym = sym ITconsym
 695
 696 sym con span buf len =
 697   case lookupUFM reservedSymsFM fs of
 698         Just (keyword,0)    -> return (L span keyword)
 699         Just (keyword,exts) -> do
 700                 b <- extension (\i -> exts .&. i /= 0)
 701                 if b then return (L span keyword)
 702                      else return (L span $! con fs)
 703         _other -> return (L span $! con fs)
 704   where
 705         fs = lexemeToFastString buf len
 706
 707 tok_decimal span buf len
 708   = return (L span (ITinteger  $! parseInteger buf len 10 oct_or_dec))
 709
 710 tok_octal span buf len
 711   = return (L span (ITinteger  $! parseInteger (stepOnBy 2 buf) (len-2) 8 oct_or_dec))
 712
 713 tok_hexadecimal span buf len
 714   = return (L span (ITinteger  $! parseInteger (stepOnBy 2 buf) (len-2) 16 hex))
 715
 716 prim_decimal span buf len
 717   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 oct_or_dec))
 718
 719 prim_octal span buf len
 720   = return (L span (ITprimint  $! parseInteger (stepOnBy 2 buf) (len-3) 8 oct_or_dec))
 721
 722 prim_hexadecimal span buf len
 723   = return (L span (ITprimint  $! parseInteger (stepOnBy 2 buf) (len-3) 16 hex))
 724
 725 tok_float        str = ITrational $! readRational__ str
 726 prim_float       str = ITprimfloat  $! readRational__ str
 727 prim_double      str = ITprimdouble $! readRational__ str
 728
 729 parseInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
 730 parseInteger buf len radix to_int
 731   = go 0 0
 732   where go i x | i == len  = x
 733                | otherwise = go (i+1) (x * radix + toInteger (to_int (lookAhead buf i)))
 734
 735 -- -----------------------------------------------------------------------------
 736 -- Layout processing
 737
 738 -- we're at the first token on a line, insert layout tokens if necessary
 739 do_bol :: Action
 740 do_bol span _str _len = do
 741         pos <- getOffside (srcSpanEnd span)
 742         case pos of
 743             LT -> do
 744                 --trace "layout: inserting '}'" $ do
 745                 popContext
 746                 -- do NOT pop the lex state, we might have a ';' to insert
 747                 return (L span ITvccurly)
 748             EQ -> do
 749                 --trace "layout: inserting ';'" $ do
 750                 popLexState
 751                 return (L span ITsemi)
 752             GT -> do
 753                 popLexState
 754                 lexToken
 755
 756 -- certain keywords put us in the "layout" state, where we might
 757 -- add an opening curly brace.
 758 maybe_layout ITdo       = pushLexState layout_do
 759 maybe_layout ITmdo      = pushLexState layout_do
 760 maybe_layout ITof       = pushLexState layout
 761 maybe_layout ITlet      = pushLexState layout
 762 maybe_layout ITwhere    = pushLexState layout
 763 maybe_layout ITrec      = pushLexState layout
 764 maybe_layout _          = return ()
 765
 766 -- Pushing a new implicit layout context.  If the indentation of the
 767 -- next token is not greater than the previous layout context, then
 768 -- Haskell 98 says that the new layout context should be empty; that is
 769 -- the lexer must generate {}.
 770 --
 771 -- We are slightly more lenient than this: when the new context is started
 772 -- by a 'do', then we allow the new context to be at the same indentation as
 773 -- the previous context.  This is what the 'strict' argument is for.
 774 --
 775 new_layout_context strict span _buf _len = do
 776     popLexState
 777     let offset = srcSpanStartCol span
 778     ctx <- getContext
 779     case ctx of
 780         Layout prev_off : _  |
 781            (strict     && prev_off >= offset  ||
 782             not strict && prev_off > offset) -> do
 783                 -- token is indented to the left of the previous context.
 784                 -- we must generate a {} sequence now.
 785                 pushLexState layout_left
 786                 return (L span ITvocurly)
 787         other -> do
 788                 setContext (Layout offset : ctx)
 789                 return (L span ITvocurly)
 790
 791 do_layout_left span _buf _len = do
 792     popLexState
 793     pushLexState bol  -- we must be at the start of a line
 794     return (L span ITvccurly)
 795
 796 -- -----------------------------------------------------------------------------
 797 -- LINE pragmas
 798
 799 set_line :: Int -> Action
 800 set_line code span buf len = do
 801   let line = parseInteger buf len 10 oct_or_dec
 802   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
 803         -- subtract one: the line number refers to the *following* line
 804   popLexState
 805   pushLexState code
 806   lexToken
 807
 808 set_file :: Int -> Action
 809 set_file code span buf len = do
 810   let file = lexemeToFastString (stepOn buf) (len-2)
 811   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
 812   popLexState
 813   pushLexState code
 814   lexToken
 815
 816 -- -----------------------------------------------------------------------------
 817 -- Strings & Chars
 818
 819 -- This stuff is horrible.  I hates it.
 820
 821 lex_string_tok :: Action
 822 lex_string_tok span buf len = do
 823   tok <- lex_string ""
 824   end <- getSrcLoc
 825   return (L (mkSrcSpan (srcSpanStart span) end) tok)
 826
 827 lex_string :: String -> P Token
 828 lex_string s = do
 829   i <- getInput
 830   case alexGetChar i of
 831     Nothing -> lit_error
 832
 833     Just ('"',i)  -> do
 834         setInput i
 835         glaexts <- extension glaExtsEnabled
 836         if glaexts
 837           then do
 838             i <- getInput
 839             case alexGetChar i of
 840               Just ('#',i) -> do
 841                    setInput i
 842                    if any (> '\xFF') s
 843                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 844                     else let s' = mkFastStringNarrow (reverse s) in
 845                          -- always a narrow string/byte array
 846                          return (ITprimstring s')
 847               _other ->
 848                 return (ITstring (mkFastString (reverse s)))
 849           else
 850                 return (ITstring (mkFastString (reverse s)))
 851
 852     Just ('\\',i)
 853         | Just ('&',i) <- next -> do
 854                 setInput i; lex_string s
 855         | Just (c,i) <- next, is_space c -> do
 856                 setInput i; lex_stringgap s
 857         where next = alexGetChar i
 858
 859     Just _ -> do
 860         c <- lex_char
 861         lex_string (c:s)
 862
 863 lex_stringgap s = do
 864   c <- getCharOrFail
 865   case c of
 866     '\\' -> lex_string s
 867     c | is_space c -> lex_stringgap s
 868     _other -> lit_error
 869
 870
 871 lex_char_tok :: Action
 872 -- Here we are basically parsing character literals, such as 'x' or '\n'
 873 -- but, when Template Haskell is on, we additionally spot
 874 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
 875 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
 876 -- So we have to do two characters of lookahead: when we see 'x we need to
 877 -- see if there's a trailing quote
 878 lex_char_tok span buf len = do  -- We've seen '
 879    i1 <- getInput       -- Look ahead to first character
 880    let loc = srcSpanStart span
 881    case alexGetChar i1 of
 882         Nothing -> lit_error
 883
 884         Just ('\'', i2@(end2,_)) -> do  -- We've seen ''
 885                   th_exts <- extension thEnabled
 886                   if th_exts then do
 887                         setInput i2
 888                         return (L (mkSrcSpan loc end2)  ITtyQuote)
 889                    else lit_error
 890
 891         Just ('\\', i2@(end2,_)) -> do  -- We've seen 'backslash
 892                   setInput i2
 893                   lit_ch <- lex_escape
 894                   mc <- getCharOrFail   -- Trailing quote
 895                   if mc == '\'' then finish_char_tok loc lit_ch
 896                                 else lit_error
 897
 898         Just (c, i2@(end2,_)) | not (is_any c) -> lit_error
 899                               | otherwise      ->
 900
 901                 -- We've seen 'x, where x is a valid character
 902                 --  (i.e. not newline etc) but not a quote or backslash
 903            case alexGetChar i2 of       -- Look ahead one more character
 904                 Nothing -> lit_error
 905                 Just ('\'', i3) -> do   -- We've seen 'x'
 906                         setInput i3
 907                         finish_char_tok loc c
 908                 _other -> do            -- We've seen 'x not followed by quote
 909                                         -- If TH is on, just parse the quote only
 910                         th_exts <- extension thEnabled
 911                         if th_exts then return (L (mkSrcSpan loc (fst i1)) ITvarQuote)
 912                                    else lit_error
 913
 914 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
 915 finish_char_tok loc ch  -- We've already seen the closing quote
 916                         -- Just need to check for trailing #
 917   = do  glaexts <- extension glaExtsEnabled
 918         i@(end,_) <- getInput
 919         if glaexts then do
 920                 case alexGetChar i of
 921                         Just ('#',i@(end,_)) -> do
 922                                 setInput i
 923                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
 924                         _other ->
 925                                 return (L (mkSrcSpan loc end) (ITchar ch))
 926                 else do
 927                    return (L (mkSrcSpan loc end) (ITchar ch))
 928
 929 lex_char :: P Char
 930 lex_char = do
 931   mc <- getCharOrFail
 932   case mc of
 933       '\\' -> lex_escape
 934       c | is_any c -> return c
 935       _other -> lit_error
 936
 937 lex_escape :: P Char
 938 lex_escape = do
 939   c <- getCharOrFail
 940   case c of
 941         'a'   -> return '\a'
 942         'b'   -> return '\b'
 943         'f'   -> return '\f'
 944         'n'   -> return '\n'
 945         'r'   -> return '\r'
 946         't'   -> return '\t'
 947         'v'   -> return '\v'
 948         '\\'  -> return '\\'
 949         '"'   -> return '\"'
 950         '\''  -> return '\''
 951         '^'   -> do c <- getCharOrFail
 952                     if c >= '@' && c <= '_'
 953                         then return (chr (ord c - ord '@'))
 954                         else lit_error
 955
 956         'x'   -> readNum is_hexdigit 16 hex
 957         'o'   -> readNum is_octdigit  8 oct_or_dec
 958         x | is_digit x -> readNum2 is_digit 10 oct_or_dec (oct_or_dec x)
 959
 960         c1 ->  do
 961            i <- getInput
 962            case alexGetChar i of
 963             Nothing -> lit_error
 964             Just (c2,i2) ->
 965               case alexGetChar i2 of
 966                 Nothing -> lit_error
 967                 Just (c3,i3) ->
 968                    let str = [c1,c2,c3] in
 969                    case [ (c,rest) | (p,c) <- silly_escape_chars,
 970                                      Just rest <- [maybePrefixMatch p str] ] of
 971                           (escape_char,[]):_ -> do
 972                                 setInput i3
 973                                 return escape_char
 974                           (escape_char,_:_):_ -> do
 975                                 setInput i2
 976                                 return escape_char
 977                           [] -> lit_error
 978
 979 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
 980 readNum is_digit base conv = do
 981   c <- getCharOrFail
 982   if is_digit c
 983         then readNum2 is_digit base conv (conv c)
 984         else lit_error
 985
 986 readNum2 is_digit base conv i = do
 987   input <- getInput
 988   read i input
 989   where read i input = do
 990           case alexGetChar input of
 991             Just (c,input') | is_digit c -> do
 992                 read (i*base + conv c) input'
 993             _other -> do
 994                 setInput input
 995                 if i >= 0 && i <= 0x10FFFF
 996                    then return (chr i)
 997                    else lit_error
 998
 999 is_hexdigit c
1000         =  is_digit c
1001         || (c >= 'a' && c <= 'f')
1002         || (c >= 'A' && c <= 'F')
1003
1004 hex c | is_digit c = ord c - ord '0'
1005       | otherwise  = ord (to_lower c) - ord 'a' + 10
1006
1007 oct_or_dec c = ord c - ord '0'
1008
1009 is_octdigit c = c >= '0' && c <= '7'
1010
1011 to_lower c
1012   | c >=  'A' && c <= 'Z' = chr (ord c - (ord 'A' - ord 'a'))
1013   | otherwise = c
1014
1015 silly_escape_chars = [
1016         ("NUL", '\NUL'),
1017         ("SOH", '\SOH'),
1018         ("STX", '\STX'),
1019         ("ETX", '\ETX'),
1020         ("EOT", '\EOT'),
1021         ("ENQ", '\ENQ'),
1022         ("ACK", '\ACK'),
1023         ("BEL", '\BEL'),
1024         ("BS", '\BS'),
1025         ("HT", '\HT'),
1026         ("LF", '\LF'),
1027         ("VT", '\VT'),
1028         ("FF", '\FF'),
1029         ("CR", '\CR'),
1030         ("SO", '\SO'),
1031         ("SI", '\SI'),
1032         ("DLE", '\DLE'),
1033         ("DC1", '\DC1'),
1034         ("DC2", '\DC2'),
1035         ("DC3", '\DC3'),
1036         ("DC4", '\DC4'),
1037         ("NAK", '\NAK'),
1038         ("SYN", '\SYN'),
1039         ("ETB", '\ETB'),
1040         ("CAN", '\CAN'),
1041         ("EM", '\EM'),
1042         ("SUB", '\SUB'),
1043         ("ESC", '\ESC'),
1044         ("FS", '\FS'),
1045         ("GS", '\GS'),
1046         ("RS", '\RS'),
1047         ("US", '\US'),
1048         ("SP", '\SP'),
1049         ("DEL", '\DEL')
1050         ]
1051
1052 lit_error = lexError "lexical error in string/character literal"
1053
1054 getCharOrFail :: P Char
1055 getCharOrFail =  do
1056   i <- getInput
1057   case alexGetChar i of
1058         Nothing -> lexError "unexpected end-of-file in string/character literal"
1059         Just (c,i)  -> do setInput i; return c
1060
1061 -- -----------------------------------------------------------------------------
1062 -- Floats
1063
1064 readRational :: ReadS Rational -- NB: doesn't handle leading "-"
1065 readRational r = do
1066      (n,d,s) <- readFix r
1067      (k,t)   <- readExp s
1068      return ((n%1)*10^^(k-d), t)
1069  where
1070      readFix r = do
1071         (ds,s)  <- lexDecDigits r
1072         (ds',t) <- lexDotDigits s
1073         return (read (ds++ds'), length ds', t)
1074
1075      readExp (e:s) | e `elem` "eE" = readExp' s
1076      readExp s                     = return (0,s)
1077
1078      readExp' ('+':s) = readDec s
1079      readExp' ('-':s) = do
1080                         (k,t) <- readDec s
1081                         return (-k,t)
1082      readExp' s       = readDec s
1083
1084      readDec s = do
1085         (ds,r) <- nonnull isDigit s
1086         return (foldl1 (\n d -> n * 10 + d) [ ord d - ord '0' | d <- ds ],
1087                 r)
1088
1089      lexDecDigits = nonnull isDigit
1090
1091      lexDotDigits ('.':s) = return (span isDigit s)
1092      lexDotDigits s       = return ("",s)
1093
1094      nonnull p s = do (cs@(_:_),t) <- return (span p s)
1095                       return (cs,t)
1096
1097 readRational__ :: String -> Rational -- NB: *does* handle a leading "-"
1098 readRational__ top_s
1099   = case top_s of
1100       '-' : xs -> - (read_me xs)
1101       xs       -> read_me xs
1102   where
1103     read_me s
1104       = case (do { (x,"") <- readRational s ; return x }) of
1105           [x] -> x
1106           []  -> error ("readRational__: no parse:"        ++ top_s)
1107           _   -> error ("readRational__: ambiguous parse:" ++ top_s)
1108
1109 -- -----------------------------------------------------------------------------
1110 -- The Parse Monad
1111
1112 data LayoutContext
1113   = NoLayout
1114   | Layout !Int
1115
1116 data ParseResult a
1117   = POk PState a
1118   | PFailed
1119         SrcSpan         -- The start and end of the text span related to
1120                         -- the error.  Might be used in environments which can
1121                         -- show this span, e.g. by highlighting it.
1122         Message         -- The error message
1123
1124 data PState = PState {
1125         buffer     :: StringBuffer,
1126         last_loc   :: SrcSpan,  -- pos of previous token
1127         last_len   :: !Int,     -- len of previous token
1128         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1129         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1130         context    :: [LayoutContext],
1131         lex_state  :: [Int]
1132      }
1133         -- last_loc and last_len are used when generating error messages,
1134         -- and in pushCurrentContext only.
1135
1136 newtype P a = P { unP :: PState -> ParseResult a }
1137
1138 instance Monad P where
1139   return = returnP
1140   (>>=) = thenP
1141   fail = failP
1142
1143 returnP :: a -> P a
1144 returnP a = P $ \s -> POk s a
1145
1146 thenP :: P a -> (a -> P b) -> P b
1147 (P m) `thenP` k = P $ \ s ->
1148         case m s of
1149                 POk s1 a         -> (unP (k a)) s1
1150                 PFailed span err -> PFailed span err
1151
1152 failP :: String -> P a
1153 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1154
1155 failMsgP :: String -> P a
1156 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1157
1158 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1159 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1160
1161 failSpanMsgP :: SrcSpan -> String -> P a
1162 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1163
1164 extension :: (Int -> Bool) -> P Bool
1165 extension p = P $ \s -> POk s (p $! extsBitmap s)
1166
1167 getExts :: P Int
1168 getExts = P $ \s -> POk s (extsBitmap s)
1169
1170 setSrcLoc :: SrcLoc -> P ()
1171 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1172
1173 getSrcLoc :: P SrcLoc
1174 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1175
1176 setLastToken :: SrcSpan -> Int -> P ()
1177 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1178
1179 type AlexInput = (SrcLoc,StringBuffer)
1180
1181 alexInputPrevChar :: AlexInput -> Char
1182 alexInputPrevChar (_,s) = prevChar s '\n'
1183
1184 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1185 alexGetChar (loc,s)
1186   | atEnd s   = Nothing
1187   | otherwise = c `seq` loc' `seq` s' `seq` Just (c, (loc', s'))
1188   where c = currentChar s
1189         loc' = advanceSrcLoc loc c
1190         s'   = stepOn s
1191
1192 getInput :: P AlexInput
1193 getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (l,b)
1194
1195 setInput :: AlexInput -> P ()
1196 setInput (l,b) = P $ \s -> POk s{ loc=l, buffer=b } ()
1197
1198 pushLexState :: Int -> P ()
1199 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1200
1201 popLexState :: P Int
1202 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1203
1204 getLexState :: P Int
1205 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1206
1207 -- for reasons of efficiency, flags indicating language extensions (eg,
1208 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1209 -- integer
1210
1211 glaExtsBit, ffiBit, parrBit :: Int
1212 glaExtsBit = 0
1213 ffiBit     = 1
1214 parrBit    = 2
1215 arrowsBit  = 4
1216 thBit      = 5
1217 ipBit      = 6
1218
1219 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1220 glaExtsEnabled flags = testBit flags glaExtsBit
1221 ffiEnabled     flags = testBit flags ffiBit
1222 parrEnabled    flags = testBit flags parrBit
1223 arrowsEnabled  flags = testBit flags arrowsBit
1224 thEnabled      flags = testBit flags thBit
1225 ipEnabled      flags = testBit flags ipBit
1226
1227 -- create a parse state
1228 --
1229 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1230 mkPState buf loc flags  =
1231   PState {
1232       buffer     = buf,
1233       last_loc   = mkSrcSpan loc loc,
1234       last_len   = 0,
1235       loc        = loc,
1236       extsBitmap = fromIntegral bitmap,
1237       context    = [],
1238       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1239         -- we begin in the layout state if toplev_layout is set
1240     }
1241     where
1242       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1243                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1244                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1245                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1246                .|. thBit      `setBitIf` dopt Opt_TH          flags
1247                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1248       --
1249       setBitIf :: Int -> Bool -> Int
1250       b `setBitIf` cond | cond      = bit b
1251                         | otherwise = 0
1252
1253 getContext :: P [LayoutContext]
1254 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1255
1256 setContext :: [LayoutContext] -> P ()
1257 setContext ctx = P $ \s -> POk s{context=ctx} ()
1258
1259 popContext :: P ()
1260 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1261                            loc = loc, last_len = len, last_loc = last_loc }) ->
1262   case ctx of
1263         (_:tl) -> POk s{ context = tl } ()
1264         []     -> PFailed last_loc (srcParseErr buf len)
1265
1266 -- Push a new layout context at the indentation of the last token read.
1267 -- This is only used at the outer level of a module when the 'module'
1268 -- keyword is missing.
1269 pushCurrentContext :: P ()
1270 pushCurrentContext = P $ \ s@PState{ last_loc=loc, context=ctx } ->
1271   POk s{ context = Layout (srcSpanStartCol loc) : ctx} ()
1272
1273 getOffside :: SrcLoc -> P Ordering
1274 getOffside loc = P $ \s@PState{context=stk} ->
1275                 let ord = case stk of
1276                         (Layout n:_) -> compare (srcLocCol loc) n
1277                         _            -> GT
1278                 in POk s ord
1279
1280 -- ---------------------------------------------------------------------------
1281 -- Construct a parse error
1282
1283 srcParseErr
1284   :: StringBuffer       -- current buffer (placed just after the last token)
1285   -> Int                -- length of the previous token
1286   -> Message
1287 srcParseErr buf len
1288   = hcat [ if null token
1289              then ptext SLIT("parse error (possibly incorrect indentation)")
1290              else hcat [ptext SLIT("parse error on input "),
1291                         char '`', text token, char '\'']
1292     ]
1293   where token = lexemeToString (stepOnBy (-len) buf) len
1294
1295 -- Report a parse failure, giving the span of the previous token as
1296 -- the location of the error.  This is the entry point for errors
1297 -- detected during parsing.
1298 srcParseFail :: P a
1299 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1300                                 last_loc = last_loc, loc = loc } ->
1301     PFailed last_loc (srcParseErr buf len)
1302
1303 -- A lexical error is reported at a particular position in the source file,
1304 -- not over a token range.  TODO: this is slightly wrong, because we record
1305 -- the error at the character position following the one which caused the
1306 -- error.  We should somehow back up by one character.
1307 lexError :: String -> P a
1308 lexError str = do
1309   loc <- getSrcLoc
1310   i@(end,_) <- getInput
1311   failLocMsgP loc end str
1312
1313 -- -----------------------------------------------------------------------------
1314 -- This is the top-level function: called from the parser each time a
1315 -- new token is to be read from the input.
1316
1317 lexer :: (Located Token -> P a) -> P a
1318 lexer cont = do
1319   tok@(L _ tok__) <- lexToken
1320   --trace ("token: " ++ show tok__) $ do
1321   cont tok
1322
1323 lexToken :: P (Located Token)
1324 lexToken = do
1325   inp@(loc1,buf) <- getInput
1326   sc <- getLexState
1327   exts <- getExts
1328   case alexScanUser exts inp sc of
1329     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1330                   setLastToken span 0
1331                   return (L span ITeof)
1332     AlexError (loc2,_) -> do failLocMsgP loc1 loc2 "lexical error"
1333     AlexSkip inp2 _ -> do
1334         setInput inp2
1335         lexToken
1336     AlexToken inp2@(end,buf2) len t -> do
1337         setInput inp2
1338         let span = mkSrcSpan loc1 end
1339         span `seq` setLastToken span len
1340         t span buf len
1341 }