ghc/compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2003
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), Token__(..), lexer, ExtFlags(..), mkPState, showPFailed,
  26    P(..), ParseResult(..), setSrcLocFor, getSrcLoc,
  27    failMsgP, failLocMsgP, srcParseFail,
  28    popContext, pushCurrentContext,
  29   ) where
  30
  31 #include "HsVersions.h"
  32
  33 import ForeignCall      ( Safety(..) )
  34 import ErrUtils         ( Message )
  35 import Outputable
  36 import StringBuffer
  37 import FastString
  38 import FastTypes
  39 import SrcLoc
  40 import UniqFM
  41 import Ctype
  42 import Util             ( maybePrefixMatch )
  43
  44 import DATA_BITS
  45 import Char
  46 import Ratio
  47 import TRACE
  48 }
  49
  50 $whitechar   = [\ \t\n\r\f\v]
  51 $white_no_nl = $whitechar # \n
  52
  53 $ascdigit  = 0-9
  54 $unidigit  = \x01
  55 $digit     = [$ascdigit $unidigit]
  56
  57 $special   = [\(\)\,\;\[\]\`\{\}]
  58 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  59 $unisymbol = \x02
  60 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  61
  62 $unilarge  = \x03
  63 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  64 $large     = [$asclarge $unilarge]
  65
  66 $unismall  = \x04
  67 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  68 $small     = [$ascsmall $unismall \_]
  69
  70 $graphic   = [$small $large $symbol $digit $special \:\"\']
  71
  72 $octit     = 0-7
  73 $hexit     = [$digit A-F a-f]
  74 $symchar   = [$symbol \:]
  75 $nl        = [\n\r]
  76 $idchar    = [$small $large $digit \']
  77
  78 @varid     = $small $idchar*
  79 @conid     = $large $idchar*
  80
  81 @varsym    = $symbol $symchar*
  82 @consym    = \: $symchar*
  83
  84 @decimal     = $digit+
  85 @octal       = $octit+
  86 @hexadecimal = $hexit+
  87 @exponent    = [eE] [\-\+]? @decimal
  88
  89 -- we support the hierarchical module name extension:
  90 @qual = (@conid \.)+
  91
  92 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
  93
  94 haskell :-
  95
  96 -- everywhere: skip whitespace and comments
  97 $white_no_nl+                           ;
  98
  99 -- Everywhere: deal with nested comments.  We explicitly rule out
 100 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 101 -- (this can happen even though pragmas will normally take precedence due to
 102 -- longest-match, because pragmas aren't valid in every state, but comments
 103 -- are).
 104 "{-" / { notFollowedBy '#' }            { nested_comment }
 105
 106 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 107 -- more dashes followed by a symbol should be parsed as a varsym, so we
 108 -- have to exclude those.
 109 -- The regex says: "munch all the characters after the dashes, as long as
 110 -- the first one is not a symbol".
 111 "--"\-* ([^$symbol] .*)?                ;
 112
 113 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 114 -- blank lines) until we find a non-whitespace character, then do layout
 115 -- processing.
 116 --
 117 -- One slight wibble here: what if the line begins with {-#? In
 118 -- theory, we have to lex the pragma to see if it's one we recognise,
 119 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 120 -- as a nested comment.  We don't bother with this: if the line begins
 121 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 122 <bol> {
 123   \n                                    ;
 124   ^\# (line)?                           { begin line_prag1 }
 125   ()                                    { do_bol }
 126 }
 127
 128 -- after a layout keyword (let, where, do, of), we begin a new layout
 129 -- context if the curly brace is missing.
 130 -- Careful! This stuff is quite delicate.
 131 <layout, layout_do> {
 132   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 133         -- we might encounter {-# here, but {- has been handled already
 134   \n                                    ;
 135   ^\# (line)?                           { begin line_prag1 }
 136 }
 137
 138 -- do is treated in a subtly different way, see new_layout_context
 139 <layout>    ()                          { new_layout_context True }
 140 <layout_do> ()                          { new_layout_context False }
 141
 142 -- after a new layout context which was found to be to the left of the
 143 -- previous context, we have generated a '{' token, and we now need to
 144 -- generate a matching '}' token.
 145 <layout_left>  ()                       { do_layout_left }
 146
 147 <0,glaexts> \n                          { begin bol }
 148
 149 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 150
 151 -- single-line line pragmas, of the form
 152 --    # <line> "<file>" <extra-stuff> \n
 153 <line_prag1> $digit+                    { set_line line_prag1a }
 154 <line_prag1a> \" $graphic* \"           { set_file line_prag1b }
 155 <line_prag1b> .*                        { pop }
 156
 157 -- Haskell-style line pragmas, of the form
 158 --    {-# LINE <line> "<file>" #-}
 159 <line_prag2> $digit+                    { set_line line_prag2a }
 160 <line_prag2a> \" $graphic* \"           { set_file line_prag2b }
 161 <line_prag2b> "#-}"                     { pop }
 162
 163 <0,glaexts> {
 164   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 165                                         { token ITspecialise_prag }
 166   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 167   "{-#" $whitechar* (INLINE|inline)     { token ITinline_prag }
 168   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 169                                         { token ITnoinline_prag }
 170   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 171   "{-#" $whitechar* (DEPRECATED|deprecated)
 172                                         { token ITdeprecated_prag }
 173   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 174   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 175
 176   "{-#"                                 { nested_comment }
 177
 178   -- ToDo: should only be valid inside a pragma:
 179   "#-}"                                 { token ITclose_prag}
 180 }
 181
 182
 183 -- '0' state: ordinary lexemes
 184 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 185
 186 -- "special" symbols
 187
 188 <glaexts> {
 189   "(#"                                  { token IToubxparen }
 190   "#)"                                  { token ITcubxparen }
 191
 192   "[:"                                  { token ITopabrack }
 193   ":]"                                  { token ITcpabrack }
 194
 195   "{|"                                  { token ITocurlybar }
 196   "|}"                                  { token ITccurlybar }
 197
 198   "[|"                                  { token ITopenExpQuote }
 199   "[e|"                                 { token ITopenExpQuote }
 200   "[p|"                                 { token ITopenPatQuote }
 201   "[d|"                                 { token ITopenDecQuote }
 202   "[t|"                                 { token ITopenTypQuote }
 203   "|]"                                  { token ITcloseQuote }
 204 }
 205
 206 <0,glaexts> {
 207   "(|" / { \b _ _ _ -> arrowsEnabled b} { special IToparenbar }
 208   "|)" / { \b _ _ _ -> arrowsEnabled b} { special ITcparenbar }
 209   \(                                    { special IToparen }
 210   \)                                    { special ITcparen }
 211   \[                                    { special ITobrack }
 212   \]                                    { special ITcbrack }
 213   \,                                    { special ITcomma }
 214   \;                                    { special ITsemi }
 215   \`                                    { special ITbackquote }
 216
 217   \{                                    { open_brace }
 218   \}                                    { close_brace }
 219 }
 220
 221 <glaexts> {
 222   \? @varid                     { skip_one_varid ITdupipvarid }
 223   \% @varid                     { skip_one_varid ITsplitipvarid }
 224   \$ @varid                     { skip_one_varid ITidEscape }
 225   "$("                          { token ITparenEscape }
 226 }
 227
 228 <0,glaexts> {
 229   @qual @varid                  { check_qvarid }
 230   @qual @conid                  { idtoken qconid }
 231   @varid                        { varid }
 232   @conid                        { idtoken conid }
 233 }
 234
 235 -- after an illegal qvarid, such as 'M.let',
 236 -- we back up and try again in the bad_qvarid state:
 237 <bad_qvarid> {
 238   @conid                        { pop_and (idtoken conid) }
 239   @qual @conid                  { pop_and (idtoken qconid) }
 240 }
 241
 242 <glaexts> {
 243   @qual @varid "#"+             { idtoken qvarid }
 244   @qual @conid "#"+             { idtoken qconid }
 245   @varid "#"+                   { varid }
 246   @conid "#"+                   { idtoken conid }
 247 }
 248
 249 -- ToDo: M.(,,,)
 250
 251 <0,glaexts> {
 252   @qual @varsym                 { idtoken qvarsym }
 253   @qual @consym                 { idtoken qconsym }
 254   @varsym                       { varsym }
 255   @consym                       { consym }
 256 }
 257
 258 <0,glaexts> {
 259   @decimal                      { tok_decimal }
 260   0[oO] @octal                  { tok_octal }
 261   0[xX] @hexadecimal            { tok_hexadecimal }
 262 }
 263
 264 <glaexts> {
 265   @decimal \#                   { prim_decimal }
 266   0[oO] @octal \#               { prim_octal }
 267   0[xX] @hexadecimal \#         { prim_hexadecimal }
 268 }
 269
 270 <0,glaexts> @floating_point             { strtoken tok_float }
 271 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 272 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 273
 274 -- Strings and chars are lexed by hand-written code.  The reason is
 275 -- that even if we recognise the string or char here in the regex
 276 -- lexer, we would still have to parse the string afterward in order
 277 -- to convert it to a String.
 278 <0,glaexts> {
 279   \'                            { lex_char_tok }
 280   \"                            { lex_string_tok }
 281 }
 282
 283 <glaexts> "``" (([$graphic $whitechar] # \') | \' ([$graphic $whitechar] # \'))*
 284                 "''"            { clitlit }
 285
 286 {
 287 -- work around bug in Alex 2.0
 288 #if __GLASGOW_HASKELL__ < 503
 289 unsafeAt arr i = arr ! i
 290 #endif
 291
 292 -- -----------------------------------------------------------------------------
 293 -- The token type
 294
 295 data Token = T SrcLoc{-start-} SrcLoc{-end-} Token__
 296
 297 data Token__
 298   = ITas                        -- Haskell keywords
 299   | ITcase
 300   | ITclass
 301   | ITdata
 302   | ITdefault
 303   | ITderiving
 304   | ITdo
 305   | ITelse
 306   | IThiding
 307   | ITif
 308   | ITimport
 309   | ITin
 310   | ITinfix
 311   | ITinfixl
 312   | ITinfixr
 313   | ITinstance
 314   | ITlet
 315   | ITmodule
 316   | ITnewtype
 317   | ITof
 318   | ITqualified
 319   | ITthen
 320   | ITtype
 321   | ITwhere
 322   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 323
 324   | ITforall                    -- GHC extension keywords
 325   | ITforeign
 326   | ITexport
 327   | ITlabel
 328   | ITdynamic
 329   | ITsafe
 330   | ITthreadsafe
 331   | ITunsafe
 332   | ITwith
 333   | ITstdcallconv
 334   | ITccallconv
 335   | ITdotnet
 336   | ITccall (Bool,Bool,Safety)  -- (is_dyn, is_casm, may_gc)
 337   | ITmdo
 338
 339   | ITspecialise_prag           -- Pragmas
 340   | ITsource_prag
 341   | ITinline_prag
 342   | ITnoinline_prag
 343   | ITrules_prag
 344   | ITdeprecated_prag
 345   | ITline_prag
 346   | ITscc_prag
 347   | ITcore_prag                 -- hdaume: core annotations
 348   | ITclose_prag
 349
 350   | ITdotdot                    -- reserved symbols
 351   | ITcolon
 352   | ITdcolon
 353   | ITequal
 354   | ITlam
 355   | ITvbar
 356   | ITlarrow
 357   | ITrarrow
 358   | ITat
 359   | ITtilde
 360   | ITdarrow
 361   | ITminus
 362   | ITbang
 363   | ITstar
 364   | ITdot
 365
 366   | ITbiglam                    -- GHC-extension symbols
 367
 368   | ITocurly                    -- special symbols
 369   | ITccurly
 370   | ITocurlybar                 -- {|, for type applications
 371   | ITccurlybar                 -- |}, for type applications
 372   | ITvocurly
 373   | ITvccurly
 374   | ITobrack
 375   | ITopabrack                  -- [:, for parallel arrays with -fparr
 376   | ITcpabrack                  -- :], for parallel arrays with -fparr
 377   | ITcbrack
 378   | IToparen
 379   | ITcparen
 380   | IToubxparen
 381   | ITcubxparen
 382   | ITsemi
 383   | ITcomma
 384   | ITunderscore
 385   | ITbackquote
 386
 387   | ITvarid   FastString        -- identifiers
 388   | ITconid   FastString
 389   | ITvarsym  FastString
 390   | ITconsym  FastString
 391   | ITqvarid  (FastString,FastString)
 392   | ITqconid  (FastString,FastString)
 393   | ITqvarsym (FastString,FastString)
 394   | ITqconsym (FastString,FastString)
 395
 396   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 397   | ITsplitipvarid FastString   -- GHC extension: implicit param: %x
 398
 399   | ITpragma StringBuffer
 400
 401   | ITchar       Char
 402   | ITstring     FastString
 403   | ITinteger    Integer
 404   | ITrational   Rational
 405
 406   | ITprimchar   Char
 407   | ITprimstring FastString
 408   | ITprimint    Integer
 409   | ITprimfloat  Rational
 410   | ITprimdouble Rational
 411   | ITlitlit     FastString
 412
 413   -- MetaHaskell extension tokens
 414   | ITopenExpQuote              -- [| or [e|
 415   | ITopenPatQuote              -- [p|
 416   | ITopenDecQuote              -- [d|
 417   | ITopenTypQuote              -- [t|
 418   | ITcloseQuote                -- |]
 419   | ITidEscape   FastString     -- $x
 420   | ITparenEscape               -- $(
 421   | ITreifyType
 422   | ITreifyDecl
 423   | ITreifyFixity
 424
 425   -- Arrow notation extension
 426   | ITproc
 427   | ITrec
 428   | IToparenbar                 -- (|
 429   | ITcparenbar                 -- |)
 430   | ITlarrowtail                -- -<
 431   | ITrarrowtail                -- >-
 432   | ITLarrowtail                -- -<<
 433   | ITRarrowtail                -- >>-
 434
 435   | ITunknown String            -- Used when the lexer can't make sense of it
 436   | ITeof                       -- end of file token
 437 #ifdef DEBUG
 438   deriving Show -- debugging
 439 #endif
 440
 441 isSpecial :: Token__ -> Bool
 442 -- If we see M.x, where x is a keyword, but
 443 -- is special, we treat is as just plain M.x,
 444 -- not as a keyword.
 445 isSpecial ITas          = True
 446 isSpecial IThiding      = True
 447 isSpecial ITqualified   = True
 448 isSpecial ITforall      = True
 449 isSpecial ITexport      = True
 450 isSpecial ITlabel       = True
 451 isSpecial ITdynamic     = True
 452 isSpecial ITsafe        = True
 453 isSpecial ITthreadsafe  = True
 454 isSpecial ITunsafe      = True
 455 isSpecial ITwith        = True
 456 isSpecial ITccallconv   = True
 457 isSpecial ITstdcallconv = True
 458 isSpecial ITmdo         = True
 459 isSpecial _             = False
 460
 461 -- the bitmap provided as the third component indicates whether the
 462 -- corresponding extension keyword is valid under the extension options
 463 -- provided to the compiler; if the extension corresponding to *any* of the
 464 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 465 -- facilitates using a keyword in two different extensions that can be
 466 -- activated independently)
 467 --
 468 reservedWordsFM = listToUFM $
 469         map (\(x, y, z) -> (mkFastString x, (y, z)))
 470        [( "_",          ITunderscore,   0 ),
 471         ( "as",         ITas,           0 ),
 472         ( "case",       ITcase,         0 ),
 473         ( "class",      ITclass,        0 ),
 474         ( "data",       ITdata,         0 ),
 475         ( "default",    ITdefault,      0 ),
 476         ( "deriving",   ITderiving,     0 ),
 477         ( "do",         ITdo,           0 ),
 478         ( "else",       ITelse,         0 ),
 479         ( "hiding",     IThiding,       0 ),
 480         ( "if",         ITif,           0 ),
 481         ( "import",     ITimport,       0 ),
 482         ( "in",         ITin,           0 ),
 483         ( "infix",      ITinfix,        0 ),
 484         ( "infixl",     ITinfixl,       0 ),
 485         ( "infixr",     ITinfixr,       0 ),
 486         ( "instance",   ITinstance,     0 ),
 487         ( "let",        ITlet,          0 ),
 488         ( "module",     ITmodule,       0 ),
 489         ( "newtype",    ITnewtype,      0 ),
 490         ( "of",         ITof,           0 ),
 491         ( "qualified",  ITqualified,    0 ),
 492         ( "then",       ITthen,         0 ),
 493         ( "type",       ITtype,         0 ),
 494         ( "where",      ITwhere,        0 ),
 495         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 496
 497         ( "forall",     ITforall,        bit glaExtsBit),
 498         ( "mdo",        ITmdo,           bit glaExtsBit),
 499         ( "reifyDecl",  ITreifyDecl,     bit glaExtsBit),
 500         ( "reifyType",  ITreifyType,     bit glaExtsBit),
 501         ( "reifyFixity",ITreifyFixity,   bit glaExtsBit),
 502
 503         ( "foreign",    ITforeign,       bit ffiBit),
 504         ( "export",     ITexport,        bit ffiBit),
 505         ( "label",      ITlabel,         bit ffiBit),
 506         ( "dynamic",    ITdynamic,       bit ffiBit),
 507         ( "safe",       ITsafe,          bit ffiBit),
 508         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 509         ( "unsafe",     ITunsafe,        bit ffiBit),
 510         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 511         ( "ccall",      ITccallconv,     bit ffiBit),
 512         ( "dotnet",     ITdotnet,        bit ffiBit),
 513
 514         ( "with",       ITwith,          bit withBit),
 515
 516         ( "rec",        ITrec,           bit arrowsBit),
 517         ( "proc",       ITproc,          bit arrowsBit),
 518
 519         -- On death row
 520         ("_ccall_",     ITccall (False, False, PlayRisky),
 521                                          bit glaExtsBit),
 522         ("_ccall_GC_",  ITccall (False, False, PlaySafe False),
 523                                          bit glaExtsBit),
 524         ("_casm_",      ITccall (False, True,  PlayRisky),
 525                                          bit glaExtsBit),
 526         ("_casm_GC_",   ITccall (False, True,  PlaySafe False),
 527                                          bit glaExtsBit)
 528      ]
 529
 530 reservedSymsFM = listToUFM $
 531         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 532       [ ("..",  ITdotdot,       0)
 533        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 534                                                 -- meaning only list cons
 535        ,("::",  ITdcolon,       0)
 536        ,("=",   ITequal,        0)
 537        ,("\\",  ITlam,          0)
 538        ,("|",   ITvbar,         0)
 539        ,("<-",  ITlarrow,       0)
 540        ,("->",  ITrarrow,       0)
 541        ,("@",   ITat,           0)
 542        ,("~",   ITtilde,        0)
 543        ,("=>",  ITdarrow,       0)
 544        ,("-",   ITminus,        0)
 545        ,("!",   ITbang,         0)
 546
 547        ,("*",   ITstar,         bit glaExtsBit) -- For data T (a::*) = MkT
 548        ,(".",   ITdot,          bit glaExtsBit) -- For 'forall a . t'
 549
 550        ,("-<",  ITlarrowtail,   bit arrowsBit)
 551        ,(">-",  ITrarrowtail,   bit arrowsBit)
 552        ,("-<<", ITLarrowtail,   bit arrowsBit)
 553        ,(">>-", ITRarrowtail,   bit arrowsBit)
 554        ]
 555
 556 -- -----------------------------------------------------------------------------
 557 -- Lexer actions
 558
 559 type Action = SrcLoc -> SrcLoc -> StringBuffer -> Int -> P Token
 560
 561 special :: Token__ -> Action
 562 special tok loc end _buf len = return (T loc end tok)
 563
 564 token :: Token__ -> Action
 565 token t loc end buf len = return (T loc end t)
 566
 567 idtoken :: (StringBuffer -> Int -> Token__) -> Action
 568 idtoken f loc end buf len = return (T loc end $! (f buf len))
 569
 570 skip_one_varid :: (FastString -> Token__) -> Action
 571 skip_one_varid f loc end buf len
 572   = return (T loc end $! f (lexemeToFastString (stepOn buf) (len-1)))
 573
 574 strtoken :: (String -> Token__) -> Action
 575 strtoken f loc end buf len =
 576   return (T loc end $! (f $! lexemeToString buf len))
 577
 578 init_strtoken :: Int -> (String -> Token__) -> Action
 579 -- like strtoken, but drops the last N character(s)
 580 init_strtoken drop f loc end buf len =
 581   return (T loc end $! (f $! lexemeToString buf (len-drop)))
 582
 583 begin :: Int -> Action
 584 begin code _loc _end _str _len = do pushLexState code; lexToken
 585
 586 pop :: Action
 587 pop _loc _end _buf _len = do popLexState; lexToken
 588
 589 pop_and :: Action -> Action
 590 pop_and act loc end buf len = do popLexState; act loc end buf len
 591
 592 notFollowedBy char _ _ _ (_,buf) = atEnd buf || currentChar buf /= char
 593
 594 {-
 595   nested comments require traversing by hand, they can't be parsed
 596   using regular expressions.
 597 -}
 598 nested_comment :: Action
 599 nested_comment loc _end _str _len = do
 600   input <- getInput
 601   go 1 input
 602   where go 0 input = do setInput input; lexToken
 603         go n input = do
 604           case alexGetChar input of
 605             Nothing  -> err input
 606             Just (c,input) -> do
 607               case c of
 608                 '-' -> do
 609                   case alexGetChar input of
 610                     Nothing  -> err input
 611                     Just ('\125',input) -> go (n-1) input
 612                     Just (c,_)          -> go n input
 613                 '\123' -> do
 614                   case alexGetChar input of
 615                     Nothing  -> err input
 616                     Just ('-',input') -> go (n+1) input'
 617                     Just (c,input)    -> go n input
 618                 c -> go n input
 619
 620         err input = do failLocMsgP loc (fst input) "unterminated `{-'"
 621
 622 open_brace, close_brace :: Action
 623 open_brace  loc end _str _len = do
 624   ctx <- getContext
 625   setContext (NoLayout:ctx)
 626   return (T loc end ITocurly)
 627 close_brace loc end _str _len = do
 628   popContext
 629   return (T loc end ITccurly)
 630
 631 -- We have to be careful not to count M.<varid> as a qualified name
 632 -- when <varid> is a keyword.  We hack around this by catching
 633 -- the offending tokens afterward, and re-lexing in a different state.
 634 check_qvarid loc end buf len = do
 635   case lookupUFM reservedWordsFM var of
 636         Just (keyword,exts)
 637           | not (isSpecial keyword) ->
 638           if exts == 0
 639              then try_again
 640              else do
 641                 b <- extension (\i -> exts .&. i /= 0)
 642                 if b then try_again
 643                      else return token
 644         _other -> return token
 645   where
 646         (mod,var) = splitQualName buf len
 647         token     = T loc end (ITqvarid (mod,var))
 648
 649         try_again = do
 650                 setInput (loc,buf)
 651                 pushLexState bad_qvarid
 652                 lexToken
 653
 654 qvarid buf len = ITqvarid $! splitQualName buf len
 655 qconid buf len = ITqconid $! splitQualName buf len
 656
 657 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 658 -- takes a StringBuffer and a length, and returns the module name
 659 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 660 -- because of hierarchical module names.
 661 splitQualName orig_buf len = split orig_buf 0 0
 662   where
 663     split buf dot_off n
 664         | n == len                = done dot_off
 665         | lookAhead buf n == '.'  = split2 buf n (n+1)
 666         | otherwise               = split buf dot_off (n+1)
 667
 668     -- careful, we might get names like M....
 669     -- so, if the character after the dot is not upper-case, this is
 670     -- the end of the qualifier part.
 671     split2 buf dot_off n
 672         | isUpper (lookAhead buf n) = split buf dot_off (n+1)
 673         | otherwise                 = done dot_off
 674
 675     done dot_off =
 676         (lexemeToFastString orig_buf dot_off,
 677          lexemeToFastString (stepOnBy (dot_off+1) orig_buf) (len - dot_off -1))
 678
 679 varid loc end buf len =
 680   case lookupUFM reservedWordsFM fs of
 681         Just (keyword,0)    -> do
 682                 maybe_layout keyword
 683                 return (T loc end keyword)
 684         Just (keyword,exts) -> do
 685                 b <- extension (\i -> exts .&. i /= 0)
 686                 if b then do maybe_layout keyword
 687                              return (T loc end keyword)
 688                      else return (T loc end (ITvarid fs))
 689         _other -> return (T loc end (ITvarid fs))
 690   where
 691         fs = lexemeToFastString buf len
 692
 693 conid buf len = ITconid fs
 694   where fs = lexemeToFastString buf len
 695
 696 qvarsym buf len = ITqvarsym $! splitQualName buf len
 697 qconsym buf len = ITqconsym $! splitQualName buf len
 698
 699 varsym = sym ITvarsym
 700 consym = sym ITconsym
 701
 702 sym con loc end buf len =
 703   case lookupUFM reservedSymsFM fs of
 704         Just (keyword,0)    -> return (T loc end keyword)
 705         Just (keyword,exts) -> do
 706                 b <- extension (\i -> exts .&. i /= 0)
 707                 if b then return (T loc end keyword)
 708                      else return (T loc end $! con fs)
 709         _other -> return (T loc end $! con fs)
 710   where
 711         fs = lexemeToFastString buf len
 712
 713 tok_decimal loc end buf len
 714   = return (T loc end (ITinteger  $! parseInteger buf len 10 oct_or_dec))
 715
 716 tok_octal loc end buf len
 717   = return (T loc end (ITinteger  $! parseInteger (stepOnBy 2 buf) (len-2) 8 oct_or_dec))
 718
 719 tok_hexadecimal loc end buf len
 720   = return (T loc end (ITinteger  $! parseInteger (stepOnBy 2 buf) (len-2) 16 hex))
 721
 722 prim_decimal loc end buf len
 723   = return (T loc end (ITprimint  $! parseInteger buf (len-1) 10 oct_or_dec))
 724
 725 prim_octal loc end buf len
 726   = return (T loc end (ITprimint  $! parseInteger (stepOnBy 2 buf) (len-3) 8 oct_or_dec))
 727
 728 prim_hexadecimal loc end buf len
 729   = return (T loc end (ITprimint  $! parseInteger (stepOnBy 2 buf) (len-3) 16 hex))
 730
 731 tok_float        str = ITrational $! readRational__ str
 732 prim_float       str = ITprimfloat  $! readRational__ str
 733 prim_double      str = ITprimdouble $! readRational__ str
 734
 735 parseInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
 736 parseInteger buf len radix to_int
 737   = go 0 0
 738   where go i x | i == len  = x
 739                | otherwise = go (i+1) (x * radix + toInteger (to_int (lookAhead buf i)))
 740
 741 clitlit :: Action
 742 clitlit loc end buf len =
 743   return (T loc end (ITlitlit $! lexemeToFastString (stepOnBy 2 buf) (len-4)))
 744
 745 -- -----------------------------------------------------------------------------
 746 -- Layout processing
 747
 748 -- we're at the first token on a line, insert layout tokens if necessary
 749 do_bol :: Action
 750 do_bol loc end _str _len = do
 751         pos <- getOffside end
 752         case pos of
 753             LT -> do
 754                 --trace "layout: inserting '}'" $ do
 755                 popContext
 756                 -- do NOT pop the lex state, we might have a ';' to insert
 757                 return (T loc end ITvccurly)
 758             EQ -> do
 759                 --trace "layout: inserting ';'" $ do
 760                 popLexState
 761                 return (T loc end ITsemi)
 762             GT -> do
 763                 popLexState
 764                 lexToken
 765
 766 -- certain keywords put us in the "layout" state, where we might
 767 -- add an opening curly brace.
 768 maybe_layout ITdo       = pushLexState layout_do
 769 maybe_layout ITmdo      = pushLexState layout_do
 770 maybe_layout ITof       = pushLexState layout
 771 maybe_layout ITlet      = pushLexState layout
 772 maybe_layout ITwhere    = pushLexState layout
 773 maybe_layout _          = return ()
 774
 775 -- Pushing a new implicit layout context.  If the indentation of the
 776 -- next token is not greater than the previous layout context, then
 777 -- Haskell 98 says that the new layout context should be empty; that is
 778 -- the lexer must generate {}.
 779 --
 780 -- We are slightly more lenient than this: when the new context is started
 781 -- by a 'do', then we allow the new context to be at the same indentation as
 782 -- the previous context.  This is what the 'strict' argument is for.
 783 --
 784 new_layout_context strict loc end _buf _len = do
 785     popLexState
 786     let offset = srcLocCol loc
 787     ctx <- getContext
 788     case ctx of
 789         Layout prev_off : _  |
 790            (strict     && prev_off >= offset  ||
 791             not strict && prev_off > offset) -> do
 792                 -- token is indented to the left of the previous context.
 793                 -- we must generate a {} sequence now.
 794                 pushLexState layout_left
 795                 return (T loc end ITvocurly)
 796         other -> do
 797                 setContext (Layout offset : ctx)
 798                 return (T loc end ITvocurly)
 799
 800 do_layout_left loc end _buf _len = do
 801     popLexState
 802     pushLexState bol  -- we must be at the start of a line
 803     return (T loc end ITvccurly)
 804
 805 -- -----------------------------------------------------------------------------
 806 -- LINE pragmas
 807
 808 set_line :: Int -> Action
 809 set_line code loc end buf len = do
 810   let line = parseInteger buf len 10 oct_or_dec
 811   setSrcLoc (mkSrcLoc (srcLocFile end) (fromIntegral line - 1) 0)
 812         -- subtract one: the line number refers to the *following* line
 813   popLexState
 814   pushLexState code
 815   lexToken
 816
 817 set_file :: Int -> Action
 818 set_file code loc end buf len = do
 819   let file = lexemeToFastString (stepOn buf) (len-2)
 820   setSrcLoc (mkSrcLoc file (srcLocLine end) (srcLocCol end))
 821   popLexState
 822   pushLexState code
 823   lexToken
 824
 825 -- -----------------------------------------------------------------------------
 826 -- Strings & Chars
 827
 828 -- This stuff is horrible.  I hates it.
 829
 830 lex_string_tok :: Action
 831 lex_string_tok loc end buf len = do
 832   tok <- lex_string ""
 833   end <- getSrcLoc
 834   return (T loc end tok)
 835
 836 lex_string :: String -> P Token__
 837 lex_string s = do
 838   i <- getInput
 839   case alexGetChar i of
 840     Nothing -> lit_error
 841
 842     Just ('"',i)  -> do
 843         setInput i
 844         glaexts <- extension glaExtsEnabled
 845         if glaexts
 846           then do
 847             i <- getInput
 848             case alexGetChar i of
 849               Just ('#',i) -> do
 850                    setInput i
 851                    if any (> '\xFF') s
 852                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 853                     else let s' = mkFastStringNarrow (reverse s) in
 854                          -- always a narrow string/byte array
 855                          return (ITprimstring s')
 856               _other ->
 857                 return (ITstring (mkFastString (reverse s)))
 858           else
 859                 return (ITstring (mkFastString (reverse s)))
 860
 861     Just ('\\',i)
 862         | Just ('&',i) <- next -> do
 863                 setInput i; lex_string s
 864         | Just (c,i) <- next, is_space c -> do
 865                 setInput i; lex_stringgap s
 866         where next = alexGetChar i
 867
 868     Just _ -> do
 869         c <- lex_char
 870         lex_string (c:s)
 871
 872
 873 lex_stringgap s = do
 874   c <- getCharOrFail
 875   case c of
 876     '\\' -> lex_string s
 877     c | is_space c -> lex_stringgap s
 878     _other -> lit_error
 879
 880
 881 lex_char_tok :: Action
 882 lex_char_tok loc _end buf len = do
 883    c <- lex_char
 884    mc <- getCharOrFail
 885    case mc of
 886         '\'' -> do
 887            glaexts <- extension glaExtsEnabled
 888            if glaexts
 889                 then do
 890                    i@(end,_) <- getInput
 891                    case alexGetChar i of
 892                         Just ('#',i@(end,_)) -> do
 893                                 setInput i
 894                                 return (T loc end (ITprimchar c))
 895                         _other ->
 896                                 return (T loc end (ITchar c))
 897                 else do
 898                    end <- getSrcLoc
 899                    return (T loc end (ITchar c))
 900
 901         _other -> lit_error
 902
 903 lex_char :: P Char
 904 lex_char = do
 905   mc <- getCharOrFail
 906   case mc of
 907       '\\' -> lex_escape
 908       c | is_any c -> return c
 909       _other -> lit_error
 910
 911 lex_escape :: P Char
 912 lex_escape = do
 913   c <- getCharOrFail
 914   case c of
 915         'a'   -> return '\a'
 916         'b'   -> return '\b'
 917         'f'   -> return '\f'
 918         'n'   -> return '\n'
 919         'r'   -> return '\r'
 920         't'   -> return '\t'
 921         'v'   -> return '\v'
 922         '\\'  -> return '\\'
 923         '"'   -> return '\"'
 924         '\''  -> return '\''
 925         '^'   -> do c <- getCharOrFail
 926                     if c >= '@' && c <= '_'
 927                         then return (chr (ord c - ord '@'))
 928                         else lit_error
 929
 930         'x'   -> readNum is_hexdigit 16 hex
 931         'o'   -> readNum is_octdigit  8 oct_or_dec
 932         x | is_digit x -> readNum2 is_digit 10 oct_or_dec (oct_or_dec x)
 933
 934         c1 ->  do
 935            i <- getInput
 936            case alexGetChar i of
 937             Nothing -> lit_error
 938             Just (c2,i2) ->
 939               case alexGetChar i2 of
 940                 Nothing -> lit_error
 941                 Just (c3,i3) ->
 942                    let str = [c1,c2,c3] in
 943                    case [ (c,rest) | (p,c) <- silly_escape_chars,
 944                                      Just rest <- [maybePrefixMatch p str] ] of
 945                           (escape_char,[]):_ -> do
 946                                 setInput i3
 947                                 return escape_char
 948                           (escape_char,_:_):_ -> do
 949                                 setInput i2
 950                                 return escape_char
 951                           [] -> lit_error
 952
 953 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
 954 readNum is_digit base conv = do
 955   c <- getCharOrFail
 956   if is_digit c
 957         then readNum2 is_digit base conv (conv c)
 958         else lit_error
 959
 960 readNum2 is_digit base conv i = do
 961   input <- getInput
 962   read i input
 963   where read i input = do
 964           case alexGetChar input of
 965             Just (c,input') | is_digit c -> do
 966                 read (i*base + conv c) input'
 967             _other -> do
 968                 setInput input
 969                 if i >= 0 && i <= 0x10FFFF
 970                    then return (chr i)
 971                    else lit_error
 972
 973 is_hexdigit c
 974         =  is_digit c
 975         || (c >= 'a' && c <= 'f')
 976         || (c >= 'A' && c <= 'F')
 977
 978 hex c | is_digit c = ord c - ord '0'
 979       | otherwise  = ord (to_lower c) - ord 'a' + 10
 980
 981 oct_or_dec c = ord c - ord '0'
 982
 983 is_octdigit c = c >= '0' && c <= '7'
 984
 985 to_lower c
 986   | c >=  'A' && c <= 'Z' = chr (ord c - (ord 'A' - ord 'a'))
 987   | otherwise = c
 988
 989 silly_escape_chars = [
 990         ("NUL", '\NUL'),
 991         ("SOH", '\SOH'),
 992         ("STX", '\STX'),
 993         ("ETX", '\ETX'),
 994         ("EOT", '\EOT'),
 995         ("ENQ", '\ENQ'),
 996         ("ACK", '\ACK'),
 997         ("BEL", '\BEL'),
 998         ("BS", '\BS'),
 999         ("HT", '\HT'),
1000         ("LF", '\LF'),
1001         ("VT", '\VT'),
1002         ("FF", '\FF'),
1003         ("CR", '\CR'),
1004         ("SO", '\SO'),
1005         ("SI", '\SI'),
1006         ("DLE", '\DLE'),
1007         ("DC1", '\DC1'),
1008         ("DC2", '\DC2'),
1009         ("DC3", '\DC3'),
1010         ("DC4", '\DC4'),
1011         ("NAK", '\NAK'),
1012         ("SYN", '\SYN'),
1013         ("ETB", '\ETB'),
1014         ("CAN", '\CAN'),
1015         ("EM", '\EM'),
1016         ("SUB", '\SUB'),
1017         ("ESC", '\ESC'),
1018         ("FS", '\FS'),
1019         ("GS", '\GS'),
1020         ("RS", '\RS'),
1021         ("US", '\US'),
1022         ("SP", '\SP'),
1023         ("DEL", '\DEL')
1024         ]
1025
1026 lit_error = lexError "lexical error in string/character literal"
1027
1028 getCharOrFail :: P Char
1029 getCharOrFail =  do
1030   i <- getInput
1031   case alexGetChar i of
1032         Nothing -> lexError "unexpected end-of-file in string/character literal"
1033         Just (c,i)  -> do setInput i; return c
1034
1035 -- -----------------------------------------------------------------------------
1036 -- Floats
1037
1038 readRational :: ReadS Rational -- NB: doesn't handle leading "-"
1039 readRational r = do
1040      (n,d,s) <- readFix r
1041      (k,t)   <- readExp s
1042      return ((n%1)*10^^(k-d), t)
1043  where
1044      readFix r = do
1045         (ds,s)  <- lexDecDigits r
1046         (ds',t) <- lexDotDigits s
1047         return (read (ds++ds'), length ds', t)
1048
1049      readExp (e:s) | e `elem` "eE" = readExp' s
1050      readExp s                     = return (0,s)
1051
1052      readExp' ('+':s) = readDec s
1053      readExp' ('-':s) = do
1054                         (k,t) <- readDec s
1055                         return (-k,t)
1056      readExp' s       = readDec s
1057
1058      readDec s = do
1059         (ds,r) <- nonnull isDigit s
1060         return (foldl1 (\n d -> n * 10 + d) [ ord d - ord '0' | d <- ds ],
1061                 r)
1062
1063      lexDecDigits = nonnull isDigit
1064
1065      lexDotDigits ('.':s) = return (span isDigit s)
1066      lexDotDigits s       = return ("",s)
1067
1068      nonnull p s = do (cs@(_:_),t) <- return (span p s)
1069                       return (cs,t)
1070
1071 readRational__ :: String -> Rational -- NB: *does* handle a leading "-"
1072 readRational__ top_s
1073   = case top_s of
1074       '-' : xs -> - (read_me xs)
1075       xs       -> read_me xs
1076   where
1077     read_me s
1078       = case (do { (x,"") <- readRational s ; return x }) of
1079           [x] -> x
1080           []  -> error ("readRational__: no parse:"        ++ top_s)
1081           _   -> error ("readRational__: ambiguous parse:" ++ top_s)
1082
1083 -- -----------------------------------------------------------------------------
1084 -- The Parse Monad
1085
1086 data LayoutContext
1087   = NoLayout
1088   | Layout !Int
1089
1090 data ParseResult a
1091   = POk PState a
1092   | PFailed
1093         SrcLoc SrcLoc   -- The start and end of the text span related to
1094                         -- the error.  Might be used in environments which can
1095                         -- show this span, e.g. by highlighting it.
1096         Message         -- The error message
1097
1098 showPFailed loc1 loc2 err
1099  = showSDoc (hcat [ppr loc1, text ": ", err])
1100
1101 data PState = PState {
1102         buffer     :: StringBuffer,
1103         last_loc   :: SrcLoc,           -- pos of previous token
1104         last_len   :: !Int,             -- len of previous token
1105         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1106         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1107         context    :: [LayoutContext],
1108         lex_state  :: [Int]
1109      }
1110         -- last_loc and last_len are used when generating error messages,
1111         -- and in pushCurrentContext only.
1112
1113 newtype P a = P { unP :: PState -> ParseResult a }
1114
1115 instance Monad P where
1116   return = returnP
1117   (>>=) = thenP
1118   fail = failP
1119
1120 returnP :: a -> P a
1121 returnP a = P $ \s -> POk s a
1122
1123 thenP :: P a -> (a -> P b) -> P b
1124 (P m) `thenP` k = P $ \ s ->
1125         case m s of
1126                 POk s1 a          -> (unP (k a)) s1
1127                 PFailed l1 l2 err -> PFailed l1 l2 err
1128
1129 failP :: String -> P a
1130 failP msg = P $ \s -> PFailed (last_loc s) (loc s) (text msg)
1131
1132 failMsgP :: String -> P a
1133 failMsgP msg = P $ \s -> PFailed (last_loc s) (loc s) (text msg)
1134
1135 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1136 failLocMsgP loc1 loc2 str = P $ \s -> PFailed loc1 loc2 (text str)
1137
1138 extension :: (Int -> Bool) -> P Bool
1139 extension p = P $ \s -> POk s (p $! extsBitmap s)
1140
1141 getExts :: P Int
1142 getExts = P $ \s -> POk s (extsBitmap s)
1143
1144 setSrcLoc :: SrcLoc -> P ()
1145 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1146
1147 -- tmp, for supporting stuff in RdrHsSyn.  The scope better not include
1148 -- any calls to the lexer, because it assumes things about the SrcLoc.
1149 setSrcLocFor :: SrcLoc -> P a -> P a
1150 setSrcLocFor new_loc scope = P $ \s@PState{ loc = old_loc } ->
1151   case unP scope s{loc=new_loc} of
1152         PFailed l1 l2 msg -> PFailed l1 l2 msg
1153         POk _ r -> POk s r
1154
1155 getSrcLoc :: P SrcLoc
1156 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1157
1158 setLastToken :: SrcLoc -> Int -> P ()
1159 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1160
1161 type AlexInput = (SrcLoc,StringBuffer)
1162
1163 alexInputPrevChar :: AlexInput -> Char
1164 alexInputPrevChar (_,s) = prevChar s '\n'
1165
1166 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1167 alexGetChar (loc,s)
1168   | atEnd s   = Nothing
1169   | otherwise = c `seq` loc' `seq` s' `seq` Just (c, (loc', s'))
1170   where c = currentChar s
1171         loc' = advanceSrcLoc loc c
1172         s'   = stepOn s
1173
1174 getInput :: P AlexInput
1175 getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (l,b)
1176
1177 setInput :: AlexInput -> P ()
1178 setInput (l,b) = P $ \s -> POk s{ loc=l, buffer=b } ()
1179
1180 pushLexState :: Int -> P ()
1181 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1182
1183 popLexState :: P Int
1184 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1185
1186 getLexState :: P Int
1187 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1188
1189 -- for reasons of efficiency, flags indicating language extensions (eg,
1190 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1191 -- integer
1192
1193 glaExtsBit, ffiBit, parrBit :: Int
1194 glaExtsBit = 0
1195 ffiBit     = 1
1196 parrBit    = 2
1197 withBit    = 3
1198 arrowsBit  = 4
1199
1200 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1201 glaExtsEnabled flags = testBit flags glaExtsBit
1202 ffiEnabled     flags = testBit flags ffiBit
1203 withEnabled    flags = testBit flags withBit
1204 parrEnabled    flags = testBit flags parrBit
1205 arrowsEnabled  flags = testBit flags arrowsBit
1206
1207 -- convenient record-based bitmap for the interface to the rest of the world
1208 --
1209 -- NB: `glasgowExtsEF' implies `ffiEF' (see `mkPState' below)
1210 --
1211 data ExtFlags = ExtFlags {
1212                   glasgowExtsEF :: Bool,
1213                   ffiEF         :: Bool,
1214                   withEF        :: Bool,
1215                   parrEF        :: Bool,
1216                   arrowsEF      :: Bool
1217                 }
1218
1219 -- create a parse state
1220 --
1221 mkPState :: StringBuffer -> SrcLoc -> ExtFlags -> PState
1222 mkPState buf loc exts  =
1223   PState {
1224       buffer     = buf,
1225       last_loc   = loc,
1226       last_len   = 0,
1227       loc        = loc,
1228       extsBitmap = fromIntegral bitmap,
1229       context    = [],
1230       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1231         -- we begin in the layout state if toplev_layout is set
1232     }
1233     where
1234       bitmap =     glaExtsBit `setBitIf` glasgowExtsEF     exts
1235                .|. ffiBit     `setBitIf` (ffiEF            exts
1236                                           || glasgowExtsEF exts)
1237                .|. withBit    `setBitIf` withEF            exts
1238                .|. parrBit    `setBitIf` parrEF            exts
1239                .|. arrowsBit  `setBitIf` arrowsEF          exts
1240       --
1241       setBitIf :: Int -> Bool -> Int
1242       b `setBitIf` cond | cond      = bit b
1243                         | otherwise = 0
1244
1245 getContext :: P [LayoutContext]
1246 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1247
1248 setContext :: [LayoutContext] -> P ()
1249 setContext ctx = P $ \s -> POk s{context=ctx} ()
1250
1251 popContext :: P ()
1252 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1253                            loc = loc, last_len = len, last_loc = last_loc }) ->
1254   case ctx of
1255         (_:tl) -> POk s{ context = tl } ()
1256         []     -> PFailed last_loc loc (srcParseErr buf len)
1257
1258 -- Push a new layout context at the indentation of the last token read.
1259 -- This is only used at the outer level of a module when the 'module'
1260 -- keyword is missing.
1261 pushCurrentContext :: P ()
1262 pushCurrentContext = P $ \ s@PState{ last_loc=loc, context=ctx } ->
1263   POk s{ context = Layout (srcLocCol loc) : ctx} ()
1264
1265 getOffside :: SrcLoc -> P Ordering
1266 getOffside loc = P $ \s@PState{context=stk} ->
1267                 let ord = case stk of
1268                         (Layout n:_) -> compare (srcLocCol loc) n
1269                         _            -> GT
1270                 in POk s ord
1271
1272 -- ---------------------------------------------------------------------------
1273 -- Construct a parse error
1274
1275 srcParseErr
1276   :: StringBuffer       -- current buffer (placed just after the last token)
1277   -> Int                -- length of the previous token
1278   -> Message
1279 srcParseErr buf len
1280   = hcat [ if null token
1281              then ptext SLIT("parse error (possibly incorrect indentation)")
1282              else hcat [ptext SLIT("parse error on input "),
1283                         char '`', text token, char '\'']
1284     ]
1285   where token = lexemeToString (stepOnBy (-len) buf) len
1286
1287 -- Report a parse failure, giving the span of the previous token as
1288 -- the location of the error.  This is the entry point for errors
1289 -- detected during parsing.
1290 srcParseFail :: P a
1291 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1292                                 last_loc = last_loc, loc = loc } ->
1293     PFailed last_loc loc (srcParseErr buf len)
1294
1295 -- A lexical error is reported at a particular position in the source file,
1296 -- not over a token range.  TODO: this is slightly wrong, because we record
1297 -- the error at the character position following the one which caused the
1298 -- error.  We should somehow back up by one character.
1299 lexError :: String -> P a
1300 lexError str = do
1301   loc <- getSrcLoc
1302   failLocMsgP loc loc str
1303
1304 -- -----------------------------------------------------------------------------
1305 -- This is the top-level function: called from the parser each time a
1306 -- new token is to be read from the input.
1307
1308 lexer :: (Token -> P a) -> P a
1309 lexer cont = do
1310   tok@(T _ _ tok__) <- lexToken
1311   --trace ("token: " ++ show tok__) $ do
1312   cont tok
1313
1314 lexToken :: P Token
1315 lexToken = do
1316   inp@(loc1,buf) <- getInput
1317   sc <- getLexState
1318   exts <- getExts
1319   case alexScanUser exts inp sc of
1320     AlexEOF -> do setLastToken loc1 0
1321                   return (T loc1 loc1 ITeof)
1322     AlexError (loc2,_) -> do failLocMsgP loc1 loc2 "lexical error"
1323     AlexSkip inp2 _ -> do
1324         setInput inp2
1325         lexToken
1326     AlexToken inp2@(end,buf2) len t -> do
1327         setInput inp2
1328         setLastToken loc1 len
1329         t loc1 end buf len
1330 }