ghc/compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2003
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), Token__(..), lexer, mkPState, showPFailed,
  26    P(..), ParseResult(..), setSrcLocFor, getSrcLoc,
  27    failMsgP, failLocMsgP, srcParseFail,
  28    popContext, pushCurrentContext,
  29   ) where
  30
  31 #include "HsVersions.h"
  32
  33 import ForeignCall      ( Safety(..) )
  34 import ErrUtils         ( Message )
  35 import Outputable
  36 import StringBuffer
  37 import FastString
  38 import FastTypes
  39 import SrcLoc
  40 import UniqFM
  41 import CmdLineOpts
  42 import Ctype
  43 import Util             ( maybePrefixMatch )
  44
  45 import DATA_BITS
  46 import Char
  47 import Ratio
  48 import TRACE
  49 }
  50
  51 $whitechar   = [\ \t\n\r\f\v]
  52 $white_no_nl = $whitechar # \n
  53
  54 $ascdigit  = 0-9
  55 $unidigit  = \x01
  56 $digit     = [$ascdigit $unidigit]
  57
  58 $special   = [\(\)\,\;\[\]\`\{\}]
  59 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  60 $unisymbol = \x02
  61 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  62
  63 $unilarge  = \x03
  64 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  65 $large     = [$asclarge $unilarge]
  66
  67 $unismall  = \x04
  68 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  69 $small     = [$ascsmall $unismall \_]
  70
  71 $graphic   = [$small $large $symbol $digit $special \:\"\']
  72
  73 $octit     = 0-7
  74 $hexit     = [$digit A-F a-f]
  75 $symchar   = [$symbol \:]
  76 $nl        = [\n\r]
  77 $idchar    = [$small $large $digit \']
  78
  79 @varid     = $small $idchar*
  80 @conid     = $large $idchar*
  81
  82 @varsym    = $symbol $symchar*
  83 @consym    = \: $symchar*
  84
  85 @decimal     = $digit+
  86 @octal       = $octit+
  87 @hexadecimal = $hexit+
  88 @exponent    = [eE] [\-\+]? @decimal
  89
  90 -- we support the hierarchical module name extension:
  91 @qual = (@conid \.)+
  92
  93 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
  94
  95 haskell :-
  96
  97 -- everywhere: skip whitespace and comments
  98 $white_no_nl+                           ;
  99
 100 -- Everywhere: deal with nested comments.  We explicitly rule out
 101 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 102 -- (this can happen even though pragmas will normally take precedence due to
 103 -- longest-match, because pragmas aren't valid in every state, but comments
 104 -- are).
 105 "{-" / { notFollowedBy '#' }            { nested_comment }
 106
 107 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 108 -- more dashes followed by a symbol should be parsed as a varsym, so we
 109 -- have to exclude those.
 110 -- The regex says: "munch all the characters after the dashes, as long as
 111 -- the first one is not a symbol".
 112 "--"\-* ([^$symbol] .*)?                ;
 113
 114 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 115 -- blank lines) until we find a non-whitespace character, then do layout
 116 -- processing.
 117 --
 118 -- One slight wibble here: what if the line begins with {-#? In
 119 -- theory, we have to lex the pragma to see if it's one we recognise,
 120 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 121 -- as a nested comment.  We don't bother with this: if the line begins
 122 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 123 <bol> {
 124   \n                                    ;
 125   ^\# (line)?                           { begin line_prag1 }
 126   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 127   ()                                    { do_bol }
 128 }
 129
 130 -- after a layout keyword (let, where, do, of), we begin a new layout
 131 -- context if the curly brace is missing.
 132 -- Careful! This stuff is quite delicate.
 133 <layout, layout_do> {
 134   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 135         -- we might encounter {-# here, but {- has been handled already
 136   \n                                    ;
 137   ^\# (line)?                           { begin line_prag1 }
 138 }
 139
 140 -- do is treated in a subtly different way, see new_layout_context
 141 <layout>    ()                          { new_layout_context True }
 142 <layout_do> ()                          { new_layout_context False }
 143
 144 -- after a new layout context which was found to be to the left of the
 145 -- previous context, we have generated a '{' token, and we now need to
 146 -- generate a matching '}' token.
 147 <layout_left>  ()                       { do_layout_left }
 148
 149 <0,glaexts> \n                          { begin bol }
 150
 151 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 152
 153 -- single-line line pragmas, of the form
 154 --    # <line> "<file>" <extra-stuff> \n
 155 <line_prag1> $digit+                    { set_line line_prag1a }
 156 <line_prag1a> \" [$graphic \ ]* \"      { set_file line_prag1b }
 157 <line_prag1b> .*                        { pop }
 158
 159 -- Haskell-style line pragmas, of the form
 160 --    {-# LINE <line> "<file>" #-}
 161 <line_prag2> $digit+                    { set_line line_prag2a }
 162 <line_prag2a> \" [$graphic \ ]* \"      { set_file line_prag2b }
 163 <line_prag2b> "#-}"                     { pop }
 164
 165 <0,glaexts> {
 166   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 167                                         { token ITspecialise_prag }
 168   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 169   "{-#" $whitechar* (INLINE|inline)     { token ITinline_prag }
 170   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 171                                         { token ITnoinline_prag }
 172   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 173   "{-#" $whitechar* (DEPRECATED|deprecated)
 174                                         { token ITdeprecated_prag }
 175   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 176   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 177
 178   "{-#"                                 { nested_comment }
 179
 180   -- ToDo: should only be valid inside a pragma:
 181   "#-}"                                 { token ITclose_prag}
 182 }
 183
 184
 185 -- '0' state: ordinary lexemes
 186 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 187
 188 -- "special" symbols
 189
 190 <0,glaexts> {
 191   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 192   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 193 }
 194
 195 <0,glaexts> {
 196   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 197   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 198   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 199   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 200   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 201   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 202   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 203   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 204 }
 205
 206 <0,glaexts> {
 207   "(|" / { ifExtension arrowsEnabled }  { special IToparenbar }
 208   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 209 }
 210
 211 <0,glaexts> {
 212   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 213   \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
 214 }
 215
 216 <glaexts> {
 217   "(#"                                  { token IToubxparen }
 218   "#)"                                  { token ITcubxparen }
 219   "{|"                                  { token ITocurlybar }
 220   "|}"                                  { token ITccurlybar }
 221 }
 222
 223 <0,glaexts> {
 224   \(                                    { special IToparen }
 225   \)                                    { special ITcparen }
 226   \[                                    { special ITobrack }
 227   \]                                    { special ITcbrack }
 228   \,                                    { special ITcomma }
 229   \;                                    { special ITsemi }
 230   \`                                    { special ITbackquote }
 231
 232   \{                                    { open_brace }
 233   \}                                    { close_brace }
 234 }
 235
 236 <0,glaexts> {
 237   @qual @varid                  { check_qvarid }
 238   @qual @conid                  { idtoken qconid }
 239   @varid                        { varid }
 240   @conid                        { idtoken conid }
 241 }
 242
 243 -- after an illegal qvarid, such as 'M.let',
 244 -- we back up and try again in the bad_qvarid state:
 245 <bad_qvarid> {
 246   @conid                        { pop_and (idtoken conid) }
 247   @qual @conid                  { pop_and (idtoken qconid) }
 248 }
 249
 250 <glaexts> {
 251   @qual @varid "#"+             { idtoken qvarid }
 252   @qual @conid "#"+             { idtoken qconid }
 253   @varid "#"+                   { varid }
 254   @conid "#"+                   { idtoken conid }
 255 }
 256
 257 -- ToDo: M.(,,,)
 258
 259 <0,glaexts> {
 260   @qual @varsym                 { idtoken qvarsym }
 261   @qual @consym                 { idtoken qconsym }
 262   @varsym                       { varsym }
 263   @consym                       { consym }
 264 }
 265
 266 <0,glaexts> {
 267   @decimal                      { tok_decimal }
 268   0[oO] @octal                  { tok_octal }
 269   0[xX] @hexadecimal            { tok_hexadecimal }
 270 }
 271
 272 <glaexts> {
 273   @decimal \#                   { prim_decimal }
 274   0[oO] @octal \#               { prim_octal }
 275   0[xX] @hexadecimal \#         { prim_hexadecimal }
 276 }
 277
 278 <0,glaexts> @floating_point             { strtoken tok_float }
 279 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 280 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 281
 282 -- Strings and chars are lexed by hand-written code.  The reason is
 283 -- that even if we recognise the string or char here in the regex
 284 -- lexer, we would still have to parse the string afterward in order
 285 -- to convert it to a String.
 286 <0,glaexts> {
 287   \'                            { lex_char_tok }
 288   \"                            { lex_string_tok }
 289 }
 290
 291 {
 292 -- work around bug in Alex 2.0
 293 #if __GLASGOW_HASKELL__ < 503
 294 unsafeAt arr i = arr ! i
 295 #endif
 296
 297 -- -----------------------------------------------------------------------------
 298 -- The token type
 299
 300 data Token = T SrcLoc{-start-} SrcLoc{-end-} Token__
 301
 302 data Token__
 303   = ITas                        -- Haskell keywords
 304   | ITcase
 305   | ITclass
 306   | ITdata
 307   | ITdefault
 308   | ITderiving
 309   | ITdo
 310   | ITelse
 311   | IThiding
 312   | ITif
 313   | ITimport
 314   | ITin
 315   | ITinfix
 316   | ITinfixl
 317   | ITinfixr
 318   | ITinstance
 319   | ITlet
 320   | ITmodule
 321   | ITnewtype
 322   | ITof
 323   | ITqualified
 324   | ITthen
 325   | ITtype
 326   | ITwhere
 327   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 328
 329   | ITforall                    -- GHC extension keywords
 330   | ITforeign
 331   | ITexport
 332   | ITlabel
 333   | ITdynamic
 334   | ITsafe
 335   | ITthreadsafe
 336   | ITunsafe
 337   | ITstdcallconv
 338   | ITccallconv
 339   | ITdotnet
 340   | ITmdo
 341
 342   | ITspecialise_prag           -- Pragmas
 343   | ITsource_prag
 344   | ITinline_prag
 345   | ITnoinline_prag
 346   | ITrules_prag
 347   | ITdeprecated_prag
 348   | ITline_prag
 349   | ITscc_prag
 350   | ITcore_prag                 -- hdaume: core annotations
 351   | ITclose_prag
 352
 353   | ITdotdot                    -- reserved symbols
 354   | ITcolon
 355   | ITdcolon
 356   | ITequal
 357   | ITlam
 358   | ITvbar
 359   | ITlarrow
 360   | ITrarrow
 361   | ITat
 362   | ITtilde
 363   | ITdarrow
 364   | ITminus
 365   | ITbang
 366   | ITstar
 367   | ITdot
 368
 369   | ITbiglam                    -- GHC-extension symbols
 370
 371   | ITocurly                    -- special symbols
 372   | ITccurly
 373   | ITocurlybar                 -- {|, for type applications
 374   | ITccurlybar                 -- |}, for type applications
 375   | ITvocurly
 376   | ITvccurly
 377   | ITobrack
 378   | ITopabrack                  -- [:, for parallel arrays with -fparr
 379   | ITcpabrack                  -- :], for parallel arrays with -fparr
 380   | ITcbrack
 381   | IToparen
 382   | ITcparen
 383   | IToubxparen
 384   | ITcubxparen
 385   | ITsemi
 386   | ITcomma
 387   | ITunderscore
 388   | ITbackquote
 389
 390   | ITvarid   FastString        -- identifiers
 391   | ITconid   FastString
 392   | ITvarsym  FastString
 393   | ITconsym  FastString
 394   | ITqvarid  (FastString,FastString)
 395   | ITqconid  (FastString,FastString)
 396   | ITqvarsym (FastString,FastString)
 397   | ITqconsym (FastString,FastString)
 398
 399   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 400   | ITsplitipvarid FastString   -- GHC extension: implicit param: %x
 401
 402   | ITpragma StringBuffer
 403
 404   | ITchar       Char
 405   | ITstring     FastString
 406   | ITinteger    Integer
 407   | ITrational   Rational
 408
 409   | ITprimchar   Char
 410   | ITprimstring FastString
 411   | ITprimint    Integer
 412   | ITprimfloat  Rational
 413   | ITprimdouble Rational
 414
 415   -- MetaHaskell extension tokens
 416   | ITopenExpQuote              -- [| or [e|
 417   | ITopenPatQuote              -- [p|
 418   | ITopenDecQuote              -- [d|
 419   | ITopenTypQuote              -- [t|
 420   | ITcloseQuote                -- |]
 421   | ITidEscape   FastString     -- $x
 422   | ITparenEscape               -- $(
 423   | ITreifyType
 424   | ITreifyDecl
 425   | ITreifyFixity
 426
 427   -- Arrow notation extension
 428   | ITproc
 429   | ITrec
 430   | IToparenbar                 -- (|
 431   | ITcparenbar                 -- |)
 432   | ITlarrowtail                -- -<
 433   | ITrarrowtail                -- >-
 434   | ITLarrowtail                -- -<<
 435   | ITRarrowtail                -- >>-
 436
 437   | ITunknown String            -- Used when the lexer can't make sense of it
 438   | ITeof                       -- end of file token
 439 #ifdef DEBUG
 440   deriving Show -- debugging
 441 #endif
 442
 443 isSpecial :: Token__ -> Bool
 444 -- If we see M.x, where x is a keyword, but
 445 -- is special, we treat is as just plain M.x,
 446 -- not as a keyword.
 447 isSpecial ITas          = True
 448 isSpecial IThiding      = True
 449 isSpecial ITqualified   = True
 450 isSpecial ITforall      = True
 451 isSpecial ITexport      = True
 452 isSpecial ITlabel       = True
 453 isSpecial ITdynamic     = True
 454 isSpecial ITsafe        = True
 455 isSpecial ITthreadsafe  = True
 456 isSpecial ITunsafe      = True
 457 isSpecial ITccallconv   = True
 458 isSpecial ITstdcallconv = True
 459 isSpecial ITmdo         = True
 460 isSpecial _             = False
 461
 462 -- the bitmap provided as the third component indicates whether the
 463 -- corresponding extension keyword is valid under the extension options
 464 -- provided to the compiler; if the extension corresponding to *any* of the
 465 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 466 -- facilitates using a keyword in two different extensions that can be
 467 -- activated independently)
 468 --
 469 reservedWordsFM = listToUFM $
 470         map (\(x, y, z) -> (mkFastString x, (y, z)))
 471        [( "_",          ITunderscore,   0 ),
 472         ( "as",         ITas,           0 ),
 473         ( "case",       ITcase,         0 ),
 474         ( "class",      ITclass,        0 ),
 475         ( "data",       ITdata,         0 ),
 476         ( "default",    ITdefault,      0 ),
 477         ( "deriving",   ITderiving,     0 ),
 478         ( "do",         ITdo,           0 ),
 479         ( "else",       ITelse,         0 ),
 480         ( "hiding",     IThiding,       0 ),
 481         ( "if",         ITif,           0 ),
 482         ( "import",     ITimport,       0 ),
 483         ( "in",         ITin,           0 ),
 484         ( "infix",      ITinfix,        0 ),
 485         ( "infixl",     ITinfixl,       0 ),
 486         ( "infixr",     ITinfixr,       0 ),
 487         ( "instance",   ITinstance,     0 ),
 488         ( "let",        ITlet,          0 ),
 489         ( "module",     ITmodule,       0 ),
 490         ( "newtype",    ITnewtype,      0 ),
 491         ( "of",         ITof,           0 ),
 492         ( "qualified",  ITqualified,    0 ),
 493         ( "then",       ITthen,         0 ),
 494         ( "type",       ITtype,         0 ),
 495         ( "where",      ITwhere,        0 ),
 496         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 497
 498         ( "forall",     ITforall,        bit glaExtsBit),
 499         ( "mdo",        ITmdo,           bit glaExtsBit),
 500         ( "reifyDecl",  ITreifyDecl,     bit thBit),
 501         ( "reifyType",  ITreifyType,     bit thBit),
 502         ( "reifyFixity",ITreifyFixity,   bit thBit),
 503
 504         ( "foreign",    ITforeign,       bit ffiBit),
 505         ( "export",     ITexport,        bit ffiBit),
 506         ( "label",      ITlabel,         bit ffiBit),
 507         ( "dynamic",    ITdynamic,       bit ffiBit),
 508         ( "safe",       ITsafe,          bit ffiBit),
 509         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 510         ( "unsafe",     ITunsafe,        bit ffiBit),
 511         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 512         ( "ccall",      ITccallconv,     bit ffiBit),
 513         ( "dotnet",     ITdotnet,        bit ffiBit),
 514
 515         ( "rec",        ITrec,           bit arrowsBit),
 516         ( "proc",       ITproc,          bit arrowsBit)
 517      ]
 518
 519 reservedSymsFM = listToUFM $
 520         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 521       [ ("..",  ITdotdot,       0)
 522        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 523                                                 -- meaning only list cons
 524        ,("::",  ITdcolon,       0)
 525        ,("=",   ITequal,        0)
 526        ,("\\",  ITlam,          0)
 527        ,("|",   ITvbar,         0)
 528        ,("<-",  ITlarrow,       0)
 529        ,("->",  ITrarrow,       0)
 530        ,("@",   ITat,           0)
 531        ,("~",   ITtilde,        0)
 532        ,("=>",  ITdarrow,       0)
 533        ,("-",   ITminus,        0)
 534        ,("!",   ITbang,         0)
 535
 536        ,("*",   ITstar,         bit glaExtsBit) -- For data T (a::*) = MkT
 537        ,(".",   ITdot,          bit glaExtsBit) -- For 'forall a . t'
 538
 539        ,("-<",  ITlarrowtail,   bit arrowsBit)
 540        ,(">-",  ITrarrowtail,   bit arrowsBit)
 541        ,("-<<", ITLarrowtail,   bit arrowsBit)
 542        ,(">>-", ITRarrowtail,   bit arrowsBit)
 543        ]
 544
 545 -- -----------------------------------------------------------------------------
 546 -- Lexer actions
 547
 548 type Action = SrcLoc -> SrcLoc -> StringBuffer -> Int -> P Token
 549
 550 special :: Token__ -> Action
 551 special tok loc end _buf len = return (T loc end tok)
 552
 553 token, layout_token :: Token__ -> Action
 554 token t loc end buf len = return (T loc end t)
 555 layout_token t loc end buf len = pushLexState layout >> return (T loc end t)
 556
 557 idtoken :: (StringBuffer -> Int -> Token__) -> Action
 558 idtoken f loc end buf len = return (T loc end $! (f buf len))
 559
 560 skip_one_varid :: (FastString -> Token__) -> Action
 561 skip_one_varid f loc end buf len
 562   = return (T loc end $! f (lexemeToFastString (stepOn buf) (len-1)))
 563
 564 strtoken :: (String -> Token__) -> Action
 565 strtoken f loc end buf len =
 566   return (T loc end $! (f $! lexemeToString buf len))
 567
 568 init_strtoken :: Int -> (String -> Token__) -> Action
 569 -- like strtoken, but drops the last N character(s)
 570 init_strtoken drop f loc end buf len =
 571   return (T loc end $! (f $! lexemeToString buf (len-drop)))
 572
 573 begin :: Int -> Action
 574 begin code _loc _end _str _len = do pushLexState code; lexToken
 575
 576 pop :: Action
 577 pop _loc _end _buf _len = do popLexState; lexToken
 578
 579 pop_and :: Action -> Action
 580 pop_and act loc end buf len = do popLexState; act loc end buf len
 581
 582 notFollowedBy char _ _ _ (_,buf) = atEnd buf || currentChar buf /= char
 583
 584 ifExtension pred bits _ _ _ = pred bits
 585
 586 {-
 587   nested comments require traversing by hand, they can't be parsed
 588   using regular expressions.
 589 -}
 590 nested_comment :: Action
 591 nested_comment loc _end _str _len = do
 592   input <- getInput
 593   go 1 input
 594   where go 0 input = do setInput input; lexToken
 595         go n input = do
 596           case alexGetChar input of
 597             Nothing  -> err input
 598             Just (c,input) -> do
 599               case c of
 600                 '-' -> do
 601                   case alexGetChar input of
 602                     Nothing  -> err input
 603                     Just ('\125',input) -> go (n-1) input
 604                     Just (c,_)          -> go n input
 605                 '\123' -> do
 606                   case alexGetChar input of
 607                     Nothing  -> err input
 608                     Just ('-',input') -> go (n+1) input'
 609                     Just (c,input)    -> go n input
 610                 c -> go n input
 611
 612         err input = do failLocMsgP loc (fst input) "unterminated `{-'"
 613
 614 open_brace, close_brace :: Action
 615 open_brace  loc end _str _len = do
 616   ctx <- getContext
 617   setContext (NoLayout:ctx)
 618   return (T loc end ITocurly)
 619 close_brace loc end _str _len = do
 620   popContext
 621   return (T loc end ITccurly)
 622
 623 -- We have to be careful not to count M.<varid> as a qualified name
 624 -- when <varid> is a keyword.  We hack around this by catching
 625 -- the offending tokens afterward, and re-lexing in a different state.
 626 check_qvarid loc end buf len = do
 627   case lookupUFM reservedWordsFM var of
 628         Just (keyword,exts)
 629           | not (isSpecial keyword) ->
 630           if exts == 0
 631              then try_again
 632              else do
 633                 b <- extension (\i -> exts .&. i /= 0)
 634                 if b then try_again
 635                      else return token
 636         _other -> return token
 637   where
 638         (mod,var) = splitQualName buf len
 639         token     = T loc end (ITqvarid (mod,var))
 640
 641         try_again = do
 642                 setInput (loc,buf)
 643                 pushLexState bad_qvarid
 644                 lexToken
 645
 646 qvarid buf len = ITqvarid $! splitQualName buf len
 647 qconid buf len = ITqconid $! splitQualName buf len
 648
 649 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 650 -- takes a StringBuffer and a length, and returns the module name
 651 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 652 -- because of hierarchical module names.
 653 splitQualName orig_buf len = split orig_buf 0 0
 654   where
 655     split buf dot_off n
 656         | n == len                = done dot_off
 657         | lookAhead buf n == '.'  = split2 buf n (n+1)
 658         | otherwise               = split buf dot_off (n+1)
 659
 660     -- careful, we might get names like M....
 661     -- so, if the character after the dot is not upper-case, this is
 662     -- the end of the qualifier part.
 663     split2 buf dot_off n
 664         | isUpper (lookAhead buf n) = split buf dot_off (n+1)
 665         | otherwise                 = done dot_off
 666
 667     done dot_off =
 668         (lexemeToFastString orig_buf dot_off,
 669          lexemeToFastString (stepOnBy (dot_off+1) orig_buf) (len - dot_off -1))
 670
 671 varid loc end buf len =
 672   case lookupUFM reservedWordsFM fs of
 673         Just (keyword,0)    -> do
 674                 maybe_layout keyword
 675                 return (T loc end keyword)
 676         Just (keyword,exts) -> do
 677                 b <- extension (\i -> exts .&. i /= 0)
 678                 if b then do maybe_layout keyword
 679                              return (T loc end keyword)
 680                      else return (T loc end (ITvarid fs))
 681         _other -> return (T loc end (ITvarid fs))
 682   where
 683         fs = lexemeToFastString buf len
 684
 685 conid buf len = ITconid fs
 686   where fs = lexemeToFastString buf len
 687
 688 qvarsym buf len = ITqvarsym $! splitQualName buf len
 689 qconsym buf len = ITqconsym $! splitQualName buf len
 690
 691 varsym = sym ITvarsym
 692 consym = sym ITconsym
 693
 694 sym con loc end buf len =
 695   case lookupUFM reservedSymsFM fs of
 696         Just (keyword,0)    -> return (T loc end keyword)
 697         Just (keyword,exts) -> do
 698                 b <- extension (\i -> exts .&. i /= 0)
 699                 if b then return (T loc end keyword)
 700                      else return (T loc end $! con fs)
 701         _other -> return (T loc end $! con fs)
 702   where
 703         fs = lexemeToFastString buf len
 704
 705 tok_decimal loc end buf len
 706   = return (T loc end (ITinteger  $! parseInteger buf len 10 oct_or_dec))
 707
 708 tok_octal loc end buf len
 709   = return (T loc end (ITinteger  $! parseInteger (stepOnBy 2 buf) (len-2) 8 oct_or_dec))
 710
 711 tok_hexadecimal loc end buf len
 712   = return (T loc end (ITinteger  $! parseInteger (stepOnBy 2 buf) (len-2) 16 hex))
 713
 714 prim_decimal loc end buf len
 715   = return (T loc end (ITprimint  $! parseInteger buf (len-1) 10 oct_or_dec))
 716
 717 prim_octal loc end buf len
 718   = return (T loc end (ITprimint  $! parseInteger (stepOnBy 2 buf) (len-3) 8 oct_or_dec))
 719
 720 prim_hexadecimal loc end buf len
 721   = return (T loc end (ITprimint  $! parseInteger (stepOnBy 2 buf) (len-3) 16 hex))
 722
 723 tok_float        str = ITrational $! readRational__ str
 724 prim_float       str = ITprimfloat  $! readRational__ str
 725 prim_double      str = ITprimdouble $! readRational__ str
 726
 727 parseInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
 728 parseInteger buf len radix to_int
 729   = go 0 0
 730   where go i x | i == len  = x
 731                | otherwise = go (i+1) (x * radix + toInteger (to_int (lookAhead buf i)))
 732
 733 -- -----------------------------------------------------------------------------
 734 -- Layout processing
 735
 736 -- we're at the first token on a line, insert layout tokens if necessary
 737 do_bol :: Action
 738 do_bol loc end _str _len = do
 739         pos <- getOffside end
 740         case pos of
 741             LT -> do
 742                 --trace "layout: inserting '}'" $ do
 743                 popContext
 744                 -- do NOT pop the lex state, we might have a ';' to insert
 745                 return (T loc end ITvccurly)
 746             EQ -> do
 747                 --trace "layout: inserting ';'" $ do
 748                 popLexState
 749                 return (T loc end ITsemi)
 750             GT -> do
 751                 popLexState
 752                 lexToken
 753
 754 -- certain keywords put us in the "layout" state, where we might
 755 -- add an opening curly brace.
 756 maybe_layout ITdo       = pushLexState layout_do
 757 maybe_layout ITmdo      = pushLexState layout_do
 758 maybe_layout ITof       = pushLexState layout
 759 maybe_layout ITlet      = pushLexState layout
 760 maybe_layout ITwhere    = pushLexState layout
 761 maybe_layout ITrec      = pushLexState layout
 762 maybe_layout _          = return ()
 763
 764 -- Pushing a new implicit layout context.  If the indentation of the
 765 -- next token is not greater than the previous layout context, then
 766 -- Haskell 98 says that the new layout context should be empty; that is
 767 -- the lexer must generate {}.
 768 --
 769 -- We are slightly more lenient than this: when the new context is started
 770 -- by a 'do', then we allow the new context to be at the same indentation as
 771 -- the previous context.  This is what the 'strict' argument is for.
 772 --
 773 new_layout_context strict loc end _buf _len = do
 774     popLexState
 775     let offset = srcLocCol loc
 776     ctx <- getContext
 777     case ctx of
 778         Layout prev_off : _  |
 779            (strict     && prev_off >= offset  ||
 780             not strict && prev_off > offset) -> do
 781                 -- token is indented to the left of the previous context.
 782                 -- we must generate a {} sequence now.
 783                 pushLexState layout_left
 784                 return (T loc end ITvocurly)
 785         other -> do
 786                 setContext (Layout offset : ctx)
 787                 return (T loc end ITvocurly)
 788
 789 do_layout_left loc end _buf _len = do
 790     popLexState
 791     pushLexState bol  -- we must be at the start of a line
 792     return (T loc end ITvccurly)
 793
 794 -- -----------------------------------------------------------------------------
 795 -- LINE pragmas
 796
 797 set_line :: Int -> Action
 798 set_line code loc end buf len = do
 799   let line = parseInteger buf len 10 oct_or_dec
 800   setSrcLoc (mkSrcLoc (srcLocFile end) (fromIntegral line - 1) 0)
 801         -- subtract one: the line number refers to the *following* line
 802   popLexState
 803   pushLexState code
 804   lexToken
 805
 806 set_file :: Int -> Action
 807 set_file code loc end buf len = do
 808   let file = lexemeToFastString (stepOn buf) (len-2)
 809   setSrcLoc (mkSrcLoc file (srcLocLine end) (srcLocCol end))
 810   popLexState
 811   pushLexState code
 812   lexToken
 813
 814 -- -----------------------------------------------------------------------------
 815 -- Strings & Chars
 816
 817 -- This stuff is horrible.  I hates it.
 818
 819 lex_string_tok :: Action
 820 lex_string_tok loc end buf len = do
 821   tok <- lex_string ""
 822   end <- getSrcLoc
 823   return (T loc end tok)
 824
 825 lex_string :: String -> P Token__
 826 lex_string s = do
 827   i <- getInput
 828   case alexGetChar i of
 829     Nothing -> lit_error
 830
 831     Just ('"',i)  -> do
 832         setInput i
 833         glaexts <- extension glaExtsEnabled
 834         if glaexts
 835           then do
 836             i <- getInput
 837             case alexGetChar i of
 838               Just ('#',i) -> do
 839                    setInput i
 840                    if any (> '\xFF') s
 841                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 842                     else let s' = mkFastStringNarrow (reverse s) in
 843                          -- always a narrow string/byte array
 844                          return (ITprimstring s')
 845               _other ->
 846                 return (ITstring (mkFastString (reverse s)))
 847           else
 848                 return (ITstring (mkFastString (reverse s)))
 849
 850     Just ('\\',i)
 851         | Just ('&',i) <- next -> do
 852                 setInput i; lex_string s
 853         | Just (c,i) <- next, is_space c -> do
 854                 setInput i; lex_stringgap s
 855         where next = alexGetChar i
 856
 857     Just _ -> do
 858         c <- lex_char
 859         lex_string (c:s)
 860
 861
 862 lex_stringgap s = do
 863   c <- getCharOrFail
 864   case c of
 865     '\\' -> lex_string s
 866     c | is_space c -> lex_stringgap s
 867     _other -> lit_error
 868
 869
 870 lex_char_tok :: Action
 871 lex_char_tok loc _end buf len = do
 872    c <- lex_char
 873    mc <- getCharOrFail
 874    case mc of
 875         '\'' -> do
 876            glaexts <- extension glaExtsEnabled
 877            if glaexts
 878                 then do
 879                    i@(end,_) <- getInput
 880                    case alexGetChar i of
 881                         Just ('#',i@(end,_)) -> do
 882                                 setInput i
 883                                 return (T loc end (ITprimchar c))
 884                         _other ->
 885                                 return (T loc end (ITchar c))
 886                 else do
 887                    end <- getSrcLoc
 888                    return (T loc end (ITchar c))
 889
 890         _other -> lit_error
 891
 892 lex_char :: P Char
 893 lex_char = do
 894   mc <- getCharOrFail
 895   case mc of
 896       '\\' -> lex_escape
 897       c | is_any c -> return c
 898       _other -> lit_error
 899
 900 lex_escape :: P Char
 901 lex_escape = do
 902   c <- getCharOrFail
 903   case c of
 904         'a'   -> return '\a'
 905         'b'   -> return '\b'
 906         'f'   -> return '\f'
 907         'n'   -> return '\n'
 908         'r'   -> return '\r'
 909         't'   -> return '\t'
 910         'v'   -> return '\v'
 911         '\\'  -> return '\\'
 912         '"'   -> return '\"'
 913         '\''  -> return '\''
 914         '^'   -> do c <- getCharOrFail
 915                     if c >= '@' && c <= '_'
 916                         then return (chr (ord c - ord '@'))
 917                         else lit_error
 918
 919         'x'   -> readNum is_hexdigit 16 hex
 920         'o'   -> readNum is_octdigit  8 oct_or_dec
 921         x | is_digit x -> readNum2 is_digit 10 oct_or_dec (oct_or_dec x)
 922
 923         c1 ->  do
 924            i <- getInput
 925            case alexGetChar i of
 926             Nothing -> lit_error
 927             Just (c2,i2) ->
 928               case alexGetChar i2 of
 929                 Nothing -> lit_error
 930                 Just (c3,i3) ->
 931                    let str = [c1,c2,c3] in
 932                    case [ (c,rest) | (p,c) <- silly_escape_chars,
 933                                      Just rest <- [maybePrefixMatch p str] ] of
 934                           (escape_char,[]):_ -> do
 935                                 setInput i3
 936                                 return escape_char
 937                           (escape_char,_:_):_ -> do
 938                                 setInput i2
 939                                 return escape_char
 940                           [] -> lit_error
 941
 942 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
 943 readNum is_digit base conv = do
 944   c <- getCharOrFail
 945   if is_digit c
 946         then readNum2 is_digit base conv (conv c)
 947         else lit_error
 948
 949 readNum2 is_digit base conv i = do
 950   input <- getInput
 951   read i input
 952   where read i input = do
 953           case alexGetChar input of
 954             Just (c,input') | is_digit c -> do
 955                 read (i*base + conv c) input'
 956             _other -> do
 957                 setInput input
 958                 if i >= 0 && i <= 0x10FFFF
 959                    then return (chr i)
 960                    else lit_error
 961
 962 is_hexdigit c
 963         =  is_digit c
 964         || (c >= 'a' && c <= 'f')
 965         || (c >= 'A' && c <= 'F')
 966
 967 hex c | is_digit c = ord c - ord '0'
 968       | otherwise  = ord (to_lower c) - ord 'a' + 10
 969
 970 oct_or_dec c = ord c - ord '0'
 971
 972 is_octdigit c = c >= '0' && c <= '7'
 973
 974 to_lower c
 975   | c >=  'A' && c <= 'Z' = chr (ord c - (ord 'A' - ord 'a'))
 976   | otherwise = c
 977
 978 silly_escape_chars = [
 979         ("NUL", '\NUL'),
 980         ("SOH", '\SOH'),
 981         ("STX", '\STX'),
 982         ("ETX", '\ETX'),
 983         ("EOT", '\EOT'),
 984         ("ENQ", '\ENQ'),
 985         ("ACK", '\ACK'),
 986         ("BEL", '\BEL'),
 987         ("BS", '\BS'),
 988         ("HT", '\HT'),
 989         ("LF", '\LF'),
 990         ("VT", '\VT'),
 991         ("FF", '\FF'),
 992         ("CR", '\CR'),
 993         ("SO", '\SO'),
 994         ("SI", '\SI'),
 995         ("DLE", '\DLE'),
 996         ("DC1", '\DC1'),
 997         ("DC2", '\DC2'),
 998         ("DC3", '\DC3'),
 999         ("DC4", '\DC4'),
1000         ("NAK", '\NAK'),
1001         ("SYN", '\SYN'),
1002         ("ETB", '\ETB'),
1003         ("CAN", '\CAN'),
1004         ("EM", '\EM'),
1005         ("SUB", '\SUB'),
1006         ("ESC", '\ESC'),
1007         ("FS", '\FS'),
1008         ("GS", '\GS'),
1009         ("RS", '\RS'),
1010         ("US", '\US'),
1011         ("SP", '\SP'),
1012         ("DEL", '\DEL')
1013         ]
1014
1015 lit_error = lexError "lexical error in string/character literal"
1016
1017 getCharOrFail :: P Char
1018 getCharOrFail =  do
1019   i <- getInput
1020   case alexGetChar i of
1021         Nothing -> lexError "unexpected end-of-file in string/character literal"
1022         Just (c,i)  -> do setInput i; return c
1023
1024 -- -----------------------------------------------------------------------------
1025 -- Floats
1026
1027 readRational :: ReadS Rational -- NB: doesn't handle leading "-"
1028 readRational r = do
1029      (n,d,s) <- readFix r
1030      (k,t)   <- readExp s
1031      return ((n%1)*10^^(k-d), t)
1032  where
1033      readFix r = do
1034         (ds,s)  <- lexDecDigits r
1035         (ds',t) <- lexDotDigits s
1036         return (read (ds++ds'), length ds', t)
1037
1038      readExp (e:s) | e `elem` "eE" = readExp' s
1039      readExp s                     = return (0,s)
1040
1041      readExp' ('+':s) = readDec s
1042      readExp' ('-':s) = do
1043                         (k,t) <- readDec s
1044                         return (-k,t)
1045      readExp' s       = readDec s
1046
1047      readDec s = do
1048         (ds,r) <- nonnull isDigit s
1049         return (foldl1 (\n d -> n * 10 + d) [ ord d - ord '0' | d <- ds ],
1050                 r)
1051
1052      lexDecDigits = nonnull isDigit
1053
1054      lexDotDigits ('.':s) = return (span isDigit s)
1055      lexDotDigits s       = return ("",s)
1056
1057      nonnull p s = do (cs@(_:_),t) <- return (span p s)
1058                       return (cs,t)
1059
1060 readRational__ :: String -> Rational -- NB: *does* handle a leading "-"
1061 readRational__ top_s
1062   = case top_s of
1063       '-' : xs -> - (read_me xs)
1064       xs       -> read_me xs
1065   where
1066     read_me s
1067       = case (do { (x,"") <- readRational s ; return x }) of
1068           [x] -> x
1069           []  -> error ("readRational__: no parse:"        ++ top_s)
1070           _   -> error ("readRational__: ambiguous parse:" ++ top_s)
1071
1072 -- -----------------------------------------------------------------------------
1073 -- The Parse Monad
1074
1075 data LayoutContext
1076   = NoLayout
1077   | Layout !Int
1078
1079 data ParseResult a
1080   = POk PState a
1081   | PFailed
1082         SrcLoc SrcLoc   -- The start and end of the text span related to
1083                         -- the error.  Might be used in environments which can
1084                         -- show this span, e.g. by highlighting it.
1085         Message         -- The error message
1086
1087 showPFailed loc1 loc2 err = hcat [ppr loc1, text ": ", err]
1088
1089 data PState = PState {
1090         buffer     :: StringBuffer,
1091         last_loc   :: SrcLoc,           -- pos of previous token
1092         last_len   :: !Int,             -- len of previous token
1093         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1094         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1095         context    :: [LayoutContext],
1096         lex_state  :: [Int]
1097      }
1098         -- last_loc and last_len are used when generating error messages,
1099         -- and in pushCurrentContext only.
1100
1101 newtype P a = P { unP :: PState -> ParseResult a }
1102
1103 instance Monad P where
1104   return = returnP
1105   (>>=) = thenP
1106   fail = failP
1107
1108 returnP :: a -> P a
1109 returnP a = P $ \s -> POk s a
1110
1111 thenP :: P a -> (a -> P b) -> P b
1112 (P m) `thenP` k = P $ \ s ->
1113         case m s of
1114                 POk s1 a          -> (unP (k a)) s1
1115                 PFailed l1 l2 err -> PFailed l1 l2 err
1116
1117 failP :: String -> P a
1118 failP msg = P $ \s -> PFailed (last_loc s) (loc s) (text msg)
1119
1120 failMsgP :: String -> P a
1121 failMsgP msg = P $ \s -> PFailed (last_loc s) (loc s) (text msg)
1122
1123 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1124 failLocMsgP loc1 loc2 str = P $ \s -> PFailed loc1 loc2 (text str)
1125
1126 extension :: (Int -> Bool) -> P Bool
1127 extension p = P $ \s -> POk s (p $! extsBitmap s)
1128
1129 getExts :: P Int
1130 getExts = P $ \s -> POk s (extsBitmap s)
1131
1132 setSrcLoc :: SrcLoc -> P ()
1133 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1134
1135 -- tmp, for supporting stuff in RdrHsSyn.  The scope better not include
1136 -- any calls to the lexer, because it assumes things about the SrcLoc.
1137 setSrcLocFor :: SrcLoc -> P a -> P a
1138 setSrcLocFor new_loc scope = P $ \s@PState{ loc = old_loc } ->
1139   case unP scope s{loc=new_loc} of
1140         PFailed l1 l2 msg -> PFailed l1 l2 msg
1141         POk _ r -> POk s r
1142
1143 getSrcLoc :: P SrcLoc
1144 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1145
1146 setLastToken :: SrcLoc -> Int -> P ()
1147 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1148
1149 type AlexInput = (SrcLoc,StringBuffer)
1150
1151 alexInputPrevChar :: AlexInput -> Char
1152 alexInputPrevChar (_,s) = prevChar s '\n'
1153
1154 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1155 alexGetChar (loc,s)
1156   | atEnd s   = Nothing
1157   | otherwise = c `seq` loc' `seq` s' `seq` Just (c, (loc', s'))
1158   where c = currentChar s
1159         loc' = advanceSrcLoc loc c
1160         s'   = stepOn s
1161
1162 getInput :: P AlexInput
1163 getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (l,b)
1164
1165 setInput :: AlexInput -> P ()
1166 setInput (l,b) = P $ \s -> POk s{ loc=l, buffer=b } ()
1167
1168 pushLexState :: Int -> P ()
1169 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1170
1171 popLexState :: P Int
1172 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1173
1174 getLexState :: P Int
1175 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1176
1177 -- for reasons of efficiency, flags indicating language extensions (eg,
1178 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1179 -- integer
1180
1181 glaExtsBit, ffiBit, parrBit :: Int
1182 glaExtsBit = 0
1183 ffiBit     = 1
1184 parrBit    = 2
1185 arrowsBit  = 4
1186 thBit      = 5
1187 ipBit      = 6
1188
1189 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1190 glaExtsEnabled flags = testBit flags glaExtsBit
1191 ffiEnabled     flags = testBit flags ffiBit
1192 parrEnabled    flags = testBit flags parrBit
1193 arrowsEnabled  flags = testBit flags arrowsBit
1194 thEnabled      flags = testBit flags thBit
1195 ipEnabled      flags = testBit flags ipBit
1196
1197 -- create a parse state
1198 --
1199 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1200 mkPState buf loc flags  =
1201   PState {
1202       buffer     = buf,
1203       last_loc   = loc,
1204       last_len   = 0,
1205       loc        = loc,
1206       extsBitmap = fromIntegral bitmap,
1207       context    = [],
1208       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1209         -- we begin in the layout state if toplev_layout is set
1210     }
1211     where
1212       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1213                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1214                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1215                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1216                .|. thBit      `setBitIf` dopt Opt_TH          flags
1217                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1218       --
1219       setBitIf :: Int -> Bool -> Int
1220       b `setBitIf` cond | cond      = bit b
1221                         | otherwise = 0
1222
1223 getContext :: P [LayoutContext]
1224 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1225
1226 setContext :: [LayoutContext] -> P ()
1227 setContext ctx = P $ \s -> POk s{context=ctx} ()
1228
1229 popContext :: P ()
1230 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1231                            loc = loc, last_len = len, last_loc = last_loc }) ->
1232   case ctx of
1233         (_:tl) -> POk s{ context = tl } ()
1234         []     -> PFailed last_loc loc (srcParseErr buf len)
1235
1236 -- Push a new layout context at the indentation of the last token read.
1237 -- This is only used at the outer level of a module when the 'module'
1238 -- keyword is missing.
1239 pushCurrentContext :: P ()
1240 pushCurrentContext = P $ \ s@PState{ last_loc=loc, context=ctx } ->
1241   POk s{ context = Layout (srcLocCol loc) : ctx} ()
1242
1243 getOffside :: SrcLoc -> P Ordering
1244 getOffside loc = P $ \s@PState{context=stk} ->
1245                 let ord = case stk of
1246                         (Layout n:_) -> compare (srcLocCol loc) n
1247                         _            -> GT
1248                 in POk s ord
1249
1250 -- ---------------------------------------------------------------------------
1251 -- Construct a parse error
1252
1253 srcParseErr
1254   :: StringBuffer       -- current buffer (placed just after the last token)
1255   -> Int                -- length of the previous token
1256   -> Message
1257 srcParseErr buf len
1258   = hcat [ if null token
1259              then ptext SLIT("parse error (possibly incorrect indentation)")
1260              else hcat [ptext SLIT("parse error on input "),
1261                         char '`', text token, char '\'']
1262     ]
1263   where token = lexemeToString (stepOnBy (-len) buf) len
1264
1265 -- Report a parse failure, giving the span of the previous token as
1266 -- the location of the error.  This is the entry point for errors
1267 -- detected during parsing.
1268 srcParseFail :: P a
1269 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1270                                 last_loc = last_loc, loc = loc } ->
1271     PFailed last_loc loc (srcParseErr buf len)
1272
1273 -- A lexical error is reported at a particular position in the source file,
1274 -- not over a token range.  TODO: this is slightly wrong, because we record
1275 -- the error at the character position following the one which caused the
1276 -- error.  We should somehow back up by one character.
1277 lexError :: String -> P a
1278 lexError str = do
1279   loc <- getSrcLoc
1280   failLocMsgP loc loc str
1281
1282 -- -----------------------------------------------------------------------------
1283 -- This is the top-level function: called from the parser each time a
1284 -- new token is to be read from the input.
1285
1286 lexer :: (Token -> P a) -> P a
1287 lexer cont = do
1288   tok@(T _ _ tok__) <- lexToken
1289   --trace ("token: " ++ show tok__) $ do
1290   cont tok
1291
1292 lexToken :: P Token
1293 lexToken = do
1294   inp@(loc1,buf) <- getInput
1295   sc <- getLexState
1296   exts <- getExts
1297   case alexScanUser exts inp sc of
1298     AlexEOF -> do setLastToken loc1 0
1299                   return (T loc1 loc1 ITeof)
1300     AlexError (loc2,_) -> do failLocMsgP loc1 loc2 "lexical error"
1301     AlexSkip inp2 _ -> do
1302         setInput inp2
1303         lexToken
1304     AlexToken inp2@(end,buf2) len t -> do
1305         setInput inp2
1306         setLastToken loc1 len
1307         t loc1 end buf len
1308 }