ghc/compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2003
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  29    getLexState, popLexState, pushLexState
  30   ) where
  31
  32 #include "HsVersions.h"
  33
  34 import ErrUtils         ( Message )
  35 import Outputable
  36 import StringBuffer
  37 import FastString
  38 import FastTypes
  39 import SrcLoc
  40 import UniqFM
  41 import DynFlags
  42 import Ctype
  43 import Util             ( maybePrefixMatch, readRational )
  44
  45 import DATA_BITS
  46 import Char
  47 import Ratio
  48 --import TRACE
  49 }
  50
  51 $whitechar   = [\ \t\n\r\f\v\xa0]
  52 $white_no_nl = $whitechar # \n
  53
  54 $ascdigit  = 0-9
  55 $unidigit  = \x01
  56 $digit     = [$ascdigit $unidigit]
  57
  58 $special   = [\(\)\,\;\[\]\`\{\}]
  59 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  60 $unisymbol = \x02
  61 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  62
  63 $unilarge  = \x03
  64 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  65 $large     = [$asclarge $unilarge]
  66
  67 $unismall  = \x04
  68 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  69 $small     = [$ascsmall $unismall \_]
  70
  71 $graphic   = [$small $large $symbol $digit $special \:\"\']
  72
  73 $octit     = 0-7
  74 $hexit     = [$digit A-F a-f]
  75 $symchar   = [$symbol \:]
  76 $nl        = [\n\r]
  77 $idchar    = [$small $large $digit \']
  78
  79 @varid     = $small $idchar*
  80 @conid     = $large $idchar*
  81
  82 @varsym    = $symbol $symchar*
  83 @consym    = \: $symchar*
  84
  85 @decimal     = $digit+
  86 @octal       = $octit+
  87 @hexadecimal = $hexit+
  88 @exponent    = [eE] [\-\+]? @decimal
  89
  90 -- we support the hierarchical module name extension:
  91 @qual = (@conid \.)+
  92
  93 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
  94
  95 haskell :-
  96
  97 -- everywhere: skip whitespace and comments
  98 $white_no_nl+                           ;
  99
 100 -- Everywhere: deal with nested comments.  We explicitly rule out
 101 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 102 -- (this can happen even though pragmas will normally take precedence due to
 103 -- longest-match, because pragmas aren't valid in every state, but comments
 104 -- are).
 105 "{-" / { notFollowedBy '#' }            { nested_comment }
 106
 107 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 108 -- more dashes followed by a symbol should be parsed as a varsym, so we
 109 -- have to exclude those.
 110 -- The regex says: "munch all the characters after the dashes, as long as
 111 -- the first one is not a symbol".
 112 "--"\-* [^$symbol] .*                   ;
 113 "--"\-* / { atEOL }                     ;
 114
 115 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 116 -- blank lines) until we find a non-whitespace character, then do layout
 117 -- processing.
 118 --
 119 -- One slight wibble here: what if the line begins with {-#? In
 120 -- theory, we have to lex the pragma to see if it's one we recognise,
 121 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 122 -- as a nested comment.  We don't bother with this: if the line begins
 123 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 124 <bol> {
 125   \n                                    ;
 126   ^\# (line)?                           { begin line_prag1 }
 127   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 128   ^\# \! .* \n                          ; -- #!, for scripts
 129   ()                                    { do_bol }
 130 }
 131
 132 -- after a layout keyword (let, where, do, of), we begin a new layout
 133 -- context if the curly brace is missing.
 134 -- Careful! This stuff is quite delicate.
 135 <layout, layout_do> {
 136   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 137         -- we might encounter {-# here, but {- has been handled already
 138   \n                                    ;
 139   ^\# (line)?                           { begin line_prag1 }
 140 }
 141
 142 -- do is treated in a subtly different way, see new_layout_context
 143 <layout>    ()                          { new_layout_context True }
 144 <layout_do> ()                          { new_layout_context False }
 145
 146 -- after a new layout context which was found to be to the left of the
 147 -- previous context, we have generated a '{' token, and we now need to
 148 -- generate a matching '}' token.
 149 <layout_left>  ()                       { do_layout_left }
 150
 151 <0,glaexts> \n                          { begin bol }
 152
 153 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 154
 155 -- single-line line pragmas, of the form
 156 --    # <line> "<file>" <extra-stuff> \n
 157 <line_prag1> $digit+                    { setLine line_prag1a }
 158 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 159 <line_prag1b> .*                        { pop }
 160
 161 -- Haskell-style line pragmas, of the form
 162 --    {-# LINE <line> "<file>" #-}
 163 <line_prag2> $digit+                    { setLine line_prag2a }
 164 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 165 <line_prag2b> "#-}"|"-}"                { pop }
 166    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 167    -- with older versions of GHC which generated these.
 168
 169 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 170 -- is on, because the contents of the pragma is always written using
 171 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 172 -- enabled, we're sure to get a parse error.
 173 -- (ToDo: we should really emit a warning when ignoring pragmas)
 174 <glaexts>
 175   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 176
 177 <0,glaexts> {
 178   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 179                                         { token ITspecialise_prag }
 180   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 181   "{-#" $whitechar* (INLINE|inline)     { token ITinline_prag }
 182   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 183                                         { token ITnoinline_prag }
 184   "{-#" $whitechar* (DEPRECATED|deprecated)
 185                                         { token ITdeprecated_prag }
 186   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 187   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 188   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 189
 190   "{-#"                                 { nested_comment }
 191
 192   -- ToDo: should only be valid inside a pragma:
 193   "#-}"                                 { token ITclose_prag}
 194 }
 195
 196
 197 -- '0' state: ordinary lexemes
 198 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 199
 200 -- "special" symbols
 201
 202 <0,glaexts> {
 203   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 204   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 205 }
 206
 207 <0,glaexts> {
 208   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 209   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 210   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 211   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 212   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 213   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 214   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 215   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 216 }
 217
 218 <0,glaexts> {
 219   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 220                                         { special IToparenbar }
 221   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 222 }
 223
 224 <0,glaexts> {
 225   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 226   \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
 227 }
 228
 229 <glaexts> {
 230   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 231   "#)"                                  { token ITcubxparen }
 232   "{|"                                  { token ITocurlybar }
 233   "|}"                                  { token ITccurlybar }
 234 }
 235
 236 <0,glaexts> {
 237   \(                                    { special IToparen }
 238   \)                                    { special ITcparen }
 239   \[                                    { special ITobrack }
 240   \]                                    { special ITcbrack }
 241   \,                                    { special ITcomma }
 242   \;                                    { special ITsemi }
 243   \`                                    { special ITbackquote }
 244
 245   \{                                    { open_brace }
 246   \}                                    { close_brace }
 247 }
 248
 249 <0,glaexts> {
 250   @qual @varid                  { check_qvarid }
 251   @qual @conid                  { idtoken qconid }
 252   @varid                        { varid }
 253   @conid                        { idtoken conid }
 254 }
 255
 256 -- after an illegal qvarid, such as 'M.let',
 257 -- we back up and try again in the bad_qvarid state:
 258 <bad_qvarid> {
 259   @conid                        { pop_and (idtoken conid) }
 260   @qual @conid                  { pop_and (idtoken qconid) }
 261 }
 262
 263 <glaexts> {
 264   @qual @varid "#"+             { idtoken qvarid }
 265   @qual @conid "#"+             { idtoken qconid }
 266   @varid "#"+                   { varid }
 267   @conid "#"+                   { idtoken conid }
 268 }
 269
 270 -- ToDo: M.(,,,)
 271
 272 <0,glaexts> {
 273   @qual @varsym                 { idtoken qvarsym }
 274   @qual @consym                 { idtoken qconsym }
 275   @varsym                       { varsym }
 276   @consym                       { consym }
 277 }
 278
 279 <0,glaexts> {
 280   @decimal                      { tok_decimal }
 281   0[oO] @octal                  { tok_octal }
 282   0[xX] @hexadecimal            { tok_hexadecimal }
 283 }
 284
 285 <glaexts> {
 286   @decimal \#                   { prim_decimal }
 287   0[oO] @octal \#               { prim_octal }
 288   0[xX] @hexadecimal \#         { prim_hexadecimal }
 289 }
 290
 291 <0,glaexts> @floating_point             { strtoken tok_float }
 292 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 293 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 294
 295 -- Strings and chars are lexed by hand-written code.  The reason is
 296 -- that even if we recognise the string or char here in the regex
 297 -- lexer, we would still have to parse the string afterward in order
 298 -- to convert it to a String.
 299 <0,glaexts> {
 300   \'                            { lex_char_tok }
 301   \"                            { lex_string_tok }
 302 }
 303
 304 {
 305 -- work around bug in Alex 2.0
 306 #if __GLASGOW_HASKELL__ < 503
 307 unsafeAt arr i = arr ! i
 308 #endif
 309
 310 -- -----------------------------------------------------------------------------
 311 -- The token type
 312
 313 data Token
 314   = ITas                        -- Haskell keywords
 315   | ITcase
 316   | ITclass
 317   | ITdata
 318   | ITdefault
 319   | ITderiving
 320   | ITdo
 321   | ITelse
 322   | IThiding
 323   | ITif
 324   | ITimport
 325   | ITin
 326   | ITinfix
 327   | ITinfixl
 328   | ITinfixr
 329   | ITinstance
 330   | ITlet
 331   | ITmodule
 332   | ITnewtype
 333   | ITof
 334   | ITqualified
 335   | ITthen
 336   | ITtype
 337   | ITwhere
 338   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 339
 340   | ITforall                    -- GHC extension keywords
 341   | ITforeign
 342   | ITexport
 343   | ITlabel
 344   | ITdynamic
 345   | ITsafe
 346   | ITthreadsafe
 347   | ITunsafe
 348   | ITstdcallconv
 349   | ITccallconv
 350   | ITdotnet
 351   | ITmdo
 352
 353   | ITspecialise_prag           -- Pragmas
 354   | ITsource_prag
 355   | ITinline_prag
 356   | ITnoinline_prag
 357   | ITrules_prag
 358   | ITdeprecated_prag
 359   | ITline_prag
 360   | ITscc_prag
 361   | ITcore_prag                 -- hdaume: core annotations
 362   | ITunpack_prag
 363   | ITclose_prag
 364
 365   | ITdotdot                    -- reserved symbols
 366   | ITcolon
 367   | ITdcolon
 368   | ITequal
 369   | ITlam
 370   | ITvbar
 371   | ITlarrow
 372   | ITrarrow
 373   | ITat
 374   | ITtilde
 375   | ITdarrow
 376   | ITminus
 377   | ITbang
 378   | ITstar
 379   | ITdot
 380
 381   | ITbiglam                    -- GHC-extension symbols
 382
 383   | ITocurly                    -- special symbols
 384   | ITccurly
 385   | ITocurlybar                 -- {|, for type applications
 386   | ITccurlybar                 -- |}, for type applications
 387   | ITvocurly
 388   | ITvccurly
 389   | ITobrack
 390   | ITopabrack                  -- [:, for parallel arrays with -fparr
 391   | ITcpabrack                  -- :], for parallel arrays with -fparr
 392   | ITcbrack
 393   | IToparen
 394   | ITcparen
 395   | IToubxparen
 396   | ITcubxparen
 397   | ITsemi
 398   | ITcomma
 399   | ITunderscore
 400   | ITbackquote
 401
 402   | ITvarid   FastString        -- identifiers
 403   | ITconid   FastString
 404   | ITvarsym  FastString
 405   | ITconsym  FastString
 406   | ITqvarid  (FastString,FastString)
 407   | ITqconid  (FastString,FastString)
 408   | ITqvarsym (FastString,FastString)
 409   | ITqconsym (FastString,FastString)
 410
 411   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 412   | ITsplitipvarid FastString   -- GHC extension: implicit param: %x
 413
 414   | ITpragma StringBuffer
 415
 416   | ITchar       Char
 417   | ITstring     FastString
 418   | ITinteger    Integer
 419   | ITrational   Rational
 420
 421   | ITprimchar   Char
 422   | ITprimstring FastString
 423   | ITprimint    Integer
 424   | ITprimfloat  Rational
 425   | ITprimdouble Rational
 426
 427   -- MetaHaskell extension tokens
 428   | ITopenExpQuote              --  [| or [e|
 429   | ITopenPatQuote              --  [p|
 430   | ITopenDecQuote              --  [d|
 431   | ITopenTypQuote              --  [t|
 432   | ITcloseQuote                --  |]
 433   | ITidEscape   FastString     --  $x
 434   | ITparenEscape               --  $(
 435   | ITvarQuote                  --  '
 436   | ITtyQuote                   --  ''
 437
 438   -- Arrow notation extension
 439   | ITproc
 440   | ITrec
 441   | IToparenbar                 --  (|
 442   | ITcparenbar                 --  |)
 443   | ITlarrowtail                --  -<
 444   | ITrarrowtail                --  >-
 445   | ITLarrowtail                --  -<<
 446   | ITRarrowtail                --  >>-
 447
 448   | ITunknown String            -- Used when the lexer can't make sense of it
 449   | ITeof                       -- end of file token
 450 #ifdef DEBUG
 451   deriving Show -- debugging
 452 #endif
 453
 454 isSpecial :: Token -> Bool
 455 -- If we see M.x, where x is a keyword, but
 456 -- is special, we treat is as just plain M.x,
 457 -- not as a keyword.
 458 isSpecial ITas          = True
 459 isSpecial IThiding      = True
 460 isSpecial ITqualified   = True
 461 isSpecial ITforall      = True
 462 isSpecial ITexport      = True
 463 isSpecial ITlabel       = True
 464 isSpecial ITdynamic     = True
 465 isSpecial ITsafe        = True
 466 isSpecial ITthreadsafe  = True
 467 isSpecial ITunsafe      = True
 468 isSpecial ITccallconv   = True
 469 isSpecial ITstdcallconv = True
 470 isSpecial ITmdo         = True
 471 isSpecial _             = False
 472
 473 -- the bitmap provided as the third component indicates whether the
 474 -- corresponding extension keyword is valid under the extension options
 475 -- provided to the compiler; if the extension corresponding to *any* of the
 476 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 477 -- facilitates using a keyword in two different extensions that can be
 478 -- activated independently)
 479 --
 480 reservedWordsFM = listToUFM $
 481         map (\(x, y, z) -> (mkFastString x, (y, z)))
 482        [( "_",          ITunderscore,   0 ),
 483         ( "as",         ITas,           0 ),
 484         ( "case",       ITcase,         0 ),
 485         ( "class",      ITclass,        0 ),
 486         ( "data",       ITdata,         0 ),
 487         ( "default",    ITdefault,      0 ),
 488         ( "deriving",   ITderiving,     0 ),
 489         ( "do",         ITdo,           0 ),
 490         ( "else",       ITelse,         0 ),
 491         ( "hiding",     IThiding,       0 ),
 492         ( "if",         ITif,           0 ),
 493         ( "import",     ITimport,       0 ),
 494         ( "in",         ITin,           0 ),
 495         ( "infix",      ITinfix,        0 ),
 496         ( "infixl",     ITinfixl,       0 ),
 497         ( "infixr",     ITinfixr,       0 ),
 498         ( "instance",   ITinstance,     0 ),
 499         ( "let",        ITlet,          0 ),
 500         ( "module",     ITmodule,       0 ),
 501         ( "newtype",    ITnewtype,      0 ),
 502         ( "of",         ITof,           0 ),
 503         ( "qualified",  ITqualified,    0 ),
 504         ( "then",       ITthen,         0 ),
 505         ( "type",       ITtype,         0 ),
 506         ( "where",      ITwhere,        0 ),
 507         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 508
 509         ( "forall",     ITforall,        bit tvBit),
 510         ( "mdo",        ITmdo,           bit glaExtsBit),
 511
 512         ( "foreign",    ITforeign,       bit ffiBit),
 513         ( "export",     ITexport,        bit ffiBit),
 514         ( "label",      ITlabel,         bit ffiBit),
 515         ( "dynamic",    ITdynamic,       bit ffiBit),
 516         ( "safe",       ITsafe,          bit ffiBit),
 517         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 518         ( "unsafe",     ITunsafe,        bit ffiBit),
 519         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 520         ( "ccall",      ITccallconv,     bit ffiBit),
 521         ( "dotnet",     ITdotnet,        bit ffiBit),
 522
 523         ( "rec",        ITrec,           bit arrowsBit),
 524         ( "proc",       ITproc,          bit arrowsBit)
 525      ]
 526
 527 reservedSymsFM = listToUFM $
 528         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 529       [ ("..",  ITdotdot,       0)
 530        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 531                                                 -- meaning only list cons
 532        ,("::",  ITdcolon,       0)
 533        ,("=",   ITequal,        0)
 534        ,("\\",  ITlam,          0)
 535        ,("|",   ITvbar,         0)
 536        ,("<-",  ITlarrow,       0)
 537        ,("->",  ITrarrow,       0)
 538        ,("@",   ITat,           0)
 539        ,("~",   ITtilde,        0)
 540        ,("=>",  ITdarrow,       0)
 541        ,("-",   ITminus,        0)
 542        ,("!",   ITbang,         0)
 543
 544        ,("*",   ITstar,         bit glaExtsBit) -- For data T (a::*) = MkT
 545        ,(".",   ITdot,          bit tvBit)      -- For 'forall a . t'
 546
 547        ,("-<",  ITlarrowtail,   bit arrowsBit)
 548        ,(">-",  ITrarrowtail,   bit arrowsBit)
 549        ,("-<<", ITLarrowtail,   bit arrowsBit)
 550        ,(">>-", ITRarrowtail,   bit arrowsBit)
 551        ]
 552
 553 -- -----------------------------------------------------------------------------
 554 -- Lexer actions
 555
 556 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 557
 558 special :: Token -> Action
 559 special tok span _buf len = return (L span tok)
 560
 561 token, layout_token :: Token -> Action
 562 token t span buf len = return (L span t)
 563 layout_token t span buf len = pushLexState layout >> return (L span t)
 564
 565 idtoken :: (StringBuffer -> Int -> Token) -> Action
 566 idtoken f span buf len = return (L span $! (f buf len))
 567
 568 skip_one_varid :: (FastString -> Token) -> Action
 569 skip_one_varid f span buf len
 570   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 571
 572 strtoken :: (String -> Token) -> Action
 573 strtoken f span buf len =
 574   return (L span $! (f $! lexemeToString buf len))
 575
 576 init_strtoken :: Int -> (String -> Token) -> Action
 577 -- like strtoken, but drops the last N character(s)
 578 init_strtoken drop f span buf len =
 579   return (L span $! (f $! lexemeToString buf (len-drop)))
 580
 581 begin :: Int -> Action
 582 begin code _span _str _len = do pushLexState code; lexToken
 583
 584 pop :: Action
 585 pop _span _buf _len = do popLexState; lexToken
 586
 587 pop_and :: Action -> Action
 588 pop_and act span buf len = do popLexState; act span buf len
 589
 590 notFollowedBy char _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf /= char
 591
 592 notFollowedBySymbol _ _ _ (AI _ _ buf)
 593   = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
 594
 595 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 596
 597 ifExtension pred bits _ _ _ = pred bits
 598
 599 {-
 600   nested comments require traversing by hand, they can't be parsed
 601   using regular expressions.
 602 -}
 603 nested_comment :: Action
 604 nested_comment span _str _len = do
 605   input <- getInput
 606   go 1 input
 607   where go 0 input = do setInput input; lexToken
 608         go n input = do
 609           case alexGetChar input of
 610             Nothing  -> err input
 611             Just (c,input) -> do
 612               case c of
 613                 '-' -> do
 614                   case alexGetChar input of
 615                     Nothing  -> err input
 616                     Just ('\125',input) -> go (n-1) input
 617                     Just (c,_)          -> go n input
 618                 '\123' -> do
 619                   case alexGetChar input of
 620                     Nothing  -> err input
 621                     Just ('-',input') -> go (n+1) input'
 622                     Just (c,input)    -> go n input
 623                 c -> go n input
 624
 625         err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 626
 627 open_brace, close_brace :: Action
 628 open_brace span _str _len = do
 629   ctx <- getContext
 630   setContext (NoLayout:ctx)
 631   return (L span ITocurly)
 632 close_brace span _str _len = do
 633   popContext
 634   return (L span ITccurly)
 635
 636 -- We have to be careful not to count M.<varid> as a qualified name
 637 -- when <varid> is a keyword.  We hack around this by catching
 638 -- the offending tokens afterward, and re-lexing in a different state.
 639 check_qvarid span buf len = do
 640   case lookupUFM reservedWordsFM var of
 641         Just (keyword,exts)
 642           | not (isSpecial keyword) ->
 643           if exts == 0
 644              then try_again
 645              else do
 646                 b <- extension (\i -> exts .&. i /= 0)
 647                 if b then try_again
 648                      else return token
 649         _other -> return token
 650   where
 651         (mod,var) = splitQualName buf len
 652         token     = L span (ITqvarid (mod,var))
 653
 654         try_again = do
 655                 (AI _ offs _) <- getInput
 656                 setInput (AI (srcSpanStart span) (offs-len) buf)
 657                 pushLexState bad_qvarid
 658                 lexToken
 659
 660 qvarid buf len = ITqvarid $! splitQualName buf len
 661 qconid buf len = ITqconid $! splitQualName buf len
 662
 663 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 664 -- takes a StringBuffer and a length, and returns the module name
 665 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 666 -- because of hierarchical module names.
 667 splitQualName orig_buf len = split orig_buf 0 0
 668   where
 669     split buf dot_off n
 670         | n == len                = done dot_off
 671         | lookAhead buf n == '.'  = split2 buf n (n+1)
 672         | otherwise               = split buf dot_off (n+1)
 673
 674     -- careful, we might get names like M....
 675     -- so, if the character after the dot is not upper-case, this is
 676     -- the end of the qualifier part.
 677     split2 buf dot_off n
 678         | isUpper (lookAhead buf n) = split buf dot_off (n+1)
 679         | otherwise                 = done dot_off
 680
 681     done dot_off =
 682         (lexemeToFastString orig_buf dot_off,
 683          lexemeToFastString (stepOnBy (dot_off+1) orig_buf) (len - dot_off -1))
 684
 685 varid span buf len =
 686   case lookupUFM reservedWordsFM fs of
 687         Just (keyword,0)    -> do
 688                 maybe_layout keyword
 689                 return (L span keyword)
 690         Just (keyword,exts) -> do
 691                 b <- extension (\i -> exts .&. i /= 0)
 692                 if b then do maybe_layout keyword
 693                              return (L span keyword)
 694                      else return (L span (ITvarid fs))
 695         _other -> return (L span (ITvarid fs))
 696   where
 697         fs = lexemeToFastString buf len
 698
 699 conid buf len = ITconid fs
 700   where fs = lexemeToFastString buf len
 701
 702 qvarsym buf len = ITqvarsym $! splitQualName buf len
 703 qconsym buf len = ITqconsym $! splitQualName buf len
 704
 705 varsym = sym ITvarsym
 706 consym = sym ITconsym
 707
 708 sym con span buf len =
 709   case lookupUFM reservedSymsFM fs of
 710         Just (keyword,0)    -> return (L span keyword)
 711         Just (keyword,exts) -> do
 712                 b <- extension (\i -> exts .&. i /= 0)
 713                 if b then return (L span keyword)
 714                      else return (L span $! con fs)
 715         _other -> return (L span $! con fs)
 716   where
 717         fs = lexemeToFastString buf len
 718
 719 tok_decimal span buf len
 720   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 721
 722 tok_octal span buf len
 723   = return (L span (ITinteger  $! parseInteger (stepOnBy 2 buf) (len-2) 8 octDecDigit))
 724
 725 tok_hexadecimal span buf len
 726   = return (L span (ITinteger  $! parseInteger (stepOnBy 2 buf) (len-2) 16 hexDigit))
 727
 728 prim_decimal span buf len
 729   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 730
 731 prim_octal span buf len
 732   = return (L span (ITprimint  $! parseInteger (stepOnBy 2 buf) (len-3) 8 octDecDigit))
 733
 734 prim_hexadecimal span buf len
 735   = return (L span (ITprimint  $! parseInteger (stepOnBy 2 buf) (len-3) 16 hexDigit))
 736
 737 tok_float        str = ITrational   $! readRational str
 738 prim_float       str = ITprimfloat  $! readRational str
 739 prim_double      str = ITprimdouble $! readRational str
 740
 741 -- -----------------------------------------------------------------------------
 742 -- Layout processing
 743
 744 -- we're at the first token on a line, insert layout tokens if necessary
 745 do_bol :: Action
 746 do_bol span _str _len = do
 747         pos <- getOffside
 748         case pos of
 749             LT -> do
 750                 --trace "layout: inserting '}'" $ do
 751                 popContext
 752                 -- do NOT pop the lex state, we might have a ';' to insert
 753                 return (L span ITvccurly)
 754             EQ -> do
 755                 --trace "layout: inserting ';'" $ do
 756                 popLexState
 757                 return (L span ITsemi)
 758             GT -> do
 759                 popLexState
 760                 lexToken
 761
 762 -- certain keywords put us in the "layout" state, where we might
 763 -- add an opening curly brace.
 764 maybe_layout ITdo       = pushLexState layout_do
 765 maybe_layout ITmdo      = pushLexState layout_do
 766 maybe_layout ITof       = pushLexState layout
 767 maybe_layout ITlet      = pushLexState layout
 768 maybe_layout ITwhere    = pushLexState layout
 769 maybe_layout ITrec      = pushLexState layout
 770 maybe_layout _          = return ()
 771
 772 -- Pushing a new implicit layout context.  If the indentation of the
 773 -- next token is not greater than the previous layout context, then
 774 -- Haskell 98 says that the new layout context should be empty; that is
 775 -- the lexer must generate {}.
 776 --
 777 -- We are slightly more lenient than this: when the new context is started
 778 -- by a 'do', then we allow the new context to be at the same indentation as
 779 -- the previous context.  This is what the 'strict' argument is for.
 780 --
 781 new_layout_context strict span _buf _len = do
 782     popLexState
 783     (AI _ offset _) <- getInput
 784     ctx <- getContext
 785     case ctx of
 786         Layout prev_off : _  |
 787            (strict     && prev_off >= offset  ||
 788             not strict && prev_off > offset) -> do
 789                 -- token is indented to the left of the previous context.
 790                 -- we must generate a {} sequence now.
 791                 pushLexState layout_left
 792                 return (L span ITvocurly)
 793         other -> do
 794                 setContext (Layout offset : ctx)
 795                 return (L span ITvocurly)
 796
 797 do_layout_left span _buf _len = do
 798     popLexState
 799     pushLexState bol  -- we must be at the start of a line
 800     return (L span ITvccurly)
 801
 802 -- -----------------------------------------------------------------------------
 803 -- LINE pragmas
 804
 805 setLine :: Int -> Action
 806 setLine code span buf len = do
 807   let line = parseInteger buf len 10 octDecDigit
 808   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
 809         -- subtract one: the line number refers to the *following* line
 810   popLexState
 811   pushLexState code
 812   lexToken
 813
 814 setFile :: Int -> Action
 815 setFile code span buf len = do
 816   let file = lexemeToFastString (stepOn buf) (len-2)
 817   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
 818   popLexState
 819   pushLexState code
 820   lexToken
 821
 822 -- -----------------------------------------------------------------------------
 823 -- Strings & Chars
 824
 825 -- This stuff is horrible.  I hates it.
 826
 827 lex_string_tok :: Action
 828 lex_string_tok span buf len = do
 829   tok <- lex_string ""
 830   end <- getSrcLoc
 831   return (L (mkSrcSpan (srcSpanStart span) end) tok)
 832
 833 lex_string :: String -> P Token
 834 lex_string s = do
 835   i <- getInput
 836   case alexGetChar i of
 837     Nothing -> lit_error
 838
 839     Just ('"',i)  -> do
 840         setInput i
 841         glaexts <- extension glaExtsEnabled
 842         if glaexts
 843           then do
 844             i <- getInput
 845             case alexGetChar i of
 846               Just ('#',i) -> do
 847                    setInput i
 848                    if any (> '\xFF') s
 849                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 850                     else let s' = mkFastStringNarrow (reverse s) in
 851                          -- always a narrow string/byte array
 852                          return (ITprimstring s')
 853               _other ->
 854                 return (ITstring (mkFastString (reverse s)))
 855           else
 856                 return (ITstring (mkFastString (reverse s)))
 857
 858     Just ('\\',i)
 859         | Just ('&',i) <- next -> do
 860                 setInput i; lex_string s
 861         | Just (c,i) <- next, is_space c -> do
 862                 setInput i; lex_stringgap s
 863         where next = alexGetChar i
 864
 865     Just _ -> do
 866         c <- lex_char
 867         lex_string (c:s)
 868
 869 lex_stringgap s = do
 870   c <- getCharOrFail
 871   case c of
 872     '\\' -> lex_string s
 873     c | is_space c -> lex_stringgap s
 874     _other -> lit_error
 875
 876
 877 lex_char_tok :: Action
 878 -- Here we are basically parsing character literals, such as 'x' or '\n'
 879 -- but, when Template Haskell is on, we additionally spot
 880 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
 881 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
 882 -- So we have to do two characters of lookahead: when we see 'x we need to
 883 -- see if there's a trailing quote
 884 lex_char_tok span buf len = do  -- We've seen '
 885    i1 <- getInput       -- Look ahead to first character
 886    let loc = srcSpanStart span
 887    case alexGetChar i1 of
 888         Nothing -> lit_error
 889
 890         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
 891                   th_exts <- extension thEnabled
 892                   if th_exts then do
 893                         setInput i2
 894                         return (L (mkSrcSpan loc end2)  ITtyQuote)
 895                    else lit_error
 896
 897         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
 898                   setInput i2
 899                   lit_ch <- lex_escape
 900                   mc <- getCharOrFail   -- Trailing quote
 901                   if mc == '\'' then finish_char_tok loc lit_ch
 902                                 else lit_error
 903
 904         Just (c, i2@(AI end2 _ _)) | not (is_any c) -> lit_error
 905                               | otherwise      ->
 906
 907                 -- We've seen 'x, where x is a valid character
 908                 --  (i.e. not newline etc) but not a quote or backslash
 909            case alexGetChar i2 of       -- Look ahead one more character
 910                 Nothing -> lit_error
 911                 Just ('\'', i3) -> do   -- We've seen 'x'
 912                         setInput i3
 913                         finish_char_tok loc c
 914                 _other -> do            -- We've seen 'x not followed by quote
 915                                         -- If TH is on, just parse the quote only
 916                         th_exts <- extension thEnabled
 917                         let (AI end _ _) = i1
 918                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
 919                                    else lit_error
 920
 921 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
 922 finish_char_tok loc ch  -- We've already seen the closing quote
 923                         -- Just need to check for trailing #
 924   = do  glaexts <- extension glaExtsEnabled
 925         i@(AI end _ _) <- getInput
 926         if glaexts then do
 927                 case alexGetChar i of
 928                         Just ('#',i@(AI end _ _)) -> do
 929                                 setInput i
 930                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
 931                         _other ->
 932                                 return (L (mkSrcSpan loc end) (ITchar ch))
 933                 else do
 934                    return (L (mkSrcSpan loc end) (ITchar ch))
 935
 936 lex_char :: P Char
 937 lex_char = do
 938   mc <- getCharOrFail
 939   case mc of
 940       '\\' -> lex_escape
 941       c | is_any c -> return c
 942       _other -> lit_error
 943
 944 lex_escape :: P Char
 945 lex_escape = do
 946   c <- getCharOrFail
 947   case c of
 948         'a'   -> return '\a'
 949         'b'   -> return '\b'
 950         'f'   -> return '\f'
 951         'n'   -> return '\n'
 952         'r'   -> return '\r'
 953         't'   -> return '\t'
 954         'v'   -> return '\v'
 955         '\\'  -> return '\\'
 956         '"'   -> return '\"'
 957         '\''  -> return '\''
 958         '^'   -> do c <- getCharOrFail
 959                     if c >= '@' && c <= '_'
 960                         then return (chr (ord c - ord '@'))
 961                         else lit_error
 962
 963         'x'   -> readNum is_hexdigit 16 hexDigit
 964         'o'   -> readNum is_octdigit  8 octDecDigit
 965         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
 966
 967         c1 ->  do
 968            i <- getInput
 969            case alexGetChar i of
 970             Nothing -> lit_error
 971             Just (c2,i2) ->
 972               case alexGetChar i2 of
 973                 Nothing -> lit_error
 974                 Just (c3,i3) ->
 975                    let str = [c1,c2,c3] in
 976                    case [ (c,rest) | (p,c) <- silly_escape_chars,
 977                                      Just rest <- [maybePrefixMatch p str] ] of
 978                           (escape_char,[]):_ -> do
 979                                 setInput i3
 980                                 return escape_char
 981                           (escape_char,_:_):_ -> do
 982                                 setInput i2
 983                                 return escape_char
 984                           [] -> lit_error
 985
 986 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
 987 readNum is_digit base conv = do
 988   c <- getCharOrFail
 989   if is_digit c
 990         then readNum2 is_digit base conv (conv c)
 991         else lit_error
 992
 993 readNum2 is_digit base conv i = do
 994   input <- getInput
 995   read i input
 996   where read i input = do
 997           case alexGetChar input of
 998             Just (c,input') | is_digit c -> do
 999                 read (i*base + conv c) input'
1000             _other -> do
1001                 setInput input
1002                 if i >= 0 && i <= 0x10FFFF
1003                    then return (chr i)
1004                    else lit_error
1005
1006 silly_escape_chars = [
1007         ("NUL", '\NUL'),
1008         ("SOH", '\SOH'),
1009         ("STX", '\STX'),
1010         ("ETX", '\ETX'),
1011         ("EOT", '\EOT'),
1012         ("ENQ", '\ENQ'),
1013         ("ACK", '\ACK'),
1014         ("BEL", '\BEL'),
1015         ("BS", '\BS'),
1016         ("HT", '\HT'),
1017         ("LF", '\LF'),
1018         ("VT", '\VT'),
1019         ("FF", '\FF'),
1020         ("CR", '\CR'),
1021         ("SO", '\SO'),
1022         ("SI", '\SI'),
1023         ("DLE", '\DLE'),
1024         ("DC1", '\DC1'),
1025         ("DC2", '\DC2'),
1026         ("DC3", '\DC3'),
1027         ("DC4", '\DC4'),
1028         ("NAK", '\NAK'),
1029         ("SYN", '\SYN'),
1030         ("ETB", '\ETB'),
1031         ("CAN", '\CAN'),
1032         ("EM", '\EM'),
1033         ("SUB", '\SUB'),
1034         ("ESC", '\ESC'),
1035         ("FS", '\FS'),
1036         ("GS", '\GS'),
1037         ("RS", '\RS'),
1038         ("US", '\US'),
1039         ("SP", '\SP'),
1040         ("DEL", '\DEL')
1041         ]
1042
1043 lit_error = lexError "lexical error in string/character literal"
1044
1045 getCharOrFail :: P Char
1046 getCharOrFail =  do
1047   i <- getInput
1048   case alexGetChar i of
1049         Nothing -> lexError "unexpected end-of-file in string/character literal"
1050         Just (c,i)  -> do setInput i; return c
1051
1052 -- -----------------------------------------------------------------------------
1053 -- The Parse Monad
1054
1055 data LayoutContext
1056   = NoLayout
1057   | Layout !Int
1058
1059 data ParseResult a
1060   = POk PState a
1061   | PFailed
1062         SrcSpan         -- The start and end of the text span related to
1063                         -- the error.  Might be used in environments which can
1064                         -- show this span, e.g. by highlighting it.
1065         Message         -- The error message
1066
1067 data PState = PState {
1068         buffer     :: StringBuffer,
1069         last_loc   :: SrcSpan,  -- pos of previous token
1070         last_offs  :: !Int,     -- offset of the previous token from the
1071                                 -- beginning of  the current line.
1072                                 -- \t is equal to 8 spaces.
1073         last_len   :: !Int,     -- len of previous token
1074         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1075         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1076         context    :: [LayoutContext],
1077         lex_state  :: [Int]
1078      }
1079         -- last_loc and last_len are used when generating error messages,
1080         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1081         -- current token to happyError, we could at least get rid of last_len.
1082         -- Getting rid of last_loc would require finding another way to
1083         -- implement pushCurrentContext (which is only called from one place).
1084
1085 newtype P a = P { unP :: PState -> ParseResult a }
1086
1087 instance Monad P where
1088   return = returnP
1089   (>>=) = thenP
1090   fail = failP
1091
1092 returnP :: a -> P a
1093 returnP a = P $ \s -> POk s a
1094
1095 thenP :: P a -> (a -> P b) -> P b
1096 (P m) `thenP` k = P $ \ s ->
1097         case m s of
1098                 POk s1 a         -> (unP (k a)) s1
1099                 PFailed span err -> PFailed span err
1100
1101 failP :: String -> P a
1102 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1103
1104 failMsgP :: String -> P a
1105 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1106
1107 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1108 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1109
1110 failSpanMsgP :: SrcSpan -> String -> P a
1111 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1112
1113 extension :: (Int -> Bool) -> P Bool
1114 extension p = P $ \s -> POk s (p $! extsBitmap s)
1115
1116 getExts :: P Int
1117 getExts = P $ \s -> POk s (extsBitmap s)
1118
1119 setSrcLoc :: SrcLoc -> P ()
1120 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1121
1122 getSrcLoc :: P SrcLoc
1123 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1124
1125 setLastToken :: SrcSpan -> Int -> P ()
1126 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1127
1128 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1129
1130 alexInputPrevChar :: AlexInput -> Char
1131 alexInputPrevChar (AI _ _ s) = prevChar s '\n'
1132
1133 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1134 alexGetChar (AI loc ofs s)
1135   | atEnd s   = Nothing
1136   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq` Just (c, (AI loc' ofs' s'))
1137   where c    = currentChar s
1138         loc' = advanceSrcLoc loc c
1139         ofs' = advanceOffs c ofs
1140         s'   = stepOn s
1141
1142         advanceOffs :: Char -> Int -> Int
1143         advanceOffs '\n' offs = 0
1144         advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1145         advanceOffs _    offs = offs + 1
1146
1147 getInput :: P AlexInput
1148 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1149
1150 setInput :: AlexInput -> P ()
1151 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1152
1153 pushLexState :: Int -> P ()
1154 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1155
1156 popLexState :: P Int
1157 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1158
1159 getLexState :: P Int
1160 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1161
1162 -- for reasons of efficiency, flags indicating language extensions (eg,
1163 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1164 -- integer
1165
1166 glaExtsBit, ffiBit, parrBit :: Int
1167 glaExtsBit = 0
1168 ffiBit     = 1
1169 parrBit    = 2
1170 arrowsBit  = 4
1171 thBit      = 5
1172 ipBit      = 6
1173 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1174
1175 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1176 glaExtsEnabled flags = testBit flags glaExtsBit
1177 ffiEnabled     flags = testBit flags ffiBit
1178 parrEnabled    flags = testBit flags parrBit
1179 arrowsEnabled  flags = testBit flags arrowsBit
1180 thEnabled      flags = testBit flags thBit
1181 ipEnabled      flags = testBit flags ipBit
1182 tvEnabled      flags = testBit flags tvBit
1183
1184 -- create a parse state
1185 --
1186 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1187 mkPState buf loc flags  =
1188   PState {
1189       buffer     = buf,
1190       last_loc   = mkSrcSpan loc loc,
1191       last_offs  = 0,
1192       last_len   = 0,
1193       loc        = loc,
1194       extsBitmap = fromIntegral bitmap,
1195       context    = [],
1196       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1197         -- we begin in the layout state if toplev_layout is set
1198     }
1199     where
1200       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1201                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1202                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1203                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1204                .|. thBit      `setBitIf` dopt Opt_TH          flags
1205                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1206                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1207       --
1208       setBitIf :: Int -> Bool -> Int
1209       b `setBitIf` cond | cond      = bit b
1210                         | otherwise = 0
1211
1212 getContext :: P [LayoutContext]
1213 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1214
1215 setContext :: [LayoutContext] -> P ()
1216 setContext ctx = P $ \s -> POk s{context=ctx} ()
1217
1218 popContext :: P ()
1219 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1220                            loc = loc, last_len = len, last_loc = last_loc }) ->
1221   case ctx of
1222         (_:tl) -> POk s{ context = tl } ()
1223         []     -> PFailed last_loc (srcParseErr buf len)
1224
1225 -- Push a new layout context at the indentation of the last token read.
1226 -- This is only used at the outer level of a module when the 'module'
1227 -- keyword is missing.
1228 pushCurrentContext :: P ()
1229 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_len=len, context=ctx } ->
1230   POk s{context = Layout (offs-len) : ctx} ()
1231
1232 getOffside :: P Ordering
1233 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1234                 let ord = case stk of
1235                         (Layout n:_) -> compare offs n
1236                         _            -> GT
1237                 in POk s ord
1238
1239 -- ---------------------------------------------------------------------------
1240 -- Construct a parse error
1241
1242 srcParseErr
1243   :: StringBuffer       -- current buffer (placed just after the last token)
1244   -> Int                -- length of the previous token
1245   -> Message
1246 srcParseErr buf len
1247   = hcat [ if null token
1248              then ptext SLIT("parse error (possibly incorrect indentation)")
1249              else hcat [ptext SLIT("parse error on input "),
1250                         char '`', text token, char '\'']
1251     ]
1252   where token = lexemeToString (stepOnBy (-len) buf) len
1253
1254 -- Report a parse failure, giving the span of the previous token as
1255 -- the location of the error.  This is the entry point for errors
1256 -- detected during parsing.
1257 srcParseFail :: P a
1258 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1259                             last_loc = last_loc } ->
1260     PFailed last_loc (srcParseErr buf len)
1261
1262 -- A lexical error is reported at a particular position in the source file,
1263 -- not over a token range.  TODO: this is slightly wrong, because we record
1264 -- the error at the character position following the one which caused the
1265 -- error.  We should somehow back up by one character.
1266 lexError :: String -> P a
1267 lexError str = do
1268   loc <- getSrcLoc
1269   i@(AI end _ _) <- getInput
1270   failLocMsgP loc end str
1271
1272 -- -----------------------------------------------------------------------------
1273 -- This is the top-level function: called from the parser each time a
1274 -- new token is to be read from the input.
1275
1276 lexer :: (Located Token -> P a) -> P a
1277 lexer cont = do
1278   tok@(L _ tok__) <- lexToken
1279   -- trace ("token: " ++ show tok__) $ do
1280   cont tok
1281
1282 lexToken :: P (Located Token)
1283 lexToken = do
1284   inp@(AI loc1 _ buf) <- getInput
1285   sc <- getLexState
1286   exts <- getExts
1287   case alexScanUser exts inp sc of
1288     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1289                   setLastToken span 0
1290                   return (L span ITeof)
1291     AlexError (AI loc2 _ _) -> do failLocMsgP loc1 loc2 "lexical error"
1292     AlexSkip inp2 _ -> do
1293         setInput inp2
1294         lexToken
1295     AlexToken inp2@(AI end _ buf2) len t -> do
1296         setInput inp2
1297         let span = mkSrcSpan loc1 end
1298         span `seq` setLastToken span len
1299         t span buf len
1300 }