ghc/compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  29    getLexState, popLexState, pushLexState,
  30    extension, bangPatEnabled
  31   ) where
  32
  33 #include "HsVersions.h"
  34
  35 import ErrUtils         ( Message )
  36 import Outputable
  37 import StringBuffer
  38 import FastString
  39 import FastTypes
  40 import SrcLoc
  41 import UniqFM
  42 import DynFlags
  43 import Ctype
  44 import Util             ( maybePrefixMatch, readRational )
  45
  46 import DATA_BITS
  47 import Data.Char
  48 import Ratio
  49 --import TRACE
  50 }
  51
  52 $unispace    = \x05
  53 $whitechar   = [\ \t\n\r\f\v\xa0 $unispace]
  54 $white_no_nl = $whitechar # \n
  55
  56 $ascdigit  = 0-9
  57 $unidigit  = \x03
  58 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  59 $digit     = [$ascdigit $unidigit]
  60
  61 $special   = [\(\)\,\;\[\]\`\{\}]
  62 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  63 $unisymbol = \x04
  64 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  65
  66 $unilarge  = \x01
  67 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  68 $large     = [$asclarge $unilarge]
  69
  70 $unismall  = \x02
  71 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  72 $small     = [$ascsmall $unismall \_]
  73
  74 $unigraphic = \x06
  75 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  76
  77 $octit     = 0-7
  78 $hexit     = [$decdigit A-F a-f]
  79 $symchar   = [$symbol \:]
  80 $nl        = [\n\r]
  81 $idchar    = [$small $large $digit \']
  82
  83 @varid     = $small $idchar*
  84 @conid     = $large $idchar*
  85
  86 @varsym    = $symbol $symchar*
  87 @consym    = \: $symchar*
  88
  89 @decimal     = $decdigit+
  90 @octal       = $octit+
  91 @hexadecimal = $hexit+
  92 @exponent    = [eE] [\-\+]? @decimal
  93
  94 -- we support the hierarchical module name extension:
  95 @qual = (@conid \.)+
  96
  97 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
  98
  99 haskell :-
 100
 101 -- everywhere: skip whitespace and comments
 102 $white_no_nl+                           ;
 103
 104 -- Everywhere: deal with nested comments.  We explicitly rule out
 105 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 106 -- (this can happen even though pragmas will normally take precedence due to
 107 -- longest-match, because pragmas aren't valid in every state, but comments
 108 -- are).
 109 "{-" / { notFollowedBy '#' }            { nested_comment }
 110
 111 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 112 -- more dashes followed by a symbol should be parsed as a varsym, so we
 113 -- have to exclude those.
 114 -- The regex says: "munch all the characters after the dashes, as long as
 115 -- the first one is not a symbol".
 116 "--"\-* [^$symbol :] .*                 ;
 117 "--"\-* / { atEOL }                     ;
 118
 119 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 120 -- blank lines) until we find a non-whitespace character, then do layout
 121 -- processing.
 122 --
 123 -- One slight wibble here: what if the line begins with {-#? In
 124 -- theory, we have to lex the pragma to see if it's one we recognise,
 125 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 126 -- as a nested comment.  We don't bother with this: if the line begins
 127 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 128 <bol> {
 129   \n                                    ;
 130   ^\# (line)?                           { begin line_prag1 }
 131   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 132   ^\# \! .* \n                          ; -- #!, for scripts
 133   ()                                    { do_bol }
 134 }
 135
 136 -- after a layout keyword (let, where, do, of), we begin a new layout
 137 -- context if the curly brace is missing.
 138 -- Careful! This stuff is quite delicate.
 139 <layout, layout_do> {
 140   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 141         -- we might encounter {-# here, but {- has been handled already
 142   \n                                    ;
 143   ^\# (line)?                           { begin line_prag1 }
 144 }
 145
 146 -- do is treated in a subtly different way, see new_layout_context
 147 <layout>    ()                          { new_layout_context True }
 148 <layout_do> ()                          { new_layout_context False }
 149
 150 -- after a new layout context which was found to be to the left of the
 151 -- previous context, we have generated a '{' token, and we now need to
 152 -- generate a matching '}' token.
 153 <layout_left>  ()                       { do_layout_left }
 154
 155 <0,glaexts> \n                          { begin bol }
 156
 157 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 158
 159 -- single-line line pragmas, of the form
 160 --    # <line> "<file>" <extra-stuff> \n
 161 <line_prag1> $decdigit+                 { setLine line_prag1a }
 162 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 163 <line_prag1b> .*                        { pop }
 164
 165 -- Haskell-style line pragmas, of the form
 166 --    {-# LINE <line> "<file>" #-}
 167 <line_prag2> $decdigit+                 { setLine line_prag2a }
 168 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 169 <line_prag2b> "#-}"|"-}"                { pop }
 170    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 171    -- with older versions of GHC which generated these.
 172
 173 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 174 -- is on, because the contents of the pragma is always written using
 175 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 176 -- enabled, we're sure to get a parse error.
 177 -- (ToDo: we should really emit a warning when ignoring pragmas)
 178 <glaexts>
 179   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 180
 181 <0,glaexts> {
 182   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 183   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 184                                         { token (ITinline_prag False) }
 185   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 186                                         { token ITspec_prag }
 187   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 188         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 189   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 190         $whitechar* (NO(T?)INLINE|no(t?)inline)
 191                                         { token (ITspec_inline_prag False) }
 192   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 193   "{-#" $whitechar* (DEPRECATED|deprecated)
 194                                         { token ITdeprecated_prag }
 195   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 196   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 197   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 198
 199   "{-#"                                 { nested_comment }
 200
 201   -- ToDo: should only be valid inside a pragma:
 202   "#-}"                                 { token ITclose_prag}
 203 }
 204
 205
 206 -- '0' state: ordinary lexemes
 207 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 208
 209 -- "special" symbols
 210
 211 <0,glaexts> {
 212   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 213   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 214 }
 215
 216 <0,glaexts> {
 217   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 218   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 219   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 220   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 221   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 222   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 223   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 224   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 225 }
 226
 227 <0,glaexts> {
 228   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 229                                         { special IToparenbar }
 230   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 231 }
 232
 233 <0,glaexts> {
 234   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 235   \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
 236 }
 237
 238 <glaexts> {
 239   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 240   "#)"                                  { token ITcubxparen }
 241   "{|"                                  { token ITocurlybar }
 242   "|}"                                  { token ITccurlybar }
 243 }
 244
 245 <0,glaexts> {
 246   \(                                    { special IToparen }
 247   \)                                    { special ITcparen }
 248   \[                                    { special ITobrack }
 249   \]                                    { special ITcbrack }
 250   \,                                    { special ITcomma }
 251   \;                                    { special ITsemi }
 252   \`                                    { special ITbackquote }
 253
 254   \{                                    { open_brace }
 255   \}                                    { close_brace }
 256 }
 257
 258 <0,glaexts> {
 259   @qual @varid                  { check_qvarid }
 260   @qual @conid                  { idtoken qconid }
 261   @varid                        { varid }
 262   @conid                        { idtoken conid }
 263 }
 264
 265 -- after an illegal qvarid, such as 'M.let',
 266 -- we back up and try again in the bad_qvarid state:
 267 <bad_qvarid> {
 268   @conid                        { pop_and (idtoken conid) }
 269   @qual @conid                  { pop_and (idtoken qconid) }
 270 }
 271
 272 <glaexts> {
 273   @qual @varid "#"+             { idtoken qvarid }
 274   @qual @conid "#"+             { idtoken qconid }
 275   @varid "#"+                   { varid }
 276   @conid "#"+                   { idtoken conid }
 277 }
 278
 279 -- ToDo: M.(,,,)
 280
 281 <0,glaexts> {
 282   @qual @varsym                 { idtoken qvarsym }
 283   @qual @consym                 { idtoken qconsym }
 284   @varsym                       { varsym }
 285   @consym                       { consym }
 286 }
 287
 288 <0,glaexts> {
 289   @decimal                      { tok_decimal }
 290   0[oO] @octal                  { tok_octal }
 291   0[xX] @hexadecimal            { tok_hexadecimal }
 292 }
 293
 294 <glaexts> {
 295   @decimal \#                   { prim_decimal }
 296   0[oO] @octal \#               { prim_octal }
 297   0[xX] @hexadecimal \#         { prim_hexadecimal }
 298 }
 299
 300 <0,glaexts> @floating_point             { strtoken tok_float }
 301 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 302 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 303
 304 -- Strings and chars are lexed by hand-written code.  The reason is
 305 -- that even if we recognise the string or char here in the regex
 306 -- lexer, we would still have to parse the string afterward in order
 307 -- to convert it to a String.
 308 <0,glaexts> {
 309   \'                            { lex_char_tok }
 310   \"                            { lex_string_tok }
 311 }
 312
 313 {
 314 -- work around bug in Alex 2.0
 315 #if __GLASGOW_HASKELL__ < 503
 316 unsafeAt arr i = arr ! i
 317 #endif
 318
 319 -- -----------------------------------------------------------------------------
 320 -- The token type
 321
 322 data Token
 323   = ITas                        -- Haskell keywords
 324   | ITcase
 325   | ITclass
 326   | ITdata
 327   | ITdefault
 328   | ITderiving
 329   | ITdo
 330   | ITelse
 331   | IThiding
 332   | ITif
 333   | ITimport
 334   | ITin
 335   | ITinfix
 336   | ITinfixl
 337   | ITinfixr
 338   | ITinstance
 339   | ITlet
 340   | ITmodule
 341   | ITnewtype
 342   | ITof
 343   | ITqualified
 344   | ITthen
 345   | ITtype
 346   | ITwhere
 347   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 348
 349   | ITforall                    -- GHC extension keywords
 350   | ITforeign
 351   | ITexport
 352   | ITlabel
 353   | ITdynamic
 354   | ITsafe
 355   | ITthreadsafe
 356   | ITunsafe
 357   | ITstdcallconv
 358   | ITccallconv
 359   | ITdotnet
 360   | ITmdo
 361
 362         -- Pragmas
 363   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 364   | ITspec_prag                 -- SPECIALISE
 365   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 366   | ITsource_prag
 367   | ITrules_prag
 368   | ITdeprecated_prag
 369   | ITline_prag
 370   | ITscc_prag
 371   | ITcore_prag                 -- hdaume: core annotations
 372   | ITunpack_prag
 373   | ITclose_prag
 374
 375   | ITdotdot                    -- reserved symbols
 376   | ITcolon
 377   | ITdcolon
 378   | ITequal
 379   | ITlam
 380   | ITvbar
 381   | ITlarrow
 382   | ITrarrow
 383   | ITat
 384   | ITtilde
 385   | ITdarrow
 386   | ITminus
 387   | ITbang
 388   | ITstar
 389   | ITdot
 390
 391   | ITbiglam                    -- GHC-extension symbols
 392
 393   | ITocurly                    -- special symbols
 394   | ITccurly
 395   | ITocurlybar                 -- {|, for type applications
 396   | ITccurlybar                 -- |}, for type applications
 397   | ITvocurly
 398   | ITvccurly
 399   | ITobrack
 400   | ITopabrack                  -- [:, for parallel arrays with -fparr
 401   | ITcpabrack                  -- :], for parallel arrays with -fparr
 402   | ITcbrack
 403   | IToparen
 404   | ITcparen
 405   | IToubxparen
 406   | ITcubxparen
 407   | ITsemi
 408   | ITcomma
 409   | ITunderscore
 410   | ITbackquote
 411
 412   | ITvarid   FastString        -- identifiers
 413   | ITconid   FastString
 414   | ITvarsym  FastString
 415   | ITconsym  FastString
 416   | ITqvarid  (FastString,FastString)
 417   | ITqconid  (FastString,FastString)
 418   | ITqvarsym (FastString,FastString)
 419   | ITqconsym (FastString,FastString)
 420
 421   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 422   | ITsplitipvarid FastString   -- GHC extension: implicit param: %x
 423
 424   | ITpragma StringBuffer
 425
 426   | ITchar       Char
 427   | ITstring     FastString
 428   | ITinteger    Integer
 429   | ITrational   Rational
 430
 431   | ITprimchar   Char
 432   | ITprimstring FastString
 433   | ITprimint    Integer
 434   | ITprimfloat  Rational
 435   | ITprimdouble Rational
 436
 437   -- MetaHaskell extension tokens
 438   | ITopenExpQuote              --  [| or [e|
 439   | ITopenPatQuote              --  [p|
 440   | ITopenDecQuote              --  [d|
 441   | ITopenTypQuote              --  [t|
 442   | ITcloseQuote                --  |]
 443   | ITidEscape   FastString     --  $x
 444   | ITparenEscape               --  $(
 445   | ITvarQuote                  --  '
 446   | ITtyQuote                   --  ''
 447
 448   -- Arrow notation extension
 449   | ITproc
 450   | ITrec
 451   | IToparenbar                 --  (|
 452   | ITcparenbar                 --  |)
 453   | ITlarrowtail                --  -<
 454   | ITrarrowtail                --  >-
 455   | ITLarrowtail                --  -<<
 456   | ITRarrowtail                --  >>-
 457
 458   | ITunknown String            -- Used when the lexer can't make sense of it
 459   | ITeof                       -- end of file token
 460 #ifdef DEBUG
 461   deriving Show -- debugging
 462 #endif
 463
 464 isSpecial :: Token -> Bool
 465 -- If we see M.x, where x is a keyword, but
 466 -- is special, we treat is as just plain M.x,
 467 -- not as a keyword.
 468 isSpecial ITas          = True
 469 isSpecial IThiding      = True
 470 isSpecial ITqualified   = True
 471 isSpecial ITforall      = True
 472 isSpecial ITexport      = True
 473 isSpecial ITlabel       = True
 474 isSpecial ITdynamic     = True
 475 isSpecial ITsafe        = True
 476 isSpecial ITthreadsafe  = True
 477 isSpecial ITunsafe      = True
 478 isSpecial ITccallconv   = True
 479 isSpecial ITstdcallconv = True
 480 isSpecial ITmdo         = True
 481 isSpecial _             = False
 482
 483 -- the bitmap provided as the third component indicates whether the
 484 -- corresponding extension keyword is valid under the extension options
 485 -- provided to the compiler; if the extension corresponding to *any* of the
 486 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 487 -- facilitates using a keyword in two different extensions that can be
 488 -- activated independently)
 489 --
 490 reservedWordsFM = listToUFM $
 491         map (\(x, y, z) -> (mkFastString x, (y, z)))
 492        [( "_",          ITunderscore,   0 ),
 493         ( "as",         ITas,           0 ),
 494         ( "case",       ITcase,         0 ),
 495         ( "class",      ITclass,        0 ),
 496         ( "data",       ITdata,         0 ),
 497         ( "default",    ITdefault,      0 ),
 498         ( "deriving",   ITderiving,     0 ),
 499         ( "do",         ITdo,           0 ),
 500         ( "else",       ITelse,         0 ),
 501         ( "hiding",     IThiding,       0 ),
 502         ( "if",         ITif,           0 ),
 503         ( "import",     ITimport,       0 ),
 504         ( "in",         ITin,           0 ),
 505         ( "infix",      ITinfix,        0 ),
 506         ( "infixl",     ITinfixl,       0 ),
 507         ( "infixr",     ITinfixr,       0 ),
 508         ( "instance",   ITinstance,     0 ),
 509         ( "let",        ITlet,          0 ),
 510         ( "module",     ITmodule,       0 ),
 511         ( "newtype",    ITnewtype,      0 ),
 512         ( "of",         ITof,           0 ),
 513         ( "qualified",  ITqualified,    0 ),
 514         ( "then",       ITthen,         0 ),
 515         ( "type",       ITtype,         0 ),
 516         ( "where",      ITwhere,        0 ),
 517         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 518
 519         ( "forall",     ITforall,        bit tvBit),
 520         ( "mdo",        ITmdo,           bit glaExtsBit),
 521
 522         ( "foreign",    ITforeign,       bit ffiBit),
 523         ( "export",     ITexport,        bit ffiBit),
 524         ( "label",      ITlabel,         bit ffiBit),
 525         ( "dynamic",    ITdynamic,       bit ffiBit),
 526         ( "safe",       ITsafe,          bit ffiBit),
 527         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 528         ( "unsafe",     ITunsafe,        bit ffiBit),
 529         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 530         ( "ccall",      ITccallconv,     bit ffiBit),
 531         ( "dotnet",     ITdotnet,        bit ffiBit),
 532
 533         ( "rec",        ITrec,           bit arrowsBit),
 534         ( "proc",       ITproc,          bit arrowsBit)
 535      ]
 536
 537 reservedSymsFM = listToUFM $
 538         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 539       [ ("..",  ITdotdot,       0)
 540        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 541                                                 -- meaning only list cons
 542        ,("::",  ITdcolon,       0)
 543        ,("=",   ITequal,        0)
 544        ,("\\",  ITlam,          0)
 545        ,("|",   ITvbar,         0)
 546        ,("<-",  ITlarrow,       0)
 547        ,("->",  ITrarrow,       0)
 548        ,("@",   ITat,           0)
 549        ,("~",   ITtilde,        0)
 550        ,("=>",  ITdarrow,       0)
 551        ,("-",   ITminus,        0)
 552        ,("!",   ITbang,         0)
 553
 554        ,("*",   ITstar,         bit glaExtsBit) -- For data T (a::*) = MkT
 555        ,(".",   ITdot,          bit tvBit)      -- For 'forall a . t'
 556
 557        ,("-<",  ITlarrowtail,   bit arrowsBit)
 558        ,(">-",  ITrarrowtail,   bit arrowsBit)
 559        ,("-<<", ITLarrowtail,   bit arrowsBit)
 560        ,(">>-", ITRarrowtail,   bit arrowsBit)
 561
 562 #if __GLASGOW_HASKELL__ >= 605
 563        ,("λ",  ITlam,          bit glaExtsBit)
 564        ,("∷",   ITdcolon,       bit glaExtsBit)
 565        ,("⇒",   ITdarrow,     bit glaExtsBit)
 566        ,("∀", ITforall,       bit glaExtsBit)
 567        ,("→",   ITrarrow,     bit glaExtsBit)
 568        ,("←",   ITlarrow,     bit glaExtsBit)
 569        ,("⋯",         ITdotdot,       bit glaExtsBit)
 570 #endif
 571        ]
 572
 573 -- -----------------------------------------------------------------------------
 574 -- Lexer actions
 575
 576 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 577
 578 special :: Token -> Action
 579 special tok span _buf len = return (L span tok)
 580
 581 token, layout_token :: Token -> Action
 582 token t span buf len = return (L span t)
 583 layout_token t span buf len = pushLexState layout >> return (L span t)
 584
 585 idtoken :: (StringBuffer -> Int -> Token) -> Action
 586 idtoken f span buf len = return (L span $! (f buf len))
 587
 588 skip_one_varid :: (FastString -> Token) -> Action
 589 skip_one_varid f span buf len
 590   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 591
 592 strtoken :: (String -> Token) -> Action
 593 strtoken f span buf len =
 594   return (L span $! (f $! lexemeToString buf len))
 595
 596 init_strtoken :: Int -> (String -> Token) -> Action
 597 -- like strtoken, but drops the last N character(s)
 598 init_strtoken drop f span buf len =
 599   return (L span $! (f $! lexemeToString buf (len-drop)))
 600
 601 begin :: Int -> Action
 602 begin code _span _str _len = do pushLexState code; lexToken
 603
 604 pop :: Action
 605 pop _span _buf _len = do popLexState; lexToken
 606
 607 pop_and :: Action -> Action
 608 pop_and act span buf len = do popLexState; act span buf len
 609
 610 notFollowedBy char _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf /= char
 611
 612 notFollowedBySymbol _ _ _ (AI _ _ buf)
 613   = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
 614
 615 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 616
 617 ifExtension pred bits _ _ _ = pred bits
 618
 619 {-
 620   nested comments require traversing by hand, they can't be parsed
 621   using regular expressions.
 622 -}
 623 nested_comment :: Action
 624 nested_comment span _str _len = do
 625   input <- getInput
 626   go 1 input
 627   where go 0 input = do setInput input; lexToken
 628         go n input = do
 629           case alexGetChar input of
 630             Nothing  -> err input
 631             Just (c,input) -> do
 632               case c of
 633                 '-' -> do
 634                   case alexGetChar input of
 635                     Nothing  -> err input
 636                     Just ('\125',input) -> go (n-1) input
 637                     Just (c,_)          -> go n input
 638                 '\123' -> do
 639                   case alexGetChar input of
 640                     Nothing  -> err input
 641                     Just ('-',input') -> go (n+1) input'
 642                     Just (c,input)    -> go n input
 643                 c -> go n input
 644
 645         err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 646
 647 open_brace, close_brace :: Action
 648 open_brace span _str _len = do
 649   ctx <- getContext
 650   setContext (NoLayout:ctx)
 651   return (L span ITocurly)
 652 close_brace span _str _len = do
 653   popContext
 654   return (L span ITccurly)
 655
 656 -- We have to be careful not to count M.<varid> as a qualified name
 657 -- when <varid> is a keyword.  We hack around this by catching
 658 -- the offending tokens afterward, and re-lexing in a different state.
 659 check_qvarid span buf len = do
 660   case lookupUFM reservedWordsFM var of
 661         Just (keyword,exts)
 662           | not (isSpecial keyword) ->
 663           if exts == 0
 664              then try_again
 665              else do
 666                 b <- extension (\i -> exts .&. i /= 0)
 667                 if b then try_again
 668                      else return token
 669         _other -> return token
 670   where
 671         (mod,var) = splitQualName buf len
 672         token     = L span (ITqvarid (mod,var))
 673
 674         try_again = do
 675                 (AI _ offs _) <- getInput
 676                 setInput (AI (srcSpanStart span) (offs-len) buf)
 677                 pushLexState bad_qvarid
 678                 lexToken
 679
 680 qvarid buf len = ITqvarid $! splitQualName buf len
 681 qconid buf len = ITqconid $! splitQualName buf len
 682
 683 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 684 -- takes a StringBuffer and a length, and returns the module name
 685 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 686 -- because of hierarchical module names.
 687 splitQualName orig_buf len = split orig_buf orig_buf
 688   where
 689     split buf dot_buf
 690         | orig_buf `byteDiff` buf >= len  = done dot_buf
 691         | c == '.'                        = found_dot buf'
 692         | otherwise                       = split buf' dot_buf
 693       where
 694        (c,buf') = nextChar buf
 695
 696     -- careful, we might get names like M....
 697     -- so, if the character after the dot is not upper-case, this is
 698     -- the end of the qualifier part.
 699     found_dot buf -- buf points after the '.'
 700         | isUpper c    = split buf' buf
 701         | otherwise    = done buf
 702       where
 703        (c,buf') = nextChar buf
 704
 705     done dot_buf =
 706         (lexemeToFastString orig_buf (qual_size - 1),
 707          lexemeToFastString dot_buf (len - qual_size))
 708       where
 709         qual_size = orig_buf `byteDiff` dot_buf
 710
 711 varid span buf len =
 712   case lookupUFM reservedWordsFM fs of
 713         Just (keyword,0)    -> do
 714                 maybe_layout keyword
 715                 return (L span keyword)
 716         Just (keyword,exts) -> do
 717                 b <- extension (\i -> exts .&. i /= 0)
 718                 if b then do maybe_layout keyword
 719                              return (L span keyword)
 720                      else return (L span (ITvarid fs))
 721         _other -> return (L span (ITvarid fs))
 722   where
 723         fs = lexemeToFastString buf len
 724
 725 conid buf len = ITconid fs
 726   where fs = lexemeToFastString buf len
 727
 728 qvarsym buf len = ITqvarsym $! splitQualName buf len
 729 qconsym buf len = ITqconsym $! splitQualName buf len
 730
 731 varsym = sym ITvarsym
 732 consym = sym ITconsym
 733
 734 sym con span buf len =
 735   case lookupUFM reservedSymsFM fs of
 736         Just (keyword,0)    -> return (L span keyword)
 737         Just (keyword,exts) -> do
 738                 b <- extension (\i -> exts .&. i /= 0)
 739                 if b then return (L span keyword)
 740                      else return (L span $! con fs)
 741         _other -> return (L span $! con fs)
 742   where
 743         fs = lexemeToFastString buf len
 744
 745 tok_decimal span buf len
 746   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 747
 748 tok_octal span buf len
 749   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 750
 751 tok_hexadecimal span buf len
 752   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 753
 754 prim_decimal span buf len
 755   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 756
 757 prim_octal span buf len
 758   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 759
 760 prim_hexadecimal span buf len
 761   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 762
 763 tok_float        str = ITrational   $! readRational str
 764 prim_float       str = ITprimfloat  $! readRational str
 765 prim_double      str = ITprimdouble $! readRational str
 766
 767 -- -----------------------------------------------------------------------------
 768 -- Layout processing
 769
 770 -- we're at the first token on a line, insert layout tokens if necessary
 771 do_bol :: Action
 772 do_bol span _str _len = do
 773         pos <- getOffside
 774         case pos of
 775             LT -> do
 776                 --trace "layout: inserting '}'" $ do
 777                 popContext
 778                 -- do NOT pop the lex state, we might have a ';' to insert
 779                 return (L span ITvccurly)
 780             EQ -> do
 781                 --trace "layout: inserting ';'" $ do
 782                 popLexState
 783                 return (L span ITsemi)
 784             GT -> do
 785                 popLexState
 786                 lexToken
 787
 788 -- certain keywords put us in the "layout" state, where we might
 789 -- add an opening curly brace.
 790 maybe_layout ITdo       = pushLexState layout_do
 791 maybe_layout ITmdo      = pushLexState layout_do
 792 maybe_layout ITof       = pushLexState layout
 793 maybe_layout ITlet      = pushLexState layout
 794 maybe_layout ITwhere    = pushLexState layout
 795 maybe_layout ITrec      = pushLexState layout
 796 maybe_layout _          = return ()
 797
 798 -- Pushing a new implicit layout context.  If the indentation of the
 799 -- next token is not greater than the previous layout context, then
 800 -- Haskell 98 says that the new layout context should be empty; that is
 801 -- the lexer must generate {}.
 802 --
 803 -- We are slightly more lenient than this: when the new context is started
 804 -- by a 'do', then we allow the new context to be at the same indentation as
 805 -- the previous context.  This is what the 'strict' argument is for.
 806 --
 807 new_layout_context strict span _buf _len = do
 808     popLexState
 809     (AI _ offset _) <- getInput
 810     ctx <- getContext
 811     case ctx of
 812         Layout prev_off : _  |
 813            (strict     && prev_off >= offset  ||
 814             not strict && prev_off > offset) -> do
 815                 -- token is indented to the left of the previous context.
 816                 -- we must generate a {} sequence now.
 817                 pushLexState layout_left
 818                 return (L span ITvocurly)
 819         other -> do
 820                 setContext (Layout offset : ctx)
 821                 return (L span ITvocurly)
 822
 823 do_layout_left span _buf _len = do
 824     popLexState
 825     pushLexState bol  -- we must be at the start of a line
 826     return (L span ITvccurly)
 827
 828 -- -----------------------------------------------------------------------------
 829 -- LINE pragmas
 830
 831 setLine :: Int -> Action
 832 setLine code span buf len = do
 833   let line = parseInteger buf len 10 octDecDigit
 834   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
 835         -- subtract one: the line number refers to the *following* line
 836   popLexState
 837   pushLexState code
 838   lexToken
 839
 840 setFile :: Int -> Action
 841 setFile code span buf len = do
 842   let file = lexemeToFastString (stepOn buf) (len-2)
 843   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
 844   popLexState
 845   pushLexState code
 846   lexToken
 847
 848 -- -----------------------------------------------------------------------------
 849 -- Strings & Chars
 850
 851 -- This stuff is horrible.  I hates it.
 852
 853 lex_string_tok :: Action
 854 lex_string_tok span buf len = do
 855   tok <- lex_string ""
 856   end <- getSrcLoc
 857   return (L (mkSrcSpan (srcSpanStart span) end) tok)
 858
 859 lex_string :: String -> P Token
 860 lex_string s = do
 861   i <- getInput
 862   case alexGetChar' i of
 863     Nothing -> lit_error
 864
 865     Just ('"',i)  -> do
 866         setInput i
 867         glaexts <- extension glaExtsEnabled
 868         if glaexts
 869           then do
 870             i <- getInput
 871             case alexGetChar' i of
 872               Just ('#',i) -> do
 873                    setInput i
 874                    if any (> '\xFF') s
 875                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 876                     else let s' = mkZFastString (reverse s) in
 877                          return (ITprimstring s')
 878                         -- mkZFastString is a hack to avoid encoding the
 879                         -- string in UTF-8.  We just want the exact bytes.
 880               _other ->
 881                 return (ITstring (mkFastString (reverse s)))
 882           else
 883                 return (ITstring (mkFastString (reverse s)))
 884
 885     Just ('\\',i)
 886         | Just ('&',i) <- next -> do
 887                 setInput i; lex_string s
 888         | Just (c,i) <- next, is_space c -> do
 889                 setInput i; lex_stringgap s
 890         where next = alexGetChar' i
 891
 892     Just (c, i) -> do
 893         c' <- lex_char c i
 894         lex_string (c':s)
 895
 896 lex_stringgap s = do
 897   c <- getCharOrFail
 898   case c of
 899     '\\' -> lex_string s
 900     c | is_space c -> lex_stringgap s
 901     _other -> lit_error
 902
 903
 904 lex_char_tok :: Action
 905 -- Here we are basically parsing character literals, such as 'x' or '\n'
 906 -- but, when Template Haskell is on, we additionally spot
 907 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
 908 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
 909 -- So we have to do two characters of lookahead: when we see 'x we need to
 910 -- see if there's a trailing quote
 911 lex_char_tok span buf len = do  -- We've seen '
 912    i1 <- getInput       -- Look ahead to first character
 913    let loc = srcSpanStart span
 914    case alexGetChar' i1 of
 915         Nothing -> lit_error
 916
 917         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
 918                   th_exts <- extension thEnabled
 919                   if th_exts then do
 920                         setInput i2
 921                         return (L (mkSrcSpan loc end2)  ITtyQuote)
 922                    else lit_error
 923
 924         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
 925                   setInput i2
 926                   lit_ch <- lex_escape
 927                   mc <- getCharOrFail   -- Trailing quote
 928                   if mc == '\'' then finish_char_tok loc lit_ch
 929                                 else do setInput i2; lit_error
 930
 931         Just (c, i2@(AI end2 _ _))
 932                 | not (isAny c) -> lit_error
 933                 | otherwise ->
 934
 935                 -- We've seen 'x, where x is a valid character
 936                 --  (i.e. not newline etc) but not a quote or backslash
 937            case alexGetChar' i2 of      -- Look ahead one more character
 938                 Nothing -> lit_error
 939                 Just ('\'', i3) -> do   -- We've seen 'x'
 940                         setInput i3
 941                         finish_char_tok loc c
 942                 _other -> do            -- We've seen 'x not followed by quote
 943                                         -- If TH is on, just parse the quote only
 944                         th_exts <- extension thEnabled
 945                         let (AI end _ _) = i1
 946                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
 947                                    else do setInput i2; lit_error
 948
 949 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
 950 finish_char_tok loc ch  -- We've already seen the closing quote
 951                         -- Just need to check for trailing #
 952   = do  glaexts <- extension glaExtsEnabled
 953         i@(AI end _ _) <- getInput
 954         if glaexts then do
 955                 case alexGetChar' i of
 956                         Just ('#',i@(AI end _ _)) -> do
 957                                 setInput i
 958                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
 959                         _other ->
 960                                 return (L (mkSrcSpan loc end) (ITchar ch))
 961                 else do
 962                    return (L (mkSrcSpan loc end) (ITchar ch))
 963
 964 lex_char :: Char -> AlexInput -> P Char
 965 lex_char c inp = do
 966   case c of
 967       '\\' -> do setInput inp; lex_escape
 968       c | isAny c -> do setInput inp; return c
 969       _other -> lit_error
 970
 971 isAny c | c > '\xff' = isPrint c
 972         | otherwise  = is_any c
 973
 974 lex_escape :: P Char
 975 lex_escape = do
 976   c <- getCharOrFail
 977   case c of
 978         'a'   -> return '\a'
 979         'b'   -> return '\b'
 980         'f'   -> return '\f'
 981         'n'   -> return '\n'
 982         'r'   -> return '\r'
 983         't'   -> return '\t'
 984         'v'   -> return '\v'
 985         '\\'  -> return '\\'
 986         '"'   -> return '\"'
 987         '\''  -> return '\''
 988         '^'   -> do c <- getCharOrFail
 989                     if c >= '@' && c <= '_'
 990                         then return (chr (ord c - ord '@'))
 991                         else lit_error
 992
 993         'x'   -> readNum is_hexdigit 16 hexDigit
 994         'o'   -> readNum is_octdigit  8 octDecDigit
 995         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
 996
 997         c1 ->  do
 998            i <- getInput
 999            case alexGetChar' i of
1000             Nothing -> lit_error
1001             Just (c2,i2) ->
1002               case alexGetChar' i2 of
1003                 Nothing -> do setInput i2; lit_error
1004                 Just (c3,i3) ->
1005                    let str = [c1,c2,c3] in
1006                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1007                                      Just rest <- [maybePrefixMatch p str] ] of
1008                           (escape_char,[]):_ -> do
1009                                 setInput i3
1010                                 return escape_char
1011                           (escape_char,_:_):_ -> do
1012                                 setInput i2
1013                                 return escape_char
1014                           [] -> lit_error
1015
1016 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1017 readNum is_digit base conv = do
1018   i <- getInput
1019   c <- getCharOrFail
1020   if is_digit c
1021         then readNum2 is_digit base conv (conv c)
1022         else do setInput i; lit_error
1023
1024 readNum2 is_digit base conv i = do
1025   input <- getInput
1026   read i input
1027   where read i input = do
1028           case alexGetChar' input of
1029             Just (c,input') | is_digit c -> do
1030                 read (i*base + conv c) input'
1031             _other -> do
1032                 if i >= 0 && i <= 0x10FFFF
1033                    then do setInput input; return (chr i)
1034                    else lit_error
1035
1036 silly_escape_chars = [
1037         ("NUL", '\NUL'),
1038         ("SOH", '\SOH'),
1039         ("STX", '\STX'),
1040         ("ETX", '\ETX'),
1041         ("EOT", '\EOT'),
1042         ("ENQ", '\ENQ'),
1043         ("ACK", '\ACK'),
1044         ("BEL", '\BEL'),
1045         ("BS", '\BS'),
1046         ("HT", '\HT'),
1047         ("LF", '\LF'),
1048         ("VT", '\VT'),
1049         ("FF", '\FF'),
1050         ("CR", '\CR'),
1051         ("SO", '\SO'),
1052         ("SI", '\SI'),
1053         ("DLE", '\DLE'),
1054         ("DC1", '\DC1'),
1055         ("DC2", '\DC2'),
1056         ("DC3", '\DC3'),
1057         ("DC4", '\DC4'),
1058         ("NAK", '\NAK'),
1059         ("SYN", '\SYN'),
1060         ("ETB", '\ETB'),
1061         ("CAN", '\CAN'),
1062         ("EM", '\EM'),
1063         ("SUB", '\SUB'),
1064         ("ESC", '\ESC'),
1065         ("FS", '\FS'),
1066         ("GS", '\GS'),
1067         ("RS", '\RS'),
1068         ("US", '\US'),
1069         ("SP", '\SP'),
1070         ("DEL", '\DEL')
1071         ]
1072
1073 -- before calling lit_error, ensure that the current input is pointing to
1074 -- the position of the error in the buffer.  This is so that we can report
1075 -- a correct location to the user, but also so we can detect UTF-8 decoding
1076 -- errors if they occur.
1077 lit_error = lexError "lexical error in string/character literal"
1078
1079 getCharOrFail :: P Char
1080 getCharOrFail =  do
1081   i <- getInput
1082   case alexGetChar' i of
1083         Nothing -> lexError "unexpected end-of-file in string/character literal"
1084         Just (c,i)  -> do setInput i; return c
1085
1086 -- -----------------------------------------------------------------------------
1087 -- The Parse Monad
1088
1089 data LayoutContext
1090   = NoLayout
1091   | Layout !Int
1092
1093 data ParseResult a
1094   = POk PState a
1095   | PFailed
1096         SrcSpan         -- The start and end of the text span related to
1097                         -- the error.  Might be used in environments which can
1098                         -- show this span, e.g. by highlighting it.
1099         Message         -- The error message
1100
1101 data PState = PState {
1102         buffer     :: StringBuffer,
1103         last_loc   :: SrcSpan,  -- pos of previous token
1104         last_offs  :: !Int,     -- offset of the previous token from the
1105                                 -- beginning of  the current line.
1106                                 -- \t is equal to 8 spaces.
1107         last_len   :: !Int,     -- len of previous token
1108         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1109         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1110         context    :: [LayoutContext],
1111         lex_state  :: [Int]
1112      }
1113         -- last_loc and last_len are used when generating error messages,
1114         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1115         -- current token to happyError, we could at least get rid of last_len.
1116         -- Getting rid of last_loc would require finding another way to
1117         -- implement pushCurrentContext (which is only called from one place).
1118
1119 newtype P a = P { unP :: PState -> ParseResult a }
1120
1121 instance Monad P where
1122   return = returnP
1123   (>>=) = thenP
1124   fail = failP
1125
1126 returnP :: a -> P a
1127 returnP a = P $ \s -> POk s a
1128
1129 thenP :: P a -> (a -> P b) -> P b
1130 (P m) `thenP` k = P $ \ s ->
1131         case m s of
1132                 POk s1 a         -> (unP (k a)) s1
1133                 PFailed span err -> PFailed span err
1134
1135 failP :: String -> P a
1136 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1137
1138 failMsgP :: String -> P a
1139 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1140
1141 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1142 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1143
1144 failSpanMsgP :: SrcSpan -> String -> P a
1145 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1146
1147 extension :: (Int -> Bool) -> P Bool
1148 extension p = P $ \s -> POk s (p $! extsBitmap s)
1149
1150 getExts :: P Int
1151 getExts = P $ \s -> POk s (extsBitmap s)
1152
1153 setSrcLoc :: SrcLoc -> P ()
1154 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1155
1156 getSrcLoc :: P SrcLoc
1157 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1158
1159 setLastToken :: SrcSpan -> Int -> P ()
1160 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1161
1162 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1163
1164 alexInputPrevChar :: AlexInput -> Char
1165 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1166
1167 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1168 alexGetChar (AI loc ofs s)
1169   | atEnd s   = Nothing
1170   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1171                 Just (adj_c, (AI loc' ofs' s'))
1172   where (c,s') = nextChar s
1173         loc'   = advanceSrcLoc loc c
1174         ofs'   = advanceOffs c ofs
1175
1176         non_graphic     = '\x0'
1177         upper           = '\x1'
1178         lower           = '\x2'
1179         digit           = '\x3'
1180         symbol          = '\x4'
1181         space           = '\x5'
1182         other_graphic   = '\x6'
1183
1184         adj_c
1185 #if __GLASGOW_HASKELL__ < 605
1186           = c  -- no Unicode support
1187 #else
1188           | c <= '\x06' = non_graphic
1189           | c <= '\xff' = c
1190           | otherwise =
1191                 case generalCategory c of
1192                   UppercaseLetter       -> upper
1193                   LowercaseLetter       -> lower
1194                   TitlecaseLetter       -> upper
1195                   ModifierLetter        -> other_graphic
1196                   OtherLetter           -> other_graphic
1197                   NonSpacingMark        -> other_graphic
1198                   SpacingCombiningMark  -> other_graphic
1199                   EnclosingMark         -> other_graphic
1200                   DecimalNumber         -> digit
1201                   LetterNumber          -> other_graphic
1202                   OtherNumber           -> other_graphic
1203                   ConnectorPunctuation  -> other_graphic
1204                   DashPunctuation       -> other_graphic
1205                   OpenPunctuation       -> other_graphic
1206                   ClosePunctuation      -> other_graphic
1207                   InitialQuote          -> other_graphic
1208                   FinalQuote            -> other_graphic
1209                   OtherPunctuation      -> other_graphic
1210                   MathSymbol            -> symbol
1211                   CurrencySymbol        -> symbol
1212                   ModifierSymbol        -> symbol
1213                   OtherSymbol           -> symbol
1214                   Space                 -> space
1215                   _other                -> non_graphic
1216 #endif
1217
1218 -- This version does not squash unicode characters, it is used when
1219 -- lexing strings.
1220 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1221 alexGetChar' (AI loc ofs s)
1222   | atEnd s   = Nothing
1223   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1224                 Just (c, (AI loc' ofs' s'))
1225   where (c,s') = nextChar s
1226         loc'   = advanceSrcLoc loc c
1227         ofs'   = advanceOffs c ofs
1228
1229 advanceOffs :: Char -> Int -> Int
1230 advanceOffs '\n' offs = 0
1231 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1232 advanceOffs _    offs = offs + 1
1233
1234 getInput :: P AlexInput
1235 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1236
1237 setInput :: AlexInput -> P ()
1238 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1239
1240 pushLexState :: Int -> P ()
1241 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1242
1243 popLexState :: P Int
1244 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1245
1246 getLexState :: P Int
1247 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1248
1249 -- for reasons of efficiency, flags indicating language extensions (eg,
1250 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1251 -- integer
1252
1253 glaExtsBit, ffiBit, parrBit :: Int
1254 glaExtsBit = 0
1255 ffiBit     = 1
1256 parrBit    = 2
1257 arrowsBit  = 4
1258 thBit      = 5
1259 ipBit      = 6
1260 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1261 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1262                 -- (doesn't affect the lexer)
1263
1264 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1265 glaExtsEnabled flags = testBit flags glaExtsBit
1266 ffiEnabled     flags = testBit flags ffiBit
1267 parrEnabled    flags = testBit flags parrBit
1268 arrowsEnabled  flags = testBit flags arrowsBit
1269 thEnabled      flags = testBit flags thBit
1270 ipEnabled      flags = testBit flags ipBit
1271 tvEnabled      flags = testBit flags tvBit
1272 bangPatEnabled flags = testBit flags bangPatBit
1273
1274 -- create a parse state
1275 --
1276 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1277 mkPState buf loc flags  =
1278   PState {
1279       buffer     = buf,
1280       last_loc   = mkSrcSpan loc loc,
1281       last_offs  = 0,
1282       last_len   = 0,
1283       loc        = loc,
1284       extsBitmap = fromIntegral bitmap,
1285       context    = [],
1286       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1287         -- we begin in the layout state if toplev_layout is set
1288     }
1289     where
1290       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1291                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1292                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1293                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1294                .|. thBit      `setBitIf` dopt Opt_TH          flags
1295                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1296                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1297                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1298       --
1299       setBitIf :: Int -> Bool -> Int
1300       b `setBitIf` cond | cond      = bit b
1301                         | otherwise = 0
1302
1303 getContext :: P [LayoutContext]
1304 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1305
1306 setContext :: [LayoutContext] -> P ()
1307 setContext ctx = P $ \s -> POk s{context=ctx} ()
1308
1309 popContext :: P ()
1310 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1311                            loc = loc, last_len = len, last_loc = last_loc }) ->
1312   case ctx of
1313         (_:tl) -> POk s{ context = tl } ()
1314         []     -> PFailed last_loc (srcParseErr buf len)
1315
1316 -- Push a new layout context at the indentation of the last token read.
1317 -- This is only used at the outer level of a module when the 'module'
1318 -- keyword is missing.
1319 pushCurrentContext :: P ()
1320 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_len=len, context=ctx } ->
1321   POk s{context = Layout (offs-len) : ctx} ()
1322
1323 getOffside :: P Ordering
1324 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1325                 let ord = case stk of
1326                         (Layout n:_) -> compare offs n
1327                         _            -> GT
1328                 in POk s ord
1329
1330 -- ---------------------------------------------------------------------------
1331 -- Construct a parse error
1332
1333 srcParseErr
1334   :: StringBuffer       -- current buffer (placed just after the last token)
1335   -> Int                -- length of the previous token
1336   -> Message
1337 srcParseErr buf len
1338   = hcat [ if null token
1339              then ptext SLIT("parse error (possibly incorrect indentation)")
1340              else hcat [ptext SLIT("parse error on input "),
1341                         char '`', text token, char '\'']
1342     ]
1343   where token = lexemeToString (offsetBytes (-len) buf) len
1344
1345 -- Report a parse failure, giving the span of the previous token as
1346 -- the location of the error.  This is the entry point for errors
1347 -- detected during parsing.
1348 srcParseFail :: P a
1349 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1350                             last_loc = last_loc } ->
1351     PFailed last_loc (srcParseErr buf len)
1352
1353 -- A lexical error is reported at a particular position in the source file,
1354 -- not over a token range.
1355 lexError :: String -> P a
1356 lexError str = do
1357   loc <- getSrcLoc
1358   i@(AI end _ buf) <- getInput
1359   reportLexError loc end buf str
1360
1361 -- -----------------------------------------------------------------------------
1362 -- This is the top-level function: called from the parser each time a
1363 -- new token is to be read from the input.
1364
1365 lexer :: (Located Token -> P a) -> P a
1366 lexer cont = do
1367   tok@(L _ tok__) <- lexToken
1368   --trace ("token: " ++ show tok__) $ do
1369   cont tok
1370
1371 lexToken :: P (Located Token)
1372 lexToken = do
1373   inp@(AI loc1 _ buf) <- getInput
1374   sc <- getLexState
1375   exts <- getExts
1376   case alexScanUser exts inp sc of
1377     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1378                   setLastToken span 0
1379                   return (L span ITeof)
1380     AlexError (AI loc2 _ buf) -> do
1381         reportLexError loc1 loc2 buf "lexical error"
1382     AlexSkip inp2 _ -> do
1383         setInput inp2
1384         lexToken
1385     AlexToken inp2@(AI end _ buf2) len t -> do
1386         setInput inp2
1387         let span = mkSrcSpan loc1 end
1388         let bytes = byteDiff buf buf2
1389         span `seq` setLastToken span bytes
1390         t span buf bytes
1391
1392 -- ToDo: Alex reports the buffer at the start of the erroneous lexeme,
1393 -- but it would be more informative to report the location where the
1394 -- error was actually discovered, especially if this is a decoding
1395 -- error.
1396 reportLexError loc1 loc2 buf str =
1397   let
1398         c = fst (nextChar buf)
1399   in
1400   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1401     then failLocMsgP loc2 loc2 "UTF-8 decoding error"
1402     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1403 }