ghc/compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  29    getLexState, popLexState, pushLexState
  30   ) where
  31
  32 #include "HsVersions.h"
  33
  34 import ErrUtils         ( Message )
  35 import Outputable
  36 import StringBuffer
  37 import FastString
  38 import FastTypes
  39 import SrcLoc
  40 import UniqFM
  41 import DynFlags
  42 import Ctype
  43 import Util             ( maybePrefixMatch, readRational )
  44
  45 import DATA_BITS
  46 import Data.Char
  47 import Ratio
  48 --import TRACE
  49 }
  50
  51 $unispace    = \x05
  52 $whitechar   = [\ \t\n\r\f\v\xa0 $unispace]
  53 $white_no_nl = $whitechar # \n
  54
  55 $ascdigit  = 0-9
  56 $unidigit  = \x03
  57 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  58 $digit     = [$ascdigit $unidigit]
  59
  60 $special   = [\(\)\,\;\[\]\`\{\}]
  61 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  62 $unisymbol = \x04
  63 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  64
  65 $unilarge  = \x01
  66 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  67 $large     = [$asclarge $unilarge]
  68
  69 $unismall  = \x02
  70 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  71 $small     = [$ascsmall $unismall \_]
  72
  73 $unigraphic = \x06
  74 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  75
  76 $octit     = 0-7
  77 $hexit     = [$decdigit A-F a-f]
  78 $symchar   = [$symbol \:]
  79 $nl        = [\n\r]
  80 $idchar    = [$small $large $digit \']
  81
  82 @varid     = $small $idchar*
  83 @conid     = $large $idchar*
  84
  85 @varsym    = $symbol $symchar*
  86 @consym    = \: $symchar*
  87
  88 @decimal     = $decdigit+
  89 @octal       = $octit+
  90 @hexadecimal = $hexit+
  91 @exponent    = [eE] [\-\+]? @decimal
  92
  93 -- we support the hierarchical module name extension:
  94 @qual = (@conid \.)+
  95
  96 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
  97
  98 haskell :-
  99
 100 -- everywhere: skip whitespace and comments
 101 $white_no_nl+                           ;
 102
 103 -- Everywhere: deal with nested comments.  We explicitly rule out
 104 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 105 -- (this can happen even though pragmas will normally take precedence due to
 106 -- longest-match, because pragmas aren't valid in every state, but comments
 107 -- are).
 108 "{-" / { notFollowedBy '#' }            { nested_comment }
 109
 110 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 111 -- more dashes followed by a symbol should be parsed as a varsym, so we
 112 -- have to exclude those.
 113 -- The regex says: "munch all the characters after the dashes, as long as
 114 -- the first one is not a symbol".
 115 "--"\-* [^$symbol :] .*                 ;
 116 "--"\-* / { atEOL }                     ;
 117
 118 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 119 -- blank lines) until we find a non-whitespace character, then do layout
 120 -- processing.
 121 --
 122 -- One slight wibble here: what if the line begins with {-#? In
 123 -- theory, we have to lex the pragma to see if it's one we recognise,
 124 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 125 -- as a nested comment.  We don't bother with this: if the line begins
 126 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 127 <bol> {
 128   \n                                    ;
 129   ^\# (line)?                           { begin line_prag1 }
 130   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 131   ^\# \! .* \n                          ; -- #!, for scripts
 132   ()                                    { do_bol }
 133 }
 134
 135 -- after a layout keyword (let, where, do, of), we begin a new layout
 136 -- context if the curly brace is missing.
 137 -- Careful! This stuff is quite delicate.
 138 <layout, layout_do> {
 139   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 140         -- we might encounter {-# here, but {- has been handled already
 141   \n                                    ;
 142   ^\# (line)?                           { begin line_prag1 }
 143 }
 144
 145 -- do is treated in a subtly different way, see new_layout_context
 146 <layout>    ()                          { new_layout_context True }
 147 <layout_do> ()                          { new_layout_context False }
 148
 149 -- after a new layout context which was found to be to the left of the
 150 -- previous context, we have generated a '{' token, and we now need to
 151 -- generate a matching '}' token.
 152 <layout_left>  ()                       { do_layout_left }
 153
 154 <0,glaexts> \n                          { begin bol }
 155
 156 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 157
 158 -- single-line line pragmas, of the form
 159 --    # <line> "<file>" <extra-stuff> \n
 160 <line_prag1> $decdigit+                 { setLine line_prag1a }
 161 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 162 <line_prag1b> .*                        { pop }
 163
 164 -- Haskell-style line pragmas, of the form
 165 --    {-# LINE <line> "<file>" #-}
 166 <line_prag2> $decdigit+                 { setLine line_prag2a }
 167 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 168 <line_prag2b> "#-}"|"-}"                { pop }
 169    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 170    -- with older versions of GHC which generated these.
 171
 172 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 173 -- is on, because the contents of the pragma is always written using
 174 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 175 -- enabled, we're sure to get a parse error.
 176 -- (ToDo: we should really emit a warning when ignoring pragmas)
 177 <glaexts>
 178   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 179
 180 <0,glaexts> {
 181   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 182   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 183                                         { token (ITinline_prag False) }
 184   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 185                                         { token ITspec_prag }
 186   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 187         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 188   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 189         $whitechar* (NO(T?)INLINE|no(t?)inline)
 190                                         { token (ITspec_inline_prag False) }
 191   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 192   "{-#" $whitechar* (DEPRECATED|deprecated)
 193                                         { token ITdeprecated_prag }
 194   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 195   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 196   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 197
 198   "{-#"                                 { nested_comment }
 199
 200   -- ToDo: should only be valid inside a pragma:
 201   "#-}"                                 { token ITclose_prag}
 202 }
 203
 204
 205 -- '0' state: ordinary lexemes
 206 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 207
 208 -- "special" symbols
 209
 210 <0,glaexts> {
 211   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 212   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 213 }
 214
 215 <0,glaexts> {
 216   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 217   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 218   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 219   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 220   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 221   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 222   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 223   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 224 }
 225
 226 <0,glaexts> {
 227   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 228                                         { special IToparenbar }
 229   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 230 }
 231
 232 <0,glaexts> {
 233   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 234   \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
 235 }
 236
 237 <glaexts> {
 238   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 239   "#)"                                  { token ITcubxparen }
 240   "{|"                                  { token ITocurlybar }
 241   "|}"                                  { token ITccurlybar }
 242 }
 243
 244 <0,glaexts> {
 245   \(                                    { special IToparen }
 246   \)                                    { special ITcparen }
 247   \[                                    { special ITobrack }
 248   \]                                    { special ITcbrack }
 249   \,                                    { special ITcomma }
 250   \;                                    { special ITsemi }
 251   \`                                    { special ITbackquote }
 252
 253   \{                                    { open_brace }
 254   \}                                    { close_brace }
 255 }
 256
 257 <0,glaexts> {
 258   @qual @varid                  { check_qvarid }
 259   @qual @conid                  { idtoken qconid }
 260   @varid                        { varid }
 261   @conid                        { idtoken conid }
 262 }
 263
 264 -- after an illegal qvarid, such as 'M.let',
 265 -- we back up and try again in the bad_qvarid state:
 266 <bad_qvarid> {
 267   @conid                        { pop_and (idtoken conid) }
 268   @qual @conid                  { pop_and (idtoken qconid) }
 269 }
 270
 271 <glaexts> {
 272   @qual @varid "#"+             { idtoken qvarid }
 273   @qual @conid "#"+             { idtoken qconid }
 274   @varid "#"+                   { varid }
 275   @conid "#"+                   { idtoken conid }
 276 }
 277
 278 -- ToDo: M.(,,,)
 279
 280 <0,glaexts> {
 281   @qual @varsym                 { idtoken qvarsym }
 282   @qual @consym                 { idtoken qconsym }
 283   @varsym                       { varsym }
 284   @consym                       { consym }
 285 }
 286
 287 <0,glaexts> {
 288   @decimal                      { tok_decimal }
 289   0[oO] @octal                  { tok_octal }
 290   0[xX] @hexadecimal            { tok_hexadecimal }
 291 }
 292
 293 <glaexts> {
 294   @decimal \#                   { prim_decimal }
 295   0[oO] @octal \#               { prim_octal }
 296   0[xX] @hexadecimal \#         { prim_hexadecimal }
 297 }
 298
 299 <0,glaexts> @floating_point             { strtoken tok_float }
 300 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 301 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 302
 303 -- Strings and chars are lexed by hand-written code.  The reason is
 304 -- that even if we recognise the string or char here in the regex
 305 -- lexer, we would still have to parse the string afterward in order
 306 -- to convert it to a String.
 307 <0,glaexts> {
 308   \'                            { lex_char_tok }
 309   \"                            { lex_string_tok }
 310 }
 311
 312 {
 313 -- work around bug in Alex 2.0
 314 #if __GLASGOW_HASKELL__ < 503
 315 unsafeAt arr i = arr ! i
 316 #endif
 317
 318 -- -----------------------------------------------------------------------------
 319 -- The token type
 320
 321 data Token
 322   = ITas                        -- Haskell keywords
 323   | ITcase
 324   | ITclass
 325   | ITdata
 326   | ITdefault
 327   | ITderiving
 328   | ITdo
 329   | ITelse
 330   | IThiding
 331   | ITif
 332   | ITimport
 333   | ITin
 334   | ITinfix
 335   | ITinfixl
 336   | ITinfixr
 337   | ITinstance
 338   | ITlet
 339   | ITmodule
 340   | ITnewtype
 341   | ITof
 342   | ITqualified
 343   | ITthen
 344   | ITtype
 345   | ITwhere
 346   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 347
 348   | ITforall                    -- GHC extension keywords
 349   | ITforeign
 350   | ITexport
 351   | ITlabel
 352   | ITdynamic
 353   | ITsafe
 354   | ITthreadsafe
 355   | ITunsafe
 356   | ITstdcallconv
 357   | ITccallconv
 358   | ITdotnet
 359   | ITmdo
 360
 361         -- Pragmas
 362   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 363   | ITspec_prag                 -- SPECIALISE
 364   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 365   | ITsource_prag
 366   | ITrules_prag
 367   | ITdeprecated_prag
 368   | ITline_prag
 369   | ITscc_prag
 370   | ITcore_prag                 -- hdaume: core annotations
 371   | ITunpack_prag
 372   | ITclose_prag
 373
 374   | ITdotdot                    -- reserved symbols
 375   | ITcolon
 376   | ITdcolon
 377   | ITequal
 378   | ITlam
 379   | ITvbar
 380   | ITlarrow
 381   | ITrarrow
 382   | ITat
 383   | ITtilde
 384   | ITdarrow
 385   | ITminus
 386   | ITbang
 387   | ITstar
 388   | ITdot
 389
 390   | ITbiglam                    -- GHC-extension symbols
 391
 392   | ITocurly                    -- special symbols
 393   | ITccurly
 394   | ITocurlybar                 -- {|, for type applications
 395   | ITccurlybar                 -- |}, for type applications
 396   | ITvocurly
 397   | ITvccurly
 398   | ITobrack
 399   | ITopabrack                  -- [:, for parallel arrays with -fparr
 400   | ITcpabrack                  -- :], for parallel arrays with -fparr
 401   | ITcbrack
 402   | IToparen
 403   | ITcparen
 404   | IToubxparen
 405   | ITcubxparen
 406   | ITsemi
 407   | ITcomma
 408   | ITunderscore
 409   | ITbackquote
 410
 411   | ITvarid   FastString        -- identifiers
 412   | ITconid   FastString
 413   | ITvarsym  FastString
 414   | ITconsym  FastString
 415   | ITqvarid  (FastString,FastString)
 416   | ITqconid  (FastString,FastString)
 417   | ITqvarsym (FastString,FastString)
 418   | ITqconsym (FastString,FastString)
 419
 420   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 421   | ITsplitipvarid FastString   -- GHC extension: implicit param: %x
 422
 423   | ITpragma StringBuffer
 424
 425   | ITchar       Char
 426   | ITstring     FastString
 427   | ITinteger    Integer
 428   | ITrational   Rational
 429
 430   | ITprimchar   Char
 431   | ITprimstring FastString
 432   | ITprimint    Integer
 433   | ITprimfloat  Rational
 434   | ITprimdouble Rational
 435
 436   -- MetaHaskell extension tokens
 437   | ITopenExpQuote              --  [| or [e|
 438   | ITopenPatQuote              --  [p|
 439   | ITopenDecQuote              --  [d|
 440   | ITopenTypQuote              --  [t|
 441   | ITcloseQuote                --  |]
 442   | ITidEscape   FastString     --  $x
 443   | ITparenEscape               --  $(
 444   | ITvarQuote                  --  '
 445   | ITtyQuote                   --  ''
 446
 447   -- Arrow notation extension
 448   | ITproc
 449   | ITrec
 450   | IToparenbar                 --  (|
 451   | ITcparenbar                 --  |)
 452   | ITlarrowtail                --  -<
 453   | ITrarrowtail                --  >-
 454   | ITLarrowtail                --  -<<
 455   | ITRarrowtail                --  >>-
 456
 457   | ITunknown String            -- Used when the lexer can't make sense of it
 458   | ITeof                       -- end of file token
 459 #ifdef DEBUG
 460   deriving Show -- debugging
 461 #endif
 462
 463 isSpecial :: Token -> Bool
 464 -- If we see M.x, where x is a keyword, but
 465 -- is special, we treat is as just plain M.x,
 466 -- not as a keyword.
 467 isSpecial ITas          = True
 468 isSpecial IThiding      = True
 469 isSpecial ITqualified   = True
 470 isSpecial ITforall      = True
 471 isSpecial ITexport      = True
 472 isSpecial ITlabel       = True
 473 isSpecial ITdynamic     = True
 474 isSpecial ITsafe        = True
 475 isSpecial ITthreadsafe  = True
 476 isSpecial ITunsafe      = True
 477 isSpecial ITccallconv   = True
 478 isSpecial ITstdcallconv = True
 479 isSpecial ITmdo         = True
 480 isSpecial _             = False
 481
 482 -- the bitmap provided as the third component indicates whether the
 483 -- corresponding extension keyword is valid under the extension options
 484 -- provided to the compiler; if the extension corresponding to *any* of the
 485 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 486 -- facilitates using a keyword in two different extensions that can be
 487 -- activated independently)
 488 --
 489 reservedWordsFM = listToUFM $
 490         map (\(x, y, z) -> (mkFastString x, (y, z)))
 491        [( "_",          ITunderscore,   0 ),
 492         ( "as",         ITas,           0 ),
 493         ( "case",       ITcase,         0 ),
 494         ( "class",      ITclass,        0 ),
 495         ( "data",       ITdata,         0 ),
 496         ( "default",    ITdefault,      0 ),
 497         ( "deriving",   ITderiving,     0 ),
 498         ( "do",         ITdo,           0 ),
 499         ( "else",       ITelse,         0 ),
 500         ( "hiding",     IThiding,       0 ),
 501         ( "if",         ITif,           0 ),
 502         ( "import",     ITimport,       0 ),
 503         ( "in",         ITin,           0 ),
 504         ( "infix",      ITinfix,        0 ),
 505         ( "infixl",     ITinfixl,       0 ),
 506         ( "infixr",     ITinfixr,       0 ),
 507         ( "instance",   ITinstance,     0 ),
 508         ( "let",        ITlet,          0 ),
 509         ( "module",     ITmodule,       0 ),
 510         ( "newtype",    ITnewtype,      0 ),
 511         ( "of",         ITof,           0 ),
 512         ( "qualified",  ITqualified,    0 ),
 513         ( "then",       ITthen,         0 ),
 514         ( "type",       ITtype,         0 ),
 515         ( "where",      ITwhere,        0 ),
 516         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 517
 518         ( "forall",     ITforall,        bit tvBit),
 519         ( "mdo",        ITmdo,           bit glaExtsBit),
 520
 521         ( "foreign",    ITforeign,       bit ffiBit),
 522         ( "export",     ITexport,        bit ffiBit),
 523         ( "label",      ITlabel,         bit ffiBit),
 524         ( "dynamic",    ITdynamic,       bit ffiBit),
 525         ( "safe",       ITsafe,          bit ffiBit),
 526         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 527         ( "unsafe",     ITunsafe,        bit ffiBit),
 528         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 529         ( "ccall",      ITccallconv,     bit ffiBit),
 530         ( "dotnet",     ITdotnet,        bit ffiBit),
 531
 532         ( "rec",        ITrec,           bit arrowsBit),
 533         ( "proc",       ITproc,          bit arrowsBit)
 534      ]
 535
 536 reservedSymsFM = listToUFM $
 537         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 538       [ ("..",  ITdotdot,       0)
 539        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 540                                                 -- meaning only list cons
 541        ,("::",  ITdcolon,       0)
 542        ,("=",   ITequal,        0)
 543        ,("\\",  ITlam,          0)
 544        ,("|",   ITvbar,         0)
 545        ,("<-",  ITlarrow,       0)
 546        ,("->",  ITrarrow,       0)
 547        ,("@",   ITat,           0)
 548        ,("~",   ITtilde,        0)
 549        ,("=>",  ITdarrow,       0)
 550        ,("-",   ITminus,        0)
 551        ,("!",   ITbang,         0)
 552
 553        ,("*",   ITstar,         bit glaExtsBit) -- For data T (a::*) = MkT
 554        ,(".",   ITdot,          bit tvBit)      -- For 'forall a . t'
 555
 556        ,("-<",  ITlarrowtail,   bit arrowsBit)
 557        ,(">-",  ITrarrowtail,   bit arrowsBit)
 558        ,("-<<", ITLarrowtail,   bit arrowsBit)
 559        ,(">>-", ITRarrowtail,   bit arrowsBit)
 560
 561 #if __GLASGOW_HASKELL__ >= 605
 562        ,("λ",  ITlam,          bit glaExtsBit)
 563        ,("∀", ITforall,       bit glaExtsBit)
 564        ,("→",   ITrarrow,     bit glaExtsBit)
 565        ,("←",   ITlarrow,     bit glaExtsBit)
 566        ,("⋯",         ITdotdot,       bit glaExtsBit)
 567 #endif
 568        ]
 569
 570 -- -----------------------------------------------------------------------------
 571 -- Lexer actions
 572
 573 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 574
 575 special :: Token -> Action
 576 special tok span _buf len = return (L span tok)
 577
 578 token, layout_token :: Token -> Action
 579 token t span buf len = return (L span t)
 580 layout_token t span buf len = pushLexState layout >> return (L span t)
 581
 582 idtoken :: (StringBuffer -> Int -> Token) -> Action
 583 idtoken f span buf len = return (L span $! (f buf len))
 584
 585 skip_one_varid :: (FastString -> Token) -> Action
 586 skip_one_varid f span buf len
 587   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 588
 589 strtoken :: (String -> Token) -> Action
 590 strtoken f span buf len =
 591   return (L span $! (f $! lexemeToString buf len))
 592
 593 init_strtoken :: Int -> (String -> Token) -> Action
 594 -- like strtoken, but drops the last N character(s)
 595 init_strtoken drop f span buf len =
 596   return (L span $! (f $! lexemeToString buf (len-drop)))
 597
 598 begin :: Int -> Action
 599 begin code _span _str _len = do pushLexState code; lexToken
 600
 601 pop :: Action
 602 pop _span _buf _len = do popLexState; lexToken
 603
 604 pop_and :: Action -> Action
 605 pop_and act span buf len = do popLexState; act span buf len
 606
 607 notFollowedBy char _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf /= char
 608
 609 notFollowedBySymbol _ _ _ (AI _ _ buf)
 610   = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
 611
 612 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 613
 614 ifExtension pred bits _ _ _ = pred bits
 615
 616 {-
 617   nested comments require traversing by hand, they can't be parsed
 618   using regular expressions.
 619 -}
 620 nested_comment :: Action
 621 nested_comment span _str _len = do
 622   input <- getInput
 623   go 1 input
 624   where go 0 input = do setInput input; lexToken
 625         go n input = do
 626           case alexGetChar input of
 627             Nothing  -> err input
 628             Just (c,input) -> do
 629               case c of
 630                 '-' -> do
 631                   case alexGetChar input of
 632                     Nothing  -> err input
 633                     Just ('\125',input) -> go (n-1) input
 634                     Just (c,_)          -> go n input
 635                 '\123' -> do
 636                   case alexGetChar input of
 637                     Nothing  -> err input
 638                     Just ('-',input') -> go (n+1) input'
 639                     Just (c,input)    -> go n input
 640                 c -> go n input
 641
 642         err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 643
 644 open_brace, close_brace :: Action
 645 open_brace span _str _len = do
 646   ctx <- getContext
 647   setContext (NoLayout:ctx)
 648   return (L span ITocurly)
 649 close_brace span _str _len = do
 650   popContext
 651   return (L span ITccurly)
 652
 653 -- We have to be careful not to count M.<varid> as a qualified name
 654 -- when <varid> is a keyword.  We hack around this by catching
 655 -- the offending tokens afterward, and re-lexing in a different state.
 656 check_qvarid span buf len = do
 657   case lookupUFM reservedWordsFM var of
 658         Just (keyword,exts)
 659           | not (isSpecial keyword) ->
 660           if exts == 0
 661              then try_again
 662              else do
 663                 b <- extension (\i -> exts .&. i /= 0)
 664                 if b then try_again
 665                      else return token
 666         _other -> return token
 667   where
 668         (mod,var) = splitQualName buf len
 669         token     = L span (ITqvarid (mod,var))
 670
 671         try_again = do
 672                 (AI _ offs _) <- getInput
 673                 setInput (AI (srcSpanStart span) (offs-len) buf)
 674                 pushLexState bad_qvarid
 675                 lexToken
 676
 677 qvarid buf len = ITqvarid $! splitQualName buf len
 678 qconid buf len = ITqconid $! splitQualName buf len
 679
 680 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 681 -- takes a StringBuffer and a length, and returns the module name
 682 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 683 -- because of hierarchical module names.
 684 splitQualName orig_buf len = split orig_buf orig_buf
 685   where
 686     split buf dot_buf
 687         | orig_buf `byteDiff` buf >= len  = done dot_buf
 688         | c == '.'                        = found_dot buf'
 689         | otherwise                       = split buf' dot_buf
 690       where
 691        (c,buf') = nextChar buf
 692
 693     -- careful, we might get names like M....
 694     -- so, if the character after the dot is not upper-case, this is
 695     -- the end of the qualifier part.
 696     found_dot buf -- buf points after the '.'
 697         | isUpper c    = split buf' buf
 698         | otherwise    = done buf
 699       where
 700        (c,buf') = nextChar buf
 701
 702     done dot_buf =
 703         (lexemeToFastString orig_buf (qual_size - 1),
 704          lexemeToFastString dot_buf (len - qual_size))
 705       where
 706         qual_size = orig_buf `byteDiff` dot_buf
 707
 708 varid span buf len =
 709   case lookupUFM reservedWordsFM fs of
 710         Just (keyword,0)    -> do
 711                 maybe_layout keyword
 712                 return (L span keyword)
 713         Just (keyword,exts) -> do
 714                 b <- extension (\i -> exts .&. i /= 0)
 715                 if b then do maybe_layout keyword
 716                              return (L span keyword)
 717                      else return (L span (ITvarid fs))
 718         _other -> return (L span (ITvarid fs))
 719   where
 720         fs = lexemeToFastString buf len
 721
 722 conid buf len = ITconid fs
 723   where fs = lexemeToFastString buf len
 724
 725 qvarsym buf len = ITqvarsym $! splitQualName buf len
 726 qconsym buf len = ITqconsym $! splitQualName buf len
 727
 728 varsym = sym ITvarsym
 729 consym = sym ITconsym
 730
 731 sym con span buf len =
 732   case lookupUFM reservedSymsFM fs of
 733         Just (keyword,0)    -> return (L span keyword)
 734         Just (keyword,exts) -> do
 735                 b <- extension (\i -> exts .&. i /= 0)
 736                 if b then return (L span keyword)
 737                      else return (L span $! con fs)
 738         _other -> return (L span $! con fs)
 739   where
 740         fs = lexemeToFastString buf len
 741
 742 tok_decimal span buf len
 743   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 744
 745 tok_octal span buf len
 746   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 747
 748 tok_hexadecimal span buf len
 749   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 750
 751 prim_decimal span buf len
 752   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 753
 754 prim_octal span buf len
 755   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 756
 757 prim_hexadecimal span buf len
 758   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 759
 760 tok_float        str = ITrational   $! readRational str
 761 prim_float       str = ITprimfloat  $! readRational str
 762 prim_double      str = ITprimdouble $! readRational str
 763
 764 -- -----------------------------------------------------------------------------
 765 -- Layout processing
 766
 767 -- we're at the first token on a line, insert layout tokens if necessary
 768 do_bol :: Action
 769 do_bol span _str _len = do
 770         pos <- getOffside
 771         case pos of
 772             LT -> do
 773                 --trace "layout: inserting '}'" $ do
 774                 popContext
 775                 -- do NOT pop the lex state, we might have a ';' to insert
 776                 return (L span ITvccurly)
 777             EQ -> do
 778                 --trace "layout: inserting ';'" $ do
 779                 popLexState
 780                 return (L span ITsemi)
 781             GT -> do
 782                 popLexState
 783                 lexToken
 784
 785 -- certain keywords put us in the "layout" state, where we might
 786 -- add an opening curly brace.
 787 maybe_layout ITdo       = pushLexState layout_do
 788 maybe_layout ITmdo      = pushLexState layout_do
 789 maybe_layout ITof       = pushLexState layout
 790 maybe_layout ITlet      = pushLexState layout
 791 maybe_layout ITwhere    = pushLexState layout
 792 maybe_layout ITrec      = pushLexState layout
 793 maybe_layout _          = return ()
 794
 795 -- Pushing a new implicit layout context.  If the indentation of the
 796 -- next token is not greater than the previous layout context, then
 797 -- Haskell 98 says that the new layout context should be empty; that is
 798 -- the lexer must generate {}.
 799 --
 800 -- We are slightly more lenient than this: when the new context is started
 801 -- by a 'do', then we allow the new context to be at the same indentation as
 802 -- the previous context.  This is what the 'strict' argument is for.
 803 --
 804 new_layout_context strict span _buf _len = do
 805     popLexState
 806     (AI _ offset _) <- getInput
 807     ctx <- getContext
 808     case ctx of
 809         Layout prev_off : _  |
 810            (strict     && prev_off >= offset  ||
 811             not strict && prev_off > offset) -> do
 812                 -- token is indented to the left of the previous context.
 813                 -- we must generate a {} sequence now.
 814                 pushLexState layout_left
 815                 return (L span ITvocurly)
 816         other -> do
 817                 setContext (Layout offset : ctx)
 818                 return (L span ITvocurly)
 819
 820 do_layout_left span _buf _len = do
 821     popLexState
 822     pushLexState bol  -- we must be at the start of a line
 823     return (L span ITvccurly)
 824
 825 -- -----------------------------------------------------------------------------
 826 -- LINE pragmas
 827
 828 setLine :: Int -> Action
 829 setLine code span buf len = do
 830   let line = parseInteger buf len 10 octDecDigit
 831   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
 832         -- subtract one: the line number refers to the *following* line
 833   popLexState
 834   pushLexState code
 835   lexToken
 836
 837 setFile :: Int -> Action
 838 setFile code span buf len = do
 839   let file = lexemeToFastString (stepOn buf) (len-2)
 840   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
 841   popLexState
 842   pushLexState code
 843   lexToken
 844
 845 -- -----------------------------------------------------------------------------
 846 -- Strings & Chars
 847
 848 -- This stuff is horrible.  I hates it.
 849
 850 lex_string_tok :: Action
 851 lex_string_tok span buf len = do
 852   tok <- lex_string ""
 853   end <- getSrcLoc
 854   return (L (mkSrcSpan (srcSpanStart span) end) tok)
 855
 856 lex_string :: String -> P Token
 857 lex_string s = do
 858   i <- getInput
 859   case alexGetChar' i of
 860     Nothing -> lit_error
 861
 862     Just ('"',i)  -> do
 863         setInput i
 864         glaexts <- extension glaExtsEnabled
 865         if glaexts
 866           then do
 867             i <- getInput
 868             case alexGetChar' i of
 869               Just ('#',i) -> do
 870                    setInput i
 871                    if any (> '\xFF') s
 872                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 873                     else let s' = mkZFastString (reverse s) in
 874                          return (ITprimstring s')
 875                         -- mkZFastString is a hack to avoid encoding the
 876                         -- string in UTF-8.  We just want the exact bytes.
 877               _other ->
 878                 return (ITstring (mkFastString (reverse s)))
 879           else
 880                 return (ITstring (mkFastString (reverse s)))
 881
 882     Just ('\\',i)
 883         | Just ('&',i) <- next -> do
 884                 setInput i; lex_string s
 885         | Just (c,i) <- next, is_space c -> do
 886                 setInput i; lex_stringgap s
 887         where next = alexGetChar' i
 888
 889     Just (c, i) -> do
 890         c' <- lex_char c i
 891         lex_string (c':s)
 892
 893 lex_stringgap s = do
 894   c <- getCharOrFail
 895   case c of
 896     '\\' -> lex_string s
 897     c | is_space c -> lex_stringgap s
 898     _other -> lit_error
 899
 900
 901 lex_char_tok :: Action
 902 -- Here we are basically parsing character literals, such as 'x' or '\n'
 903 -- but, when Template Haskell is on, we additionally spot
 904 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
 905 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
 906 -- So we have to do two characters of lookahead: when we see 'x we need to
 907 -- see if there's a trailing quote
 908 lex_char_tok span buf len = do  -- We've seen '
 909    i1 <- getInput       -- Look ahead to first character
 910    let loc = srcSpanStart span
 911    case alexGetChar' i1 of
 912         Nothing -> lit_error
 913
 914         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
 915                   th_exts <- extension thEnabled
 916                   if th_exts then do
 917                         setInput i2
 918                         return (L (mkSrcSpan loc end2)  ITtyQuote)
 919                    else lit_error
 920
 921         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
 922                   setInput i2
 923                   lit_ch <- lex_escape
 924                   mc <- getCharOrFail   -- Trailing quote
 925                   if mc == '\'' then finish_char_tok loc lit_ch
 926                                 else do setInput i2; lit_error
 927
 928         Just (c, i2@(AI end2 _ _))
 929                 | not (isAny c) -> lit_error
 930                 | otherwise ->
 931
 932                 -- We've seen 'x, where x is a valid character
 933                 --  (i.e. not newline etc) but not a quote or backslash
 934            case alexGetChar' i2 of      -- Look ahead one more character
 935                 Nothing -> lit_error
 936                 Just ('\'', i3) -> do   -- We've seen 'x'
 937                         setInput i3
 938                         finish_char_tok loc c
 939                 _other -> do            -- We've seen 'x not followed by quote
 940                                         -- If TH is on, just parse the quote only
 941                         th_exts <- extension thEnabled
 942                         let (AI end _ _) = i1
 943                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
 944                                    else do setInput i2; lit_error
 945
 946 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
 947 finish_char_tok loc ch  -- We've already seen the closing quote
 948                         -- Just need to check for trailing #
 949   = do  glaexts <- extension glaExtsEnabled
 950         i@(AI end _ _) <- getInput
 951         if glaexts then do
 952                 case alexGetChar' i of
 953                         Just ('#',i@(AI end _ _)) -> do
 954                                 setInput i
 955                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
 956                         _other ->
 957                                 return (L (mkSrcSpan loc end) (ITchar ch))
 958                 else do
 959                    return (L (mkSrcSpan loc end) (ITchar ch))
 960
 961 lex_char :: Char -> AlexInput -> P Char
 962 lex_char c inp = do
 963   case c of
 964       '\\' -> do setInput inp; lex_escape
 965       c | isAny c -> do setInput inp; return c
 966       _other -> lit_error
 967
 968 isAny c | c > '\xff' = isPrint c
 969         | otherwise  = is_any c
 970
 971 lex_escape :: P Char
 972 lex_escape = do
 973   c <- getCharOrFail
 974   case c of
 975         'a'   -> return '\a'
 976         'b'   -> return '\b'
 977         'f'   -> return '\f'
 978         'n'   -> return '\n'
 979         'r'   -> return '\r'
 980         't'   -> return '\t'
 981         'v'   -> return '\v'
 982         '\\'  -> return '\\'
 983         '"'   -> return '\"'
 984         '\''  -> return '\''
 985         '^'   -> do c <- getCharOrFail
 986                     if c >= '@' && c <= '_'
 987                         then return (chr (ord c - ord '@'))
 988                         else lit_error
 989
 990         'x'   -> readNum is_hexdigit 16 hexDigit
 991         'o'   -> readNum is_octdigit  8 octDecDigit
 992         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
 993
 994         c1 ->  do
 995            i <- getInput
 996            case alexGetChar' i of
 997             Nothing -> lit_error
 998             Just (c2,i2) ->
 999               case alexGetChar' i2 of
1000                 Nothing -> do setInput i2; lit_error
1001                 Just (c3,i3) ->
1002                    let str = [c1,c2,c3] in
1003                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1004                                      Just rest <- [maybePrefixMatch p str] ] of
1005                           (escape_char,[]):_ -> do
1006                                 setInput i3
1007                                 return escape_char
1008                           (escape_char,_:_):_ -> do
1009                                 setInput i2
1010                                 return escape_char
1011                           [] -> lit_error
1012
1013 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1014 readNum is_digit base conv = do
1015   i <- getInput
1016   c <- getCharOrFail
1017   if is_digit c
1018         then readNum2 is_digit base conv (conv c)
1019         else do setInput i; lit_error
1020
1021 readNum2 is_digit base conv i = do
1022   input <- getInput
1023   read i input
1024   where read i input = do
1025           case alexGetChar' input of
1026             Just (c,input') | is_digit c -> do
1027                 read (i*base + conv c) input'
1028             _other -> do
1029                 if i >= 0 && i <= 0x10FFFF
1030                    then do setInput input; return (chr i)
1031                    else lit_error
1032
1033 silly_escape_chars = [
1034         ("NUL", '\NUL'),
1035         ("SOH", '\SOH'),
1036         ("STX", '\STX'),
1037         ("ETX", '\ETX'),
1038         ("EOT", '\EOT'),
1039         ("ENQ", '\ENQ'),
1040         ("ACK", '\ACK'),
1041         ("BEL", '\BEL'),
1042         ("BS", '\BS'),
1043         ("HT", '\HT'),
1044         ("LF", '\LF'),
1045         ("VT", '\VT'),
1046         ("FF", '\FF'),
1047         ("CR", '\CR'),
1048         ("SO", '\SO'),
1049         ("SI", '\SI'),
1050         ("DLE", '\DLE'),
1051         ("DC1", '\DC1'),
1052         ("DC2", '\DC2'),
1053         ("DC3", '\DC3'),
1054         ("DC4", '\DC4'),
1055         ("NAK", '\NAK'),
1056         ("SYN", '\SYN'),
1057         ("ETB", '\ETB'),
1058         ("CAN", '\CAN'),
1059         ("EM", '\EM'),
1060         ("SUB", '\SUB'),
1061         ("ESC", '\ESC'),
1062         ("FS", '\FS'),
1063         ("GS", '\GS'),
1064         ("RS", '\RS'),
1065         ("US", '\US'),
1066         ("SP", '\SP'),
1067         ("DEL", '\DEL')
1068         ]
1069
1070 -- before calling lit_error, ensure that the current input is pointing to
1071 -- the position of the error in the buffer.  This is so that we can report
1072 -- a correct location to the user, but also so we can detect UTF-8 decoding
1073 -- errors if they occur.
1074 lit_error = lexError "lexical error in string/character literal"
1075
1076 getCharOrFail :: P Char
1077 getCharOrFail =  do
1078   i <- getInput
1079   case alexGetChar' i of
1080         Nothing -> lexError "unexpected end-of-file in string/character literal"
1081         Just (c,i)  -> do setInput i; return c
1082
1083 -- -----------------------------------------------------------------------------
1084 -- The Parse Monad
1085
1086 data LayoutContext
1087   = NoLayout
1088   | Layout !Int
1089
1090 data ParseResult a
1091   = POk PState a
1092   | PFailed
1093         SrcSpan         -- The start and end of the text span related to
1094                         -- the error.  Might be used in environments which can
1095                         -- show this span, e.g. by highlighting it.
1096         Message         -- The error message
1097
1098 data PState = PState {
1099         buffer     :: StringBuffer,
1100         last_loc   :: SrcSpan,  -- pos of previous token
1101         last_offs  :: !Int,     -- offset of the previous token from the
1102                                 -- beginning of  the current line.
1103                                 -- \t is equal to 8 spaces.
1104         last_len   :: !Int,     -- len of previous token
1105         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1106         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1107         context    :: [LayoutContext],
1108         lex_state  :: [Int]
1109      }
1110         -- last_loc and last_len are used when generating error messages,
1111         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1112         -- current token to happyError, we could at least get rid of last_len.
1113         -- Getting rid of last_loc would require finding another way to
1114         -- implement pushCurrentContext (which is only called from one place).
1115
1116 newtype P a = P { unP :: PState -> ParseResult a }
1117
1118 instance Monad P where
1119   return = returnP
1120   (>>=) = thenP
1121   fail = failP
1122
1123 returnP :: a -> P a
1124 returnP a = P $ \s -> POk s a
1125
1126 thenP :: P a -> (a -> P b) -> P b
1127 (P m) `thenP` k = P $ \ s ->
1128         case m s of
1129                 POk s1 a         -> (unP (k a)) s1
1130                 PFailed span err -> PFailed span err
1131
1132 failP :: String -> P a
1133 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1134
1135 failMsgP :: String -> P a
1136 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1137
1138 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1139 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1140
1141 failSpanMsgP :: SrcSpan -> String -> P a
1142 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1143
1144 extension :: (Int -> Bool) -> P Bool
1145 extension p = P $ \s -> POk s (p $! extsBitmap s)
1146
1147 getExts :: P Int
1148 getExts = P $ \s -> POk s (extsBitmap s)
1149
1150 setSrcLoc :: SrcLoc -> P ()
1151 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1152
1153 getSrcLoc :: P SrcLoc
1154 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1155
1156 setLastToken :: SrcSpan -> Int -> P ()
1157 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1158
1159 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1160
1161 alexInputPrevChar :: AlexInput -> Char
1162 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1163
1164 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1165 alexGetChar (AI loc ofs s)
1166   | atEnd s   = Nothing
1167   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1168                 Just (adj_c, (AI loc' ofs' s'))
1169   where (c,s') = nextChar s
1170         loc'   = advanceSrcLoc loc c
1171         ofs'   = advanceOffs c ofs
1172
1173         non_graphic     = '\x0'
1174         upper           = '\x1'
1175         lower           = '\x2'
1176         digit           = '\x3'
1177         symbol          = '\x4'
1178         space           = '\x5'
1179         other_graphic   = '\x6'
1180
1181         adj_c
1182 #if __GLASGOW_HASKELL__ < 605
1183           = c  -- no Unicode support
1184 #else
1185           | c <= '\x06' = non_graphic
1186           | c <= '\xff' = c
1187           | otherwise =
1188                 case generalCategory c of
1189                   UppercaseLetter       -> upper
1190                   LowercaseLetter       -> lower
1191                   TitlecaseLetter       -> upper
1192                   ModifierLetter        -> other_graphic
1193                   OtherLetter           -> other_graphic
1194                   NonSpacingMark        -> other_graphic
1195                   SpacingCombiningMark  -> other_graphic
1196                   EnclosingMark         -> other_graphic
1197                   DecimalNumber         -> digit
1198                   LetterNumber          -> other_graphic
1199                   OtherNumber           -> other_graphic
1200                   ConnectorPunctuation  -> other_graphic
1201                   DashPunctuation       -> other_graphic
1202                   OpenPunctuation       -> other_graphic
1203                   ClosePunctuation      -> other_graphic
1204                   InitialQuote          -> other_graphic
1205                   FinalQuote            -> other_graphic
1206                   OtherPunctuation      -> other_graphic
1207                   MathSymbol            -> symbol
1208                   CurrencySymbol        -> symbol
1209                   ModifierSymbol        -> symbol
1210                   OtherSymbol           -> symbol
1211                   Space                 -> space
1212                   _other                -> non_graphic
1213 #endif
1214
1215 -- This version does not squash unicode characters, it is used when
1216 -- lexing strings.
1217 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1218 alexGetChar' (AI loc ofs s)
1219   | atEnd s   = Nothing
1220   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1221                 Just (c, (AI loc' ofs' s'))
1222   where (c,s') = nextChar s
1223         loc'   = advanceSrcLoc loc c
1224         ofs'   = advanceOffs c ofs
1225
1226 advanceOffs :: Char -> Int -> Int
1227 advanceOffs '\n' offs = 0
1228 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1229 advanceOffs _    offs = offs + 1
1230
1231 getInput :: P AlexInput
1232 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1233
1234 setInput :: AlexInput -> P ()
1235 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1236
1237 pushLexState :: Int -> P ()
1238 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1239
1240 popLexState :: P Int
1241 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1242
1243 getLexState :: P Int
1244 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1245
1246 -- for reasons of efficiency, flags indicating language extensions (eg,
1247 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1248 -- integer
1249
1250 glaExtsBit, ffiBit, parrBit :: Int
1251 glaExtsBit = 0
1252 ffiBit     = 1
1253 parrBit    = 2
1254 arrowsBit  = 4
1255 thBit      = 5
1256 ipBit      = 6
1257 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1258
1259 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1260 glaExtsEnabled flags = testBit flags glaExtsBit
1261 ffiEnabled     flags = testBit flags ffiBit
1262 parrEnabled    flags = testBit flags parrBit
1263 arrowsEnabled  flags = testBit flags arrowsBit
1264 thEnabled      flags = testBit flags thBit
1265 ipEnabled      flags = testBit flags ipBit
1266 tvEnabled      flags = testBit flags tvBit
1267
1268 -- create a parse state
1269 --
1270 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1271 mkPState buf loc flags  =
1272   PState {
1273       buffer     = buf,
1274       last_loc   = mkSrcSpan loc loc,
1275       last_offs  = 0,
1276       last_len   = 0,
1277       loc        = loc,
1278       extsBitmap = fromIntegral bitmap,
1279       context    = [],
1280       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1281         -- we begin in the layout state if toplev_layout is set
1282     }
1283     where
1284       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1285                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1286                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1287                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1288                .|. thBit      `setBitIf` dopt Opt_TH          flags
1289                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1290                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1291       --
1292       setBitIf :: Int -> Bool -> Int
1293       b `setBitIf` cond | cond      = bit b
1294                         | otherwise = 0
1295
1296 getContext :: P [LayoutContext]
1297 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1298
1299 setContext :: [LayoutContext] -> P ()
1300 setContext ctx = P $ \s -> POk s{context=ctx} ()
1301
1302 popContext :: P ()
1303 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1304                            loc = loc, last_len = len, last_loc = last_loc }) ->
1305   case ctx of
1306         (_:tl) -> POk s{ context = tl } ()
1307         []     -> PFailed last_loc (srcParseErr buf len)
1308
1309 -- Push a new layout context at the indentation of the last token read.
1310 -- This is only used at the outer level of a module when the 'module'
1311 -- keyword is missing.
1312 pushCurrentContext :: P ()
1313 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_len=len, context=ctx } ->
1314   POk s{context = Layout (offs-len) : ctx} ()
1315
1316 getOffside :: P Ordering
1317 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1318                 let ord = case stk of
1319                         (Layout n:_) -> compare offs n
1320                         _            -> GT
1321                 in POk s ord
1322
1323 -- ---------------------------------------------------------------------------
1324 -- Construct a parse error
1325
1326 srcParseErr
1327   :: StringBuffer       -- current buffer (placed just after the last token)
1328   -> Int                -- length of the previous token
1329   -> Message
1330 srcParseErr buf len
1331   = hcat [ if null token
1332              then ptext SLIT("parse error (possibly incorrect indentation)")
1333              else hcat [ptext SLIT("parse error on input "),
1334                         char '`', text token, char '\'']
1335     ]
1336   where token = lexemeToString (offsetBytes (-len) buf) len
1337
1338 -- Report a parse failure, giving the span of the previous token as
1339 -- the location of the error.  This is the entry point for errors
1340 -- detected during parsing.
1341 srcParseFail :: P a
1342 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1343                             last_loc = last_loc } ->
1344     PFailed last_loc (srcParseErr buf len)
1345
1346 -- A lexical error is reported at a particular position in the source file,
1347 -- not over a token range.
1348 lexError :: String -> P a
1349 lexError str = do
1350   loc <- getSrcLoc
1351   i@(AI end _ buf) <- getInput
1352   reportLexError loc end buf str
1353
1354 -- -----------------------------------------------------------------------------
1355 -- This is the top-level function: called from the parser each time a
1356 -- new token is to be read from the input.
1357
1358 lexer :: (Located Token -> P a) -> P a
1359 lexer cont = do
1360   tok@(L _ tok__) <- lexToken
1361   --trace ("token: " ++ show tok__) $ do
1362   cont tok
1363
1364 lexToken :: P (Located Token)
1365 lexToken = do
1366   inp@(AI loc1 _ buf) <- getInput
1367   sc <- getLexState
1368   exts <- getExts
1369   case alexScanUser exts inp sc of
1370     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1371                   setLastToken span 0
1372                   return (L span ITeof)
1373     AlexError (AI loc2 _ buf) -> do
1374         reportLexError loc1 loc2 buf "lexical error"
1375     AlexSkip inp2 _ -> do
1376         setInput inp2
1377         lexToken
1378     AlexToken inp2@(AI end _ buf2) len t -> do
1379         setInput inp2
1380         let span = mkSrcSpan loc1 end
1381         let bytes = byteDiff buf buf2
1382         span `seq` setLastToken span bytes
1383         t span buf bytes
1384
1385 -- ToDo: Alex reports the buffer at the start of the erroneous lexeme,
1386 -- but it would be more informative to report the location where the
1387 -- error was actually discovered, especially if this is a decoding
1388 -- error.
1389 reportLexError loc1 loc2 buf str =
1390   let
1391         c = fst (nextChar buf)
1392   in
1393   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1394     then failLocMsgP loc2 loc2 "UTF-8 decoding error"
1395     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1396 }