ghc/compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  29    getLexState, popLexState, pushLexState
  30   ) where
  31
  32 #include "HsVersions.h"
  33
  34 import ErrUtils         ( Message )
  35 import Outputable
  36 import StringBuffer
  37 import FastString
  38 import FastTypes
  39 import SrcLoc
  40 import UniqFM
  41 import DynFlags
  42 import Ctype
  43 import Util             ( maybePrefixMatch, readRational )
  44
  45 import DATA_BITS
  46 import Data.Char
  47 import Ratio
  48 --import TRACE
  49 }
  50
  51 $unispace    = \x05
  52 $whitechar   = [\ \t\n\r\f\v\xa0 $unispace]
  53 $white_no_nl = $whitechar # \n
  54
  55 $ascdigit  = 0-9
  56 $unidigit  = \x03
  57 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  58 $digit     = [$ascdigit $unidigit]
  59
  60 $special   = [\(\)\,\;\[\]\`\{\}]
  61 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  62 $unisymbol = \x04
  63 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  64
  65 $unilarge  = \x01
  66 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  67 $large     = [$asclarge $unilarge]
  68
  69 $unismall  = \x02
  70 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  71 $small     = [$ascsmall $unismall \_]
  72
  73 $unigraphic = \x06
  74 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  75
  76 $octit     = 0-7
  77 $hexit     = [$decdigit A-F a-f]
  78 $symchar   = [$symbol \:]
  79 $nl        = [\n\r]
  80 $idchar    = [$small $large $digit \']
  81
  82 @varid     = $small $idchar*
  83 @conid     = $large $idchar*
  84
  85 @varsym    = $symbol $symchar*
  86 @consym    = \: $symchar*
  87
  88 @decimal     = $decdigit+
  89 @octal       = $octit+
  90 @hexadecimal = $hexit+
  91 @exponent    = [eE] [\-\+]? @decimal
  92
  93 -- we support the hierarchical module name extension:
  94 @qual = (@conid \.)+
  95
  96 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
  97
  98 haskell :-
  99
 100 -- everywhere: skip whitespace and comments
 101 $white_no_nl+                           ;
 102
 103 -- Everywhere: deal with nested comments.  We explicitly rule out
 104 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 105 -- (this can happen even though pragmas will normally take precedence due to
 106 -- longest-match, because pragmas aren't valid in every state, but comments
 107 -- are).
 108 "{-" / { notFollowedBy '#' }            { nested_comment }
 109
 110 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 111 -- more dashes followed by a symbol should be parsed as a varsym, so we
 112 -- have to exclude those.
 113 -- The regex says: "munch all the characters after the dashes, as long as
 114 -- the first one is not a symbol".
 115 "--"\-* [^$symbol :] .*                 ;
 116 "--"\-* / { atEOL }                     ;
 117
 118 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 119 -- blank lines) until we find a non-whitespace character, then do layout
 120 -- processing.
 121 --
 122 -- One slight wibble here: what if the line begins with {-#? In
 123 -- theory, we have to lex the pragma to see if it's one we recognise,
 124 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 125 -- as a nested comment.  We don't bother with this: if the line begins
 126 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 127 <bol> {
 128   \n                                    ;
 129   ^\# (line)?                           { begin line_prag1 }
 130   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 131   ^\# \! .* \n                          ; -- #!, for scripts
 132   ()                                    { do_bol }
 133 }
 134
 135 -- after a layout keyword (let, where, do, of), we begin a new layout
 136 -- context if the curly brace is missing.
 137 -- Careful! This stuff is quite delicate.
 138 <layout, layout_do> {
 139   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 140         -- we might encounter {-# here, but {- has been handled already
 141   \n                                    ;
 142   ^\# (line)?                           { begin line_prag1 }
 143 }
 144
 145 -- do is treated in a subtly different way, see new_layout_context
 146 <layout>    ()                          { new_layout_context True }
 147 <layout_do> ()                          { new_layout_context False }
 148
 149 -- after a new layout context which was found to be to the left of the
 150 -- previous context, we have generated a '{' token, and we now need to
 151 -- generate a matching '}' token.
 152 <layout_left>  ()                       { do_layout_left }
 153
 154 <0,glaexts> \n                          { begin bol }
 155
 156 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 157
 158 -- single-line line pragmas, of the form
 159 --    # <line> "<file>" <extra-stuff> \n
 160 <line_prag1> $decdigit+                 { setLine line_prag1a }
 161 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 162 <line_prag1b> .*                        { pop }
 163
 164 -- Haskell-style line pragmas, of the form
 165 --    {-# LINE <line> "<file>" #-}
 166 <line_prag2> $decdigit+                 { setLine line_prag2a }
 167 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 168 <line_prag2b> "#-}"|"-}"                { pop }
 169    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 170    -- with older versions of GHC which generated these.
 171
 172 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 173 -- is on, because the contents of the pragma is always written using
 174 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 175 -- enabled, we're sure to get a parse error.
 176 -- (ToDo: we should really emit a warning when ignoring pragmas)
 177 <glaexts>
 178   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 179
 180 <0,glaexts> {
 181   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 182   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 183                                         { token (ITinline_prag False) }
 184   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 185                                         { token ITspec_prag }
 186   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 187         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 188   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 189         $whitechar* (NO(T?)INLINE|no(t?)inline)
 190                                         { token (ITspec_inline_prag False) }
 191   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 192   "{-#" $whitechar* (DEPRECATED|deprecated)
 193                                         { token ITdeprecated_prag }
 194   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 195   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 196   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 197
 198   "{-#"                                 { nested_comment }
 199
 200   -- ToDo: should only be valid inside a pragma:
 201   "#-}"                                 { token ITclose_prag}
 202 }
 203
 204
 205 -- '0' state: ordinary lexemes
 206 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 207
 208 -- "special" symbols
 209
 210 <0,glaexts> {
 211   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 212   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 213 }
 214
 215 <0,glaexts> {
 216   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 217   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 218   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 219   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 220   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 221   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 222   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 223   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 224 }
 225
 226 <0,glaexts> {
 227   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 228                                         { special IToparenbar }
 229   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 230 }
 231
 232 <0,glaexts> {
 233   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 234   \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
 235 }
 236
 237 <glaexts> {
 238   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 239   "#)"                                  { token ITcubxparen }
 240   "{|"                                  { token ITocurlybar }
 241   "|}"                                  { token ITccurlybar }
 242 }
 243
 244 <0,glaexts> {
 245   \(                                    { special IToparen }
 246   \)                                    { special ITcparen }
 247   \[                                    { special ITobrack }
 248   \]                                    { special ITcbrack }
 249   \,                                    { special ITcomma }
 250   \;                                    { special ITsemi }
 251   \`                                    { special ITbackquote }
 252
 253   \{                                    { open_brace }
 254   \}                                    { close_brace }
 255 }
 256
 257 <0,glaexts> {
 258   @qual @varid                  { check_qvarid }
 259   @qual @conid                  { idtoken qconid }
 260   @varid                        { varid }
 261   @conid                        { idtoken conid }
 262 }
 263
 264 -- after an illegal qvarid, such as 'M.let',
 265 -- we back up and try again in the bad_qvarid state:
 266 <bad_qvarid> {
 267   @conid                        { pop_and (idtoken conid) }
 268   @qual @conid                  { pop_and (idtoken qconid) }
 269 }
 270
 271 <glaexts> {
 272   @qual @varid "#"+             { idtoken qvarid }
 273   @qual @conid "#"+             { idtoken qconid }
 274   @varid "#"+                   { varid }
 275   @conid "#"+                   { idtoken conid }
 276 }
 277
 278 -- ToDo: M.(,,,)
 279
 280 <0,glaexts> {
 281   @qual @varsym                 { idtoken qvarsym }
 282   @qual @consym                 { idtoken qconsym }
 283   @varsym                       { varsym }
 284   @consym                       { consym }
 285 }
 286
 287 <0,glaexts> {
 288   @decimal                      { tok_decimal }
 289   0[oO] @octal                  { tok_octal }
 290   0[xX] @hexadecimal            { tok_hexadecimal }
 291 }
 292
 293 <glaexts> {
 294   @decimal \#                   { prim_decimal }
 295   0[oO] @octal \#               { prim_octal }
 296   0[xX] @hexadecimal \#         { prim_hexadecimal }
 297 }
 298
 299 <0,glaexts> @floating_point             { strtoken tok_float }
 300 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 301 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 302
 303 -- Strings and chars are lexed by hand-written code.  The reason is
 304 -- that even if we recognise the string or char here in the regex
 305 -- lexer, we would still have to parse the string afterward in order
 306 -- to convert it to a String.
 307 <0,glaexts> {
 308   \'                            { lex_char_tok }
 309   \"                            { lex_string_tok }
 310 }
 311
 312 {
 313 -- work around bug in Alex 2.0
 314 #if __GLASGOW_HASKELL__ < 503
 315 unsafeAt arr i = arr ! i
 316 #endif
 317
 318 -- -----------------------------------------------------------------------------
 319 -- The token type
 320
 321 data Token
 322   = ITas                        -- Haskell keywords
 323   | ITcase
 324   | ITclass
 325   | ITdata
 326   | ITdefault
 327   | ITderiving
 328   | ITdo
 329   | ITelse
 330   | IThiding
 331   | ITif
 332   | ITimport
 333   | ITin
 334   | ITinfix
 335   | ITinfixl
 336   | ITinfixr
 337   | ITinstance
 338   | ITlet
 339   | ITmodule
 340   | ITnewtype
 341   | ITof
 342   | ITqualified
 343   | ITthen
 344   | ITtype
 345   | ITwhere
 346   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 347
 348   | ITforall                    -- GHC extension keywords
 349   | ITforeign
 350   | ITexport
 351   | ITlabel
 352   | ITdynamic
 353   | ITsafe
 354   | ITthreadsafe
 355   | ITunsafe
 356   | ITstdcallconv
 357   | ITccallconv
 358   | ITdotnet
 359   | ITmdo
 360
 361         -- Pragmas
 362   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 363   | ITspec_prag                 -- SPECIALISE
 364   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 365   | ITsource_prag
 366   | ITrules_prag
 367   | ITdeprecated_prag
 368   | ITline_prag
 369   | ITscc_prag
 370   | ITcore_prag                 -- hdaume: core annotations
 371   | ITunpack_prag
 372   | ITclose_prag
 373
 374   | ITdotdot                    -- reserved symbols
 375   | ITcolon
 376   | ITdcolon
 377   | ITequal
 378   | ITlam
 379   | ITvbar
 380   | ITlarrow
 381   | ITrarrow
 382   | ITat
 383   | ITtilde
 384   | ITdarrow
 385   | ITminus
 386   | ITbang
 387   | ITstar
 388   | ITdot
 389
 390   | ITbiglam                    -- GHC-extension symbols
 391
 392   | ITocurly                    -- special symbols
 393   | ITccurly
 394   | ITocurlybar                 -- {|, for type applications
 395   | ITccurlybar                 -- |}, for type applications
 396   | ITvocurly
 397   | ITvccurly
 398   | ITobrack
 399   | ITopabrack                  -- [:, for parallel arrays with -fparr
 400   | ITcpabrack                  -- :], for parallel arrays with -fparr
 401   | ITcbrack
 402   | IToparen
 403   | ITcparen
 404   | IToubxparen
 405   | ITcubxparen
 406   | ITsemi
 407   | ITcomma
 408   | ITunderscore
 409   | ITbackquote
 410
 411   | ITvarid   FastString        -- identifiers
 412   | ITconid   FastString
 413   | ITvarsym  FastString
 414   | ITconsym  FastString
 415   | ITqvarid  (FastString,FastString)
 416   | ITqconid  (FastString,FastString)
 417   | ITqvarsym (FastString,FastString)
 418   | ITqconsym (FastString,FastString)
 419
 420   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 421   | ITsplitipvarid FastString   -- GHC extension: implicit param: %x
 422
 423   | ITpragma StringBuffer
 424
 425   | ITchar       Char
 426   | ITstring     FastString
 427   | ITinteger    Integer
 428   | ITrational   Rational
 429
 430   | ITprimchar   Char
 431   | ITprimstring FastString
 432   | ITprimint    Integer
 433   | ITprimfloat  Rational
 434   | ITprimdouble Rational
 435
 436   -- MetaHaskell extension tokens
 437   | ITopenExpQuote              --  [| or [e|
 438   | ITopenPatQuote              --  [p|
 439   | ITopenDecQuote              --  [d|
 440   | ITopenTypQuote              --  [t|
 441   | ITcloseQuote                --  |]
 442   | ITidEscape   FastString     --  $x
 443   | ITparenEscape               --  $(
 444   | ITvarQuote                  --  '
 445   | ITtyQuote                   --  ''
 446
 447   -- Arrow notation extension
 448   | ITproc
 449   | ITrec
 450   | IToparenbar                 --  (|
 451   | ITcparenbar                 --  |)
 452   | ITlarrowtail                --  -<
 453   | ITrarrowtail                --  >-
 454   | ITLarrowtail                --  -<<
 455   | ITRarrowtail                --  >>-
 456
 457   | ITunknown String            -- Used when the lexer can't make sense of it
 458   | ITeof                       -- end of file token
 459 #ifdef DEBUG
 460   deriving Show -- debugging
 461 #endif
 462
 463 isSpecial :: Token -> Bool
 464 -- If we see M.x, where x is a keyword, but
 465 -- is special, we treat is as just plain M.x,
 466 -- not as a keyword.
 467 isSpecial ITas          = True
 468 isSpecial IThiding      = True
 469 isSpecial ITqualified   = True
 470 isSpecial ITforall      = True
 471 isSpecial ITexport      = True
 472 isSpecial ITlabel       = True
 473 isSpecial ITdynamic     = True
 474 isSpecial ITsafe        = True
 475 isSpecial ITthreadsafe  = True
 476 isSpecial ITunsafe      = True
 477 isSpecial ITccallconv   = True
 478 isSpecial ITstdcallconv = True
 479 isSpecial ITmdo         = True
 480 isSpecial _             = False
 481
 482 -- the bitmap provided as the third component indicates whether the
 483 -- corresponding extension keyword is valid under the extension options
 484 -- provided to the compiler; if the extension corresponding to *any* of the
 485 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 486 -- facilitates using a keyword in two different extensions that can be
 487 -- activated independently)
 488 --
 489 reservedWordsFM = listToUFM $
 490         map (\(x, y, z) -> (mkFastString x, (y, z)))
 491        [( "_",          ITunderscore,   0 ),
 492         ( "as",         ITas,           0 ),
 493         ( "case",       ITcase,         0 ),
 494         ( "class",      ITclass,        0 ),
 495         ( "data",       ITdata,         0 ),
 496         ( "default",    ITdefault,      0 ),
 497         ( "deriving",   ITderiving,     0 ),
 498         ( "do",         ITdo,           0 ),
 499         ( "else",       ITelse,         0 ),
 500         ( "hiding",     IThiding,       0 ),
 501         ( "if",         ITif,           0 ),
 502         ( "import",     ITimport,       0 ),
 503         ( "in",         ITin,           0 ),
 504         ( "infix",      ITinfix,        0 ),
 505         ( "infixl",     ITinfixl,       0 ),
 506         ( "infixr",     ITinfixr,       0 ),
 507         ( "instance",   ITinstance,     0 ),
 508         ( "let",        ITlet,          0 ),
 509         ( "module",     ITmodule,       0 ),
 510         ( "newtype",    ITnewtype,      0 ),
 511         ( "of",         ITof,           0 ),
 512         ( "qualified",  ITqualified,    0 ),
 513         ( "then",       ITthen,         0 ),
 514         ( "type",       ITtype,         0 ),
 515         ( "where",      ITwhere,        0 ),
 516         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 517
 518         ( "forall",     ITforall,        bit tvBit),
 519         ( "mdo",        ITmdo,           bit glaExtsBit),
 520
 521         ( "foreign",    ITforeign,       bit ffiBit),
 522         ( "export",     ITexport,        bit ffiBit),
 523         ( "label",      ITlabel,         bit ffiBit),
 524         ( "dynamic",    ITdynamic,       bit ffiBit),
 525         ( "safe",       ITsafe,          bit ffiBit),
 526         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 527         ( "unsafe",     ITunsafe,        bit ffiBit),
 528         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 529         ( "ccall",      ITccallconv,     bit ffiBit),
 530         ( "dotnet",     ITdotnet,        bit ffiBit),
 531
 532         ( "rec",        ITrec,           bit arrowsBit),
 533         ( "proc",       ITproc,          bit arrowsBit)
 534      ]
 535
 536 reservedSymsFM = listToUFM $
 537         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 538       [ ("..",  ITdotdot,       0)
 539        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 540                                                 -- meaning only list cons
 541        ,("::",  ITdcolon,       0)
 542        ,("=",   ITequal,        0)
 543        ,("\\",  ITlam,          0)
 544        ,("|",   ITvbar,         0)
 545        ,("<-",  ITlarrow,       0)
 546        ,("->",  ITrarrow,       0)
 547        ,("@",   ITat,           0)
 548        ,("~",   ITtilde,        0)
 549        ,("=>",  ITdarrow,       0)
 550        ,("-",   ITminus,        0)
 551        ,("!",   ITbang,         0)
 552
 553        ,("*",   ITstar,         bit glaExtsBit) -- For data T (a::*) = MkT
 554        ,(".",   ITdot,          bit tvBit)      -- For 'forall a . t'
 555
 556        ,("-<",  ITlarrowtail,   bit arrowsBit)
 557        ,(">-",  ITrarrowtail,   bit arrowsBit)
 558        ,("-<<", ITLarrowtail,   bit arrowsBit)
 559        ,(">>-", ITRarrowtail,   bit arrowsBit)
 560
 561 #if __GLASGOW_HASKELL__ >= 605
 562        ,("∀", ITforall,       bit tvBit)
 563        ,("→",   ITrarrow,     0)
 564        ,("←",   ITlarrow,     0)
 565        ,("⋯",         ITdotdot,       0)
 566 #endif
 567        ]
 568
 569 -- -----------------------------------------------------------------------------
 570 -- Lexer actions
 571
 572 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 573
 574 special :: Token -> Action
 575 special tok span _buf len = return (L span tok)
 576
 577 token, layout_token :: Token -> Action
 578 token t span buf len = return (L span t)
 579 layout_token t span buf len = pushLexState layout >> return (L span t)
 580
 581 idtoken :: (StringBuffer -> Int -> Token) -> Action
 582 idtoken f span buf len = return (L span $! (f buf len))
 583
 584 skip_one_varid :: (FastString -> Token) -> Action
 585 skip_one_varid f span buf len
 586   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 587
 588 strtoken :: (String -> Token) -> Action
 589 strtoken f span buf len =
 590   return (L span $! (f $! lexemeToString buf len))
 591
 592 init_strtoken :: Int -> (String -> Token) -> Action
 593 -- like strtoken, but drops the last N character(s)
 594 init_strtoken drop f span buf len =
 595   return (L span $! (f $! lexemeToString buf (len-drop)))
 596
 597 begin :: Int -> Action
 598 begin code _span _str _len = do pushLexState code; lexToken
 599
 600 pop :: Action
 601 pop _span _buf _len = do popLexState; lexToken
 602
 603 pop_and :: Action -> Action
 604 pop_and act span buf len = do popLexState; act span buf len
 605
 606 notFollowedBy char _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf /= char
 607
 608 notFollowedBySymbol _ _ _ (AI _ _ buf)
 609   = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
 610
 611 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 612
 613 ifExtension pred bits _ _ _ = pred bits
 614
 615 {-
 616   nested comments require traversing by hand, they can't be parsed
 617   using regular expressions.
 618 -}
 619 nested_comment :: Action
 620 nested_comment span _str _len = do
 621   input <- getInput
 622   go 1 input
 623   where go 0 input = do setInput input; lexToken
 624         go n input = do
 625           case alexGetChar input of
 626             Nothing  -> err input
 627             Just (c,input) -> do
 628               case c of
 629                 '-' -> do
 630                   case alexGetChar input of
 631                     Nothing  -> err input
 632                     Just ('\125',input) -> go (n-1) input
 633                     Just (c,_)          -> go n input
 634                 '\123' -> do
 635                   case alexGetChar input of
 636                     Nothing  -> err input
 637                     Just ('-',input') -> go (n+1) input'
 638                     Just (c,input)    -> go n input
 639                 c -> go n input
 640
 641         err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 642
 643 open_brace, close_brace :: Action
 644 open_brace span _str _len = do
 645   ctx <- getContext
 646   setContext (NoLayout:ctx)
 647   return (L span ITocurly)
 648 close_brace span _str _len = do
 649   popContext
 650   return (L span ITccurly)
 651
 652 -- We have to be careful not to count M.<varid> as a qualified name
 653 -- when <varid> is a keyword.  We hack around this by catching
 654 -- the offending tokens afterward, and re-lexing in a different state.
 655 check_qvarid span buf len = do
 656   case lookupUFM reservedWordsFM var of
 657         Just (keyword,exts)
 658           | not (isSpecial keyword) ->
 659           if exts == 0
 660              then try_again
 661              else do
 662                 b <- extension (\i -> exts .&. i /= 0)
 663                 if b then try_again
 664                      else return token
 665         _other -> return token
 666   where
 667         (mod,var) = splitQualName buf len
 668         token     = L span (ITqvarid (mod,var))
 669
 670         try_again = do
 671                 (AI _ offs _) <- getInput
 672                 setInput (AI (srcSpanStart span) (offs-len) buf)
 673                 pushLexState bad_qvarid
 674                 lexToken
 675
 676 qvarid buf len = ITqvarid $! splitQualName buf len
 677 qconid buf len = ITqconid $! splitQualName buf len
 678
 679 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 680 -- takes a StringBuffer and a length, and returns the module name
 681 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 682 -- because of hierarchical module names.
 683 splitQualName orig_buf len = split orig_buf orig_buf
 684   where
 685     split buf dot_buf
 686         | orig_buf `byteDiff` buf >= len  = done dot_buf
 687         | c == '.'                        = found_dot buf'
 688         | otherwise                       = split buf' dot_buf
 689       where
 690        (c,buf') = nextChar buf
 691
 692     -- careful, we might get names like M....
 693     -- so, if the character after the dot is not upper-case, this is
 694     -- the end of the qualifier part.
 695     found_dot buf -- buf points after the '.'
 696         | isUpper c    = split buf' buf
 697         | otherwise    = done buf
 698       where
 699        (c,buf') = nextChar buf
 700
 701     done dot_buf =
 702         (lexemeToFastString orig_buf (qual_size - 1),
 703          lexemeToFastString dot_buf (len - qual_size))
 704       where
 705         qual_size = orig_buf `byteDiff` dot_buf
 706
 707 varid span buf len =
 708   case lookupUFM reservedWordsFM fs of
 709         Just (keyword,0)    -> do
 710                 maybe_layout keyword
 711                 return (L span keyword)
 712         Just (keyword,exts) -> do
 713                 b <- extension (\i -> exts .&. i /= 0)
 714                 if b then do maybe_layout keyword
 715                              return (L span keyword)
 716                      else return (L span (ITvarid fs))
 717         _other -> return (L span (ITvarid fs))
 718   where
 719         fs = lexemeToFastString buf len
 720
 721 conid buf len = ITconid fs
 722   where fs = lexemeToFastString buf len
 723
 724 qvarsym buf len = ITqvarsym $! splitQualName buf len
 725 qconsym buf len = ITqconsym $! splitQualName buf len
 726
 727 varsym = sym ITvarsym
 728 consym = sym ITconsym
 729
 730 sym con span buf len =
 731   case lookupUFM reservedSymsFM fs of
 732         Just (keyword,0)    -> return (L span keyword)
 733         Just (keyword,exts) -> do
 734                 b <- extension (\i -> exts .&. i /= 0)
 735                 if b then return (L span keyword)
 736                      else return (L span $! con fs)
 737         _other -> return (L span $! con fs)
 738   where
 739         fs = lexemeToFastString buf len
 740
 741 tok_decimal span buf len
 742   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 743
 744 tok_octal span buf len
 745   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 746
 747 tok_hexadecimal span buf len
 748   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 749
 750 prim_decimal span buf len
 751   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 752
 753 prim_octal span buf len
 754   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 755
 756 prim_hexadecimal span buf len
 757   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 758
 759 tok_float        str = ITrational   $! readRational str
 760 prim_float       str = ITprimfloat  $! readRational str
 761 prim_double      str = ITprimdouble $! readRational str
 762
 763 -- -----------------------------------------------------------------------------
 764 -- Layout processing
 765
 766 -- we're at the first token on a line, insert layout tokens if necessary
 767 do_bol :: Action
 768 do_bol span _str _len = do
 769         pos <- getOffside
 770         case pos of
 771             LT -> do
 772                 --trace "layout: inserting '}'" $ do
 773                 popContext
 774                 -- do NOT pop the lex state, we might have a ';' to insert
 775                 return (L span ITvccurly)
 776             EQ -> do
 777                 --trace "layout: inserting ';'" $ do
 778                 popLexState
 779                 return (L span ITsemi)
 780             GT -> do
 781                 popLexState
 782                 lexToken
 783
 784 -- certain keywords put us in the "layout" state, where we might
 785 -- add an opening curly brace.
 786 maybe_layout ITdo       = pushLexState layout_do
 787 maybe_layout ITmdo      = pushLexState layout_do
 788 maybe_layout ITof       = pushLexState layout
 789 maybe_layout ITlet      = pushLexState layout
 790 maybe_layout ITwhere    = pushLexState layout
 791 maybe_layout ITrec      = pushLexState layout
 792 maybe_layout _          = return ()
 793
 794 -- Pushing a new implicit layout context.  If the indentation of the
 795 -- next token is not greater than the previous layout context, then
 796 -- Haskell 98 says that the new layout context should be empty; that is
 797 -- the lexer must generate {}.
 798 --
 799 -- We are slightly more lenient than this: when the new context is started
 800 -- by a 'do', then we allow the new context to be at the same indentation as
 801 -- the previous context.  This is what the 'strict' argument is for.
 802 --
 803 new_layout_context strict span _buf _len = do
 804     popLexState
 805     (AI _ offset _) <- getInput
 806     ctx <- getContext
 807     case ctx of
 808         Layout prev_off : _  |
 809            (strict     && prev_off >= offset  ||
 810             not strict && prev_off > offset) -> do
 811                 -- token is indented to the left of the previous context.
 812                 -- we must generate a {} sequence now.
 813                 pushLexState layout_left
 814                 return (L span ITvocurly)
 815         other -> do
 816                 setContext (Layout offset : ctx)
 817                 return (L span ITvocurly)
 818
 819 do_layout_left span _buf _len = do
 820     popLexState
 821     pushLexState bol  -- we must be at the start of a line
 822     return (L span ITvccurly)
 823
 824 -- -----------------------------------------------------------------------------
 825 -- LINE pragmas
 826
 827 setLine :: Int -> Action
 828 setLine code span buf len = do
 829   let line = parseInteger buf len 10 octDecDigit
 830   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
 831         -- subtract one: the line number refers to the *following* line
 832   popLexState
 833   pushLexState code
 834   lexToken
 835
 836 setFile :: Int -> Action
 837 setFile code span buf len = do
 838   let file = lexemeToFastString (stepOn buf) (len-2)
 839   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
 840   popLexState
 841   pushLexState code
 842   lexToken
 843
 844 -- -----------------------------------------------------------------------------
 845 -- Strings & Chars
 846
 847 -- This stuff is horrible.  I hates it.
 848
 849 lex_string_tok :: Action
 850 lex_string_tok span buf len = do
 851   tok <- lex_string ""
 852   end <- getSrcLoc
 853   return (L (mkSrcSpan (srcSpanStart span) end) tok)
 854
 855 lex_string :: String -> P Token
 856 lex_string s = do
 857   i <- getInput
 858   case alexGetChar' i of
 859     Nothing -> lit_error
 860
 861     Just ('"',i)  -> do
 862         setInput i
 863         glaexts <- extension glaExtsEnabled
 864         if glaexts
 865           then do
 866             i <- getInput
 867             case alexGetChar' i of
 868               Just ('#',i) -> do
 869                    setInput i
 870                    if any (> '\xFF') s
 871                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 872                     else let s' = mkZFastString (reverse s) in
 873                          return (ITprimstring s')
 874                         -- mkZFastString is a hack to avoid encoding the
 875                         -- string in UTF-8.  We just want the exact bytes.
 876               _other ->
 877                 return (ITstring (mkFastString (reverse s)))
 878           else
 879                 return (ITstring (mkFastString (reverse s)))
 880
 881     Just ('\\',i)
 882         | Just ('&',i) <- next -> do
 883                 setInput i; lex_string s
 884         | Just (c,i) <- next, is_space c -> do
 885                 setInput i; lex_stringgap s
 886         where next = alexGetChar' i
 887
 888     Just (c, i) -> do
 889         c' <- lex_char c i
 890         lex_string (c':s)
 891
 892 lex_stringgap s = do
 893   c <- getCharOrFail
 894   case c of
 895     '\\' -> lex_string s
 896     c | is_space c -> lex_stringgap s
 897     _other -> lit_error
 898
 899
 900 lex_char_tok :: Action
 901 -- Here we are basically parsing character literals, such as 'x' or '\n'
 902 -- but, when Template Haskell is on, we additionally spot
 903 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
 904 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
 905 -- So we have to do two characters of lookahead: when we see 'x we need to
 906 -- see if there's a trailing quote
 907 lex_char_tok span buf len = do  -- We've seen '
 908    i1 <- getInput       -- Look ahead to first character
 909    let loc = srcSpanStart span
 910    case alexGetChar' i1 of
 911         Nothing -> lit_error
 912
 913         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
 914                   th_exts <- extension thEnabled
 915                   if th_exts then do
 916                         setInput i2
 917                         return (L (mkSrcSpan loc end2)  ITtyQuote)
 918                    else lit_error
 919
 920         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
 921                   setInput i2
 922                   lit_ch <- lex_escape
 923                   mc <- getCharOrFail   -- Trailing quote
 924                   if mc == '\'' then finish_char_tok loc lit_ch
 925                                 else do setInput i2; lit_error
 926
 927         Just (c, i2@(AI end2 _ _))
 928                 | not (isAny c) -> lit_error
 929                 | otherwise ->
 930
 931                 -- We've seen 'x, where x is a valid character
 932                 --  (i.e. not newline etc) but not a quote or backslash
 933            case alexGetChar' i2 of      -- Look ahead one more character
 934                 Nothing -> lit_error
 935                 Just ('\'', i3) -> do   -- We've seen 'x'
 936                         setInput i3
 937                         finish_char_tok loc c
 938                 _other -> do            -- We've seen 'x not followed by quote
 939                                         -- If TH is on, just parse the quote only
 940                         th_exts <- extension thEnabled
 941                         let (AI end _ _) = i1
 942                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
 943                                    else do setInput i2; lit_error
 944
 945 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
 946 finish_char_tok loc ch  -- We've already seen the closing quote
 947                         -- Just need to check for trailing #
 948   = do  glaexts <- extension glaExtsEnabled
 949         i@(AI end _ _) <- getInput
 950         if glaexts then do
 951                 case alexGetChar' i of
 952                         Just ('#',i@(AI end _ _)) -> do
 953                                 setInput i
 954                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
 955                         _other ->
 956                                 return (L (mkSrcSpan loc end) (ITchar ch))
 957                 else do
 958                    return (L (mkSrcSpan loc end) (ITchar ch))
 959
 960 lex_char :: Char -> AlexInput -> P Char
 961 lex_char c inp = do
 962   case c of
 963       '\\' -> do setInput inp; lex_escape
 964       c | isAny c -> do setInput inp; return c
 965       _other -> lit_error
 966
 967 isAny c | c > '\xff' = isPrint c
 968         | otherwise  = is_any c
 969
 970 lex_escape :: P Char
 971 lex_escape = do
 972   c <- getCharOrFail
 973   case c of
 974         'a'   -> return '\a'
 975         'b'   -> return '\b'
 976         'f'   -> return '\f'
 977         'n'   -> return '\n'
 978         'r'   -> return '\r'
 979         't'   -> return '\t'
 980         'v'   -> return '\v'
 981         '\\'  -> return '\\'
 982         '"'   -> return '\"'
 983         '\''  -> return '\''
 984         '^'   -> do c <- getCharOrFail
 985                     if c >= '@' && c <= '_'
 986                         then return (chr (ord c - ord '@'))
 987                         else lit_error
 988
 989         'x'   -> readNum is_hexdigit 16 hexDigit
 990         'o'   -> readNum is_octdigit  8 octDecDigit
 991         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
 992
 993         c1 ->  do
 994            i <- getInput
 995            case alexGetChar' i of
 996             Nothing -> lit_error
 997             Just (c2,i2) ->
 998               case alexGetChar' i2 of
 999                 Nothing -> do setInput i2; lit_error
1000                 Just (c3,i3) ->
1001                    let str = [c1,c2,c3] in
1002                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1003                                      Just rest <- [maybePrefixMatch p str] ] of
1004                           (escape_char,[]):_ -> do
1005                                 setInput i3
1006                                 return escape_char
1007                           (escape_char,_:_):_ -> do
1008                                 setInput i2
1009                                 return escape_char
1010                           [] -> lit_error
1011
1012 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1013 readNum is_digit base conv = do
1014   i <- getInput
1015   c <- getCharOrFail
1016   if is_digit c
1017         then readNum2 is_digit base conv (conv c)
1018         else do setInput i; lit_error
1019
1020 readNum2 is_digit base conv i = do
1021   input <- getInput
1022   read i input
1023   where read i input = do
1024           case alexGetChar' input of
1025             Just (c,input') | is_digit c -> do
1026                 read (i*base + conv c) input'
1027             _other -> do
1028                 if i >= 0 && i <= 0x10FFFF
1029                    then do setInput input; return (chr i)
1030                    else lit_error
1031
1032 silly_escape_chars = [
1033         ("NUL", '\NUL'),
1034         ("SOH", '\SOH'),
1035         ("STX", '\STX'),
1036         ("ETX", '\ETX'),
1037         ("EOT", '\EOT'),
1038         ("ENQ", '\ENQ'),
1039         ("ACK", '\ACK'),
1040         ("BEL", '\BEL'),
1041         ("BS", '\BS'),
1042         ("HT", '\HT'),
1043         ("LF", '\LF'),
1044         ("VT", '\VT'),
1045         ("FF", '\FF'),
1046         ("CR", '\CR'),
1047         ("SO", '\SO'),
1048         ("SI", '\SI'),
1049         ("DLE", '\DLE'),
1050         ("DC1", '\DC1'),
1051         ("DC2", '\DC2'),
1052         ("DC3", '\DC3'),
1053         ("DC4", '\DC4'),
1054         ("NAK", '\NAK'),
1055         ("SYN", '\SYN'),
1056         ("ETB", '\ETB'),
1057         ("CAN", '\CAN'),
1058         ("EM", '\EM'),
1059         ("SUB", '\SUB'),
1060         ("ESC", '\ESC'),
1061         ("FS", '\FS'),
1062         ("GS", '\GS'),
1063         ("RS", '\RS'),
1064         ("US", '\US'),
1065         ("SP", '\SP'),
1066         ("DEL", '\DEL')
1067         ]
1068
1069 -- before calling lit_error, ensure that the current input is pointing to
1070 -- the position of the error in the buffer.  This is so that we can report
1071 -- a correct location to the user, but also so we can detect UTF-8 decoding
1072 -- errors if they occur.
1073 lit_error = lexError "lexical error in string/character literal"
1074
1075 getCharOrFail :: P Char
1076 getCharOrFail =  do
1077   i <- getInput
1078   case alexGetChar' i of
1079         Nothing -> lexError "unexpected end-of-file in string/character literal"
1080         Just (c,i)  -> do setInput i; return c
1081
1082 -- -----------------------------------------------------------------------------
1083 -- The Parse Monad
1084
1085 data LayoutContext
1086   = NoLayout
1087   | Layout !Int
1088
1089 data ParseResult a
1090   = POk PState a
1091   | PFailed
1092         SrcSpan         -- The start and end of the text span related to
1093                         -- the error.  Might be used in environments which can
1094                         -- show this span, e.g. by highlighting it.
1095         Message         -- The error message
1096
1097 data PState = PState {
1098         buffer     :: StringBuffer,
1099         last_loc   :: SrcSpan,  -- pos of previous token
1100         last_offs  :: !Int,     -- offset of the previous token from the
1101                                 -- beginning of  the current line.
1102                                 -- \t is equal to 8 spaces.
1103         last_len   :: !Int,     -- len of previous token
1104         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1105         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1106         context    :: [LayoutContext],
1107         lex_state  :: [Int]
1108      }
1109         -- last_loc and last_len are used when generating error messages,
1110         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1111         -- current token to happyError, we could at least get rid of last_len.
1112         -- Getting rid of last_loc would require finding another way to
1113         -- implement pushCurrentContext (which is only called from one place).
1114
1115 newtype P a = P { unP :: PState -> ParseResult a }
1116
1117 instance Monad P where
1118   return = returnP
1119   (>>=) = thenP
1120   fail = failP
1121
1122 returnP :: a -> P a
1123 returnP a = P $ \s -> POk s a
1124
1125 thenP :: P a -> (a -> P b) -> P b
1126 (P m) `thenP` k = P $ \ s ->
1127         case m s of
1128                 POk s1 a         -> (unP (k a)) s1
1129                 PFailed span err -> PFailed span err
1130
1131 failP :: String -> P a
1132 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1133
1134 failMsgP :: String -> P a
1135 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1136
1137 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1138 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1139
1140 failSpanMsgP :: SrcSpan -> String -> P a
1141 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1142
1143 extension :: (Int -> Bool) -> P Bool
1144 extension p = P $ \s -> POk s (p $! extsBitmap s)
1145
1146 getExts :: P Int
1147 getExts = P $ \s -> POk s (extsBitmap s)
1148
1149 setSrcLoc :: SrcLoc -> P ()
1150 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1151
1152 getSrcLoc :: P SrcLoc
1153 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1154
1155 setLastToken :: SrcSpan -> Int -> P ()
1156 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1157
1158 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1159
1160 alexInputPrevChar :: AlexInput -> Char
1161 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1162
1163 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1164 alexGetChar (AI loc ofs s)
1165   | atEnd s   = Nothing
1166   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1167                 Just (adj_c, (AI loc' ofs' s'))
1168   where (c,s') = nextChar s
1169         loc'   = advanceSrcLoc loc c
1170         ofs'   = advanceOffs c ofs
1171
1172         non_graphic     = '\x0'
1173         upper           = '\x1'
1174         lower           = '\x2'
1175         digit           = '\x3'
1176         symbol          = '\x4'
1177         space           = '\x5'
1178         other_graphic   = '\x6'
1179
1180         adj_c
1181 #if __GLASGOW_HASKELL__ < 605
1182           = c  -- no Unicode support
1183 #else
1184           | c <= '\x06' = non_graphic
1185           | c <= '\xff' = c
1186           | otherwise =
1187                 case generalCategory c of
1188                   UppercaseLetter       -> upper
1189                   LowercaseLetter       -> lower
1190                   TitlecaseLetter       -> upper
1191                   ModifierLetter        -> other_graphic
1192                   OtherLetter           -> other_graphic
1193                   NonSpacingMark        -> other_graphic
1194                   SpacingCombiningMark  -> other_graphic
1195                   EnclosingMark         -> other_graphic
1196                   DecimalNumber         -> digit
1197                   LetterNumber          -> other_graphic
1198                   OtherNumber           -> other_graphic
1199                   ConnectorPunctuation  -> other_graphic
1200                   DashPunctuation       -> other_graphic
1201                   OpenPunctuation       -> other_graphic
1202                   ClosePunctuation      -> other_graphic
1203                   InitialQuote          -> other_graphic
1204                   FinalQuote            -> other_graphic
1205                   OtherPunctuation      -> other_graphic
1206                   MathSymbol            -> symbol
1207                   CurrencySymbol        -> symbol
1208                   ModifierSymbol        -> symbol
1209                   OtherSymbol           -> symbol
1210                   Space                 -> space
1211                   _other                -> non_graphic
1212 #endif
1213
1214 -- This version does not squash unicode characters, it is used when
1215 -- lexing strings.
1216 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1217 alexGetChar' (AI loc ofs s)
1218   | atEnd s   = Nothing
1219   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1220                 Just (c, (AI loc' ofs' s'))
1221   where (c,s') = nextChar s
1222         loc'   = advanceSrcLoc loc c
1223         ofs'   = advanceOffs c ofs
1224
1225 advanceOffs :: Char -> Int -> Int
1226 advanceOffs '\n' offs = 0
1227 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1228 advanceOffs _    offs = offs + 1
1229
1230 getInput :: P AlexInput
1231 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1232
1233 setInput :: AlexInput -> P ()
1234 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1235
1236 pushLexState :: Int -> P ()
1237 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1238
1239 popLexState :: P Int
1240 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1241
1242 getLexState :: P Int
1243 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1244
1245 -- for reasons of efficiency, flags indicating language extensions (eg,
1246 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1247 -- integer
1248
1249 glaExtsBit, ffiBit, parrBit :: Int
1250 glaExtsBit = 0
1251 ffiBit     = 1
1252 parrBit    = 2
1253 arrowsBit  = 4
1254 thBit      = 5
1255 ipBit      = 6
1256 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1257
1258 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1259 glaExtsEnabled flags = testBit flags glaExtsBit
1260 ffiEnabled     flags = testBit flags ffiBit
1261 parrEnabled    flags = testBit flags parrBit
1262 arrowsEnabled  flags = testBit flags arrowsBit
1263 thEnabled      flags = testBit flags thBit
1264 ipEnabled      flags = testBit flags ipBit
1265 tvEnabled      flags = testBit flags tvBit
1266
1267 -- create a parse state
1268 --
1269 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1270 mkPState buf loc flags  =
1271   PState {
1272       buffer     = buf,
1273       last_loc   = mkSrcSpan loc loc,
1274       last_offs  = 0,
1275       last_len   = 0,
1276       loc        = loc,
1277       extsBitmap = fromIntegral bitmap,
1278       context    = [],
1279       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1280         -- we begin in the layout state if toplev_layout is set
1281     }
1282     where
1283       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1284                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1285                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1286                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1287                .|. thBit      `setBitIf` dopt Opt_TH          flags
1288                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1289                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1290       --
1291       setBitIf :: Int -> Bool -> Int
1292       b `setBitIf` cond | cond      = bit b
1293                         | otherwise = 0
1294
1295 getContext :: P [LayoutContext]
1296 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1297
1298 setContext :: [LayoutContext] -> P ()
1299 setContext ctx = P $ \s -> POk s{context=ctx} ()
1300
1301 popContext :: P ()
1302 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1303                            loc = loc, last_len = len, last_loc = last_loc }) ->
1304   case ctx of
1305         (_:tl) -> POk s{ context = tl } ()
1306         []     -> PFailed last_loc (srcParseErr buf len)
1307
1308 -- Push a new layout context at the indentation of the last token read.
1309 -- This is only used at the outer level of a module when the 'module'
1310 -- keyword is missing.
1311 pushCurrentContext :: P ()
1312 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_len=len, context=ctx } ->
1313   POk s{context = Layout (offs-len) : ctx} ()
1314
1315 getOffside :: P Ordering
1316 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1317                 let ord = case stk of
1318                         (Layout n:_) -> compare offs n
1319                         _            -> GT
1320                 in POk s ord
1321
1322 -- ---------------------------------------------------------------------------
1323 -- Construct a parse error
1324
1325 srcParseErr
1326   :: StringBuffer       -- current buffer (placed just after the last token)
1327   -> Int                -- length of the previous token
1328   -> Message
1329 srcParseErr buf len
1330   = hcat [ if null token
1331              then ptext SLIT("parse error (possibly incorrect indentation)")
1332              else hcat [ptext SLIT("parse error on input "),
1333                         char '`', text token, char '\'']
1334     ]
1335   where token = lexemeToString (offsetBytes (-len) buf) len
1336
1337 -- Report a parse failure, giving the span of the previous token as
1338 -- the location of the error.  This is the entry point for errors
1339 -- detected during parsing.
1340 srcParseFail :: P a
1341 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1342                             last_loc = last_loc } ->
1343     PFailed last_loc (srcParseErr buf len)
1344
1345 -- A lexical error is reported at a particular position in the source file,
1346 -- not over a token range.
1347 lexError :: String -> P a
1348 lexError str = do
1349   loc <- getSrcLoc
1350   i@(AI end _ buf) <- getInput
1351   reportLexError loc end buf str
1352
1353 -- -----------------------------------------------------------------------------
1354 -- This is the top-level function: called from the parser each time a
1355 -- new token is to be read from the input.
1356
1357 lexer :: (Located Token -> P a) -> P a
1358 lexer cont = do
1359   tok@(L _ tok__) <- lexToken
1360   --trace ("token: " ++ show tok__) $ do
1361   cont tok
1362
1363 lexToken :: P (Located Token)
1364 lexToken = do
1365   inp@(AI loc1 _ buf) <- getInput
1366   sc <- getLexState
1367   exts <- getExts
1368   case alexScanUser exts inp sc of
1369     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1370                   setLastToken span 0
1371                   return (L span ITeof)
1372     AlexError (AI loc2 _ buf) -> do
1373         reportLexError loc1 loc2 buf "lexical error"
1374     AlexSkip inp2 _ -> do
1375         setInput inp2
1376         lexToken
1377     AlexToken inp2@(AI end _ buf2) len t -> do
1378         setInput inp2
1379         let span = mkSrcSpan loc1 end
1380         let bytes = byteDiff buf buf2
1381         span `seq` setLastToken span bytes
1382         t span buf bytes
1383
1384 -- ToDo: Alex reports the buffer at the start of the erroneous lexeme,
1385 -- but it would be more informative to report the location where the
1386 -- error was actually discovered, especially if this is a decoding
1387 -- error.
1388 reportLexError loc1 loc2 buf str =
1389   let
1390         c = fst (nextChar buf)
1391   in
1392   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1393     then failLocMsgP loc2 loc2 "UTF-8 decoding error"
1394     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1395 }