ghc/compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  29    getLexState, popLexState, pushLexState,
  30    extension, bangPatEnabled
  31   ) where
  32
  33 #include "HsVersions.h"
  34
  35 import ErrUtils         ( Message )
  36 import Outputable
  37 import StringBuffer
  38 import FastString
  39 import FastTypes
  40 import SrcLoc
  41 import UniqFM
  42 import DynFlags
  43 import Ctype
  44 import Util             ( maybePrefixMatch, readRational )
  45
  46 import DATA_BITS
  47 import Data.Char        ( chr )
  48 import Ratio
  49 --import TRACE
  50
  51 #if __GLASGOW_HASKELL__ >= 605
  52 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  53 #else
  54 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  55 #endif
  56 }
  57
  58 $unispace    = \x05
  59 $whitechar   = [\ \t\n\r\f\v\xa0 $unispace]
  60 $white_no_nl = $whitechar # \n
  61
  62 $ascdigit  = 0-9
  63 $unidigit  = \x03
  64 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  65 $digit     = [$ascdigit $unidigit]
  66
  67 $special   = [\(\)\,\;\[\]\`\{\}]
  68 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  69 $unisymbol = \x04
  70 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  71
  72 $unilarge  = \x01
  73 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  74 $large     = [$asclarge $unilarge]
  75
  76 $unismall  = \x02
  77 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  78 $small     = [$ascsmall $unismall \_]
  79
  80 $unigraphic = \x06
  81 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  82
  83 $octit     = 0-7
  84 $hexit     = [$decdigit A-F a-f]
  85 $symchar   = [$symbol \:]
  86 $nl        = [\n\r]
  87 $idchar    = [$small $large $digit \']
  88
  89 @varid     = $small $idchar*
  90 @conid     = $large $idchar*
  91
  92 @varsym    = $symbol $symchar*
  93 @consym    = \: $symchar*
  94
  95 @decimal     = $decdigit+
  96 @octal       = $octit+
  97 @hexadecimal = $hexit+
  98 @exponent    = [eE] [\-\+]? @decimal
  99
 100 -- we support the hierarchical module name extension:
 101 @qual = (@conid \.)+
 102
 103 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 104
 105 haskell :-
 106
 107 -- everywhere: skip whitespace and comments
 108 $white_no_nl+                           ;
 109
 110 -- Everywhere: deal with nested comments.  We explicitly rule out
 111 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 112 -- (this can happen even though pragmas will normally take precedence due to
 113 -- longest-match, because pragmas aren't valid in every state, but comments
 114 -- are).
 115 "{-" / { notFollowedBy '#' }            { nested_comment }
 116
 117 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 118 -- more dashes followed by a symbol should be parsed as a varsym, so we
 119 -- have to exclude those.
 120 -- The regex says: "munch all the characters after the dashes, as long as
 121 -- the first one is not a symbol".
 122 "--"\-* [^$symbol :] .*                 ;
 123 "--"\-* / { atEOL }                     ;
 124
 125 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 126 -- blank lines) until we find a non-whitespace character, then do layout
 127 -- processing.
 128 --
 129 -- One slight wibble here: what if the line begins with {-#? In
 130 -- theory, we have to lex the pragma to see if it's one we recognise,
 131 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 132 -- as a nested comment.  We don't bother with this: if the line begins
 133 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 134 <bol> {
 135   \n                                    ;
 136   ^\# (line)?                           { begin line_prag1 }
 137   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 138   ^\# \! .* \n                          ; -- #!, for scripts
 139   ()                                    { do_bol }
 140 }
 141
 142 -- after a layout keyword (let, where, do, of), we begin a new layout
 143 -- context if the curly brace is missing.
 144 -- Careful! This stuff is quite delicate.
 145 <layout, layout_do> {
 146   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 147         -- we might encounter {-# here, but {- has been handled already
 148   \n                                    ;
 149   ^\# (line)?                           { begin line_prag1 }
 150 }
 151
 152 -- do is treated in a subtly different way, see new_layout_context
 153 <layout>    ()                          { new_layout_context True }
 154 <layout_do> ()                          { new_layout_context False }
 155
 156 -- after a new layout context which was found to be to the left of the
 157 -- previous context, we have generated a '{' token, and we now need to
 158 -- generate a matching '}' token.
 159 <layout_left>  ()                       { do_layout_left }
 160
 161 <0,option_prags,glaexts> \n                             { begin bol }
 162
 163 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 164
 165 -- single-line line pragmas, of the form
 166 --    # <line> "<file>" <extra-stuff> \n
 167 <line_prag1> $decdigit+                 { setLine line_prag1a }
 168 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 169 <line_prag1b> .*                        { pop }
 170
 171 -- Haskell-style line pragmas, of the form
 172 --    {-# LINE <line> "<file>" #-}
 173 <line_prag2> $decdigit+                 { setLine line_prag2a }
 174 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 175 <line_prag2b> "#-}"|"-}"                { pop }
 176    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 177    -- with older versions of GHC which generated these.
 178
 179 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 180 -- is on, because the contents of the pragma is always written using
 181 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 182 -- enabled, we're sure to get a parse error.
 183 -- (ToDo: we should really emit a warning when ignoring pragmas)
 184 <glaexts>
 185   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 186
 187 <0,option_prags,glaexts> {
 188   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 189   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 190                                         { token (ITinline_prag False) }
 191   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 192                                         { token ITspec_prag }
 193   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 194         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 195   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 196         $whitechar* (NO(T?)INLINE|no(t?)inline)
 197                                         { token (ITspec_inline_prag False) }
 198   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 199   "{-#" $whitechar* (DEPRECATED|deprecated)
 200                                         { token ITdeprecated_prag }
 201   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 202   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 203   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 204
 205   "{-#"                                 { nested_comment }
 206
 207   -- ToDo: should only be valid inside a pragma:
 208   "#-}"                                 { token ITclose_prag}
 209 }
 210
 211 <option_prags> {
 212   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 213   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 214                                         { lex_string_prag IToptions_prag }
 215   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 216   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 217 }
 218
 219 -- '0' state: ordinary lexemes
 220 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 221
 222 -- "special" symbols
 223
 224 <0,glaexts> {
 225   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 226   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 227 }
 228
 229 <0,glaexts> {
 230   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 231   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 232   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 233   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 234   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 235   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 236   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 237   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 238 }
 239
 240 <0,glaexts> {
 241   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 242                                         { special IToparenbar }
 243   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 244 }
 245
 246 <0,glaexts> {
 247   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 248   \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
 249 }
 250
 251 <glaexts> {
 252   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 253   "#)"                                  { token ITcubxparen }
 254   "{|"                                  { token ITocurlybar }
 255   "|}"                                  { token ITccurlybar }
 256 }
 257
 258 <0,option_prags,glaexts> {
 259   \(                                    { special IToparen }
 260   \)                                    { special ITcparen }
 261   \[                                    { special ITobrack }
 262   \]                                    { special ITcbrack }
 263   \,                                    { special ITcomma }
 264   \;                                    { special ITsemi }
 265   \`                                    { special ITbackquote }
 266
 267   \{                                    { open_brace }
 268   \}                                    { close_brace }
 269 }
 270
 271 <0,option_prags,glaexts> {
 272   @qual @varid                  { check_qvarid }
 273   @qual @conid                  { idtoken qconid }
 274   @varid                        { varid }
 275   @conid                        { idtoken conid }
 276 }
 277
 278 -- after an illegal qvarid, such as 'M.let',
 279 -- we back up and try again in the bad_qvarid state:
 280 <bad_qvarid> {
 281   @conid                        { pop_and (idtoken conid) }
 282   @qual @conid                  { pop_and (idtoken qconid) }
 283 }
 284
 285 <glaexts> {
 286   @qual @varid "#"+             { idtoken qvarid }
 287   @qual @conid "#"+             { idtoken qconid }
 288   @varid "#"+                   { varid }
 289   @conid "#"+                   { idtoken conid }
 290 }
 291
 292 -- ToDo: M.(,,,)
 293
 294 <0,glaexts> {
 295   @qual @varsym                 { idtoken qvarsym }
 296   @qual @consym                 { idtoken qconsym }
 297   @varsym                       { varsym }
 298   @consym                       { consym }
 299 }
 300
 301 <0,glaexts> {
 302   @decimal                      { tok_decimal }
 303   0[oO] @octal                  { tok_octal }
 304   0[xX] @hexadecimal            { tok_hexadecimal }
 305 }
 306
 307 <glaexts> {
 308   @decimal \#                   { prim_decimal }
 309   0[oO] @octal \#               { prim_octal }
 310   0[xX] @hexadecimal \#         { prim_hexadecimal }
 311 }
 312
 313 <0,glaexts> @floating_point             { strtoken tok_float }
 314 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 315 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 316
 317 -- Strings and chars are lexed by hand-written code.  The reason is
 318 -- that even if we recognise the string or char here in the regex
 319 -- lexer, we would still have to parse the string afterward in order
 320 -- to convert it to a String.
 321 <0,glaexts> {
 322   \'                            { lex_char_tok }
 323   \"                            { lex_string_tok }
 324 }
 325
 326 {
 327 -- work around bug in Alex 2.0
 328 #if __GLASGOW_HASKELL__ < 503
 329 unsafeAt arr i = arr ! i
 330 #endif
 331
 332 -- -----------------------------------------------------------------------------
 333 -- The token type
 334
 335 data Token
 336   = ITas                        -- Haskell keywords
 337   | ITcase
 338   | ITclass
 339   | ITdata
 340   | ITdefault
 341   | ITderiving
 342   | ITdo
 343   | ITelse
 344   | IThiding
 345   | ITif
 346   | ITimport
 347   | ITin
 348   | ITinfix
 349   | ITinfixl
 350   | ITinfixr
 351   | ITinstance
 352   | ITlet
 353   | ITmodule
 354   | ITnewtype
 355   | ITof
 356   | ITqualified
 357   | ITthen
 358   | ITtype
 359   | ITwhere
 360   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 361
 362   | ITforall                    -- GHC extension keywords
 363   | ITforeign
 364   | ITexport
 365   | ITlabel
 366   | ITdynamic
 367   | ITsafe
 368   | ITthreadsafe
 369   | ITunsafe
 370   | ITstdcallconv
 371   | ITccallconv
 372   | ITdotnet
 373   | ITmdo
 374
 375         -- Pragmas
 376   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 377   | ITspec_prag                 -- SPECIALISE
 378   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 379   | ITsource_prag
 380   | ITrules_prag
 381   | ITdeprecated_prag
 382   | ITline_prag
 383   | ITscc_prag
 384   | ITcore_prag                 -- hdaume: core annotations
 385   | ITunpack_prag
 386   | ITclose_prag
 387   | IToptions_prag String
 388   | ITinclude_prag String
 389   | ITlanguage_prag
 390
 391   | ITdotdot                    -- reserved symbols
 392   | ITcolon
 393   | ITdcolon
 394   | ITequal
 395   | ITlam
 396   | ITvbar
 397   | ITlarrow
 398   | ITrarrow
 399   | ITat
 400   | ITtilde
 401   | ITdarrow
 402   | ITminus
 403   | ITbang
 404   | ITstar
 405   | ITdot
 406
 407   | ITbiglam                    -- GHC-extension symbols
 408
 409   | ITocurly                    -- special symbols
 410   | ITccurly
 411   | ITocurlybar                 -- {|, for type applications
 412   | ITccurlybar                 -- |}, for type applications
 413   | ITvocurly
 414   | ITvccurly
 415   | ITobrack
 416   | ITopabrack                  -- [:, for parallel arrays with -fparr
 417   | ITcpabrack                  -- :], for parallel arrays with -fparr
 418   | ITcbrack
 419   | IToparen
 420   | ITcparen
 421   | IToubxparen
 422   | ITcubxparen
 423   | ITsemi
 424   | ITcomma
 425   | ITunderscore
 426   | ITbackquote
 427
 428   | ITvarid   FastString        -- identifiers
 429   | ITconid   FastString
 430   | ITvarsym  FastString
 431   | ITconsym  FastString
 432   | ITqvarid  (FastString,FastString)
 433   | ITqconid  (FastString,FastString)
 434   | ITqvarsym (FastString,FastString)
 435   | ITqconsym (FastString,FastString)
 436
 437   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 438   | ITsplitipvarid FastString   -- GHC extension: implicit param: %x
 439
 440   | ITpragma StringBuffer
 441
 442   | ITchar       Char
 443   | ITstring     FastString
 444   | ITinteger    Integer
 445   | ITrational   Rational
 446
 447   | ITprimchar   Char
 448   | ITprimstring FastString
 449   | ITprimint    Integer
 450   | ITprimfloat  Rational
 451   | ITprimdouble Rational
 452
 453   -- MetaHaskell extension tokens
 454   | ITopenExpQuote              --  [| or [e|
 455   | ITopenPatQuote              --  [p|
 456   | ITopenDecQuote              --  [d|
 457   | ITopenTypQuote              --  [t|
 458   | ITcloseQuote                --  |]
 459   | ITidEscape   FastString     --  $x
 460   | ITparenEscape               --  $(
 461   | ITvarQuote                  --  '
 462   | ITtyQuote                   --  ''
 463
 464   -- Arrow notation extension
 465   | ITproc
 466   | ITrec
 467   | IToparenbar                 --  (|
 468   | ITcparenbar                 --  |)
 469   | ITlarrowtail                --  -<
 470   | ITrarrowtail                --  >-
 471   | ITLarrowtail                --  -<<
 472   | ITRarrowtail                --  >>-
 473
 474   | ITunknown String            -- Used when the lexer can't make sense of it
 475   | ITeof                       -- end of file token
 476 #ifdef DEBUG
 477   deriving Show -- debugging
 478 #endif
 479
 480 isSpecial :: Token -> Bool
 481 -- If we see M.x, where x is a keyword, but
 482 -- is special, we treat is as just plain M.x,
 483 -- not as a keyword.
 484 isSpecial ITas          = True
 485 isSpecial IThiding      = True
 486 isSpecial ITqualified   = True
 487 isSpecial ITforall      = True
 488 isSpecial ITexport      = True
 489 isSpecial ITlabel       = True
 490 isSpecial ITdynamic     = True
 491 isSpecial ITsafe        = True
 492 isSpecial ITthreadsafe  = True
 493 isSpecial ITunsafe      = True
 494 isSpecial ITccallconv   = True
 495 isSpecial ITstdcallconv = True
 496 isSpecial ITmdo         = True
 497 isSpecial _             = False
 498
 499 -- the bitmap provided as the third component indicates whether the
 500 -- corresponding extension keyword is valid under the extension options
 501 -- provided to the compiler; if the extension corresponding to *any* of the
 502 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 503 -- facilitates using a keyword in two different extensions that can be
 504 -- activated independently)
 505 --
 506 reservedWordsFM = listToUFM $
 507         map (\(x, y, z) -> (mkFastString x, (y, z)))
 508        [( "_",          ITunderscore,   0 ),
 509         ( "as",         ITas,           0 ),
 510         ( "case",       ITcase,         0 ),
 511         ( "class",      ITclass,        0 ),
 512         ( "data",       ITdata,         0 ),
 513         ( "default",    ITdefault,      0 ),
 514         ( "deriving",   ITderiving,     0 ),
 515         ( "do",         ITdo,           0 ),
 516         ( "else",       ITelse,         0 ),
 517         ( "hiding",     IThiding,       0 ),
 518         ( "if",         ITif,           0 ),
 519         ( "import",     ITimport,       0 ),
 520         ( "in",         ITin,           0 ),
 521         ( "infix",      ITinfix,        0 ),
 522         ( "infixl",     ITinfixl,       0 ),
 523         ( "infixr",     ITinfixr,       0 ),
 524         ( "instance",   ITinstance,     0 ),
 525         ( "let",        ITlet,          0 ),
 526         ( "module",     ITmodule,       0 ),
 527         ( "newtype",    ITnewtype,      0 ),
 528         ( "of",         ITof,           0 ),
 529         ( "qualified",  ITqualified,    0 ),
 530         ( "then",       ITthen,         0 ),
 531         ( "type",       ITtype,         0 ),
 532         ( "where",      ITwhere,        0 ),
 533         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 534
 535         ( "forall",     ITforall,        bit tvBit),
 536         ( "mdo",        ITmdo,           bit glaExtsBit),
 537
 538         ( "foreign",    ITforeign,       bit ffiBit),
 539         ( "export",     ITexport,        bit ffiBit),
 540         ( "label",      ITlabel,         bit ffiBit),
 541         ( "dynamic",    ITdynamic,       bit ffiBit),
 542         ( "safe",       ITsafe,          bit ffiBit),
 543         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 544         ( "unsafe",     ITunsafe,        bit ffiBit),
 545         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 546         ( "ccall",      ITccallconv,     bit ffiBit),
 547         ( "dotnet",     ITdotnet,        bit ffiBit),
 548
 549         ( "rec",        ITrec,           bit arrowsBit),
 550         ( "proc",       ITproc,          bit arrowsBit)
 551      ]
 552
 553 reservedSymsFM = listToUFM $
 554         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 555       [ ("..",  ITdotdot,       0)
 556        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 557                                                 -- meaning only list cons
 558        ,("::",  ITdcolon,       0)
 559        ,("=",   ITequal,        0)
 560        ,("\\",  ITlam,          0)
 561        ,("|",   ITvbar,         0)
 562        ,("<-",  ITlarrow,       0)
 563        ,("->",  ITrarrow,       0)
 564        ,("@",   ITat,           0)
 565        ,("~",   ITtilde,        0)
 566        ,("=>",  ITdarrow,       0)
 567        ,("-",   ITminus,        0)
 568        ,("!",   ITbang,         0)
 569
 570        ,("*",   ITstar,         bit glaExtsBit) -- For data T (a::*) = MkT
 571        ,(".",   ITdot,          bit tvBit)      -- For 'forall a . t'
 572
 573        ,("-<",  ITlarrowtail,   bit arrowsBit)
 574        ,(">-",  ITrarrowtail,   bit arrowsBit)
 575        ,("-<<", ITLarrowtail,   bit arrowsBit)
 576        ,(">>-", ITRarrowtail,   bit arrowsBit)
 577
 578 #if __GLASGOW_HASKELL__ >= 605
 579        ,("λ",  ITlam,          bit glaExtsBit)
 580        ,("∷",   ITdcolon,       bit glaExtsBit)
 581        ,("⇒",   ITdarrow,     bit glaExtsBit)
 582        ,("∀", ITforall,       bit glaExtsBit)
 583        ,("→",   ITrarrow,     bit glaExtsBit)
 584        ,("←",   ITlarrow,     bit glaExtsBit)
 585        ,("⋯",         ITdotdot,       bit glaExtsBit)
 586 #endif
 587        ]
 588
 589 -- -----------------------------------------------------------------------------
 590 -- Lexer actions
 591
 592 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 593
 594 special :: Token -> Action
 595 special tok span _buf len = return (L span tok)
 596
 597 token, layout_token :: Token -> Action
 598 token t span buf len = return (L span t)
 599 layout_token t span buf len = pushLexState layout >> return (L span t)
 600
 601 idtoken :: (StringBuffer -> Int -> Token) -> Action
 602 idtoken f span buf len = return (L span $! (f buf len))
 603
 604 skip_one_varid :: (FastString -> Token) -> Action
 605 skip_one_varid f span buf len
 606   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 607
 608 strtoken :: (String -> Token) -> Action
 609 strtoken f span buf len =
 610   return (L span $! (f $! lexemeToString buf len))
 611
 612 init_strtoken :: Int -> (String -> Token) -> Action
 613 -- like strtoken, but drops the last N character(s)
 614 init_strtoken drop f span buf len =
 615   return (L span $! (f $! lexemeToString buf (len-drop)))
 616
 617 begin :: Int -> Action
 618 begin code _span _str _len = do pushLexState code; lexToken
 619
 620 pop :: Action
 621 pop _span _buf _len = do popLexState; lexToken
 622
 623 pop_and :: Action -> Action
 624 pop_and act span buf len = do popLexState; act span buf len
 625
 626 notFollowedBy char _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf /= char
 627
 628 notFollowedBySymbol _ _ _ (AI _ _ buf)
 629   = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
 630
 631 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 632
 633 ifExtension pred bits _ _ _ = pred bits
 634
 635 {-
 636   nested comments require traversing by hand, they can't be parsed
 637   using regular expressions.
 638 -}
 639 nested_comment :: Action
 640 nested_comment span _str _len = do
 641   input <- getInput
 642   go 1 input
 643   where go 0 input = do setInput input; lexToken
 644         go n input = do
 645           case alexGetChar input of
 646             Nothing  -> err input
 647             Just (c,input) -> do
 648               case c of
 649                 '-' -> do
 650                   case alexGetChar input of
 651                     Nothing  -> err input
 652                     Just ('\125',input) -> go (n-1) input
 653                     Just (c,_)          -> go n input
 654                 '\123' -> do
 655                   case alexGetChar input of
 656                     Nothing  -> err input
 657                     Just ('-',input') -> go (n+1) input'
 658                     Just (c,input)    -> go n input
 659                 c -> go n input
 660
 661         err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 662
 663 open_brace, close_brace :: Action
 664 open_brace span _str _len = do
 665   ctx <- getContext
 666   setContext (NoLayout:ctx)
 667   return (L span ITocurly)
 668 close_brace span _str _len = do
 669   popContext
 670   return (L span ITccurly)
 671
 672 -- We have to be careful not to count M.<varid> as a qualified name
 673 -- when <varid> is a keyword.  We hack around this by catching
 674 -- the offending tokens afterward, and re-lexing in a different state.
 675 check_qvarid span buf len = do
 676   case lookupUFM reservedWordsFM var of
 677         Just (keyword,exts)
 678           | not (isSpecial keyword) ->
 679           if exts == 0
 680              then try_again
 681              else do
 682                 b <- extension (\i -> exts .&. i /= 0)
 683                 if b then try_again
 684                      else return token
 685         _other -> return token
 686   where
 687         (mod,var) = splitQualName buf len
 688         token     = L span (ITqvarid (mod,var))
 689
 690         try_again = do
 691                 (AI _ offs _) <- getInput
 692                 setInput (AI (srcSpanStart span) (offs-len) buf)
 693                 pushLexState bad_qvarid
 694                 lexToken
 695
 696 qvarid buf len = ITqvarid $! splitQualName buf len
 697 qconid buf len = ITqconid $! splitQualName buf len
 698
 699 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 700 -- takes a StringBuffer and a length, and returns the module name
 701 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 702 -- because of hierarchical module names.
 703 splitQualName orig_buf len = split orig_buf orig_buf
 704   where
 705     split buf dot_buf
 706         | orig_buf `byteDiff` buf >= len  = done dot_buf
 707         | c == '.'                        = found_dot buf'
 708         | otherwise                       = split buf' dot_buf
 709       where
 710        (c,buf') = nextChar buf
 711
 712     -- careful, we might get names like M....
 713     -- so, if the character after the dot is not upper-case, this is
 714     -- the end of the qualifier part.
 715     found_dot buf -- buf points after the '.'
 716         | isUpper c    = split buf' buf
 717         | otherwise    = done buf
 718       where
 719        (c,buf') = nextChar buf
 720
 721     done dot_buf =
 722         (lexemeToFastString orig_buf (qual_size - 1),
 723          lexemeToFastString dot_buf (len - qual_size))
 724       where
 725         qual_size = orig_buf `byteDiff` dot_buf
 726
 727 varid span buf len =
 728   case lookupUFM reservedWordsFM fs of
 729         Just (keyword,0)    -> do
 730                 maybe_layout keyword
 731                 return (L span keyword)
 732         Just (keyword,exts) -> do
 733                 b <- extension (\i -> exts .&. i /= 0)
 734                 if b then do maybe_layout keyword
 735                              return (L span keyword)
 736                      else return (L span (ITvarid fs))
 737         _other -> return (L span (ITvarid fs))
 738   where
 739         fs = lexemeToFastString buf len
 740
 741 conid buf len = ITconid fs
 742   where fs = lexemeToFastString buf len
 743
 744 qvarsym buf len = ITqvarsym $! splitQualName buf len
 745 qconsym buf len = ITqconsym $! splitQualName buf len
 746
 747 varsym = sym ITvarsym
 748 consym = sym ITconsym
 749
 750 sym con span buf len =
 751   case lookupUFM reservedSymsFM fs of
 752         Just (keyword,0)    -> return (L span keyword)
 753         Just (keyword,exts) -> do
 754                 b <- extension (\i -> exts .&. i /= 0)
 755                 if b then return (L span keyword)
 756                      else return (L span $! con fs)
 757         _other -> return (L span $! con fs)
 758   where
 759         fs = lexemeToFastString buf len
 760
 761 tok_decimal span buf len
 762   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 763
 764 tok_octal span buf len
 765   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 766
 767 tok_hexadecimal span buf len
 768   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 769
 770 prim_decimal span buf len
 771   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 772
 773 prim_octal span buf len
 774   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 775
 776 prim_hexadecimal span buf len
 777   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 778
 779 tok_float        str = ITrational   $! readRational str
 780 prim_float       str = ITprimfloat  $! readRational str
 781 prim_double      str = ITprimdouble $! readRational str
 782
 783 -- -----------------------------------------------------------------------------
 784 -- Layout processing
 785
 786 -- we're at the first token on a line, insert layout tokens if necessary
 787 do_bol :: Action
 788 do_bol span _str _len = do
 789         pos <- getOffside
 790         case pos of
 791             LT -> do
 792                 --trace "layout: inserting '}'" $ do
 793                 popContext
 794                 -- do NOT pop the lex state, we might have a ';' to insert
 795                 return (L span ITvccurly)
 796             EQ -> do
 797                 --trace "layout: inserting ';'" $ do
 798                 popLexState
 799                 return (L span ITsemi)
 800             GT -> do
 801                 popLexState
 802                 lexToken
 803
 804 -- certain keywords put us in the "layout" state, where we might
 805 -- add an opening curly brace.
 806 maybe_layout ITdo       = pushLexState layout_do
 807 maybe_layout ITmdo      = pushLexState layout_do
 808 maybe_layout ITof       = pushLexState layout
 809 maybe_layout ITlet      = pushLexState layout
 810 maybe_layout ITwhere    = pushLexState layout
 811 maybe_layout ITrec      = pushLexState layout
 812 maybe_layout _          = return ()
 813
 814 -- Pushing a new implicit layout context.  If the indentation of the
 815 -- next token is not greater than the previous layout context, then
 816 -- Haskell 98 says that the new layout context should be empty; that is
 817 -- the lexer must generate {}.
 818 --
 819 -- We are slightly more lenient than this: when the new context is started
 820 -- by a 'do', then we allow the new context to be at the same indentation as
 821 -- the previous context.  This is what the 'strict' argument is for.
 822 --
 823 new_layout_context strict span _buf _len = do
 824     popLexState
 825     (AI _ offset _) <- getInput
 826     ctx <- getContext
 827     case ctx of
 828         Layout prev_off : _  |
 829            (strict     && prev_off >= offset  ||
 830             not strict && prev_off > offset) -> do
 831                 -- token is indented to the left of the previous context.
 832                 -- we must generate a {} sequence now.
 833                 pushLexState layout_left
 834                 return (L span ITvocurly)
 835         other -> do
 836                 setContext (Layout offset : ctx)
 837                 return (L span ITvocurly)
 838
 839 do_layout_left span _buf _len = do
 840     popLexState
 841     pushLexState bol  -- we must be at the start of a line
 842     return (L span ITvccurly)
 843
 844 -- -----------------------------------------------------------------------------
 845 -- LINE pragmas
 846
 847 setLine :: Int -> Action
 848 setLine code span buf len = do
 849   let line = parseInteger buf len 10 octDecDigit
 850   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
 851         -- subtract one: the line number refers to the *following* line
 852   popLexState
 853   pushLexState code
 854   lexToken
 855
 856 setFile :: Int -> Action
 857 setFile code span buf len = do
 858   let file = lexemeToFastString (stepOn buf) (len-2)
 859   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
 860   popLexState
 861   pushLexState code
 862   lexToken
 863
 864
 865 -- -----------------------------------------------------------------------------
 866 -- Options, includes and language pragmas.
 867
 868 lex_string_prag :: (String -> Token) -> Action
 869 lex_string_prag mkTok span buf len
 870     = do input <- getInput
 871          start <- getSrcLoc
 872          tok <- go [] input
 873          end <- getSrcLoc
 874          return (L (mkSrcSpan start end) tok)
 875     where go acc input
 876               = if isString input "#-}"
 877                    then do setInput input
 878                            return (mkTok (reverse acc))
 879                    else case alexGetChar input of
 880                           Just (c,i) -> go (c:acc) i
 881                           Nothing -> err input
 882           isString i [] = True
 883           isString i (x:xs)
 884               = case alexGetChar i of
 885                   Just (c,i') | c == x    -> isString i' xs
 886                   _other -> False
 887           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
 888
 889
 890 -- -----------------------------------------------------------------------------
 891 -- Strings & Chars
 892
 893 -- This stuff is horrible.  I hates it.
 894
 895 lex_string_tok :: Action
 896 lex_string_tok span buf len = do
 897   tok <- lex_string ""
 898   end <- getSrcLoc
 899   return (L (mkSrcSpan (srcSpanStart span) end) tok)
 900
 901 lex_string :: String -> P Token
 902 lex_string s = do
 903   i <- getInput
 904   case alexGetChar' i of
 905     Nothing -> lit_error
 906
 907     Just ('"',i)  -> do
 908         setInput i
 909         glaexts <- extension glaExtsEnabled
 910         if glaexts
 911           then do
 912             i <- getInput
 913             case alexGetChar' i of
 914               Just ('#',i) -> do
 915                    setInput i
 916                    if any (> '\xFF') s
 917                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 918                     else let s' = mkZFastString (reverse s) in
 919                          return (ITprimstring s')
 920                         -- mkZFastString is a hack to avoid encoding the
 921                         -- string in UTF-8.  We just want the exact bytes.
 922               _other ->
 923                 return (ITstring (mkFastString (reverse s)))
 924           else
 925                 return (ITstring (mkFastString (reverse s)))
 926
 927     Just ('\\',i)
 928         | Just ('&',i) <- next -> do
 929                 setInput i; lex_string s
 930         | Just (c,i) <- next, is_space c -> do
 931                 setInput i; lex_stringgap s
 932         where next = alexGetChar' i
 933
 934     Just (c, i) -> do
 935         c' <- lex_char c i
 936         lex_string (c':s)
 937
 938 lex_stringgap s = do
 939   c <- getCharOrFail
 940   case c of
 941     '\\' -> lex_string s
 942     c | is_space c -> lex_stringgap s
 943     _other -> lit_error
 944
 945
 946 lex_char_tok :: Action
 947 -- Here we are basically parsing character literals, such as 'x' or '\n'
 948 -- but, when Template Haskell is on, we additionally spot
 949 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
 950 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
 951 -- So we have to do two characters of lookahead: when we see 'x we need to
 952 -- see if there's a trailing quote
 953 lex_char_tok span buf len = do  -- We've seen '
 954    i1 <- getInput       -- Look ahead to first character
 955    let loc = srcSpanStart span
 956    case alexGetChar' i1 of
 957         Nothing -> lit_error
 958
 959         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
 960                   th_exts <- extension thEnabled
 961                   if th_exts then do
 962                         setInput i2
 963                         return (L (mkSrcSpan loc end2)  ITtyQuote)
 964                    else lit_error
 965
 966         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
 967                   setInput i2
 968                   lit_ch <- lex_escape
 969                   mc <- getCharOrFail   -- Trailing quote
 970                   if mc == '\'' then finish_char_tok loc lit_ch
 971                                 else do setInput i2; lit_error
 972
 973         Just (c, i2@(AI end2 _ _))
 974                 | not (isAny c) -> lit_error
 975                 | otherwise ->
 976
 977                 -- We've seen 'x, where x is a valid character
 978                 --  (i.e. not newline etc) but not a quote or backslash
 979            case alexGetChar' i2 of      -- Look ahead one more character
 980                 Nothing -> lit_error
 981                 Just ('\'', i3) -> do   -- We've seen 'x'
 982                         setInput i3
 983                         finish_char_tok loc c
 984                 _other -> do            -- We've seen 'x not followed by quote
 985                                         -- If TH is on, just parse the quote only
 986                         th_exts <- extension thEnabled
 987                         let (AI end _ _) = i1
 988                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
 989                                    else do setInput i2; lit_error
 990
 991 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
 992 finish_char_tok loc ch  -- We've already seen the closing quote
 993                         -- Just need to check for trailing #
 994   = do  glaexts <- extension glaExtsEnabled
 995         i@(AI end _ _) <- getInput
 996         if glaexts then do
 997                 case alexGetChar' i of
 998                         Just ('#',i@(AI end _ _)) -> do
 999                                 setInput i
1000                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1001                         _other ->
1002                                 return (L (mkSrcSpan loc end) (ITchar ch))
1003                 else do
1004                    return (L (mkSrcSpan loc end) (ITchar ch))
1005
1006 lex_char :: Char -> AlexInput -> P Char
1007 lex_char c inp = do
1008   case c of
1009       '\\' -> do setInput inp; lex_escape
1010       c | isAny c -> do setInput inp; return c
1011       _other -> lit_error
1012
1013 isAny c | c > '\xff' = isPrint c
1014         | otherwise  = is_any c
1015
1016 lex_escape :: P Char
1017 lex_escape = do
1018   c <- getCharOrFail
1019   case c of
1020         'a'   -> return '\a'
1021         'b'   -> return '\b'
1022         'f'   -> return '\f'
1023         'n'   -> return '\n'
1024         'r'   -> return '\r'
1025         't'   -> return '\t'
1026         'v'   -> return '\v'
1027         '\\'  -> return '\\'
1028         '"'   -> return '\"'
1029         '\''  -> return '\''
1030         '^'   -> do c <- getCharOrFail
1031                     if c >= '@' && c <= '_'
1032                         then return (chr (ord c - ord '@'))
1033                         else lit_error
1034
1035         'x'   -> readNum is_hexdigit 16 hexDigit
1036         'o'   -> readNum is_octdigit  8 octDecDigit
1037         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1038
1039         c1 ->  do
1040            i <- getInput
1041            case alexGetChar' i of
1042             Nothing -> lit_error
1043             Just (c2,i2) ->
1044               case alexGetChar' i2 of
1045                 Nothing -> do setInput i2; lit_error
1046                 Just (c3,i3) ->
1047                    let str = [c1,c2,c3] in
1048                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1049                                      Just rest <- [maybePrefixMatch p str] ] of
1050                           (escape_char,[]):_ -> do
1051                                 setInput i3
1052                                 return escape_char
1053                           (escape_char,_:_):_ -> do
1054                                 setInput i2
1055                                 return escape_char
1056                           [] -> lit_error
1057
1058 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1059 readNum is_digit base conv = do
1060   i <- getInput
1061   c <- getCharOrFail
1062   if is_digit c
1063         then readNum2 is_digit base conv (conv c)
1064         else do setInput i; lit_error
1065
1066 readNum2 is_digit base conv i = do
1067   input <- getInput
1068   read i input
1069   where read i input = do
1070           case alexGetChar' input of
1071             Just (c,input') | is_digit c -> do
1072                 read (i*base + conv c) input'
1073             _other -> do
1074                 if i >= 0 && i <= 0x10FFFF
1075                    then do setInput input; return (chr i)
1076                    else lit_error
1077
1078 silly_escape_chars = [
1079         ("NUL", '\NUL'),
1080         ("SOH", '\SOH'),
1081         ("STX", '\STX'),
1082         ("ETX", '\ETX'),
1083         ("EOT", '\EOT'),
1084         ("ENQ", '\ENQ'),
1085         ("ACK", '\ACK'),
1086         ("BEL", '\BEL'),
1087         ("BS", '\BS'),
1088         ("HT", '\HT'),
1089         ("LF", '\LF'),
1090         ("VT", '\VT'),
1091         ("FF", '\FF'),
1092         ("CR", '\CR'),
1093         ("SO", '\SO'),
1094         ("SI", '\SI'),
1095         ("DLE", '\DLE'),
1096         ("DC1", '\DC1'),
1097         ("DC2", '\DC2'),
1098         ("DC3", '\DC3'),
1099         ("DC4", '\DC4'),
1100         ("NAK", '\NAK'),
1101         ("SYN", '\SYN'),
1102         ("ETB", '\ETB'),
1103         ("CAN", '\CAN'),
1104         ("EM", '\EM'),
1105         ("SUB", '\SUB'),
1106         ("ESC", '\ESC'),
1107         ("FS", '\FS'),
1108         ("GS", '\GS'),
1109         ("RS", '\RS'),
1110         ("US", '\US'),
1111         ("SP", '\SP'),
1112         ("DEL", '\DEL')
1113         ]
1114
1115 -- before calling lit_error, ensure that the current input is pointing to
1116 -- the position of the error in the buffer.  This is so that we can report
1117 -- a correct location to the user, but also so we can detect UTF-8 decoding
1118 -- errors if they occur.
1119 lit_error = lexError "lexical error in string/character literal"
1120
1121 getCharOrFail :: P Char
1122 getCharOrFail =  do
1123   i <- getInput
1124   case alexGetChar' i of
1125         Nothing -> lexError "unexpected end-of-file in string/character literal"
1126         Just (c,i)  -> do setInput i; return c
1127
1128 -- -----------------------------------------------------------------------------
1129 -- The Parse Monad
1130
1131 data LayoutContext
1132   = NoLayout
1133   | Layout !Int
1134
1135 data ParseResult a
1136   = POk PState a
1137   | PFailed
1138         SrcSpan         -- The start and end of the text span related to
1139                         -- the error.  Might be used in environments which can
1140                         -- show this span, e.g. by highlighting it.
1141         Message         -- The error message
1142
1143 data PState = PState {
1144         buffer     :: StringBuffer,
1145         last_loc   :: SrcSpan,  -- pos of previous token
1146         last_offs  :: !Int,     -- offset of the previous token from the
1147                                 -- beginning of  the current line.
1148                                 -- \t is equal to 8 spaces.
1149         last_len   :: !Int,     -- len of previous token
1150         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1151         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1152         context    :: [LayoutContext],
1153         lex_state  :: [Int]
1154      }
1155         -- last_loc and last_len are used when generating error messages,
1156         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1157         -- current token to happyError, we could at least get rid of last_len.
1158         -- Getting rid of last_loc would require finding another way to
1159         -- implement pushCurrentContext (which is only called from one place).
1160
1161 newtype P a = P { unP :: PState -> ParseResult a }
1162
1163 instance Monad P where
1164   return = returnP
1165   (>>=) = thenP
1166   fail = failP
1167
1168 returnP :: a -> P a
1169 returnP a = P $ \s -> POk s a
1170
1171 thenP :: P a -> (a -> P b) -> P b
1172 (P m) `thenP` k = P $ \ s ->
1173         case m s of
1174                 POk s1 a         -> (unP (k a)) s1
1175                 PFailed span err -> PFailed span err
1176
1177 failP :: String -> P a
1178 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1179
1180 failMsgP :: String -> P a
1181 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1182
1183 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1184 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1185
1186 failSpanMsgP :: SrcSpan -> String -> P a
1187 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1188
1189 extension :: (Int -> Bool) -> P Bool
1190 extension p = P $ \s -> POk s (p $! extsBitmap s)
1191
1192 getExts :: P Int
1193 getExts = P $ \s -> POk s (extsBitmap s)
1194
1195 setSrcLoc :: SrcLoc -> P ()
1196 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1197
1198 getSrcLoc :: P SrcLoc
1199 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1200
1201 setLastToken :: SrcSpan -> Int -> P ()
1202 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1203
1204 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1205
1206 alexInputPrevChar :: AlexInput -> Char
1207 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1208
1209 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1210 alexGetChar (AI loc ofs s)
1211   | atEnd s   = Nothing
1212   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1213                 Just (adj_c, (AI loc' ofs' s'))
1214   where (c,s') = nextChar s
1215         loc'   = advanceSrcLoc loc c
1216         ofs'   = advanceOffs c ofs
1217
1218         non_graphic     = '\x0'
1219         upper           = '\x1'
1220         lower           = '\x2'
1221         digit           = '\x3'
1222         symbol          = '\x4'
1223         space           = '\x5'
1224         other_graphic   = '\x6'
1225
1226         adj_c
1227           | c <= '\x06' = non_graphic
1228           | c <= '\xff' = c
1229           | otherwise =
1230                 case generalCategory c of
1231                   UppercaseLetter       -> upper
1232                   LowercaseLetter       -> lower
1233                   TitlecaseLetter       -> upper
1234                   ModifierLetter        -> other_graphic
1235                   OtherLetter           -> other_graphic
1236                   NonSpacingMark        -> other_graphic
1237                   SpacingCombiningMark  -> other_graphic
1238                   EnclosingMark         -> other_graphic
1239                   DecimalNumber         -> digit
1240                   LetterNumber          -> other_graphic
1241                   OtherNumber           -> other_graphic
1242                   ConnectorPunctuation  -> other_graphic
1243                   DashPunctuation       -> other_graphic
1244                   OpenPunctuation       -> other_graphic
1245                   ClosePunctuation      -> other_graphic
1246                   InitialQuote          -> other_graphic
1247                   FinalQuote            -> other_graphic
1248                   OtherPunctuation      -> other_graphic
1249                   MathSymbol            -> symbol
1250                   CurrencySymbol        -> symbol
1251                   ModifierSymbol        -> symbol
1252                   OtherSymbol           -> symbol
1253                   Space                 -> space
1254                   _other                -> non_graphic
1255
1256 -- This version does not squash unicode characters, it is used when
1257 -- lexing strings.
1258 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1259 alexGetChar' (AI loc ofs s)
1260   | atEnd s   = Nothing
1261   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1262                 Just (c, (AI loc' ofs' s'))
1263   where (c,s') = nextChar s
1264         loc'   = advanceSrcLoc loc c
1265         ofs'   = advanceOffs c ofs
1266
1267 advanceOffs :: Char -> Int -> Int
1268 advanceOffs '\n' offs = 0
1269 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1270 advanceOffs _    offs = offs + 1
1271
1272 getInput :: P AlexInput
1273 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1274
1275 setInput :: AlexInput -> P ()
1276 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1277
1278 pushLexState :: Int -> P ()
1279 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1280
1281 popLexState :: P Int
1282 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1283
1284 getLexState :: P Int
1285 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1286
1287 -- for reasons of efficiency, flags indicating language extensions (eg,
1288 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1289 -- integer
1290
1291 glaExtsBit, ffiBit, parrBit :: Int
1292 glaExtsBit = 0
1293 ffiBit     = 1
1294 parrBit    = 2
1295 arrowsBit  = 4
1296 thBit      = 5
1297 ipBit      = 6
1298 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1299 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1300                 -- (doesn't affect the lexer)
1301
1302 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1303 glaExtsEnabled flags = testBit flags glaExtsBit
1304 ffiEnabled     flags = testBit flags ffiBit
1305 parrEnabled    flags = testBit flags parrBit
1306 arrowsEnabled  flags = testBit flags arrowsBit
1307 thEnabled      flags = testBit flags thBit
1308 ipEnabled      flags = testBit flags ipBit
1309 tvEnabled      flags = testBit flags tvBit
1310 bangPatEnabled flags = testBit flags bangPatBit
1311
1312 -- PState for parsing options pragmas
1313 --
1314 pragState :: StringBuffer -> SrcLoc -> PState
1315 pragState buf loc  =
1316   PState {
1317       buffer     = buf,
1318       last_loc   = mkSrcSpan loc loc,
1319       last_offs  = 0,
1320       last_len   = 0,
1321       loc        = loc,
1322       extsBitmap = 0,
1323       context    = [],
1324       lex_state  = [bol, option_prags, 0]
1325     }
1326
1327
1328 -- create a parse state
1329 --
1330 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1331 mkPState buf loc flags  =
1332   PState {
1333       buffer     = buf,
1334       last_loc   = mkSrcSpan loc loc,
1335       last_offs  = 0,
1336       last_len   = 0,
1337       loc        = loc,
1338       extsBitmap = fromIntegral bitmap,
1339       context    = [],
1340       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1341         -- we begin in the layout state if toplev_layout is set
1342     }
1343     where
1344       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1345                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1346                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1347                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1348                .|. thBit      `setBitIf` dopt Opt_TH          flags
1349                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1350                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1351                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1352       --
1353       setBitIf :: Int -> Bool -> Int
1354       b `setBitIf` cond | cond      = bit b
1355                         | otherwise = 0
1356
1357 getContext :: P [LayoutContext]
1358 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1359
1360 setContext :: [LayoutContext] -> P ()
1361 setContext ctx = P $ \s -> POk s{context=ctx} ()
1362
1363 popContext :: P ()
1364 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1365                            loc = loc, last_len = len, last_loc = last_loc }) ->
1366   case ctx of
1367         (_:tl) -> POk s{ context = tl } ()
1368         []     -> PFailed last_loc (srcParseErr buf len)
1369
1370 -- Push a new layout context at the indentation of the last token read.
1371 -- This is only used at the outer level of a module when the 'module'
1372 -- keyword is missing.
1373 pushCurrentContext :: P ()
1374 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_len=len, context=ctx } ->
1375   POk s{context = Layout (offs-len) : ctx} ()
1376
1377 getOffside :: P Ordering
1378 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1379                 let ord = case stk of
1380                         (Layout n:_) -> compare offs n
1381                         _            -> GT
1382                 in POk s ord
1383
1384 -- ---------------------------------------------------------------------------
1385 -- Construct a parse error
1386
1387 srcParseErr
1388   :: StringBuffer       -- current buffer (placed just after the last token)
1389   -> Int                -- length of the previous token
1390   -> Message
1391 srcParseErr buf len
1392   = hcat [ if null token
1393              then ptext SLIT("parse error (possibly incorrect indentation)")
1394              else hcat [ptext SLIT("parse error on input "),
1395                         char '`', text token, char '\'']
1396     ]
1397   where token = lexemeToString (offsetBytes (-len) buf) len
1398
1399 -- Report a parse failure, giving the span of the previous token as
1400 -- the location of the error.  This is the entry point for errors
1401 -- detected during parsing.
1402 srcParseFail :: P a
1403 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1404                             last_loc = last_loc } ->
1405     PFailed last_loc (srcParseErr buf len)
1406
1407 -- A lexical error is reported at a particular position in the source file,
1408 -- not over a token range.
1409 lexError :: String -> P a
1410 lexError str = do
1411   loc <- getSrcLoc
1412   i@(AI end _ buf) <- getInput
1413   reportLexError loc end buf str
1414
1415 -- -----------------------------------------------------------------------------
1416 -- This is the top-level function: called from the parser each time a
1417 -- new token is to be read from the input.
1418
1419 lexer :: (Located Token -> P a) -> P a
1420 lexer cont = do
1421   tok@(L _ tok__) <- lexToken
1422   --trace ("token: " ++ show tok__) $ do
1423   cont tok
1424
1425 lexToken :: P (Located Token)
1426 lexToken = do
1427   inp@(AI loc1 _ buf) <- getInput
1428   sc <- getLexState
1429   exts <- getExts
1430   case alexScanUser exts inp sc of
1431     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1432                   setLastToken span 0
1433                   return (L span ITeof)
1434     AlexError (AI loc2 _ buf) -> do
1435         reportLexError loc1 loc2 buf "lexical error"
1436     AlexSkip inp2 _ -> do
1437         setInput inp2
1438         lexToken
1439     AlexToken inp2@(AI end _ buf2) len t -> do
1440         setInput inp2
1441         let span = mkSrcSpan loc1 end
1442         let bytes = byteDiff buf buf2
1443         span `seq` setLastToken span bytes
1444         t span buf bytes
1445
1446 -- ToDo: Alex reports the buffer at the start of the erroneous lexeme,
1447 -- but it would be more informative to report the location where the
1448 -- error was actually discovered, especially if this is a decoding
1449 -- error.
1450 reportLexError loc1 loc2 buf str =
1451   let
1452         c = fst (nextChar buf)
1453   in
1454   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1455     then failLocMsgP loc2 loc2 "UTF-8 decoding error"
1456     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1457 }