compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  29    getLexState, popLexState, pushLexState,
  30    extension, bangPatEnabled
  31   ) where
  32
  33 #include "HsVersions.h"
  34
  35 import ErrUtils         ( Message )
  36 import Outputable
  37 import StringBuffer
  38 import FastString
  39 import FastTypes
  40 import SrcLoc
  41 import UniqFM
  42 import DynFlags
  43 import Ctype
  44 import Util             ( maybePrefixMatch, readRational )
  45
  46 import DATA_BITS
  47 import Data.Char        ( chr )
  48 import Ratio
  49 --import TRACE
  50
  51 #if __GLASGOW_HASKELL__ >= 605
  52 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  53 #else
  54 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  55 #endif
  56 }
  57
  58 $unispace    = \x05
  59 $whitechar   = [\ \t\n\r\f\v\xa0 $unispace]
  60 $white_no_nl = $whitechar # \n
  61
  62 $ascdigit  = 0-9
  63 $unidigit  = \x03
  64 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  65 $digit     = [$ascdigit $unidigit]
  66
  67 $special   = [\(\)\,\;\[\]\`\{\}]
  68 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  69 $unisymbol = \x04
  70 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  71
  72 $unilarge  = \x01
  73 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  74 $large     = [$asclarge $unilarge]
  75
  76 $unismall  = \x02
  77 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  78 $small     = [$ascsmall $unismall \_]
  79
  80 $unigraphic = \x06
  81 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  82
  83 $octit     = 0-7
  84 $hexit     = [$decdigit A-F a-f]
  85 $symchar   = [$symbol \:]
  86 $nl        = [\n\r]
  87 $idchar    = [$small $large $digit \']
  88
  89 @varid     = $small $idchar*
  90 @conid     = $large $idchar*
  91
  92 @varsym    = $symbol $symchar*
  93 @consym    = \: $symchar*
  94
  95 @decimal     = $decdigit+
  96 @octal       = $octit+
  97 @hexadecimal = $hexit+
  98 @exponent    = [eE] [\-\+]? @decimal
  99
 100 -- we support the hierarchical module name extension:
 101 @qual = (@conid \.)+
 102
 103 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 104
 105 haskell :-
 106
 107 -- everywhere: skip whitespace and comments
 108 $white_no_nl+                           ;
 109
 110 -- Everywhere: deal with nested comments.  We explicitly rule out
 111 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 112 -- (this can happen even though pragmas will normally take precedence due to
 113 -- longest-match, because pragmas aren't valid in every state, but comments
 114 -- are).
 115 "{-" / { notFollowedBy '#' }            { nested_comment }
 116
 117 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 118 -- more dashes followed by a symbol should be parsed as a varsym, so we
 119 -- have to exclude those.
 120 -- The regex says: "munch all the characters after the dashes, as long as
 121 -- the first one is not a symbol".
 122 "--"\-* [^$symbol :] .*                 ;
 123 "--"\-* / { atEOL }                     ;
 124
 125 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 126 -- blank lines) until we find a non-whitespace character, then do layout
 127 -- processing.
 128 --
 129 -- One slight wibble here: what if the line begins with {-#? In
 130 -- theory, we have to lex the pragma to see if it's one we recognise,
 131 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 132 -- as a nested comment.  We don't bother with this: if the line begins
 133 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 134 <bol> {
 135   \n                                    ;
 136   ^\# (line)?                           { begin line_prag1 }
 137   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 138   ^\# \! .* \n                          ; -- #!, for scripts
 139   ()                                    { do_bol }
 140 }
 141
 142 -- after a layout keyword (let, where, do, of), we begin a new layout
 143 -- context if the curly brace is missing.
 144 -- Careful! This stuff is quite delicate.
 145 <layout, layout_do> {
 146   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 147         -- we might encounter {-# here, but {- has been handled already
 148   \n                                    ;
 149   ^\# (line)?                           { begin line_prag1 }
 150 }
 151
 152 -- do is treated in a subtly different way, see new_layout_context
 153 <layout>    ()                          { new_layout_context True }
 154 <layout_do> ()                          { new_layout_context False }
 155
 156 -- after a new layout context which was found to be to the left of the
 157 -- previous context, we have generated a '{' token, and we now need to
 158 -- generate a matching '}' token.
 159 <layout_left>  ()                       { do_layout_left }
 160
 161 <0,option_prags,glaexts> \n                             { begin bol }
 162
 163 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 164
 165 -- single-line line pragmas, of the form
 166 --    # <line> "<file>" <extra-stuff> \n
 167 <line_prag1> $decdigit+                 { setLine line_prag1a }
 168 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 169 <line_prag1b> .*                        { pop }
 170
 171 -- Haskell-style line pragmas, of the form
 172 --    {-# LINE <line> "<file>" #-}
 173 <line_prag2> $decdigit+                 { setLine line_prag2a }
 174 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 175 <line_prag2b> "#-}"|"-}"                { pop }
 176    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 177    -- with older versions of GHC which generated these.
 178
 179 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 180 -- is on, because the contents of the pragma is always written using
 181 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 182 -- enabled, we're sure to get a parse error.
 183 -- (ToDo: we should really emit a warning when ignoring pragmas)
 184 <glaexts>
 185   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 186
 187 <0,option_prags,glaexts> {
 188   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 189   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 190                                         { token (ITinline_prag False) }
 191   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 192                                         { token ITspec_prag }
 193   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 194         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 195   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 196         $whitechar* (NO(T?)INLINE|no(t?)inline)
 197                                         { token (ITspec_inline_prag False) }
 198   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 199   "{-#" $whitechar* (DEPRECATED|deprecated)
 200                                         { token ITdeprecated_prag }
 201   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 202   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 203   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 204
 205   "{-#"                                 { nested_comment }
 206
 207   -- ToDo: should only be valid inside a pragma:
 208   "#-}"                                 { token ITclose_prag}
 209 }
 210
 211 <option_prags> {
 212   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 213   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 214                                         { lex_string_prag IToptions_prag }
 215   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 216   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 217 }
 218
 219 <0,option_prags,glaexts> {
 220         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 221   "{-#" $whitechar* $idchar+            { nested_comment }
 222 }
 223
 224 -- '0' state: ordinary lexemes
 225 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 226
 227 -- "special" symbols
 228
 229 <0,glaexts> {
 230   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 231   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 232 }
 233
 234 <0,glaexts> {
 235   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 236   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 237   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 238   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 239   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 240   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 241   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 242   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 243 }
 244
 245 <0,glaexts> {
 246   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 247                                         { special IToparenbar }
 248   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 249 }
 250
 251 <0,glaexts> {
 252   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 253   \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
 254 }
 255
 256 <glaexts> {
 257   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 258   "#)"                                  { token ITcubxparen }
 259   "{|"                                  { token ITocurlybar }
 260   "|}"                                  { token ITccurlybar }
 261 }
 262
 263 <0,option_prags,glaexts> {
 264   \(                                    { special IToparen }
 265   \)                                    { special ITcparen }
 266   \[                                    { special ITobrack }
 267   \]                                    { special ITcbrack }
 268   \,                                    { special ITcomma }
 269   \;                                    { special ITsemi }
 270   \`                                    { special ITbackquote }
 271
 272   \{                                    { open_brace }
 273   \}                                    { close_brace }
 274 }
 275
 276 <0,option_prags,glaexts> {
 277   @qual @varid                  { check_qvarid }
 278   @qual @conid                  { idtoken qconid }
 279   @varid                        { varid }
 280   @conid                        { idtoken conid }
 281 }
 282
 283 -- after an illegal qvarid, such as 'M.let',
 284 -- we back up and try again in the bad_qvarid state:
 285 <bad_qvarid> {
 286   @conid                        { pop_and (idtoken conid) }
 287   @qual @conid                  { pop_and (idtoken qconid) }
 288 }
 289
 290 <glaexts> {
 291   @qual @varid "#"+             { idtoken qvarid }
 292   @qual @conid "#"+             { idtoken qconid }
 293   @varid "#"+                   { varid }
 294   @conid "#"+                   { idtoken conid }
 295 }
 296
 297 -- ToDo: M.(,,,)
 298
 299 <0,glaexts> {
 300   @qual @varsym                 { idtoken qvarsym }
 301   @qual @consym                 { idtoken qconsym }
 302   @varsym                       { varsym }
 303   @consym                       { consym }
 304 }
 305
 306 <0,glaexts> {
 307   @decimal                      { tok_decimal }
 308   0[oO] @octal                  { tok_octal }
 309   0[xX] @hexadecimal            { tok_hexadecimal }
 310 }
 311
 312 <glaexts> {
 313   @decimal \#                   { prim_decimal }
 314   0[oO] @octal \#               { prim_octal }
 315   0[xX] @hexadecimal \#         { prim_hexadecimal }
 316 }
 317
 318 <0,glaexts> @floating_point             { strtoken tok_float }
 319 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 320 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 321
 322 -- Strings and chars are lexed by hand-written code.  The reason is
 323 -- that even if we recognise the string or char here in the regex
 324 -- lexer, we would still have to parse the string afterward in order
 325 -- to convert it to a String.
 326 <0,glaexts> {
 327   \'                            { lex_char_tok }
 328   \"                            { lex_string_tok }
 329 }
 330
 331 {
 332 -- work around bug in Alex 2.0
 333 #if __GLASGOW_HASKELL__ < 503
 334 unsafeAt arr i = arr ! i
 335 #endif
 336
 337 -- -----------------------------------------------------------------------------
 338 -- The token type
 339
 340 data Token
 341   = ITas                        -- Haskell keywords
 342   | ITcase
 343   | ITclass
 344   | ITdata
 345   | ITdefault
 346   | ITderiving
 347   | ITdo
 348   | ITelse
 349   | IThiding
 350   | ITif
 351   | ITimport
 352   | ITin
 353   | ITinfix
 354   | ITinfixl
 355   | ITinfixr
 356   | ITinstance
 357   | ITlet
 358   | ITmodule
 359   | ITnewtype
 360   | ITof
 361   | ITqualified
 362   | ITthen
 363   | ITtype
 364   | ITwhere
 365   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 366
 367   | ITforall                    -- GHC extension keywords
 368   | ITforeign
 369   | ITexport
 370   | ITlabel
 371   | ITdynamic
 372   | ITsafe
 373   | ITthreadsafe
 374   | ITunsafe
 375   | ITstdcallconv
 376   | ITccallconv
 377   | ITdotnet
 378   | ITmdo
 379
 380         -- Pragmas
 381   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 382   | ITspec_prag                 -- SPECIALISE
 383   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 384   | ITsource_prag
 385   | ITrules_prag
 386   | ITdeprecated_prag
 387   | ITline_prag
 388   | ITscc_prag
 389   | ITcore_prag                 -- hdaume: core annotations
 390   | ITunpack_prag
 391   | ITclose_prag
 392   | IToptions_prag String
 393   | ITinclude_prag String
 394   | ITlanguage_prag
 395
 396   | ITdotdot                    -- reserved symbols
 397   | ITcolon
 398   | ITdcolon
 399   | ITequal
 400   | ITlam
 401   | ITvbar
 402   | ITlarrow
 403   | ITrarrow
 404   | ITat
 405   | ITtilde
 406   | ITdarrow
 407   | ITminus
 408   | ITbang
 409   | ITstar
 410   | ITdot
 411
 412   | ITbiglam                    -- GHC-extension symbols
 413
 414   | ITocurly                    -- special symbols
 415   | ITccurly
 416   | ITocurlybar                 -- {|, for type applications
 417   | ITccurlybar                 -- |}, for type applications
 418   | ITvocurly
 419   | ITvccurly
 420   | ITobrack
 421   | ITopabrack                  -- [:, for parallel arrays with -fparr
 422   | ITcpabrack                  -- :], for parallel arrays with -fparr
 423   | ITcbrack
 424   | IToparen
 425   | ITcparen
 426   | IToubxparen
 427   | ITcubxparen
 428   | ITsemi
 429   | ITcomma
 430   | ITunderscore
 431   | ITbackquote
 432
 433   | ITvarid   FastString        -- identifiers
 434   | ITconid   FastString
 435   | ITvarsym  FastString
 436   | ITconsym  FastString
 437   | ITqvarid  (FastString,FastString)
 438   | ITqconid  (FastString,FastString)
 439   | ITqvarsym (FastString,FastString)
 440   | ITqconsym (FastString,FastString)
 441
 442   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 443   | ITsplitipvarid FastString   -- GHC extension: implicit param: %x
 444
 445   | ITpragma StringBuffer
 446
 447   | ITchar       Char
 448   | ITstring     FastString
 449   | ITinteger    Integer
 450   | ITrational   Rational
 451
 452   | ITprimchar   Char
 453   | ITprimstring FastString
 454   | ITprimint    Integer
 455   | ITprimfloat  Rational
 456   | ITprimdouble Rational
 457
 458   -- MetaHaskell extension tokens
 459   | ITopenExpQuote              --  [| or [e|
 460   | ITopenPatQuote              --  [p|
 461   | ITopenDecQuote              --  [d|
 462   | ITopenTypQuote              --  [t|
 463   | ITcloseQuote                --  |]
 464   | ITidEscape   FastString     --  $x
 465   | ITparenEscape               --  $(
 466   | ITvarQuote                  --  '
 467   | ITtyQuote                   --  ''
 468
 469   -- Arrow notation extension
 470   | ITproc
 471   | ITrec
 472   | IToparenbar                 --  (|
 473   | ITcparenbar                 --  |)
 474   | ITlarrowtail                --  -<
 475   | ITrarrowtail                --  >-
 476   | ITLarrowtail                --  -<<
 477   | ITRarrowtail                --  >>-
 478
 479   | ITunknown String            -- Used when the lexer can't make sense of it
 480   | ITeof                       -- end of file token
 481 #ifdef DEBUG
 482   deriving Show -- debugging
 483 #endif
 484
 485 isSpecial :: Token -> Bool
 486 -- If we see M.x, where x is a keyword, but
 487 -- is special, we treat is as just plain M.x,
 488 -- not as a keyword.
 489 isSpecial ITas          = True
 490 isSpecial IThiding      = True
 491 isSpecial ITqualified   = True
 492 isSpecial ITforall      = True
 493 isSpecial ITexport      = True
 494 isSpecial ITlabel       = True
 495 isSpecial ITdynamic     = True
 496 isSpecial ITsafe        = True
 497 isSpecial ITthreadsafe  = True
 498 isSpecial ITunsafe      = True
 499 isSpecial ITccallconv   = True
 500 isSpecial ITstdcallconv = True
 501 isSpecial ITmdo         = True
 502 isSpecial _             = False
 503
 504 -- the bitmap provided as the third component indicates whether the
 505 -- corresponding extension keyword is valid under the extension options
 506 -- provided to the compiler; if the extension corresponding to *any* of the
 507 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 508 -- facilitates using a keyword in two different extensions that can be
 509 -- activated independently)
 510 --
 511 reservedWordsFM = listToUFM $
 512         map (\(x, y, z) -> (mkFastString x, (y, z)))
 513        [( "_",          ITunderscore,   0 ),
 514         ( "as",         ITas,           0 ),
 515         ( "case",       ITcase,         0 ),
 516         ( "class",      ITclass,        0 ),
 517         ( "data",       ITdata,         0 ),
 518         ( "default",    ITdefault,      0 ),
 519         ( "deriving",   ITderiving,     0 ),
 520         ( "do",         ITdo,           0 ),
 521         ( "else",       ITelse,         0 ),
 522         ( "hiding",     IThiding,       0 ),
 523         ( "if",         ITif,           0 ),
 524         ( "import",     ITimport,       0 ),
 525         ( "in",         ITin,           0 ),
 526         ( "infix",      ITinfix,        0 ),
 527         ( "infixl",     ITinfixl,       0 ),
 528         ( "infixr",     ITinfixr,       0 ),
 529         ( "instance",   ITinstance,     0 ),
 530         ( "let",        ITlet,          0 ),
 531         ( "module",     ITmodule,       0 ),
 532         ( "newtype",    ITnewtype,      0 ),
 533         ( "of",         ITof,           0 ),
 534         ( "qualified",  ITqualified,    0 ),
 535         ( "then",       ITthen,         0 ),
 536         ( "type",       ITtype,         0 ),
 537         ( "where",      ITwhere,        0 ),
 538         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 539
 540         ( "forall",     ITforall,        bit tvBit),
 541         ( "mdo",        ITmdo,           bit glaExtsBit),
 542
 543         ( "foreign",    ITforeign,       bit ffiBit),
 544         ( "export",     ITexport,        bit ffiBit),
 545         ( "label",      ITlabel,         bit ffiBit),
 546         ( "dynamic",    ITdynamic,       bit ffiBit),
 547         ( "safe",       ITsafe,          bit ffiBit),
 548         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 549         ( "unsafe",     ITunsafe,        bit ffiBit),
 550         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 551         ( "ccall",      ITccallconv,     bit ffiBit),
 552         ( "dotnet",     ITdotnet,        bit ffiBit),
 553
 554         ( "rec",        ITrec,           bit arrowsBit),
 555         ( "proc",       ITproc,          bit arrowsBit)
 556      ]
 557
 558 reservedSymsFM = listToUFM $
 559         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 560       [ ("..",  ITdotdot,       0)
 561        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 562                                                 -- meaning only list cons
 563        ,("::",  ITdcolon,       0)
 564        ,("=",   ITequal,        0)
 565        ,("\\",  ITlam,          0)
 566        ,("|",   ITvbar,         0)
 567        ,("<-",  ITlarrow,       0)
 568        ,("->",  ITrarrow,       0)
 569        ,("@",   ITat,           0)
 570        ,("~",   ITtilde,        0)
 571        ,("=>",  ITdarrow,       0)
 572        ,("-",   ITminus,        0)
 573        ,("!",   ITbang,         0)
 574
 575        ,("*",   ITstar,         bit glaExtsBit) -- For data T (a::*) = MkT
 576        ,(".",   ITdot,          bit tvBit)      -- For 'forall a . t'
 577
 578        ,("-<",  ITlarrowtail,   bit arrowsBit)
 579        ,(">-",  ITrarrowtail,   bit arrowsBit)
 580        ,("-<<", ITLarrowtail,   bit arrowsBit)
 581        ,(">>-", ITRarrowtail,   bit arrowsBit)
 582
 583 #if __GLASGOW_HASKELL__ >= 605
 584        ,("λ",  ITlam,          bit glaExtsBit)
 585        ,("∷",   ITdcolon,       bit glaExtsBit)
 586        ,("⇒",   ITdarrow,     bit glaExtsBit)
 587        ,("∀", ITforall,       bit glaExtsBit)
 588        ,("→",   ITrarrow,     bit glaExtsBit)
 589        ,("←",   ITlarrow,     bit glaExtsBit)
 590        ,("⋯",         ITdotdot,       bit glaExtsBit)
 591 #endif
 592        ]
 593
 594 -- -----------------------------------------------------------------------------
 595 -- Lexer actions
 596
 597 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 598
 599 special :: Token -> Action
 600 special tok span _buf len = return (L span tok)
 601
 602 token, layout_token :: Token -> Action
 603 token t span buf len = return (L span t)
 604 layout_token t span buf len = pushLexState layout >> return (L span t)
 605
 606 idtoken :: (StringBuffer -> Int -> Token) -> Action
 607 idtoken f span buf len = return (L span $! (f buf len))
 608
 609 skip_one_varid :: (FastString -> Token) -> Action
 610 skip_one_varid f span buf len
 611   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 612
 613 strtoken :: (String -> Token) -> Action
 614 strtoken f span buf len =
 615   return (L span $! (f $! lexemeToString buf len))
 616
 617 init_strtoken :: Int -> (String -> Token) -> Action
 618 -- like strtoken, but drops the last N character(s)
 619 init_strtoken drop f span buf len =
 620   return (L span $! (f $! lexemeToString buf (len-drop)))
 621
 622 begin :: Int -> Action
 623 begin code _span _str _len = do pushLexState code; lexToken
 624
 625 pop :: Action
 626 pop _span _buf _len = do popLexState; lexToken
 627
 628 pop_and :: Action -> Action
 629 pop_and act span buf len = do popLexState; act span buf len
 630
 631 notFollowedBy char _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf /= char
 632
 633 notFollowedBySymbol _ _ _ (AI _ _ buf)
 634   = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
 635
 636 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 637
 638 ifExtension pred bits _ _ _ = pred bits
 639
 640 {-
 641   nested comments require traversing by hand, they can't be parsed
 642   using regular expressions.
 643 -}
 644 nested_comment :: Action
 645 nested_comment span _str _len = do
 646   input <- getInput
 647   go 1 input
 648   where go 0 input = do setInput input; lexToken
 649         go n input = do
 650           case alexGetChar input of
 651             Nothing  -> err input
 652             Just (c,input) -> do
 653               case c of
 654                 '-' -> do
 655                   case alexGetChar input of
 656                     Nothing  -> err input
 657                     Just ('\125',input) -> go (n-1) input
 658                     Just (c,_)          -> go n input
 659                 '\123' -> do
 660                   case alexGetChar input of
 661                     Nothing  -> err input
 662                     Just ('-',input') -> go (n+1) input'
 663                     Just (c,input)    -> go n input
 664                 c -> go n input
 665
 666         err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 667
 668 open_brace, close_brace :: Action
 669 open_brace span _str _len = do
 670   ctx <- getContext
 671   setContext (NoLayout:ctx)
 672   return (L span ITocurly)
 673 close_brace span _str _len = do
 674   popContext
 675   return (L span ITccurly)
 676
 677 -- We have to be careful not to count M.<varid> as a qualified name
 678 -- when <varid> is a keyword.  We hack around this by catching
 679 -- the offending tokens afterward, and re-lexing in a different state.
 680 check_qvarid span buf len = do
 681   case lookupUFM reservedWordsFM var of
 682         Just (keyword,exts)
 683           | not (isSpecial keyword) ->
 684           if exts == 0
 685              then try_again
 686              else do
 687                 b <- extension (\i -> exts .&. i /= 0)
 688                 if b then try_again
 689                      else return token
 690         _other -> return token
 691   where
 692         (mod,var) = splitQualName buf len
 693         token     = L span (ITqvarid (mod,var))
 694
 695         try_again = do
 696                 (AI _ offs _) <- getInput
 697                 setInput (AI (srcSpanStart span) (offs-len) buf)
 698                 pushLexState bad_qvarid
 699                 lexToken
 700
 701 qvarid buf len = ITqvarid $! splitQualName buf len
 702 qconid buf len = ITqconid $! splitQualName buf len
 703
 704 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 705 -- takes a StringBuffer and a length, and returns the module name
 706 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 707 -- because of hierarchical module names.
 708 splitQualName orig_buf len = split orig_buf orig_buf
 709   where
 710     split buf dot_buf
 711         | orig_buf `byteDiff` buf >= len  = done dot_buf
 712         | c == '.'                        = found_dot buf'
 713         | otherwise                       = split buf' dot_buf
 714       where
 715        (c,buf') = nextChar buf
 716
 717     -- careful, we might get names like M....
 718     -- so, if the character after the dot is not upper-case, this is
 719     -- the end of the qualifier part.
 720     found_dot buf -- buf points after the '.'
 721         | isUpper c    = split buf' buf
 722         | otherwise    = done buf
 723       where
 724        (c,buf') = nextChar buf
 725
 726     done dot_buf =
 727         (lexemeToFastString orig_buf (qual_size - 1),
 728          lexemeToFastString dot_buf (len - qual_size))
 729       where
 730         qual_size = orig_buf `byteDiff` dot_buf
 731
 732 varid span buf len =
 733   case lookupUFM reservedWordsFM fs of
 734         Just (keyword,0)    -> do
 735                 maybe_layout keyword
 736                 return (L span keyword)
 737         Just (keyword,exts) -> do
 738                 b <- extension (\i -> exts .&. i /= 0)
 739                 if b then do maybe_layout keyword
 740                              return (L span keyword)
 741                      else return (L span (ITvarid fs))
 742         _other -> return (L span (ITvarid fs))
 743   where
 744         fs = lexemeToFastString buf len
 745
 746 conid buf len = ITconid fs
 747   where fs = lexemeToFastString buf len
 748
 749 qvarsym buf len = ITqvarsym $! splitQualName buf len
 750 qconsym buf len = ITqconsym $! splitQualName buf len
 751
 752 varsym = sym ITvarsym
 753 consym = sym ITconsym
 754
 755 sym con span buf len =
 756   case lookupUFM reservedSymsFM fs of
 757         Just (keyword,0)    -> return (L span keyword)
 758         Just (keyword,exts) -> do
 759                 b <- extension (\i -> exts .&. i /= 0)
 760                 if b then return (L span keyword)
 761                      else return (L span $! con fs)
 762         _other -> return (L span $! con fs)
 763   where
 764         fs = lexemeToFastString buf len
 765
 766 tok_decimal span buf len
 767   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 768
 769 tok_octal span buf len
 770   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 771
 772 tok_hexadecimal span buf len
 773   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 774
 775 prim_decimal span buf len
 776   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 777
 778 prim_octal span buf len
 779   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 780
 781 prim_hexadecimal span buf len
 782   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 783
 784 tok_float        str = ITrational   $! readRational str
 785 prim_float       str = ITprimfloat  $! readRational str
 786 prim_double      str = ITprimdouble $! readRational str
 787
 788 -- -----------------------------------------------------------------------------
 789 -- Layout processing
 790
 791 -- we're at the first token on a line, insert layout tokens if necessary
 792 do_bol :: Action
 793 do_bol span _str _len = do
 794         pos <- getOffside
 795         case pos of
 796             LT -> do
 797                 --trace "layout: inserting '}'" $ do
 798                 popContext
 799                 -- do NOT pop the lex state, we might have a ';' to insert
 800                 return (L span ITvccurly)
 801             EQ -> do
 802                 --trace "layout: inserting ';'" $ do
 803                 popLexState
 804                 return (L span ITsemi)
 805             GT -> do
 806                 popLexState
 807                 lexToken
 808
 809 -- certain keywords put us in the "layout" state, where we might
 810 -- add an opening curly brace.
 811 maybe_layout ITdo       = pushLexState layout_do
 812 maybe_layout ITmdo      = pushLexState layout_do
 813 maybe_layout ITof       = pushLexState layout
 814 maybe_layout ITlet      = pushLexState layout
 815 maybe_layout ITwhere    = pushLexState layout
 816 maybe_layout ITrec      = pushLexState layout
 817 maybe_layout _          = return ()
 818
 819 -- Pushing a new implicit layout context.  If the indentation of the
 820 -- next token is not greater than the previous layout context, then
 821 -- Haskell 98 says that the new layout context should be empty; that is
 822 -- the lexer must generate {}.
 823 --
 824 -- We are slightly more lenient than this: when the new context is started
 825 -- by a 'do', then we allow the new context to be at the same indentation as
 826 -- the previous context.  This is what the 'strict' argument is for.
 827 --
 828 new_layout_context strict span _buf _len = do
 829     popLexState
 830     (AI _ offset _) <- getInput
 831     ctx <- getContext
 832     case ctx of
 833         Layout prev_off : _  |
 834            (strict     && prev_off >= offset  ||
 835             not strict && prev_off > offset) -> do
 836                 -- token is indented to the left of the previous context.
 837                 -- we must generate a {} sequence now.
 838                 pushLexState layout_left
 839                 return (L span ITvocurly)
 840         other -> do
 841                 setContext (Layout offset : ctx)
 842                 return (L span ITvocurly)
 843
 844 do_layout_left span _buf _len = do
 845     popLexState
 846     pushLexState bol  -- we must be at the start of a line
 847     return (L span ITvccurly)
 848
 849 -- -----------------------------------------------------------------------------
 850 -- LINE pragmas
 851
 852 setLine :: Int -> Action
 853 setLine code span buf len = do
 854   let line = parseInteger buf len 10 octDecDigit
 855   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
 856         -- subtract one: the line number refers to the *following* line
 857   popLexState
 858   pushLexState code
 859   lexToken
 860
 861 setFile :: Int -> Action
 862 setFile code span buf len = do
 863   let file = lexemeToFastString (stepOn buf) (len-2)
 864   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
 865   popLexState
 866   pushLexState code
 867   lexToken
 868
 869
 870 -- -----------------------------------------------------------------------------
 871 -- Options, includes and language pragmas.
 872
 873 lex_string_prag :: (String -> Token) -> Action
 874 lex_string_prag mkTok span buf len
 875     = do input <- getInput
 876          start <- getSrcLoc
 877          tok <- go [] input
 878          end <- getSrcLoc
 879          return (L (mkSrcSpan start end) tok)
 880     where go acc input
 881               = if isString input "#-}"
 882                    then do setInput input
 883                            return (mkTok (reverse acc))
 884                    else case alexGetChar input of
 885                           Just (c,i) -> go (c:acc) i
 886                           Nothing -> err input
 887           isString i [] = True
 888           isString i (x:xs)
 889               = case alexGetChar i of
 890                   Just (c,i') | c == x    -> isString i' xs
 891                   _other -> False
 892           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
 893
 894
 895 -- -----------------------------------------------------------------------------
 896 -- Strings & Chars
 897
 898 -- This stuff is horrible.  I hates it.
 899
 900 lex_string_tok :: Action
 901 lex_string_tok span buf len = do
 902   tok <- lex_string ""
 903   end <- getSrcLoc
 904   return (L (mkSrcSpan (srcSpanStart span) end) tok)
 905
 906 lex_string :: String -> P Token
 907 lex_string s = do
 908   i <- getInput
 909   case alexGetChar' i of
 910     Nothing -> lit_error
 911
 912     Just ('"',i)  -> do
 913         setInput i
 914         glaexts <- extension glaExtsEnabled
 915         if glaexts
 916           then do
 917             i <- getInput
 918             case alexGetChar' i of
 919               Just ('#',i) -> do
 920                    setInput i
 921                    if any (> '\xFF') s
 922                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 923                     else let s' = mkZFastString (reverse s) in
 924                          return (ITprimstring s')
 925                         -- mkZFastString is a hack to avoid encoding the
 926                         -- string in UTF-8.  We just want the exact bytes.
 927               _other ->
 928                 return (ITstring (mkFastString (reverse s)))
 929           else
 930                 return (ITstring (mkFastString (reverse s)))
 931
 932     Just ('\\',i)
 933         | Just ('&',i) <- next -> do
 934                 setInput i; lex_string s
 935         | Just (c,i) <- next, is_space c -> do
 936                 setInput i; lex_stringgap s
 937         where next = alexGetChar' i
 938
 939     Just (c, i) -> do
 940         c' <- lex_char c i
 941         lex_string (c':s)
 942
 943 lex_stringgap s = do
 944   c <- getCharOrFail
 945   case c of
 946     '\\' -> lex_string s
 947     c | is_space c -> lex_stringgap s
 948     _other -> lit_error
 949
 950
 951 lex_char_tok :: Action
 952 -- Here we are basically parsing character literals, such as 'x' or '\n'
 953 -- but, when Template Haskell is on, we additionally spot
 954 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
 955 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
 956 -- So we have to do two characters of lookahead: when we see 'x we need to
 957 -- see if there's a trailing quote
 958 lex_char_tok span buf len = do  -- We've seen '
 959    i1 <- getInput       -- Look ahead to first character
 960    let loc = srcSpanStart span
 961    case alexGetChar' i1 of
 962         Nothing -> lit_error
 963
 964         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
 965                   th_exts <- extension thEnabled
 966                   if th_exts then do
 967                         setInput i2
 968                         return (L (mkSrcSpan loc end2)  ITtyQuote)
 969                    else lit_error
 970
 971         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
 972                   setInput i2
 973                   lit_ch <- lex_escape
 974                   mc <- getCharOrFail   -- Trailing quote
 975                   if mc == '\'' then finish_char_tok loc lit_ch
 976                                 else do setInput i2; lit_error
 977
 978         Just (c, i2@(AI end2 _ _))
 979                 | not (isAny c) -> lit_error
 980                 | otherwise ->
 981
 982                 -- We've seen 'x, where x is a valid character
 983                 --  (i.e. not newline etc) but not a quote or backslash
 984            case alexGetChar' i2 of      -- Look ahead one more character
 985                 Nothing -> lit_error
 986                 Just ('\'', i3) -> do   -- We've seen 'x'
 987                         setInput i3
 988                         finish_char_tok loc c
 989                 _other -> do            -- We've seen 'x not followed by quote
 990                                         -- If TH is on, just parse the quote only
 991                         th_exts <- extension thEnabled
 992                         let (AI end _ _) = i1
 993                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
 994                                    else do setInput i2; lit_error
 995
 996 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
 997 finish_char_tok loc ch  -- We've already seen the closing quote
 998                         -- Just need to check for trailing #
 999   = do  glaexts <- extension glaExtsEnabled
1000         i@(AI end _ _) <- getInput
1001         if glaexts then do
1002                 case alexGetChar' i of
1003                         Just ('#',i@(AI end _ _)) -> do
1004                                 setInput i
1005                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1006                         _other ->
1007                                 return (L (mkSrcSpan loc end) (ITchar ch))
1008                 else do
1009                    return (L (mkSrcSpan loc end) (ITchar ch))
1010
1011 lex_char :: Char -> AlexInput -> P Char
1012 lex_char c inp = do
1013   case c of
1014       '\\' -> do setInput inp; lex_escape
1015       c | isAny c -> do setInput inp; return c
1016       _other -> lit_error
1017
1018 isAny c | c > '\xff' = isPrint c
1019         | otherwise  = is_any c
1020
1021 lex_escape :: P Char
1022 lex_escape = do
1023   c <- getCharOrFail
1024   case c of
1025         'a'   -> return '\a'
1026         'b'   -> return '\b'
1027         'f'   -> return '\f'
1028         'n'   -> return '\n'
1029         'r'   -> return '\r'
1030         't'   -> return '\t'
1031         'v'   -> return '\v'
1032         '\\'  -> return '\\'
1033         '"'   -> return '\"'
1034         '\''  -> return '\''
1035         '^'   -> do c <- getCharOrFail
1036                     if c >= '@' && c <= '_'
1037                         then return (chr (ord c - ord '@'))
1038                         else lit_error
1039
1040         'x'   -> readNum is_hexdigit 16 hexDigit
1041         'o'   -> readNum is_octdigit  8 octDecDigit
1042         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1043
1044         c1 ->  do
1045            i <- getInput
1046            case alexGetChar' i of
1047             Nothing -> lit_error
1048             Just (c2,i2) ->
1049               case alexGetChar' i2 of
1050                 Nothing -> do setInput i2; lit_error
1051                 Just (c3,i3) ->
1052                    let str = [c1,c2,c3] in
1053                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1054                                      Just rest <- [maybePrefixMatch p str] ] of
1055                           (escape_char,[]):_ -> do
1056                                 setInput i3
1057                                 return escape_char
1058                           (escape_char,_:_):_ -> do
1059                                 setInput i2
1060                                 return escape_char
1061                           [] -> lit_error
1062
1063 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1064 readNum is_digit base conv = do
1065   i <- getInput
1066   c <- getCharOrFail
1067   if is_digit c
1068         then readNum2 is_digit base conv (conv c)
1069         else do setInput i; lit_error
1070
1071 readNum2 is_digit base conv i = do
1072   input <- getInput
1073   read i input
1074   where read i input = do
1075           case alexGetChar' input of
1076             Just (c,input') | is_digit c -> do
1077                 read (i*base + conv c) input'
1078             _other -> do
1079                 if i >= 0 && i <= 0x10FFFF
1080                    then do setInput input; return (chr i)
1081                    else lit_error
1082
1083 silly_escape_chars = [
1084         ("NUL", '\NUL'),
1085         ("SOH", '\SOH'),
1086         ("STX", '\STX'),
1087         ("ETX", '\ETX'),
1088         ("EOT", '\EOT'),
1089         ("ENQ", '\ENQ'),
1090         ("ACK", '\ACK'),
1091         ("BEL", '\BEL'),
1092         ("BS", '\BS'),
1093         ("HT", '\HT'),
1094         ("LF", '\LF'),
1095         ("VT", '\VT'),
1096         ("FF", '\FF'),
1097         ("CR", '\CR'),
1098         ("SO", '\SO'),
1099         ("SI", '\SI'),
1100         ("DLE", '\DLE'),
1101         ("DC1", '\DC1'),
1102         ("DC2", '\DC2'),
1103         ("DC3", '\DC3'),
1104         ("DC4", '\DC4'),
1105         ("NAK", '\NAK'),
1106         ("SYN", '\SYN'),
1107         ("ETB", '\ETB'),
1108         ("CAN", '\CAN'),
1109         ("EM", '\EM'),
1110         ("SUB", '\SUB'),
1111         ("ESC", '\ESC'),
1112         ("FS", '\FS'),
1113         ("GS", '\GS'),
1114         ("RS", '\RS'),
1115         ("US", '\US'),
1116         ("SP", '\SP'),
1117         ("DEL", '\DEL')
1118         ]
1119
1120 -- before calling lit_error, ensure that the current input is pointing to
1121 -- the position of the error in the buffer.  This is so that we can report
1122 -- a correct location to the user, but also so we can detect UTF-8 decoding
1123 -- errors if they occur.
1124 lit_error = lexError "lexical error in string/character literal"
1125
1126 getCharOrFail :: P Char
1127 getCharOrFail =  do
1128   i <- getInput
1129   case alexGetChar' i of
1130         Nothing -> lexError "unexpected end-of-file in string/character literal"
1131         Just (c,i)  -> do setInput i; return c
1132
1133 -- -----------------------------------------------------------------------------
1134 -- The Parse Monad
1135
1136 data LayoutContext
1137   = NoLayout
1138   | Layout !Int
1139
1140 data ParseResult a
1141   = POk PState a
1142   | PFailed
1143         SrcSpan         -- The start and end of the text span related to
1144                         -- the error.  Might be used in environments which can
1145                         -- show this span, e.g. by highlighting it.
1146         Message         -- The error message
1147
1148 data PState = PState {
1149         buffer     :: StringBuffer,
1150         last_loc   :: SrcSpan,  -- pos of previous token
1151         last_offs  :: !Int,     -- offset of the previous token from the
1152                                 -- beginning of  the current line.
1153                                 -- \t is equal to 8 spaces.
1154         last_len   :: !Int,     -- len of previous token
1155         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1156         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1157         context    :: [LayoutContext],
1158         lex_state  :: [Int]
1159      }
1160         -- last_loc and last_len are used when generating error messages,
1161         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1162         -- current token to happyError, we could at least get rid of last_len.
1163         -- Getting rid of last_loc would require finding another way to
1164         -- implement pushCurrentContext (which is only called from one place).
1165
1166 newtype P a = P { unP :: PState -> ParseResult a }
1167
1168 instance Monad P where
1169   return = returnP
1170   (>>=) = thenP
1171   fail = failP
1172
1173 returnP :: a -> P a
1174 returnP a = P $ \s -> POk s a
1175
1176 thenP :: P a -> (a -> P b) -> P b
1177 (P m) `thenP` k = P $ \ s ->
1178         case m s of
1179                 POk s1 a         -> (unP (k a)) s1
1180                 PFailed span err -> PFailed span err
1181
1182 failP :: String -> P a
1183 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1184
1185 failMsgP :: String -> P a
1186 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1187
1188 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1189 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1190
1191 failSpanMsgP :: SrcSpan -> String -> P a
1192 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1193
1194 extension :: (Int -> Bool) -> P Bool
1195 extension p = P $ \s -> POk s (p $! extsBitmap s)
1196
1197 getExts :: P Int
1198 getExts = P $ \s -> POk s (extsBitmap s)
1199
1200 setSrcLoc :: SrcLoc -> P ()
1201 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1202
1203 getSrcLoc :: P SrcLoc
1204 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1205
1206 setLastToken :: SrcSpan -> Int -> P ()
1207 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1208
1209 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1210
1211 alexInputPrevChar :: AlexInput -> Char
1212 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1213
1214 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1215 alexGetChar (AI loc ofs s)
1216   | atEnd s   = Nothing
1217   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1218                 --trace (show (ord c)) $
1219                 Just (adj_c, (AI loc' ofs' s'))
1220   where (c,s') = nextChar s
1221         loc'   = advanceSrcLoc loc c
1222         ofs'   = advanceOffs c ofs
1223
1224         non_graphic     = '\x0'
1225         upper           = '\x1'
1226         lower           = '\x2'
1227         digit           = '\x3'
1228         symbol          = '\x4'
1229         space           = '\x5'
1230         other_graphic   = '\x6'
1231
1232         adj_c
1233           | c <= '\x06' = non_graphic
1234           | c <= '\xff' = c
1235           | otherwise =
1236                 case generalCategory c of
1237                   UppercaseLetter       -> upper
1238                   LowercaseLetter       -> lower
1239                   TitlecaseLetter       -> upper
1240                   ModifierLetter        -> other_graphic
1241                   OtherLetter           -> other_graphic
1242                   NonSpacingMark        -> other_graphic
1243                   SpacingCombiningMark  -> other_graphic
1244                   EnclosingMark         -> other_graphic
1245                   DecimalNumber         -> digit
1246                   LetterNumber          -> other_graphic
1247                   OtherNumber           -> other_graphic
1248                   ConnectorPunctuation  -> other_graphic
1249                   DashPunctuation       -> other_graphic
1250                   OpenPunctuation       -> other_graphic
1251                   ClosePunctuation      -> other_graphic
1252                   InitialQuote          -> other_graphic
1253                   FinalQuote            -> other_graphic
1254                   OtherPunctuation      -> other_graphic
1255                   MathSymbol            -> symbol
1256                   CurrencySymbol        -> symbol
1257                   ModifierSymbol        -> symbol
1258                   OtherSymbol           -> symbol
1259                   Space                 -> space
1260                   _other                -> non_graphic
1261
1262 -- This version does not squash unicode characters, it is used when
1263 -- lexing strings.
1264 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1265 alexGetChar' (AI loc ofs s)
1266   | atEnd s   = Nothing
1267   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1268                 --trace (show (ord c)) $
1269                 Just (c, (AI loc' ofs' s'))
1270   where (c,s') = nextChar s
1271         loc'   = advanceSrcLoc loc c
1272         ofs'   = advanceOffs c ofs
1273
1274 advanceOffs :: Char -> Int -> Int
1275 advanceOffs '\n' offs = 0
1276 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1277 advanceOffs _    offs = offs + 1
1278
1279 getInput :: P AlexInput
1280 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1281
1282 setInput :: AlexInput -> P ()
1283 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1284
1285 pushLexState :: Int -> P ()
1286 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1287
1288 popLexState :: P Int
1289 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1290
1291 getLexState :: P Int
1292 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1293
1294 -- for reasons of efficiency, flags indicating language extensions (eg,
1295 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1296 -- integer
1297
1298 glaExtsBit, ffiBit, parrBit :: Int
1299 glaExtsBit = 0
1300 ffiBit     = 1
1301 parrBit    = 2
1302 arrowsBit  = 4
1303 thBit      = 5
1304 ipBit      = 6
1305 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1306 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1307                 -- (doesn't affect the lexer)
1308
1309 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1310 glaExtsEnabled flags = testBit flags glaExtsBit
1311 ffiEnabled     flags = testBit flags ffiBit
1312 parrEnabled    flags = testBit flags parrBit
1313 arrowsEnabled  flags = testBit flags arrowsBit
1314 thEnabled      flags = testBit flags thBit
1315 ipEnabled      flags = testBit flags ipBit
1316 tvEnabled      flags = testBit flags tvBit
1317 bangPatEnabled flags = testBit flags bangPatBit
1318
1319 -- PState for parsing options pragmas
1320 --
1321 pragState :: StringBuffer -> SrcLoc -> PState
1322 pragState buf loc  =
1323   PState {
1324       buffer     = buf,
1325       last_loc   = mkSrcSpan loc loc,
1326       last_offs  = 0,
1327       last_len   = 0,
1328       loc        = loc,
1329       extsBitmap = 0,
1330       context    = [],
1331       lex_state  = [bol, option_prags, 0]
1332     }
1333
1334
1335 -- create a parse state
1336 --
1337 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1338 mkPState buf loc flags  =
1339   PState {
1340       buffer     = buf,
1341       last_loc   = mkSrcSpan loc loc,
1342       last_offs  = 0,
1343       last_len   = 0,
1344       loc        = loc,
1345       extsBitmap = fromIntegral bitmap,
1346       context    = [],
1347       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1348         -- we begin in the layout state if toplev_layout is set
1349     }
1350     where
1351       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1352                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1353                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1354                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1355                .|. thBit      `setBitIf` dopt Opt_TH          flags
1356                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1357                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1358                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1359       --
1360       setBitIf :: Int -> Bool -> Int
1361       b `setBitIf` cond | cond      = bit b
1362                         | otherwise = 0
1363
1364 getContext :: P [LayoutContext]
1365 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1366
1367 setContext :: [LayoutContext] -> P ()
1368 setContext ctx = P $ \s -> POk s{context=ctx} ()
1369
1370 popContext :: P ()
1371 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1372                            loc = loc, last_len = len, last_loc = last_loc }) ->
1373   case ctx of
1374         (_:tl) -> POk s{ context = tl } ()
1375         []     -> PFailed last_loc (srcParseErr buf len)
1376
1377 -- Push a new layout context at the indentation of the last token read.
1378 -- This is only used at the outer level of a module when the 'module'
1379 -- keyword is missing.
1380 pushCurrentContext :: P ()
1381 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_len=len, context=ctx } ->
1382   POk s{context = Layout (offs-len) : ctx} ()
1383
1384 getOffside :: P Ordering
1385 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1386                 let ord = case stk of
1387                         (Layout n:_) -> compare offs n
1388                         _            -> GT
1389                 in POk s ord
1390
1391 -- ---------------------------------------------------------------------------
1392 -- Construct a parse error
1393
1394 srcParseErr
1395   :: StringBuffer       -- current buffer (placed just after the last token)
1396   -> Int                -- length of the previous token
1397   -> Message
1398 srcParseErr buf len
1399   = hcat [ if null token
1400              then ptext SLIT("parse error (possibly incorrect indentation)")
1401              else hcat [ptext SLIT("parse error on input "),
1402                         char '`', text token, char '\'']
1403     ]
1404   where token = lexemeToString (offsetBytes (-len) buf) len
1405
1406 -- Report a parse failure, giving the span of the previous token as
1407 -- the location of the error.  This is the entry point for errors
1408 -- detected during parsing.
1409 srcParseFail :: P a
1410 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1411                             last_loc = last_loc } ->
1412     PFailed last_loc (srcParseErr buf len)
1413
1414 -- A lexical error is reported at a particular position in the source file,
1415 -- not over a token range.
1416 lexError :: String -> P a
1417 lexError str = do
1418   loc <- getSrcLoc
1419   i@(AI end _ buf) <- getInput
1420   reportLexError loc end buf str
1421
1422 -- -----------------------------------------------------------------------------
1423 -- This is the top-level function: called from the parser each time a
1424 -- new token is to be read from the input.
1425
1426 lexer :: (Located Token -> P a) -> P a
1427 lexer cont = do
1428   tok@(L _ tok__) <- lexToken
1429   --trace ("token: " ++ show tok__) $ do
1430   cont tok
1431
1432 lexToken :: P (Located Token)
1433 lexToken = do
1434   inp@(AI loc1 _ buf) <- getInput
1435   sc <- getLexState
1436   exts <- getExts
1437   case alexScanUser exts inp sc of
1438     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1439                   setLastToken span 0
1440                   return (L span ITeof)
1441     AlexError (AI loc2 _ buf) -> do
1442         reportLexError loc1 loc2 buf "lexical error"
1443     AlexSkip inp2 _ -> do
1444         setInput inp2
1445         lexToken
1446     AlexToken inp2@(AI end _ buf2) len t -> do
1447         setInput inp2
1448         let span = mkSrcSpan loc1 end
1449         let bytes = byteDiff buf buf2
1450         span `seq` setLastToken span bytes
1451         t span buf bytes
1452
1453 reportLexError loc1 loc2 buf str
1454   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1455   | otherwise =
1456   let
1457         c = fst (nextChar buf)
1458   in
1459   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1460     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1461     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1462 }