compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  29    getLexState, popLexState, pushLexState,
  30    extension, bangPatEnabled
  31   ) where
  32
  33 #include "HsVersions.h"
  34
  35 import ErrUtils         ( Message )
  36 import Outputable
  37 import StringBuffer
  38 import FastString
  39 import FastTypes
  40 import SrcLoc
  41 import UniqFM
  42 import DynFlags
  43 import Ctype
  44 import Util             ( maybePrefixMatch, readRational )
  45
  46 import DATA_BITS
  47 import Data.Char        ( chr )
  48 import Ratio
  49 --import TRACE
  50
  51 #if __GLASGOW_HASKELL__ >= 605
  52 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  53 #else
  54 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  55 #endif
  56 }
  57
  58 $unispace    = \x05
  59 $whitechar   = [\ \t\n\r\f\v\xa0 $unispace]
  60 $white_no_nl = $whitechar # \n
  61
  62 $ascdigit  = 0-9
  63 $unidigit  = \x03
  64 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  65 $digit     = [$ascdigit $unidigit]
  66
  67 $special   = [\(\)\,\;\[\]\`\{\}]
  68 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  69 $unisymbol = \x04
  70 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  71
  72 $unilarge  = \x01
  73 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  74 $large     = [$asclarge $unilarge]
  75
  76 $unismall  = \x02
  77 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  78 $small     = [$ascsmall $unismall \_]
  79
  80 $unigraphic = \x06
  81 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  82
  83 $octit     = 0-7
  84 $hexit     = [$decdigit A-F a-f]
  85 $symchar   = [$symbol \:]
  86 $nl        = [\n\r]
  87 $idchar    = [$small $large $digit \']
  88
  89 @varid     = $small $idchar*
  90 @conid     = $large $idchar*
  91
  92 @varsym    = $symbol $symchar*
  93 @consym    = \: $symchar*
  94
  95 @decimal     = $decdigit+
  96 @octal       = $octit+
  97 @hexadecimal = $hexit+
  98 @exponent    = [eE] [\-\+]? @decimal
  99
 100 -- we support the hierarchical module name extension:
 101 @qual = (@conid \.)+
 102
 103 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 104
 105 haskell :-
 106
 107 -- everywhere: skip whitespace and comments
 108 $white_no_nl+                           ;
 109
 110 -- Everywhere: deal with nested comments.  We explicitly rule out
 111 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 112 -- (this can happen even though pragmas will normally take precedence due to
 113 -- longest-match, because pragmas aren't valid in every state, but comments
 114 -- are).
 115 "{-" / { notFollowedBy '#' }            { nested_comment }
 116
 117 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 118 -- more dashes followed by a symbol should be parsed as a varsym, so we
 119 -- have to exclude those.
 120 -- The regex says: "munch all the characters after the dashes, as long as
 121 -- the first one is not a symbol".
 122 "--"\-* [^$symbol :] .*                 ;
 123 "--"\-* / { atEOL }                     ;
 124
 125 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 126 -- blank lines) until we find a non-whitespace character, then do layout
 127 -- processing.
 128 --
 129 -- One slight wibble here: what if the line begins with {-#? In
 130 -- theory, we have to lex the pragma to see if it's one we recognise,
 131 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 132 -- as a nested comment.  We don't bother with this: if the line begins
 133 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 134 <bol> {
 135   \n                                    ;
 136   ^\# (line)?                           { begin line_prag1 }
 137   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 138   ^\# \! .* \n                          ; -- #!, for scripts
 139   ()                                    { do_bol }
 140 }
 141
 142 -- after a layout keyword (let, where, do, of), we begin a new layout
 143 -- context if the curly brace is missing.
 144 -- Careful! This stuff is quite delicate.
 145 <layout, layout_do> {
 146   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 147         -- we might encounter {-# here, but {- has been handled already
 148   \n                                    ;
 149   ^\# (line)?                           { begin line_prag1 }
 150 }
 151
 152 -- do is treated in a subtly different way, see new_layout_context
 153 <layout>    ()                          { new_layout_context True }
 154 <layout_do> ()                          { new_layout_context False }
 155
 156 -- after a new layout context which was found to be to the left of the
 157 -- previous context, we have generated a '{' token, and we now need to
 158 -- generate a matching '}' token.
 159 <layout_left>  ()                       { do_layout_left }
 160
 161 <0,option_prags,glaexts> \n                             { begin bol }
 162
 163 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 164
 165 -- single-line line pragmas, of the form
 166 --    # <line> "<file>" <extra-stuff> \n
 167 <line_prag1> $decdigit+                 { setLine line_prag1a }
 168 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 169 <line_prag1b> .*                        { pop }
 170
 171 -- Haskell-style line pragmas, of the form
 172 --    {-# LINE <line> "<file>" #-}
 173 <line_prag2> $decdigit+                 { setLine line_prag2a }
 174 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 175 <line_prag2b> "#-}"|"-}"                { pop }
 176    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 177    -- with older versions of GHC which generated these.
 178
 179 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 180 -- is on, because the contents of the pragma is always written using
 181 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 182 -- enabled, we're sure to get a parse error.
 183 -- (ToDo: we should really emit a warning when ignoring pragmas)
 184 <glaexts>
 185   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 186
 187 <0,option_prags,glaexts> {
 188   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 189   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 190                                         { token (ITinline_prag False) }
 191   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 192                                         { token ITspec_prag }
 193   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 194         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 195   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 196         $whitechar* (NO(T?)INLINE|no(t?)inline)
 197                                         { token (ITspec_inline_prag False) }
 198   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 199   "{-#" $whitechar* (DEPRECATED|deprecated)
 200                                         { token ITdeprecated_prag }
 201   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 202   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 203   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 204
 205   "{-#"                                 { nested_comment }
 206
 207   -- ToDo: should only be valid inside a pragma:
 208   "#-}"                                 { token ITclose_prag}
 209 }
 210
 211 <option_prags> {
 212   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 213   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 214                                         { lex_string_prag IToptions_prag }
 215   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 216   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 217 }
 218
 219 <0,option_prags,glaexts> {
 220         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 221   "{-#" $whitechar* $idchar+            { nested_comment }
 222 }
 223
 224 -- '0' state: ordinary lexemes
 225 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 226
 227 -- "special" symbols
 228
 229 <0,glaexts> {
 230   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 231   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 232 }
 233
 234 <0,glaexts> {
 235   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 236   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 237   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 238   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 239   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 240   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 241   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 242   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 243 }
 244
 245 <0,glaexts> {
 246   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 247                                         { special IToparenbar }
 248   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 249 }
 250
 251 <0,glaexts> {
 252   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 253   \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
 254 }
 255
 256 <glaexts> {
 257   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 258   "#)"                                  { token ITcubxparen }
 259   "{|"                                  { token ITocurlybar }
 260   "|}"                                  { token ITccurlybar }
 261 }
 262
 263 <0,option_prags,glaexts> {
 264   \(                                    { special IToparen }
 265   \)                                    { special ITcparen }
 266   \[                                    { special ITobrack }
 267   \]                                    { special ITcbrack }
 268   \,                                    { special ITcomma }
 269   \;                                    { special ITsemi }
 270   \`                                    { special ITbackquote }
 271
 272   \{                                    { open_brace }
 273   \}                                    { close_brace }
 274 }
 275
 276 <0,option_prags,glaexts> {
 277   @qual @varid                  { check_qvarid }
 278   @qual @conid                  { idtoken qconid }
 279   @varid                        { varid }
 280   @conid                        { idtoken conid }
 281 }
 282
 283 -- after an illegal qvarid, such as 'M.let',
 284 -- we back up and try again in the bad_qvarid state:
 285 <bad_qvarid> {
 286   @conid                        { pop_and (idtoken conid) }
 287   @qual @conid                  { pop_and (idtoken qconid) }
 288 }
 289
 290 <glaexts> {
 291   @qual @varid "#"+             { idtoken qvarid }
 292   @qual @conid "#"+             { idtoken qconid }
 293   @varid "#"+                   { varid }
 294   @conid "#"+                   { idtoken conid }
 295 }
 296
 297 -- ToDo: M.(,,,)
 298
 299 <0,glaexts> {
 300   @qual @varsym                 { idtoken qvarsym }
 301   @qual @consym                 { idtoken qconsym }
 302   @varsym                       { varsym }
 303   @consym                       { consym }
 304 }
 305
 306 <0,glaexts> {
 307   @decimal                      { tok_decimal }
 308   0[oO] @octal                  { tok_octal }
 309   0[xX] @hexadecimal            { tok_hexadecimal }
 310 }
 311
 312 <glaexts> {
 313   @decimal \#                   { prim_decimal }
 314   0[oO] @octal \#               { prim_octal }
 315   0[xX] @hexadecimal \#         { prim_hexadecimal }
 316 }
 317
 318 <0,glaexts> @floating_point             { strtoken tok_float }
 319 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 320 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 321
 322 -- Strings and chars are lexed by hand-written code.  The reason is
 323 -- that even if we recognise the string or char here in the regex
 324 -- lexer, we would still have to parse the string afterward in order
 325 -- to convert it to a String.
 326 <0,glaexts> {
 327   \'                            { lex_char_tok }
 328   \"                            { lex_string_tok }
 329 }
 330
 331 {
 332 -- work around bug in Alex 2.0
 333 #if __GLASGOW_HASKELL__ < 503
 334 unsafeAt arr i = arr ! i
 335 #endif
 336
 337 -- -----------------------------------------------------------------------------
 338 -- The token type
 339
 340 data Token
 341   = ITas                        -- Haskell keywords
 342   | ITcase
 343   | ITclass
 344   | ITdata
 345   | ITdefault
 346   | ITderiving
 347   | ITdo
 348   | ITelse
 349   | IThiding
 350   | ITif
 351   | ITimport
 352   | ITin
 353   | ITinfix
 354   | ITinfixl
 355   | ITinfixr
 356   | ITinstance
 357   | ITlet
 358   | ITmodule
 359   | ITnewtype
 360   | ITof
 361   | ITqualified
 362   | ITthen
 363   | ITtype
 364   | ITwhere
 365   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 366
 367   | ITforall                    -- GHC extension keywords
 368   | ITforeign
 369   | ITexport
 370   | ITlabel
 371   | ITdynamic
 372   | ITsafe
 373   | ITthreadsafe
 374   | ITunsafe
 375   | ITstdcallconv
 376   | ITccallconv
 377   | ITdotnet
 378   | ITmdo
 379   | ITiso
 380
 381         -- Pragmas
 382   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 383   | ITspec_prag                 -- SPECIALISE
 384   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 385   | ITsource_prag
 386   | ITrules_prag
 387   | ITdeprecated_prag
 388   | ITline_prag
 389   | ITscc_prag
 390   | ITcore_prag                 -- hdaume: core annotations
 391   | ITunpack_prag
 392   | ITclose_prag
 393   | IToptions_prag String
 394   | ITinclude_prag String
 395   | ITlanguage_prag
 396
 397   | ITdotdot                    -- reserved symbols
 398   | ITcolon
 399   | ITdcolon
 400   | ITequal
 401   | ITlam
 402   | ITvbar
 403   | ITlarrow
 404   | ITrarrow
 405   | ITat
 406   | ITtilde
 407   | ITdarrow
 408   | ITminus
 409   | ITbang
 410   | ITstar
 411   | ITdot
 412
 413   | ITbiglam                    -- GHC-extension symbols
 414
 415   | ITocurly                    -- special symbols
 416   | ITccurly
 417   | ITocurlybar                 -- {|, for type applications
 418   | ITccurlybar                 -- |}, for type applications
 419   | ITvocurly
 420   | ITvccurly
 421   | ITobrack
 422   | ITopabrack                  -- [:, for parallel arrays with -fparr
 423   | ITcpabrack                  -- :], for parallel arrays with -fparr
 424   | ITcbrack
 425   | IToparen
 426   | ITcparen
 427   | IToubxparen
 428   | ITcubxparen
 429   | ITsemi
 430   | ITcomma
 431   | ITunderscore
 432   | ITbackquote
 433
 434   | ITvarid   FastString        -- identifiers
 435   | ITconid   FastString
 436   | ITvarsym  FastString
 437   | ITconsym  FastString
 438   | ITqvarid  (FastString,FastString)
 439   | ITqconid  (FastString,FastString)
 440   | ITqvarsym (FastString,FastString)
 441   | ITqconsym (FastString,FastString)
 442
 443   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 444   | ITsplitipvarid FastString   -- GHC extension: implicit param: %x
 445
 446   | ITpragma StringBuffer
 447
 448   | ITchar       Char
 449   | ITstring     FastString
 450   | ITinteger    Integer
 451   | ITrational   Rational
 452
 453   | ITprimchar   Char
 454   | ITprimstring FastString
 455   | ITprimint    Integer
 456   | ITprimfloat  Rational
 457   | ITprimdouble Rational
 458
 459   -- MetaHaskell extension tokens
 460   | ITopenExpQuote              --  [| or [e|
 461   | ITopenPatQuote              --  [p|
 462   | ITopenDecQuote              --  [d|
 463   | ITopenTypQuote              --  [t|
 464   | ITcloseQuote                --  |]
 465   | ITidEscape   FastString     --  $x
 466   | ITparenEscape               --  $(
 467   | ITvarQuote                  --  '
 468   | ITtyQuote                   --  ''
 469
 470   -- Arrow notation extension
 471   | ITproc
 472   | ITrec
 473   | IToparenbar                 --  (|
 474   | ITcparenbar                 --  |)
 475   | ITlarrowtail                --  -<
 476   | ITrarrowtail                --  >-
 477   | ITLarrowtail                --  -<<
 478   | ITRarrowtail                --  >>-
 479
 480   | ITunknown String            -- Used when the lexer can't make sense of it
 481   | ITeof                       -- end of file token
 482 #ifdef DEBUG
 483   deriving Show -- debugging
 484 #endif
 485
 486 isSpecial :: Token -> Bool
 487 -- If we see M.x, where x is a keyword, but
 488 -- is special, we treat is as just plain M.x,
 489 -- not as a keyword.
 490 isSpecial ITas          = True
 491 isSpecial IThiding      = True
 492 isSpecial ITqualified   = True
 493 isSpecial ITforall      = True
 494 isSpecial ITexport      = True
 495 isSpecial ITlabel       = True
 496 isSpecial ITdynamic     = True
 497 isSpecial ITsafe        = True
 498 isSpecial ITthreadsafe  = True
 499 isSpecial ITunsafe      = True
 500 isSpecial ITccallconv   = True
 501 isSpecial ITstdcallconv = True
 502 isSpecial ITmdo         = True
 503 isSpecial ITiso         = True
 504 isSpecial _             = False
 505
 506 -- the bitmap provided as the third component indicates whether the
 507 -- corresponding extension keyword is valid under the extension options
 508 -- provided to the compiler; if the extension corresponding to *any* of the
 509 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 510 -- facilitates using a keyword in two different extensions that can be
 511 -- activated independently)
 512 --
 513 reservedWordsFM = listToUFM $
 514         map (\(x, y, z) -> (mkFastString x, (y, z)))
 515        [( "_",          ITunderscore,   0 ),
 516         ( "as",         ITas,           0 ),
 517         ( "case",       ITcase,         0 ),
 518         ( "class",      ITclass,        0 ),
 519         ( "data",       ITdata,         0 ),
 520         ( "default",    ITdefault,      0 ),
 521         ( "deriving",   ITderiving,     0 ),
 522         ( "do",         ITdo,           0 ),
 523         ( "else",       ITelse,         0 ),
 524         ( "hiding",     IThiding,       0 ),
 525         ( "if",         ITif,           0 ),
 526         ( "import",     ITimport,       0 ),
 527         ( "in",         ITin,           0 ),
 528         ( "infix",      ITinfix,        0 ),
 529         ( "infixl",     ITinfixl,       0 ),
 530         ( "infixr",     ITinfixr,       0 ),
 531         ( "instance",   ITinstance,     0 ),
 532         ( "let",        ITlet,          0 ),
 533         ( "module",     ITmodule,       0 ),
 534         ( "newtype",    ITnewtype,      0 ),
 535         ( "of",         ITof,           0 ),
 536         ( "qualified",  ITqualified,    0 ),
 537         ( "then",       ITthen,         0 ),
 538         ( "type",       ITtype,         0 ),
 539         ( "where",      ITwhere,        0 ),
 540         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 541
 542         ( "forall",     ITforall,        bit tvBit),
 543         ( "mdo",        ITmdo,           bit glaExtsBit),
 544         ( "iso",        ITiso,           bit glaExtsBit),
 545
 546         ( "foreign",    ITforeign,       bit ffiBit),
 547         ( "export",     ITexport,        bit ffiBit),
 548         ( "label",      ITlabel,         bit ffiBit),
 549         ( "dynamic",    ITdynamic,       bit ffiBit),
 550         ( "safe",       ITsafe,          bit ffiBit),
 551         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 552         ( "unsafe",     ITunsafe,        bit ffiBit),
 553         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 554         ( "ccall",      ITccallconv,     bit ffiBit),
 555         ( "dotnet",     ITdotnet,        bit ffiBit),
 556
 557         ( "rec",        ITrec,           bit arrowsBit),
 558         ( "proc",       ITproc,          bit arrowsBit)
 559      ]
 560
 561 reservedSymsFM = listToUFM $
 562         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 563       [ ("..",  ITdotdot,       0)
 564        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 565                                                 -- meaning only list cons
 566        ,("::",  ITdcolon,       0)
 567        ,("=",   ITequal,        0)
 568        ,("\\",  ITlam,          0)
 569        ,("|",   ITvbar,         0)
 570        ,("<-",  ITlarrow,       0)
 571        ,("->",  ITrarrow,       0)
 572        ,("@",   ITat,           0)
 573        ,("~",   ITtilde,        0)
 574        ,("=>",  ITdarrow,       0)
 575        ,("-",   ITminus,        0)
 576        ,("!",   ITbang,         0)
 577
 578        ,("*",   ITstar,         bit glaExtsBit) -- For data T (a::*) = MkT
 579        ,(".",   ITdot,          bit tvBit)      -- For 'forall a . t'
 580
 581        ,("-<",  ITlarrowtail,   bit arrowsBit)
 582        ,(">-",  ITrarrowtail,   bit arrowsBit)
 583        ,("-<<", ITLarrowtail,   bit arrowsBit)
 584        ,(">>-", ITRarrowtail,   bit arrowsBit)
 585
 586 #if __GLASGOW_HASKELL__ >= 605
 587        ,("λ",  ITlam,          bit glaExtsBit)
 588        ,("∷",   ITdcolon,       bit glaExtsBit)
 589        ,("⇒",   ITdarrow,     bit glaExtsBit)
 590        ,("∀", ITforall,       bit glaExtsBit)
 591        ,("→",   ITrarrow,     bit glaExtsBit)
 592        ,("←",   ITlarrow,     bit glaExtsBit)
 593        ,("⋯",         ITdotdot,       bit glaExtsBit)
 594         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 595         -- form part of a large operator.  This would let us have a better
 596         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 597 #endif
 598        ]
 599
 600 -- -----------------------------------------------------------------------------
 601 -- Lexer actions
 602
 603 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 604
 605 special :: Token -> Action
 606 special tok span _buf len = return (L span tok)
 607
 608 token, layout_token :: Token -> Action
 609 token t span buf len = return (L span t)
 610 layout_token t span buf len = pushLexState layout >> return (L span t)
 611
 612 idtoken :: (StringBuffer -> Int -> Token) -> Action
 613 idtoken f span buf len = return (L span $! (f buf len))
 614
 615 skip_one_varid :: (FastString -> Token) -> Action
 616 skip_one_varid f span buf len
 617   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 618
 619 strtoken :: (String -> Token) -> Action
 620 strtoken f span buf len =
 621   return (L span $! (f $! lexemeToString buf len))
 622
 623 init_strtoken :: Int -> (String -> Token) -> Action
 624 -- like strtoken, but drops the last N character(s)
 625 init_strtoken drop f span buf len =
 626   return (L span $! (f $! lexemeToString buf (len-drop)))
 627
 628 begin :: Int -> Action
 629 begin code _span _str _len = do pushLexState code; lexToken
 630
 631 pop :: Action
 632 pop _span _buf _len = do popLexState; lexToken
 633
 634 pop_and :: Action -> Action
 635 pop_and act span buf len = do popLexState; act span buf len
 636
 637 notFollowedBy char _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf /= char
 638
 639 notFollowedBySymbol _ _ _ (AI _ _ buf)
 640   = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
 641
 642 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 643
 644 ifExtension pred bits _ _ _ = pred bits
 645
 646 {-
 647   nested comments require traversing by hand, they can't be parsed
 648   using regular expressions.
 649 -}
 650 nested_comment :: Action
 651 nested_comment span _str _len = do
 652   input <- getInput
 653   go 1 input
 654   where go 0 input = do setInput input; lexToken
 655         go n input = do
 656           case alexGetChar input of
 657             Nothing  -> err input
 658             Just (c,input) -> do
 659               case c of
 660                 '-' -> do
 661                   case alexGetChar input of
 662                     Nothing  -> err input
 663                     Just ('\125',input) -> go (n-1) input
 664                     Just (c,_)          -> go n input
 665                 '\123' -> do
 666                   case alexGetChar input of
 667                     Nothing  -> err input
 668                     Just ('-',input') -> go (n+1) input'
 669                     Just (c,input)    -> go n input
 670                 c -> go n input
 671
 672         err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 673
 674 open_brace, close_brace :: Action
 675 open_brace span _str _len = do
 676   ctx <- getContext
 677   setContext (NoLayout:ctx)
 678   return (L span ITocurly)
 679 close_brace span _str _len = do
 680   popContext
 681   return (L span ITccurly)
 682
 683 -- We have to be careful not to count M.<varid> as a qualified name
 684 -- when <varid> is a keyword.  We hack around this by catching
 685 -- the offending tokens afterward, and re-lexing in a different state.
 686 check_qvarid span buf len = do
 687   case lookupUFM reservedWordsFM var of
 688         Just (keyword,exts)
 689           | not (isSpecial keyword) ->
 690           if exts == 0
 691              then try_again
 692              else do
 693                 b <- extension (\i -> exts .&. i /= 0)
 694                 if b then try_again
 695                      else return token
 696         _other -> return token
 697   where
 698         (mod,var) = splitQualName buf len
 699         token     = L span (ITqvarid (mod,var))
 700
 701         try_again = do
 702                 (AI _ offs _) <- getInput
 703                 setInput (AI (srcSpanStart span) (offs-len) buf)
 704                 pushLexState bad_qvarid
 705                 lexToken
 706
 707 qvarid buf len = ITqvarid $! splitQualName buf len
 708 qconid buf len = ITqconid $! splitQualName buf len
 709
 710 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 711 -- takes a StringBuffer and a length, and returns the module name
 712 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 713 -- because of hierarchical module names.
 714 splitQualName orig_buf len = split orig_buf orig_buf
 715   where
 716     split buf dot_buf
 717         | orig_buf `byteDiff` buf >= len  = done dot_buf
 718         | c == '.'                        = found_dot buf'
 719         | otherwise                       = split buf' dot_buf
 720       where
 721        (c,buf') = nextChar buf
 722
 723     -- careful, we might get names like M....
 724     -- so, if the character after the dot is not upper-case, this is
 725     -- the end of the qualifier part.
 726     found_dot buf -- buf points after the '.'
 727         | isUpper c    = split buf' buf
 728         | otherwise    = done buf
 729       where
 730        (c,buf') = nextChar buf
 731
 732     done dot_buf =
 733         (lexemeToFastString orig_buf (qual_size - 1),
 734          lexemeToFastString dot_buf (len - qual_size))
 735       where
 736         qual_size = orig_buf `byteDiff` dot_buf
 737
 738 varid span buf len =
 739   case lookupUFM reservedWordsFM fs of
 740         Just (keyword,0)    -> do
 741                 maybe_layout keyword
 742                 return (L span keyword)
 743         Just (keyword,exts) -> do
 744                 b <- extension (\i -> exts .&. i /= 0)
 745                 if b then do maybe_layout keyword
 746                              return (L span keyword)
 747                      else return (L span (ITvarid fs))
 748         _other -> return (L span (ITvarid fs))
 749   where
 750         fs = lexemeToFastString buf len
 751
 752 conid buf len = ITconid fs
 753   where fs = lexemeToFastString buf len
 754
 755 qvarsym buf len = ITqvarsym $! splitQualName buf len
 756 qconsym buf len = ITqconsym $! splitQualName buf len
 757
 758 varsym = sym ITvarsym
 759 consym = sym ITconsym
 760
 761 sym con span buf len =
 762   case lookupUFM reservedSymsFM fs of
 763         Just (keyword,0)    -> return (L span keyword)
 764         Just (keyword,exts) -> do
 765                 b <- extension (\i -> exts .&. i /= 0)
 766                 if b then return (L span keyword)
 767                      else return (L span $! con fs)
 768         _other -> return (L span $! con fs)
 769   where
 770         fs = lexemeToFastString buf len
 771
 772 tok_decimal span buf len
 773   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 774
 775 tok_octal span buf len
 776   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 777
 778 tok_hexadecimal span buf len
 779   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 780
 781 prim_decimal span buf len
 782   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 783
 784 prim_octal span buf len
 785   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 786
 787 prim_hexadecimal span buf len
 788   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 789
 790 tok_float        str = ITrational   $! readRational str
 791 prim_float       str = ITprimfloat  $! readRational str
 792 prim_double      str = ITprimdouble $! readRational str
 793
 794 -- -----------------------------------------------------------------------------
 795 -- Layout processing
 796
 797 -- we're at the first token on a line, insert layout tokens if necessary
 798 do_bol :: Action
 799 do_bol span _str _len = do
 800         pos <- getOffside
 801         case pos of
 802             LT -> do
 803                 --trace "layout: inserting '}'" $ do
 804                 popContext
 805                 -- do NOT pop the lex state, we might have a ';' to insert
 806                 return (L span ITvccurly)
 807             EQ -> do
 808                 --trace "layout: inserting ';'" $ do
 809                 popLexState
 810                 return (L span ITsemi)
 811             GT -> do
 812                 popLexState
 813                 lexToken
 814
 815 -- certain keywords put us in the "layout" state, where we might
 816 -- add an opening curly brace.
 817 maybe_layout ITdo       = pushLexState layout_do
 818 maybe_layout ITmdo      = pushLexState layout_do
 819 maybe_layout ITof       = pushLexState layout
 820 maybe_layout ITlet      = pushLexState layout
 821 maybe_layout ITwhere    = pushLexState layout
 822 maybe_layout ITrec      = pushLexState layout
 823 maybe_layout _          = return ()
 824
 825 -- Pushing a new implicit layout context.  If the indentation of the
 826 -- next token is not greater than the previous layout context, then
 827 -- Haskell 98 says that the new layout context should be empty; that is
 828 -- the lexer must generate {}.
 829 --
 830 -- We are slightly more lenient than this: when the new context is started
 831 -- by a 'do', then we allow the new context to be at the same indentation as
 832 -- the previous context.  This is what the 'strict' argument is for.
 833 --
 834 new_layout_context strict span _buf _len = do
 835     popLexState
 836     (AI _ offset _) <- getInput
 837     ctx <- getContext
 838     case ctx of
 839         Layout prev_off : _  |
 840            (strict     && prev_off >= offset  ||
 841             not strict && prev_off > offset) -> do
 842                 -- token is indented to the left of the previous context.
 843                 -- we must generate a {} sequence now.
 844                 pushLexState layout_left
 845                 return (L span ITvocurly)
 846         other -> do
 847                 setContext (Layout offset : ctx)
 848                 return (L span ITvocurly)
 849
 850 do_layout_left span _buf _len = do
 851     popLexState
 852     pushLexState bol  -- we must be at the start of a line
 853     return (L span ITvccurly)
 854
 855 -- -----------------------------------------------------------------------------
 856 -- LINE pragmas
 857
 858 setLine :: Int -> Action
 859 setLine code span buf len = do
 860   let line = parseInteger buf len 10 octDecDigit
 861   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
 862         -- subtract one: the line number refers to the *following* line
 863   popLexState
 864   pushLexState code
 865   lexToken
 866
 867 setFile :: Int -> Action
 868 setFile code span buf len = do
 869   let file = lexemeToFastString (stepOn buf) (len-2)
 870   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
 871   popLexState
 872   pushLexState code
 873   lexToken
 874
 875
 876 -- -----------------------------------------------------------------------------
 877 -- Options, includes and language pragmas.
 878
 879 lex_string_prag :: (String -> Token) -> Action
 880 lex_string_prag mkTok span buf len
 881     = do input <- getInput
 882          start <- getSrcLoc
 883          tok <- go [] input
 884          end <- getSrcLoc
 885          return (L (mkSrcSpan start end) tok)
 886     where go acc input
 887               = if isString input "#-}"
 888                    then do setInput input
 889                            return (mkTok (reverse acc))
 890                    else case alexGetChar input of
 891                           Just (c,i) -> go (c:acc) i
 892                           Nothing -> err input
 893           isString i [] = True
 894           isString i (x:xs)
 895               = case alexGetChar i of
 896                   Just (c,i') | c == x    -> isString i' xs
 897                   _other -> False
 898           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
 899
 900
 901 -- -----------------------------------------------------------------------------
 902 -- Strings & Chars
 903
 904 -- This stuff is horrible.  I hates it.
 905
 906 lex_string_tok :: Action
 907 lex_string_tok span buf len = do
 908   tok <- lex_string ""
 909   end <- getSrcLoc
 910   return (L (mkSrcSpan (srcSpanStart span) end) tok)
 911
 912 lex_string :: String -> P Token
 913 lex_string s = do
 914   i <- getInput
 915   case alexGetChar' i of
 916     Nothing -> lit_error
 917
 918     Just ('"',i)  -> do
 919         setInput i
 920         glaexts <- extension glaExtsEnabled
 921         if glaexts
 922           then do
 923             i <- getInput
 924             case alexGetChar' i of
 925               Just ('#',i) -> do
 926                    setInput i
 927                    if any (> '\xFF') s
 928                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 929                     else let s' = mkZFastString (reverse s) in
 930                          return (ITprimstring s')
 931                         -- mkZFastString is a hack to avoid encoding the
 932                         -- string in UTF-8.  We just want the exact bytes.
 933               _other ->
 934                 return (ITstring (mkFastString (reverse s)))
 935           else
 936                 return (ITstring (mkFastString (reverse s)))
 937
 938     Just ('\\',i)
 939         | Just ('&',i) <- next -> do
 940                 setInput i; lex_string s
 941         | Just (c,i) <- next, is_space c -> do
 942                 setInput i; lex_stringgap s
 943         where next = alexGetChar' i
 944
 945     Just (c, i) -> do
 946         c' <- lex_char c i
 947         lex_string (c':s)
 948
 949 lex_stringgap s = do
 950   c <- getCharOrFail
 951   case c of
 952     '\\' -> lex_string s
 953     c | is_space c -> lex_stringgap s
 954     _other -> lit_error
 955
 956
 957 lex_char_tok :: Action
 958 -- Here we are basically parsing character literals, such as 'x' or '\n'
 959 -- but, when Template Haskell is on, we additionally spot
 960 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
 961 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
 962 -- So we have to do two characters of lookahead: when we see 'x we need to
 963 -- see if there's a trailing quote
 964 lex_char_tok span buf len = do  -- We've seen '
 965    i1 <- getInput       -- Look ahead to first character
 966    let loc = srcSpanStart span
 967    case alexGetChar' i1 of
 968         Nothing -> lit_error
 969
 970         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
 971                   th_exts <- extension thEnabled
 972                   if th_exts then do
 973                         setInput i2
 974                         return (L (mkSrcSpan loc end2)  ITtyQuote)
 975                    else lit_error
 976
 977         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
 978                   setInput i2
 979                   lit_ch <- lex_escape
 980                   mc <- getCharOrFail   -- Trailing quote
 981                   if mc == '\'' then finish_char_tok loc lit_ch
 982                                 else do setInput i2; lit_error
 983
 984         Just (c, i2@(AI end2 _ _))
 985                 | not (isAny c) -> lit_error
 986                 | otherwise ->
 987
 988                 -- We've seen 'x, where x is a valid character
 989                 --  (i.e. not newline etc) but not a quote or backslash
 990            case alexGetChar' i2 of      -- Look ahead one more character
 991                 Nothing -> lit_error
 992                 Just ('\'', i3) -> do   -- We've seen 'x'
 993                         setInput i3
 994                         finish_char_tok loc c
 995                 _other -> do            -- We've seen 'x not followed by quote
 996                                         -- If TH is on, just parse the quote only
 997                         th_exts <- extension thEnabled
 998                         let (AI end _ _) = i1
 999                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1000                                    else do setInput i2; lit_error
1001
1002 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1003 finish_char_tok loc ch  -- We've already seen the closing quote
1004                         -- Just need to check for trailing #
1005   = do  glaexts <- extension glaExtsEnabled
1006         i@(AI end _ _) <- getInput
1007         if glaexts then do
1008                 case alexGetChar' i of
1009                         Just ('#',i@(AI end _ _)) -> do
1010                                 setInput i
1011                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1012                         _other ->
1013                                 return (L (mkSrcSpan loc end) (ITchar ch))
1014                 else do
1015                    return (L (mkSrcSpan loc end) (ITchar ch))
1016
1017 lex_char :: Char -> AlexInput -> P Char
1018 lex_char c inp = do
1019   case c of
1020       '\\' -> do setInput inp; lex_escape
1021       c | isAny c -> do setInput inp; return c
1022       _other -> lit_error
1023
1024 isAny c | c > '\xff' = isPrint c
1025         | otherwise  = is_any c
1026
1027 lex_escape :: P Char
1028 lex_escape = do
1029   c <- getCharOrFail
1030   case c of
1031         'a'   -> return '\a'
1032         'b'   -> return '\b'
1033         'f'   -> return '\f'
1034         'n'   -> return '\n'
1035         'r'   -> return '\r'
1036         't'   -> return '\t'
1037         'v'   -> return '\v'
1038         '\\'  -> return '\\'
1039         '"'   -> return '\"'
1040         '\''  -> return '\''
1041         '^'   -> do c <- getCharOrFail
1042                     if c >= '@' && c <= '_'
1043                         then return (chr (ord c - ord '@'))
1044                         else lit_error
1045
1046         'x'   -> readNum is_hexdigit 16 hexDigit
1047         'o'   -> readNum is_octdigit  8 octDecDigit
1048         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1049
1050         c1 ->  do
1051            i <- getInput
1052            case alexGetChar' i of
1053             Nothing -> lit_error
1054             Just (c2,i2) ->
1055               case alexGetChar' i2 of
1056                 Nothing -> do setInput i2; lit_error
1057                 Just (c3,i3) ->
1058                    let str = [c1,c2,c3] in
1059                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1060                                      Just rest <- [maybePrefixMatch p str] ] of
1061                           (escape_char,[]):_ -> do
1062                                 setInput i3
1063                                 return escape_char
1064                           (escape_char,_:_):_ -> do
1065                                 setInput i2
1066                                 return escape_char
1067                           [] -> lit_error
1068
1069 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1070 readNum is_digit base conv = do
1071   i <- getInput
1072   c <- getCharOrFail
1073   if is_digit c
1074         then readNum2 is_digit base conv (conv c)
1075         else do setInput i; lit_error
1076
1077 readNum2 is_digit base conv i = do
1078   input <- getInput
1079   read i input
1080   where read i input = do
1081           case alexGetChar' input of
1082             Just (c,input') | is_digit c -> do
1083                 read (i*base + conv c) input'
1084             _other -> do
1085                 if i >= 0 && i <= 0x10FFFF
1086                    then do setInput input; return (chr i)
1087                    else lit_error
1088
1089 silly_escape_chars = [
1090         ("NUL", '\NUL'),
1091         ("SOH", '\SOH'),
1092         ("STX", '\STX'),
1093         ("ETX", '\ETX'),
1094         ("EOT", '\EOT'),
1095         ("ENQ", '\ENQ'),
1096         ("ACK", '\ACK'),
1097         ("BEL", '\BEL'),
1098         ("BS", '\BS'),
1099         ("HT", '\HT'),
1100         ("LF", '\LF'),
1101         ("VT", '\VT'),
1102         ("FF", '\FF'),
1103         ("CR", '\CR'),
1104         ("SO", '\SO'),
1105         ("SI", '\SI'),
1106         ("DLE", '\DLE'),
1107         ("DC1", '\DC1'),
1108         ("DC2", '\DC2'),
1109         ("DC3", '\DC3'),
1110         ("DC4", '\DC4'),
1111         ("NAK", '\NAK'),
1112         ("SYN", '\SYN'),
1113         ("ETB", '\ETB'),
1114         ("CAN", '\CAN'),
1115         ("EM", '\EM'),
1116         ("SUB", '\SUB'),
1117         ("ESC", '\ESC'),
1118         ("FS", '\FS'),
1119         ("GS", '\GS'),
1120         ("RS", '\RS'),
1121         ("US", '\US'),
1122         ("SP", '\SP'),
1123         ("DEL", '\DEL')
1124         ]
1125
1126 -- before calling lit_error, ensure that the current input is pointing to
1127 -- the position of the error in the buffer.  This is so that we can report
1128 -- a correct location to the user, but also so we can detect UTF-8 decoding
1129 -- errors if they occur.
1130 lit_error = lexError "lexical error in string/character literal"
1131
1132 getCharOrFail :: P Char
1133 getCharOrFail =  do
1134   i <- getInput
1135   case alexGetChar' i of
1136         Nothing -> lexError "unexpected end-of-file in string/character literal"
1137         Just (c,i)  -> do setInput i; return c
1138
1139 -- -----------------------------------------------------------------------------
1140 -- The Parse Monad
1141
1142 data LayoutContext
1143   = NoLayout
1144   | Layout !Int
1145
1146 data ParseResult a
1147   = POk PState a
1148   | PFailed
1149         SrcSpan         -- The start and end of the text span related to
1150                         -- the error.  Might be used in environments which can
1151                         -- show this span, e.g. by highlighting it.
1152         Message         -- The error message
1153
1154 data PState = PState {
1155         buffer     :: StringBuffer,
1156         last_loc   :: SrcSpan,  -- pos of previous token
1157         last_offs  :: !Int,     -- offset of the previous token from the
1158                                 -- beginning of  the current line.
1159                                 -- \t is equal to 8 spaces.
1160         last_len   :: !Int,     -- len of previous token
1161         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1162         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1163         context    :: [LayoutContext],
1164         lex_state  :: [Int]
1165      }
1166         -- last_loc and last_len are used when generating error messages,
1167         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1168         -- current token to happyError, we could at least get rid of last_len.
1169         -- Getting rid of last_loc would require finding another way to
1170         -- implement pushCurrentContext (which is only called from one place).
1171
1172 newtype P a = P { unP :: PState -> ParseResult a }
1173
1174 instance Monad P where
1175   return = returnP
1176   (>>=) = thenP
1177   fail = failP
1178
1179 returnP :: a -> P a
1180 returnP a = P $ \s -> POk s a
1181
1182 thenP :: P a -> (a -> P b) -> P b
1183 (P m) `thenP` k = P $ \ s ->
1184         case m s of
1185                 POk s1 a         -> (unP (k a)) s1
1186                 PFailed span err -> PFailed span err
1187
1188 failP :: String -> P a
1189 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1190
1191 failMsgP :: String -> P a
1192 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1193
1194 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1195 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1196
1197 failSpanMsgP :: SrcSpan -> String -> P a
1198 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1199
1200 extension :: (Int -> Bool) -> P Bool
1201 extension p = P $ \s -> POk s (p $! extsBitmap s)
1202
1203 getExts :: P Int
1204 getExts = P $ \s -> POk s (extsBitmap s)
1205
1206 setSrcLoc :: SrcLoc -> P ()
1207 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1208
1209 getSrcLoc :: P SrcLoc
1210 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1211
1212 setLastToken :: SrcSpan -> Int -> P ()
1213 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1214
1215 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1216
1217 alexInputPrevChar :: AlexInput -> Char
1218 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1219
1220 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1221 alexGetChar (AI loc ofs s)
1222   | atEnd s   = Nothing
1223   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1224                 --trace (show (ord c)) $
1225                 Just (adj_c, (AI loc' ofs' s'))
1226   where (c,s') = nextChar s
1227         loc'   = advanceSrcLoc loc c
1228         ofs'   = advanceOffs c ofs
1229
1230         non_graphic     = '\x0'
1231         upper           = '\x1'
1232         lower           = '\x2'
1233         digit           = '\x3'
1234         symbol          = '\x4'
1235         space           = '\x5'
1236         other_graphic   = '\x6'
1237
1238         adj_c
1239           | c <= '\x06' = non_graphic
1240           | c <= '\xff' = c
1241           | otherwise =
1242                 case generalCategory c of
1243                   UppercaseLetter       -> upper
1244                   LowercaseLetter       -> lower
1245                   TitlecaseLetter       -> upper
1246                   ModifierLetter        -> other_graphic
1247                   OtherLetter           -> other_graphic
1248                   NonSpacingMark        -> other_graphic
1249                   SpacingCombiningMark  -> other_graphic
1250                   EnclosingMark         -> other_graphic
1251                   DecimalNumber         -> digit
1252                   LetterNumber          -> other_graphic
1253                   OtherNumber           -> other_graphic
1254                   ConnectorPunctuation  -> other_graphic
1255                   DashPunctuation       -> other_graphic
1256                   OpenPunctuation       -> other_graphic
1257                   ClosePunctuation      -> other_graphic
1258                   InitialQuote          -> other_graphic
1259                   FinalQuote            -> other_graphic
1260                   OtherPunctuation      -> other_graphic
1261                   MathSymbol            -> symbol
1262                   CurrencySymbol        -> symbol
1263                   ModifierSymbol        -> symbol
1264                   OtherSymbol           -> symbol
1265                   Space                 -> space
1266                   _other                -> non_graphic
1267
1268 -- This version does not squash unicode characters, it is used when
1269 -- lexing strings.
1270 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1271 alexGetChar' (AI loc ofs s)
1272   | atEnd s   = Nothing
1273   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1274                 --trace (show (ord c)) $
1275                 Just (c, (AI loc' ofs' s'))
1276   where (c,s') = nextChar s
1277         loc'   = advanceSrcLoc loc c
1278         ofs'   = advanceOffs c ofs
1279
1280 advanceOffs :: Char -> Int -> Int
1281 advanceOffs '\n' offs = 0
1282 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1283 advanceOffs _    offs = offs + 1
1284
1285 getInput :: P AlexInput
1286 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1287
1288 setInput :: AlexInput -> P ()
1289 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1290
1291 pushLexState :: Int -> P ()
1292 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1293
1294 popLexState :: P Int
1295 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1296
1297 getLexState :: P Int
1298 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1299
1300 -- for reasons of efficiency, flags indicating language extensions (eg,
1301 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1302 -- integer
1303
1304 glaExtsBit, ffiBit, parrBit :: Int
1305 glaExtsBit = 0
1306 ffiBit     = 1
1307 parrBit    = 2
1308 arrowsBit  = 4
1309 thBit      = 5
1310 ipBit      = 6
1311 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1312 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1313                 -- (doesn't affect the lexer)
1314
1315 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1316 glaExtsEnabled flags = testBit flags glaExtsBit
1317 ffiEnabled     flags = testBit flags ffiBit
1318 parrEnabled    flags = testBit flags parrBit
1319 arrowsEnabled  flags = testBit flags arrowsBit
1320 thEnabled      flags = testBit flags thBit
1321 ipEnabled      flags = testBit flags ipBit
1322 tvEnabled      flags = testBit flags tvBit
1323 bangPatEnabled flags = testBit flags bangPatBit
1324
1325 -- PState for parsing options pragmas
1326 --
1327 pragState :: StringBuffer -> SrcLoc -> PState
1328 pragState buf loc  =
1329   PState {
1330       buffer     = buf,
1331       last_loc   = mkSrcSpan loc loc,
1332       last_offs  = 0,
1333       last_len   = 0,
1334       loc        = loc,
1335       extsBitmap = 0,
1336       context    = [],
1337       lex_state  = [bol, option_prags, 0]
1338     }
1339
1340
1341 -- create a parse state
1342 --
1343 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1344 mkPState buf loc flags  =
1345   PState {
1346       buffer     = buf,
1347       last_loc   = mkSrcSpan loc loc,
1348       last_offs  = 0,
1349       last_len   = 0,
1350       loc        = loc,
1351       extsBitmap = fromIntegral bitmap,
1352       context    = [],
1353       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1354         -- we begin in the layout state if toplev_layout is set
1355     }
1356     where
1357       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1358                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1359                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1360                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1361                .|. thBit      `setBitIf` dopt Opt_TH          flags
1362                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1363                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1364                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1365       --
1366       setBitIf :: Int -> Bool -> Int
1367       b `setBitIf` cond | cond      = bit b
1368                         | otherwise = 0
1369
1370 getContext :: P [LayoutContext]
1371 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1372
1373 setContext :: [LayoutContext] -> P ()
1374 setContext ctx = P $ \s -> POk s{context=ctx} ()
1375
1376 popContext :: P ()
1377 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1378                            loc = loc, last_len = len, last_loc = last_loc }) ->
1379   case ctx of
1380         (_:tl) -> POk s{ context = tl } ()
1381         []     -> PFailed last_loc (srcParseErr buf len)
1382
1383 -- Push a new layout context at the indentation of the last token read.
1384 -- This is only used at the outer level of a module when the 'module'
1385 -- keyword is missing.
1386 pushCurrentContext :: P ()
1387 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_len=len, context=ctx } ->
1388   POk s{context = Layout (offs-len) : ctx} ()
1389
1390 getOffside :: P Ordering
1391 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1392                 let ord = case stk of
1393                         (Layout n:_) -> compare offs n
1394                         _            -> GT
1395                 in POk s ord
1396
1397 -- ---------------------------------------------------------------------------
1398 -- Construct a parse error
1399
1400 srcParseErr
1401   :: StringBuffer       -- current buffer (placed just after the last token)
1402   -> Int                -- length of the previous token
1403   -> Message
1404 srcParseErr buf len
1405   = hcat [ if null token
1406              then ptext SLIT("parse error (possibly incorrect indentation)")
1407              else hcat [ptext SLIT("parse error on input "),
1408                         char '`', text token, char '\'']
1409     ]
1410   where token = lexemeToString (offsetBytes (-len) buf) len
1411
1412 -- Report a parse failure, giving the span of the previous token as
1413 -- the location of the error.  This is the entry point for errors
1414 -- detected during parsing.
1415 srcParseFail :: P a
1416 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1417                             last_loc = last_loc } ->
1418     PFailed last_loc (srcParseErr buf len)
1419
1420 -- A lexical error is reported at a particular position in the source file,
1421 -- not over a token range.
1422 lexError :: String -> P a
1423 lexError str = do
1424   loc <- getSrcLoc
1425   i@(AI end _ buf) <- getInput
1426   reportLexError loc end buf str
1427
1428 -- -----------------------------------------------------------------------------
1429 -- This is the top-level function: called from the parser each time a
1430 -- new token is to be read from the input.
1431
1432 lexer :: (Located Token -> P a) -> P a
1433 lexer cont = do
1434   tok@(L _ tok__) <- lexToken
1435   --trace ("token: " ++ show tok__) $ do
1436   cont tok
1437
1438 lexToken :: P (Located Token)
1439 lexToken = do
1440   inp@(AI loc1 _ buf) <- getInput
1441   sc <- getLexState
1442   exts <- getExts
1443   case alexScanUser exts inp sc of
1444     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1445                   setLastToken span 0
1446                   return (L span ITeof)
1447     AlexError (AI loc2 _ buf) -> do
1448         reportLexError loc1 loc2 buf "lexical error"
1449     AlexSkip inp2 _ -> do
1450         setInput inp2
1451         lexToken
1452     AlexToken inp2@(AI end _ buf2) len t -> do
1453         setInput inp2
1454         let span = mkSrcSpan loc1 end
1455         let bytes = byteDiff buf buf2
1456         span `seq` setLastToken span bytes
1457         t span buf bytes
1458
1459 reportLexError loc1 loc2 buf str
1460   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1461   | otherwise =
1462   let
1463         c = fst (nextChar buf)
1464   in
1465   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1466     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1467     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1468 }