compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  29    getLexState, popLexState, pushLexState,
  30    extension, bangPatEnabled
  31   ) where
  32
  33 #include "HsVersions.h"
  34
  35 import ErrUtils         ( Message )
  36 import Outputable
  37 import StringBuffer
  38 import FastString
  39 import FastTypes
  40 import SrcLoc
  41 import UniqFM
  42 import DynFlags
  43 import Ctype
  44 import Util             ( maybePrefixMatch, readRational )
  45
  46 import DATA_BITS
  47 import Data.Char        ( chr )
  48 import Ratio
  49 --import TRACE
  50
  51 #if __GLASGOW_HASKELL__ >= 605
  52 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  53 #else
  54 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  55 #endif
  56 }
  57
  58 $unispace    = \x05
  59 $whitechar   = [\ \t\n\r\f\v\xa0 $unispace]
  60 $white_no_nl = $whitechar # \n
  61
  62 $ascdigit  = 0-9
  63 $unidigit  = \x03
  64 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  65 $digit     = [$ascdigit $unidigit]
  66
  67 $special   = [\(\)\,\;\[\]\`\{\}]
  68 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  69 $unisymbol = \x04
  70 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  71
  72 $unilarge  = \x01
  73 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  74 $large     = [$asclarge $unilarge]
  75
  76 $unismall  = \x02
  77 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  78 $small     = [$ascsmall $unismall \_]
  79
  80 $unigraphic = \x06
  81 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  82
  83 $octit     = 0-7
  84 $hexit     = [$decdigit A-F a-f]
  85 $symchar   = [$symbol \:]
  86 $nl        = [\n\r]
  87 $idchar    = [$small $large $digit \']
  88
  89 @varid     = $small $idchar*
  90 @conid     = $large $idchar*
  91
  92 @varsym    = $symbol $symchar*
  93 @consym    = \: $symchar*
  94
  95 @decimal     = $decdigit+
  96 @octal       = $octit+
  97 @hexadecimal = $hexit+
  98 @exponent    = [eE] [\-\+]? @decimal
  99
 100 -- we support the hierarchical module name extension:
 101 @qual = (@conid \.)+
 102
 103 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 104
 105 haskell :-
 106
 107 -- everywhere: skip whitespace and comments
 108 $white_no_nl+                           ;
 109
 110 -- Everywhere: deal with nested comments.  We explicitly rule out
 111 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 112 -- (this can happen even though pragmas will normally take precedence due to
 113 -- longest-match, because pragmas aren't valid in every state, but comments
 114 -- are).
 115 "{-" / { notFollowedBy '#' }            { nested_comment }
 116
 117 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 118 -- more dashes followed by a symbol should be parsed as a varsym, so we
 119 -- have to exclude those.
 120 -- The regex says: "munch all the characters after the dashes, as long as
 121 -- the first one is not a symbol".
 122 "--"\-* [^$symbol :] .*                 ;
 123 "--"\-* / { atEOL }                     ;
 124
 125 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 126 -- blank lines) until we find a non-whitespace character, then do layout
 127 -- processing.
 128 --
 129 -- One slight wibble here: what if the line begins with {-#? In
 130 -- theory, we have to lex the pragma to see if it's one we recognise,
 131 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 132 -- as a nested comment.  We don't bother with this: if the line begins
 133 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 134 <bol> {
 135   \n                                    ;
 136   ^\# (line)?                           { begin line_prag1 }
 137   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 138   ^\# \! .* \n                          ; -- #!, for scripts
 139   ()                                    { do_bol }
 140 }
 141
 142 -- after a layout keyword (let, where, do, of), we begin a new layout
 143 -- context if the curly brace is missing.
 144 -- Careful! This stuff is quite delicate.
 145 <layout, layout_do> {
 146   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 147         -- we might encounter {-# here, but {- has been handled already
 148   \n                                    ;
 149   ^\# (line)?                           { begin line_prag1 }
 150 }
 151
 152 -- do is treated in a subtly different way, see new_layout_context
 153 <layout>    ()                          { new_layout_context True }
 154 <layout_do> ()                          { new_layout_context False }
 155
 156 -- after a new layout context which was found to be to the left of the
 157 -- previous context, we have generated a '{' token, and we now need to
 158 -- generate a matching '}' token.
 159 <layout_left>  ()                       { do_layout_left }
 160
 161 <0,option_prags,glaexts> \n                             { begin bol }
 162
 163 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 164
 165 -- single-line line pragmas, of the form
 166 --    # <line> "<file>" <extra-stuff> \n
 167 <line_prag1> $decdigit+                 { setLine line_prag1a }
 168 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 169 <line_prag1b> .*                        { pop }
 170
 171 -- Haskell-style line pragmas, of the form
 172 --    {-# LINE <line> "<file>" #-}
 173 <line_prag2> $decdigit+                 { setLine line_prag2a }
 174 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 175 <line_prag2b> "#-}"|"-}"                { pop }
 176    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 177    -- with older versions of GHC which generated these.
 178
 179 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 180 -- is on, because the contents of the pragma is always written using
 181 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 182 -- enabled, we're sure to get a parse error.
 183 -- (ToDo: we should really emit a warning when ignoring pragmas)
 184 <glaexts>
 185   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 186
 187 <0,option_prags,glaexts> {
 188   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 189   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 190                                         { token (ITinline_prag False) }
 191   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 192                                         { token ITspec_prag }
 193   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 194         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 195   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 196         $whitechar* (NO(T?)INLINE|no(t?)inline)
 197                                         { token (ITspec_inline_prag False) }
 198   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 199   "{-#" $whitechar* (DEPRECATED|deprecated)
 200                                         { token ITdeprecated_prag }
 201   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 202   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 203   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 204
 205   "{-#"                                 { nested_comment }
 206
 207   -- ToDo: should only be valid inside a pragma:
 208   "#-}"                                 { token ITclose_prag}
 209 }
 210
 211 <option_prags> {
 212   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 213   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 214                                         { lex_string_prag IToptions_prag }
 215   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 216   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 217 }
 218
 219 <0,option_prags,glaexts> {
 220         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 221   "{-#" $whitechar* $idchar+            { nested_comment }
 222 }
 223
 224 -- '0' state: ordinary lexemes
 225 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 226
 227 -- "special" symbols
 228
 229 <0,glaexts> {
 230   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 231   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 232 }
 233
 234 <0,glaexts> {
 235   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 236   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 237   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 238   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 239   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 240   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 241   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 242   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 243 }
 244
 245 <0,glaexts> {
 246   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 247                                         { special IToparenbar }
 248   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 249 }
 250
 251 <0,glaexts> {
 252   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 253   \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
 254 }
 255
 256 <glaexts> {
 257   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 258   "#)"                                  { token ITcubxparen }
 259   "{|"                                  { token ITocurlybar }
 260   "|}"                                  { token ITccurlybar }
 261 }
 262
 263 <0,option_prags,glaexts> {
 264   \(                                    { special IToparen }
 265   \)                                    { special ITcparen }
 266   \[                                    { special ITobrack }
 267   \]                                    { special ITcbrack }
 268   \,                                    { special ITcomma }
 269   \;                                    { special ITsemi }
 270   \`                                    { special ITbackquote }
 271
 272   \{                                    { open_brace }
 273   \}                                    { close_brace }
 274 }
 275
 276 <0,option_prags,glaexts> {
 277   @qual @varid                  { check_qvarid }
 278   @qual @conid                  { idtoken qconid }
 279   @varid                        { varid }
 280   @conid                        { idtoken conid }
 281 }
 282
 283 -- after an illegal qvarid, such as 'M.let',
 284 -- we back up and try again in the bad_qvarid state:
 285 <bad_qvarid> {
 286   @conid                        { pop_and (idtoken conid) }
 287   @qual @conid                  { pop_and (idtoken qconid) }
 288 }
 289
 290 <glaexts> {
 291   @qual @varid "#"+             { idtoken qvarid }
 292   @qual @conid "#"+             { idtoken qconid }
 293   @varid "#"+                   { varid }
 294   @conid "#"+                   { idtoken conid }
 295 }
 296
 297 -- ToDo: M.(,,,)
 298
 299 <0,glaexts> {
 300   @qual @varsym                 { idtoken qvarsym }
 301   @qual @consym                 { idtoken qconsym }
 302   @varsym                       { varsym }
 303   @consym                       { consym }
 304 }
 305
 306 <0,glaexts> {
 307   @decimal                      { tok_decimal }
 308   0[oO] @octal                  { tok_octal }
 309   0[xX] @hexadecimal            { tok_hexadecimal }
 310 }
 311
 312 <glaexts> {
 313   @decimal \#                   { prim_decimal }
 314   0[oO] @octal \#               { prim_octal }
 315   0[xX] @hexadecimal \#         { prim_hexadecimal }
 316 }
 317
 318 <0,glaexts> @floating_point             { strtoken tok_float }
 319 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 320 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 321
 322 -- Strings and chars are lexed by hand-written code.  The reason is
 323 -- that even if we recognise the string or char here in the regex
 324 -- lexer, we would still have to parse the string afterward in order
 325 -- to convert it to a String.
 326 <0,glaexts> {
 327   \'                            { lex_char_tok }
 328   \"                            { lex_string_tok }
 329 }
 330
 331 {
 332 -- work around bug in Alex 2.0
 333 #if __GLASGOW_HASKELL__ < 503
 334 unsafeAt arr i = arr ! i
 335 #endif
 336
 337 -- -----------------------------------------------------------------------------
 338 -- The token type
 339
 340 data Token
 341   = ITas                        -- Haskell keywords
 342   | ITcase
 343   | ITclass
 344   | ITdata
 345   | ITdefault
 346   | ITderiving
 347   | ITdo
 348   | ITelse
 349   | IThiding
 350   | ITif
 351   | ITimport
 352   | ITin
 353   | ITinfix
 354   | ITinfixl
 355   | ITinfixr
 356   | ITinstance
 357   | ITlet
 358   | ITmodule
 359   | ITnewtype
 360   | ITof
 361   | ITqualified
 362   | ITthen
 363   | ITtype
 364   | ITwhere
 365   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 366
 367   | ITforall                    -- GHC extension keywords
 368   | ITforeign
 369   | ITexport
 370   | ITlabel
 371   | ITdynamic
 372   | ITsafe
 373   | ITthreadsafe
 374   | ITunsafe
 375   | ITstdcallconv
 376   | ITccallconv
 377   | ITdotnet
 378   | ITmdo
 379   | ITiso
 380   | ITfamily
 381
 382         -- Pragmas
 383   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 384   | ITspec_prag                 -- SPECIALISE
 385   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 386   | ITsource_prag
 387   | ITrules_prag
 388   | ITdeprecated_prag
 389   | ITline_prag
 390   | ITscc_prag
 391   | ITcore_prag                 -- hdaume: core annotations
 392   | ITunpack_prag
 393   | ITclose_prag
 394   | IToptions_prag String
 395   | ITinclude_prag String
 396   | ITlanguage_prag
 397
 398   | ITdotdot                    -- reserved symbols
 399   | ITcolon
 400   | ITdcolon
 401   | ITequal
 402   | ITlam
 403   | ITvbar
 404   | ITlarrow
 405   | ITrarrow
 406   | ITat
 407   | ITtilde
 408   | ITdarrow
 409   | ITminus
 410   | ITbang
 411   | ITstar
 412   | ITdot
 413
 414   | ITbiglam                    -- GHC-extension symbols
 415
 416   | ITocurly                    -- special symbols
 417   | ITccurly
 418   | ITocurlybar                 -- {|, for type applications
 419   | ITccurlybar                 -- |}, for type applications
 420   | ITvocurly
 421   | ITvccurly
 422   | ITobrack
 423   | ITopabrack                  -- [:, for parallel arrays with -fparr
 424   | ITcpabrack                  -- :], for parallel arrays with -fparr
 425   | ITcbrack
 426   | IToparen
 427   | ITcparen
 428   | IToubxparen
 429   | ITcubxparen
 430   | ITsemi
 431   | ITcomma
 432   | ITunderscore
 433   | ITbackquote
 434
 435   | ITvarid   FastString        -- identifiers
 436   | ITconid   FastString
 437   | ITvarsym  FastString
 438   | ITconsym  FastString
 439   | ITqvarid  (FastString,FastString)
 440   | ITqconid  (FastString,FastString)
 441   | ITqvarsym (FastString,FastString)
 442   | ITqconsym (FastString,FastString)
 443
 444   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 445   | ITsplitipvarid FastString   -- GHC extension: implicit param: %x
 446
 447   | ITpragma StringBuffer
 448
 449   | ITchar       Char
 450   | ITstring     FastString
 451   | ITinteger    Integer
 452   | ITrational   Rational
 453
 454   | ITprimchar   Char
 455   | ITprimstring FastString
 456   | ITprimint    Integer
 457   | ITprimfloat  Rational
 458   | ITprimdouble Rational
 459
 460   -- MetaHaskell extension tokens
 461   | ITopenExpQuote              --  [| or [e|
 462   | ITopenPatQuote              --  [p|
 463   | ITopenDecQuote              --  [d|
 464   | ITopenTypQuote              --  [t|
 465   | ITcloseQuote                --  |]
 466   | ITidEscape   FastString     --  $x
 467   | ITparenEscape               --  $(
 468   | ITvarQuote                  --  '
 469   | ITtyQuote                   --  ''
 470
 471   -- Arrow notation extension
 472   | ITproc
 473   | ITrec
 474   | IToparenbar                 --  (|
 475   | ITcparenbar                 --  |)
 476   | ITlarrowtail                --  -<
 477   | ITrarrowtail                --  >-
 478   | ITLarrowtail                --  -<<
 479   | ITRarrowtail                --  >>-
 480
 481   | ITunknown String            -- Used when the lexer can't make sense of it
 482   | ITeof                       -- end of file token
 483 #ifdef DEBUG
 484   deriving Show -- debugging
 485 #endif
 486
 487 isSpecial :: Token -> Bool
 488 -- If we see M.x, where x is a keyword, but
 489 -- is special, we treat is as just plain M.x,
 490 -- not as a keyword.
 491 isSpecial ITas          = True
 492 isSpecial IThiding      = True
 493 isSpecial ITqualified   = True
 494 isSpecial ITforall      = True
 495 isSpecial ITexport      = True
 496 isSpecial ITlabel       = True
 497 isSpecial ITdynamic     = True
 498 isSpecial ITsafe        = True
 499 isSpecial ITthreadsafe  = True
 500 isSpecial ITunsafe      = True
 501 isSpecial ITccallconv   = True
 502 isSpecial ITstdcallconv = True
 503 isSpecial ITmdo         = True
 504 isSpecial ITiso         = True
 505 isSpecial ITfamily      = True
 506 isSpecial _             = False
 507
 508 -- the bitmap provided as the third component indicates whether the
 509 -- corresponding extension keyword is valid under the extension options
 510 -- provided to the compiler; if the extension corresponding to *any* of the
 511 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 512 -- facilitates using a keyword in two different extensions that can be
 513 -- activated independently)
 514 --
 515 reservedWordsFM = listToUFM $
 516         map (\(x, y, z) -> (mkFastString x, (y, z)))
 517        [( "_",          ITunderscore,   0 ),
 518         ( "as",         ITas,           0 ),
 519         ( "case",       ITcase,         0 ),
 520         ( "class",      ITclass,        0 ),
 521         ( "data",       ITdata,         0 ),
 522         ( "default",    ITdefault,      0 ),
 523         ( "deriving",   ITderiving,     0 ),
 524         ( "do",         ITdo,           0 ),
 525         ( "else",       ITelse,         0 ),
 526         ( "hiding",     IThiding,       0 ),
 527         ( "if",         ITif,           0 ),
 528         ( "import",     ITimport,       0 ),
 529         ( "in",         ITin,           0 ),
 530         ( "infix",      ITinfix,        0 ),
 531         ( "infixl",     ITinfixl,       0 ),
 532         ( "infixr",     ITinfixr,       0 ),
 533         ( "instance",   ITinstance,     0 ),
 534         ( "let",        ITlet,          0 ),
 535         ( "module",     ITmodule,       0 ),
 536         ( "newtype",    ITnewtype,      0 ),
 537         ( "of",         ITof,           0 ),
 538         ( "qualified",  ITqualified,    0 ),
 539         ( "then",       ITthen,         0 ),
 540         ( "type",       ITtype,         0 ),
 541         ( "where",      ITwhere,        0 ),
 542         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 543
 544         ( "forall",     ITforall,        bit tvBit),
 545         ( "mdo",        ITmdo,           bit glaExtsBit),
 546         ( "iso",        ITiso,           bit glaExtsBit),
 547         ( "family",     ITfamily,        bit glaExtsBit),
 548
 549         ( "foreign",    ITforeign,       bit ffiBit),
 550         ( "export",     ITexport,        bit ffiBit),
 551         ( "label",      ITlabel,         bit ffiBit),
 552         ( "dynamic",    ITdynamic,       bit ffiBit),
 553         ( "safe",       ITsafe,          bit ffiBit),
 554         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 555         ( "unsafe",     ITunsafe,        bit ffiBit),
 556         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 557         ( "ccall",      ITccallconv,     bit ffiBit),
 558         ( "dotnet",     ITdotnet,        bit ffiBit),
 559
 560         ( "rec",        ITrec,           bit arrowsBit),
 561         ( "proc",       ITproc,          bit arrowsBit)
 562      ]
 563
 564 reservedSymsFM = listToUFM $
 565         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 566       [ ("..",  ITdotdot,       0)
 567        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 568                                                 -- meaning only list cons
 569        ,("::",  ITdcolon,       0)
 570        ,("=",   ITequal,        0)
 571        ,("\\",  ITlam,          0)
 572        ,("|",   ITvbar,         0)
 573        ,("<-",  ITlarrow,       0)
 574        ,("->",  ITrarrow,       0)
 575        ,("@",   ITat,           0)
 576        ,("~",   ITtilde,        0)
 577        ,("=>",  ITdarrow,       0)
 578        ,("-",   ITminus,        0)
 579        ,("!",   ITbang,         0)
 580
 581        ,("*",   ITstar,         bit glaExtsBit) -- For data T (a::*) = MkT
 582        ,(".",   ITdot,          bit tvBit)      -- For 'forall a . t'
 583
 584        ,("-<",  ITlarrowtail,   bit arrowsBit)
 585        ,(">-",  ITrarrowtail,   bit arrowsBit)
 586        ,("-<<", ITLarrowtail,   bit arrowsBit)
 587        ,(">>-", ITRarrowtail,   bit arrowsBit)
 588
 589 #if __GLASGOW_HASKELL__ >= 605
 590        ,("λ",  ITlam,          bit glaExtsBit)
 591        ,("∷",   ITdcolon,       bit glaExtsBit)
 592        ,("⇒",   ITdarrow,     bit glaExtsBit)
 593        ,("∀", ITforall,       bit glaExtsBit)
 594        ,("→",   ITrarrow,     bit glaExtsBit)
 595        ,("←",   ITlarrow,     bit glaExtsBit)
 596        ,("⋯",         ITdotdot,       bit glaExtsBit)
 597         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 598         -- form part of a large operator.  This would let us have a better
 599         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 600 #endif
 601        ]
 602
 603 -- -----------------------------------------------------------------------------
 604 -- Lexer actions
 605
 606 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 607
 608 special :: Token -> Action
 609 special tok span _buf len = return (L span tok)
 610
 611 token, layout_token :: Token -> Action
 612 token t span buf len = return (L span t)
 613 layout_token t span buf len = pushLexState layout >> return (L span t)
 614
 615 idtoken :: (StringBuffer -> Int -> Token) -> Action
 616 idtoken f span buf len = return (L span $! (f buf len))
 617
 618 skip_one_varid :: (FastString -> Token) -> Action
 619 skip_one_varid f span buf len
 620   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 621
 622 strtoken :: (String -> Token) -> Action
 623 strtoken f span buf len =
 624   return (L span $! (f $! lexemeToString buf len))
 625
 626 init_strtoken :: Int -> (String -> Token) -> Action
 627 -- like strtoken, but drops the last N character(s)
 628 init_strtoken drop f span buf len =
 629   return (L span $! (f $! lexemeToString buf (len-drop)))
 630
 631 begin :: Int -> Action
 632 begin code _span _str _len = do pushLexState code; lexToken
 633
 634 pop :: Action
 635 pop _span _buf _len = do popLexState; lexToken
 636
 637 pop_and :: Action -> Action
 638 pop_and act span buf len = do popLexState; act span buf len
 639
 640 notFollowedBy char _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf /= char
 641
 642 notFollowedBySymbol _ _ _ (AI _ _ buf)
 643   = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
 644
 645 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 646
 647 ifExtension pred bits _ _ _ = pred bits
 648
 649 {-
 650   nested comments require traversing by hand, they can't be parsed
 651   using regular expressions.
 652 -}
 653 nested_comment :: Action
 654 nested_comment span _str _len = do
 655   input <- getInput
 656   go 1 input
 657   where go 0 input = do setInput input; lexToken
 658         go n input = do
 659           case alexGetChar input of
 660             Nothing  -> err input
 661             Just (c,input) -> do
 662               case c of
 663                 '-' -> do
 664                   case alexGetChar input of
 665                     Nothing  -> err input
 666                     Just ('\125',input) -> go (n-1) input
 667                     Just (c,_)          -> go n input
 668                 '\123' -> do
 669                   case alexGetChar input of
 670                     Nothing  -> err input
 671                     Just ('-',input') -> go (n+1) input'
 672                     Just (c,input)    -> go n input
 673                 c -> go n input
 674
 675         err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 676
 677 open_brace, close_brace :: Action
 678 open_brace span _str _len = do
 679   ctx <- getContext
 680   setContext (NoLayout:ctx)
 681   return (L span ITocurly)
 682 close_brace span _str _len = do
 683   popContext
 684   return (L span ITccurly)
 685
 686 -- We have to be careful not to count M.<varid> as a qualified name
 687 -- when <varid> is a keyword.  We hack around this by catching
 688 -- the offending tokens afterward, and re-lexing in a different state.
 689 check_qvarid span buf len = do
 690   case lookupUFM reservedWordsFM var of
 691         Just (keyword,exts)
 692           | not (isSpecial keyword) ->
 693           if exts == 0
 694              then try_again
 695              else do
 696                 b <- extension (\i -> exts .&. i /= 0)
 697                 if b then try_again
 698                      else return token
 699         _other -> return token
 700   where
 701         (mod,var) = splitQualName buf len
 702         token     = L span (ITqvarid (mod,var))
 703
 704         try_again = do
 705                 (AI _ offs _) <- getInput
 706                 setInput (AI (srcSpanStart span) (offs-len) buf)
 707                 pushLexState bad_qvarid
 708                 lexToken
 709
 710 qvarid buf len = ITqvarid $! splitQualName buf len
 711 qconid buf len = ITqconid $! splitQualName buf len
 712
 713 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 714 -- takes a StringBuffer and a length, and returns the module name
 715 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 716 -- because of hierarchical module names.
 717 splitQualName orig_buf len = split orig_buf orig_buf
 718   where
 719     split buf dot_buf
 720         | orig_buf `byteDiff` buf >= len  = done dot_buf
 721         | c == '.'                        = found_dot buf'
 722         | otherwise                       = split buf' dot_buf
 723       where
 724        (c,buf') = nextChar buf
 725
 726     -- careful, we might get names like M....
 727     -- so, if the character after the dot is not upper-case, this is
 728     -- the end of the qualifier part.
 729     found_dot buf -- buf points after the '.'
 730         | isUpper c    = split buf' buf
 731         | otherwise    = done buf
 732       where
 733        (c,buf') = nextChar buf
 734
 735     done dot_buf =
 736         (lexemeToFastString orig_buf (qual_size - 1),
 737          lexemeToFastString dot_buf (len - qual_size))
 738       where
 739         qual_size = orig_buf `byteDiff` dot_buf
 740
 741 varid span buf len =
 742   case lookupUFM reservedWordsFM fs of
 743         Just (keyword,0)    -> do
 744                 maybe_layout keyword
 745                 return (L span keyword)
 746         Just (keyword,exts) -> do
 747                 b <- extension (\i -> exts .&. i /= 0)
 748                 if b then do maybe_layout keyword
 749                              return (L span keyword)
 750                      else return (L span (ITvarid fs))
 751         _other -> return (L span (ITvarid fs))
 752   where
 753         fs = lexemeToFastString buf len
 754
 755 conid buf len = ITconid fs
 756   where fs = lexemeToFastString buf len
 757
 758 qvarsym buf len = ITqvarsym $! splitQualName buf len
 759 qconsym buf len = ITqconsym $! splitQualName buf len
 760
 761 varsym = sym ITvarsym
 762 consym = sym ITconsym
 763
 764 sym con span buf len =
 765   case lookupUFM reservedSymsFM fs of
 766         Just (keyword,0)    -> return (L span keyword)
 767         Just (keyword,exts) -> do
 768                 b <- extension (\i -> exts .&. i /= 0)
 769                 if b then return (L span keyword)
 770                      else return (L span $! con fs)
 771         _other -> return (L span $! con fs)
 772   where
 773         fs = lexemeToFastString buf len
 774
 775 tok_decimal span buf len
 776   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 777
 778 tok_octal span buf len
 779   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 780
 781 tok_hexadecimal span buf len
 782   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 783
 784 prim_decimal span buf len
 785   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 786
 787 prim_octal span buf len
 788   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 789
 790 prim_hexadecimal span buf len
 791   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 792
 793 tok_float        str = ITrational   $! readRational str
 794 prim_float       str = ITprimfloat  $! readRational str
 795 prim_double      str = ITprimdouble $! readRational str
 796
 797 -- -----------------------------------------------------------------------------
 798 -- Layout processing
 799
 800 -- we're at the first token on a line, insert layout tokens if necessary
 801 do_bol :: Action
 802 do_bol span _str _len = do
 803         pos <- getOffside
 804         case pos of
 805             LT -> do
 806                 --trace "layout: inserting '}'" $ do
 807                 popContext
 808                 -- do NOT pop the lex state, we might have a ';' to insert
 809                 return (L span ITvccurly)
 810             EQ -> do
 811                 --trace "layout: inserting ';'" $ do
 812                 popLexState
 813                 return (L span ITsemi)
 814             GT -> do
 815                 popLexState
 816                 lexToken
 817
 818 -- certain keywords put us in the "layout" state, where we might
 819 -- add an opening curly brace.
 820 maybe_layout ITdo       = pushLexState layout_do
 821 maybe_layout ITmdo      = pushLexState layout_do
 822 maybe_layout ITof       = pushLexState layout
 823 maybe_layout ITlet      = pushLexState layout
 824 maybe_layout ITwhere    = pushLexState layout
 825 maybe_layout ITrec      = pushLexState layout
 826 maybe_layout _          = return ()
 827
 828 -- Pushing a new implicit layout context.  If the indentation of the
 829 -- next token is not greater than the previous layout context, then
 830 -- Haskell 98 says that the new layout context should be empty; that is
 831 -- the lexer must generate {}.
 832 --
 833 -- We are slightly more lenient than this: when the new context is started
 834 -- by a 'do', then we allow the new context to be at the same indentation as
 835 -- the previous context.  This is what the 'strict' argument is for.
 836 --
 837 new_layout_context strict span _buf _len = do
 838     popLexState
 839     (AI _ offset _) <- getInput
 840     ctx <- getContext
 841     case ctx of
 842         Layout prev_off : _  |
 843            (strict     && prev_off >= offset  ||
 844             not strict && prev_off > offset) -> do
 845                 -- token is indented to the left of the previous context.
 846                 -- we must generate a {} sequence now.
 847                 pushLexState layout_left
 848                 return (L span ITvocurly)
 849         other -> do
 850                 setContext (Layout offset : ctx)
 851                 return (L span ITvocurly)
 852
 853 do_layout_left span _buf _len = do
 854     popLexState
 855     pushLexState bol  -- we must be at the start of a line
 856     return (L span ITvccurly)
 857
 858 -- -----------------------------------------------------------------------------
 859 -- LINE pragmas
 860
 861 setLine :: Int -> Action
 862 setLine code span buf len = do
 863   let line = parseInteger buf len 10 octDecDigit
 864   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
 865         -- subtract one: the line number refers to the *following* line
 866   popLexState
 867   pushLexState code
 868   lexToken
 869
 870 setFile :: Int -> Action
 871 setFile code span buf len = do
 872   let file = lexemeToFastString (stepOn buf) (len-2)
 873   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
 874   popLexState
 875   pushLexState code
 876   lexToken
 877
 878
 879 -- -----------------------------------------------------------------------------
 880 -- Options, includes and language pragmas.
 881
 882 lex_string_prag :: (String -> Token) -> Action
 883 lex_string_prag mkTok span buf len
 884     = do input <- getInput
 885          start <- getSrcLoc
 886          tok <- go [] input
 887          end <- getSrcLoc
 888          return (L (mkSrcSpan start end) tok)
 889     where go acc input
 890               = if isString input "#-}"
 891                    then do setInput input
 892                            return (mkTok (reverse acc))
 893                    else case alexGetChar input of
 894                           Just (c,i) -> go (c:acc) i
 895                           Nothing -> err input
 896           isString i [] = True
 897           isString i (x:xs)
 898               = case alexGetChar i of
 899                   Just (c,i') | c == x    -> isString i' xs
 900                   _other -> False
 901           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
 902
 903
 904 -- -----------------------------------------------------------------------------
 905 -- Strings & Chars
 906
 907 -- This stuff is horrible.  I hates it.
 908
 909 lex_string_tok :: Action
 910 lex_string_tok span buf len = do
 911   tok <- lex_string ""
 912   end <- getSrcLoc
 913   return (L (mkSrcSpan (srcSpanStart span) end) tok)
 914
 915 lex_string :: String -> P Token
 916 lex_string s = do
 917   i <- getInput
 918   case alexGetChar' i of
 919     Nothing -> lit_error
 920
 921     Just ('"',i)  -> do
 922         setInput i
 923         glaexts <- extension glaExtsEnabled
 924         if glaexts
 925           then do
 926             i <- getInput
 927             case alexGetChar' i of
 928               Just ('#',i) -> do
 929                    setInput i
 930                    if any (> '\xFF') s
 931                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 932                     else let s' = mkZFastString (reverse s) in
 933                          return (ITprimstring s')
 934                         -- mkZFastString is a hack to avoid encoding the
 935                         -- string in UTF-8.  We just want the exact bytes.
 936               _other ->
 937                 return (ITstring (mkFastString (reverse s)))
 938           else
 939                 return (ITstring (mkFastString (reverse s)))
 940
 941     Just ('\\',i)
 942         | Just ('&',i) <- next -> do
 943                 setInput i; lex_string s
 944         | Just (c,i) <- next, is_space c -> do
 945                 setInput i; lex_stringgap s
 946         where next = alexGetChar' i
 947
 948     Just (c, i) -> do
 949         c' <- lex_char c i
 950         lex_string (c':s)
 951
 952 lex_stringgap s = do
 953   c <- getCharOrFail
 954   case c of
 955     '\\' -> lex_string s
 956     c | is_space c -> lex_stringgap s
 957     _other -> lit_error
 958
 959
 960 lex_char_tok :: Action
 961 -- Here we are basically parsing character literals, such as 'x' or '\n'
 962 -- but, when Template Haskell is on, we additionally spot
 963 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
 964 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
 965 -- So we have to do two characters of lookahead: when we see 'x we need to
 966 -- see if there's a trailing quote
 967 lex_char_tok span buf len = do  -- We've seen '
 968    i1 <- getInput       -- Look ahead to first character
 969    let loc = srcSpanStart span
 970    case alexGetChar' i1 of
 971         Nothing -> lit_error
 972
 973         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
 974                   th_exts <- extension thEnabled
 975                   if th_exts then do
 976                         setInput i2
 977                         return (L (mkSrcSpan loc end2)  ITtyQuote)
 978                    else lit_error
 979
 980         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
 981                   setInput i2
 982                   lit_ch <- lex_escape
 983                   mc <- getCharOrFail   -- Trailing quote
 984                   if mc == '\'' then finish_char_tok loc lit_ch
 985                                 else do setInput i2; lit_error
 986
 987         Just (c, i2@(AI end2 _ _))
 988                 | not (isAny c) -> lit_error
 989                 | otherwise ->
 990
 991                 -- We've seen 'x, where x is a valid character
 992                 --  (i.e. not newline etc) but not a quote or backslash
 993            case alexGetChar' i2 of      -- Look ahead one more character
 994                 Nothing -> lit_error
 995                 Just ('\'', i3) -> do   -- We've seen 'x'
 996                         setInput i3
 997                         finish_char_tok loc c
 998                 _other -> do            -- We've seen 'x not followed by quote
 999                                         -- If TH is on, just parse the quote only
1000                         th_exts <- extension thEnabled
1001                         let (AI end _ _) = i1
1002                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1003                                    else do setInput i2; lit_error
1004
1005 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1006 finish_char_tok loc ch  -- We've already seen the closing quote
1007                         -- Just need to check for trailing #
1008   = do  glaexts <- extension glaExtsEnabled
1009         i@(AI end _ _) <- getInput
1010         if glaexts then do
1011                 case alexGetChar' i of
1012                         Just ('#',i@(AI end _ _)) -> do
1013                                 setInput i
1014                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1015                         _other ->
1016                                 return (L (mkSrcSpan loc end) (ITchar ch))
1017                 else do
1018                    return (L (mkSrcSpan loc end) (ITchar ch))
1019
1020 lex_char :: Char -> AlexInput -> P Char
1021 lex_char c inp = do
1022   case c of
1023       '\\' -> do setInput inp; lex_escape
1024       c | isAny c -> do setInput inp; return c
1025       _other -> lit_error
1026
1027 isAny c | c > '\xff' = isPrint c
1028         | otherwise  = is_any c
1029
1030 lex_escape :: P Char
1031 lex_escape = do
1032   c <- getCharOrFail
1033   case c of
1034         'a'   -> return '\a'
1035         'b'   -> return '\b'
1036         'f'   -> return '\f'
1037         'n'   -> return '\n'
1038         'r'   -> return '\r'
1039         't'   -> return '\t'
1040         'v'   -> return '\v'
1041         '\\'  -> return '\\'
1042         '"'   -> return '\"'
1043         '\''  -> return '\''
1044         '^'   -> do c <- getCharOrFail
1045                     if c >= '@' && c <= '_'
1046                         then return (chr (ord c - ord '@'))
1047                         else lit_error
1048
1049         'x'   -> readNum is_hexdigit 16 hexDigit
1050         'o'   -> readNum is_octdigit  8 octDecDigit
1051         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1052
1053         c1 ->  do
1054            i <- getInput
1055            case alexGetChar' i of
1056             Nothing -> lit_error
1057             Just (c2,i2) ->
1058               case alexGetChar' i2 of
1059                 Nothing -> do setInput i2; lit_error
1060                 Just (c3,i3) ->
1061                    let str = [c1,c2,c3] in
1062                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1063                                      Just rest <- [maybePrefixMatch p str] ] of
1064                           (escape_char,[]):_ -> do
1065                                 setInput i3
1066                                 return escape_char
1067                           (escape_char,_:_):_ -> do
1068                                 setInput i2
1069                                 return escape_char
1070                           [] -> lit_error
1071
1072 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1073 readNum is_digit base conv = do
1074   i <- getInput
1075   c <- getCharOrFail
1076   if is_digit c
1077         then readNum2 is_digit base conv (conv c)
1078         else do setInput i; lit_error
1079
1080 readNum2 is_digit base conv i = do
1081   input <- getInput
1082   read i input
1083   where read i input = do
1084           case alexGetChar' input of
1085             Just (c,input') | is_digit c -> do
1086                 read (i*base + conv c) input'
1087             _other -> do
1088                 if i >= 0 && i <= 0x10FFFF
1089                    then do setInput input; return (chr i)
1090                    else lit_error
1091
1092 silly_escape_chars = [
1093         ("NUL", '\NUL'),
1094         ("SOH", '\SOH'),
1095         ("STX", '\STX'),
1096         ("ETX", '\ETX'),
1097         ("EOT", '\EOT'),
1098         ("ENQ", '\ENQ'),
1099         ("ACK", '\ACK'),
1100         ("BEL", '\BEL'),
1101         ("BS", '\BS'),
1102         ("HT", '\HT'),
1103         ("LF", '\LF'),
1104         ("VT", '\VT'),
1105         ("FF", '\FF'),
1106         ("CR", '\CR'),
1107         ("SO", '\SO'),
1108         ("SI", '\SI'),
1109         ("DLE", '\DLE'),
1110         ("DC1", '\DC1'),
1111         ("DC2", '\DC2'),
1112         ("DC3", '\DC3'),
1113         ("DC4", '\DC4'),
1114         ("NAK", '\NAK'),
1115         ("SYN", '\SYN'),
1116         ("ETB", '\ETB'),
1117         ("CAN", '\CAN'),
1118         ("EM", '\EM'),
1119         ("SUB", '\SUB'),
1120         ("ESC", '\ESC'),
1121         ("FS", '\FS'),
1122         ("GS", '\GS'),
1123         ("RS", '\RS'),
1124         ("US", '\US'),
1125         ("SP", '\SP'),
1126         ("DEL", '\DEL')
1127         ]
1128
1129 -- before calling lit_error, ensure that the current input is pointing to
1130 -- the position of the error in the buffer.  This is so that we can report
1131 -- a correct location to the user, but also so we can detect UTF-8 decoding
1132 -- errors if they occur.
1133 lit_error = lexError "lexical error in string/character literal"
1134
1135 getCharOrFail :: P Char
1136 getCharOrFail =  do
1137   i <- getInput
1138   case alexGetChar' i of
1139         Nothing -> lexError "unexpected end-of-file in string/character literal"
1140         Just (c,i)  -> do setInput i; return c
1141
1142 -- -----------------------------------------------------------------------------
1143 -- The Parse Monad
1144
1145 data LayoutContext
1146   = NoLayout
1147   | Layout !Int
1148
1149 data ParseResult a
1150   = POk PState a
1151   | PFailed
1152         SrcSpan         -- The start and end of the text span related to
1153                         -- the error.  Might be used in environments which can
1154                         -- show this span, e.g. by highlighting it.
1155         Message         -- The error message
1156
1157 data PState = PState {
1158         buffer     :: StringBuffer,
1159         last_loc   :: SrcSpan,  -- pos of previous token
1160         last_offs  :: !Int,     -- offset of the previous token from the
1161                                 -- beginning of  the current line.
1162                                 -- \t is equal to 8 spaces.
1163         last_len   :: !Int,     -- len of previous token
1164         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1165         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1166         context    :: [LayoutContext],
1167         lex_state  :: [Int]
1168      }
1169         -- last_loc and last_len are used when generating error messages,
1170         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1171         -- current token to happyError, we could at least get rid of last_len.
1172         -- Getting rid of last_loc would require finding another way to
1173         -- implement pushCurrentContext (which is only called from one place).
1174
1175 newtype P a = P { unP :: PState -> ParseResult a }
1176
1177 instance Monad P where
1178   return = returnP
1179   (>>=) = thenP
1180   fail = failP
1181
1182 returnP :: a -> P a
1183 returnP a = P $ \s -> POk s a
1184
1185 thenP :: P a -> (a -> P b) -> P b
1186 (P m) `thenP` k = P $ \ s ->
1187         case m s of
1188                 POk s1 a         -> (unP (k a)) s1
1189                 PFailed span err -> PFailed span err
1190
1191 failP :: String -> P a
1192 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1193
1194 failMsgP :: String -> P a
1195 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1196
1197 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1198 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1199
1200 failSpanMsgP :: SrcSpan -> String -> P a
1201 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1202
1203 extension :: (Int -> Bool) -> P Bool
1204 extension p = P $ \s -> POk s (p $! extsBitmap s)
1205
1206 getExts :: P Int
1207 getExts = P $ \s -> POk s (extsBitmap s)
1208
1209 setSrcLoc :: SrcLoc -> P ()
1210 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1211
1212 getSrcLoc :: P SrcLoc
1213 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1214
1215 setLastToken :: SrcSpan -> Int -> P ()
1216 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1217
1218 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1219
1220 alexInputPrevChar :: AlexInput -> Char
1221 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1222
1223 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1224 alexGetChar (AI loc ofs s)
1225   | atEnd s   = Nothing
1226   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1227                 --trace (show (ord c)) $
1228                 Just (adj_c, (AI loc' ofs' s'))
1229   where (c,s') = nextChar s
1230         loc'   = advanceSrcLoc loc c
1231         ofs'   = advanceOffs c ofs
1232
1233         non_graphic     = '\x0'
1234         upper           = '\x1'
1235         lower           = '\x2'
1236         digit           = '\x3'
1237         symbol          = '\x4'
1238         space           = '\x5'
1239         other_graphic   = '\x6'
1240
1241         adj_c
1242           | c <= '\x06' = non_graphic
1243           | c <= '\xff' = c
1244           | otherwise =
1245                 case generalCategory c of
1246                   UppercaseLetter       -> upper
1247                   LowercaseLetter       -> lower
1248                   TitlecaseLetter       -> upper
1249                   ModifierLetter        -> other_graphic
1250                   OtherLetter           -> other_graphic
1251                   NonSpacingMark        -> other_graphic
1252                   SpacingCombiningMark  -> other_graphic
1253                   EnclosingMark         -> other_graphic
1254                   DecimalNumber         -> digit
1255                   LetterNumber          -> other_graphic
1256                   OtherNumber           -> other_graphic
1257                   ConnectorPunctuation  -> other_graphic
1258                   DashPunctuation       -> other_graphic
1259                   OpenPunctuation       -> other_graphic
1260                   ClosePunctuation      -> other_graphic
1261                   InitialQuote          -> other_graphic
1262                   FinalQuote            -> other_graphic
1263                   OtherPunctuation      -> other_graphic
1264                   MathSymbol            -> symbol
1265                   CurrencySymbol        -> symbol
1266                   ModifierSymbol        -> symbol
1267                   OtherSymbol           -> symbol
1268                   Space                 -> space
1269                   _other                -> non_graphic
1270
1271 -- This version does not squash unicode characters, it is used when
1272 -- lexing strings.
1273 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1274 alexGetChar' (AI loc ofs s)
1275   | atEnd s   = Nothing
1276   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1277                 --trace (show (ord c)) $
1278                 Just (c, (AI loc' ofs' s'))
1279   where (c,s') = nextChar s
1280         loc'   = advanceSrcLoc loc c
1281         ofs'   = advanceOffs c ofs
1282
1283 advanceOffs :: Char -> Int -> Int
1284 advanceOffs '\n' offs = 0
1285 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1286 advanceOffs _    offs = offs + 1
1287
1288 getInput :: P AlexInput
1289 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1290
1291 setInput :: AlexInput -> P ()
1292 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1293
1294 pushLexState :: Int -> P ()
1295 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1296
1297 popLexState :: P Int
1298 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1299
1300 getLexState :: P Int
1301 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1302
1303 -- for reasons of efficiency, flags indicating language extensions (eg,
1304 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1305 -- integer
1306
1307 glaExtsBit, ffiBit, parrBit :: Int
1308 glaExtsBit = 0
1309 ffiBit     = 1
1310 parrBit    = 2
1311 arrowsBit  = 4
1312 thBit      = 5
1313 ipBit      = 6
1314 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1315 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1316                 -- (doesn't affect the lexer)
1317
1318 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1319 glaExtsEnabled flags = testBit flags glaExtsBit
1320 ffiEnabled     flags = testBit flags ffiBit
1321 parrEnabled    flags = testBit flags parrBit
1322 arrowsEnabled  flags = testBit flags arrowsBit
1323 thEnabled      flags = testBit flags thBit
1324 ipEnabled      flags = testBit flags ipBit
1325 tvEnabled      flags = testBit flags tvBit
1326 bangPatEnabled flags = testBit flags bangPatBit
1327
1328 -- PState for parsing options pragmas
1329 --
1330 pragState :: StringBuffer -> SrcLoc -> PState
1331 pragState buf loc  =
1332   PState {
1333       buffer     = buf,
1334       last_loc   = mkSrcSpan loc loc,
1335       last_offs  = 0,
1336       last_len   = 0,
1337       loc        = loc,
1338       extsBitmap = 0,
1339       context    = [],
1340       lex_state  = [bol, option_prags, 0]
1341     }
1342
1343
1344 -- create a parse state
1345 --
1346 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1347 mkPState buf loc flags  =
1348   PState {
1349       buffer     = buf,
1350       last_loc   = mkSrcSpan loc loc,
1351       last_offs  = 0,
1352       last_len   = 0,
1353       loc        = loc,
1354       extsBitmap = fromIntegral bitmap,
1355       context    = [],
1356       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1357         -- we begin in the layout state if toplev_layout is set
1358     }
1359     where
1360       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1361                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1362                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1363                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1364                .|. thBit      `setBitIf` dopt Opt_TH          flags
1365                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1366                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1367                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1368       --
1369       setBitIf :: Int -> Bool -> Int
1370       b `setBitIf` cond | cond      = bit b
1371                         | otherwise = 0
1372
1373 getContext :: P [LayoutContext]
1374 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1375
1376 setContext :: [LayoutContext] -> P ()
1377 setContext ctx = P $ \s -> POk s{context=ctx} ()
1378
1379 popContext :: P ()
1380 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1381                            loc = loc, last_len = len, last_loc = last_loc }) ->
1382   case ctx of
1383         (_:tl) -> POk s{ context = tl } ()
1384         []     -> PFailed last_loc (srcParseErr buf len)
1385
1386 -- Push a new layout context at the indentation of the last token read.
1387 -- This is only used at the outer level of a module when the 'module'
1388 -- keyword is missing.
1389 pushCurrentContext :: P ()
1390 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_len=len, context=ctx } ->
1391   POk s{context = Layout (offs-len) : ctx} ()
1392
1393 getOffside :: P Ordering
1394 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1395                 let ord = case stk of
1396                         (Layout n:_) -> compare offs n
1397                         _            -> GT
1398                 in POk s ord
1399
1400 -- ---------------------------------------------------------------------------
1401 -- Construct a parse error
1402
1403 srcParseErr
1404   :: StringBuffer       -- current buffer (placed just after the last token)
1405   -> Int                -- length of the previous token
1406   -> Message
1407 srcParseErr buf len
1408   = hcat [ if null token
1409              then ptext SLIT("parse error (possibly incorrect indentation)")
1410              else hcat [ptext SLIT("parse error on input "),
1411                         char '`', text token, char '\'']
1412     ]
1413   where token = lexemeToString (offsetBytes (-len) buf) len
1414
1415 -- Report a parse failure, giving the span of the previous token as
1416 -- the location of the error.  This is the entry point for errors
1417 -- detected during parsing.
1418 srcParseFail :: P a
1419 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1420                             last_loc = last_loc } ->
1421     PFailed last_loc (srcParseErr buf len)
1422
1423 -- A lexical error is reported at a particular position in the source file,
1424 -- not over a token range.
1425 lexError :: String -> P a
1426 lexError str = do
1427   loc <- getSrcLoc
1428   i@(AI end _ buf) <- getInput
1429   reportLexError loc end buf str
1430
1431 -- -----------------------------------------------------------------------------
1432 -- This is the top-level function: called from the parser each time a
1433 -- new token is to be read from the input.
1434
1435 lexer :: (Located Token -> P a) -> P a
1436 lexer cont = do
1437   tok@(L _ tok__) <- lexToken
1438   --trace ("token: " ++ show tok__) $ do
1439   cont tok
1440
1441 lexToken :: P (Located Token)
1442 lexToken = do
1443   inp@(AI loc1 _ buf) <- getInput
1444   sc <- getLexState
1445   exts <- getExts
1446   case alexScanUser exts inp sc of
1447     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1448                   setLastToken span 0
1449                   return (L span ITeof)
1450     AlexError (AI loc2 _ buf) -> do
1451         reportLexError loc1 loc2 buf "lexical error"
1452     AlexSkip inp2 _ -> do
1453         setInput inp2
1454         lexToken
1455     AlexToken inp2@(AI end _ buf2) len t -> do
1456         setInput inp2
1457         let span = mkSrcSpan loc1 end
1458         let bytes = byteDiff buf buf2
1459         span `seq` setLastToken span bytes
1460         t span buf bytes
1461
1462 reportLexError loc1 loc2 buf str
1463   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1464   | otherwise =
1465   let
1466         c = fst (nextChar buf)
1467   in
1468   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1469     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1470     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1471 }