compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  29    getLexState, popLexState, pushLexState,
  30    extension, glaExtsEnabled, bangPatEnabled
  31   ) where
  32
  33 #include "HsVersions.h"
  34
  35 import ErrUtils         ( Message )
  36 import Outputable
  37 import StringBuffer
  38 import FastString
  39 import FastTypes
  40 import SrcLoc
  41 import UniqFM
  42 import DynFlags
  43 import Ctype
  44 import Util             ( maybePrefixMatch, readRational )
  45
  46 import DATA_BITS
  47 import Data.Char        ( chr )
  48 import Ratio
  49 --import TRACE
  50
  51 #if __GLASGOW_HASKELL__ >= 605
  52 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  53 #else
  54 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  55 #endif
  56 }
  57
  58 $unispace    = \x05
  59 $whitechar   = [\ \t\n\r\f\v\xa0 $unispace]
  60 $white_no_nl = $whitechar # \n
  61
  62 $ascdigit  = 0-9
  63 $unidigit  = \x03
  64 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  65 $digit     = [$ascdigit $unidigit]
  66
  67 $special   = [\(\)\,\;\[\]\`\{\}]
  68 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  69 $unisymbol = \x04
  70 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  71
  72 $unilarge  = \x01
  73 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  74 $large     = [$asclarge $unilarge]
  75
  76 $unismall  = \x02
  77 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  78 $small     = [$ascsmall $unismall \_]
  79
  80 $unigraphic = \x06
  81 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  82
  83 $octit     = 0-7
  84 $hexit     = [$decdigit A-F a-f]
  85 $symchar   = [$symbol \:]
  86 $nl        = [\n\r]
  87 $idchar    = [$small $large $digit \']
  88
  89 @varid     = $small $idchar*
  90 @conid     = $large $idchar*
  91
  92 @varsym    = $symbol $symchar*
  93 @consym    = \: $symchar*
  94
  95 @decimal     = $decdigit+
  96 @octal       = $octit+
  97 @hexadecimal = $hexit+
  98 @exponent    = [eE] [\-\+]? @decimal
  99
 100 -- we support the hierarchical module name extension:
 101 @qual = (@conid \.)+
 102
 103 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 104
 105 haskell :-
 106
 107 -- everywhere: skip whitespace and comments
 108 $white_no_nl+                           ;
 109
 110 -- Everywhere: deal with nested comments.  We explicitly rule out
 111 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 112 -- (this can happen even though pragmas will normally take precedence due to
 113 -- longest-match, because pragmas aren't valid in every state, but comments
 114 -- are).
 115 "{-" / { notFollowedBy '#' }            { nested_comment }
 116
 117 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 118 -- more dashes followed by a symbol should be parsed as a varsym, so we
 119 -- have to exclude those.
 120 -- The regex says: "munch all the characters after the dashes, as long as
 121 -- the first one is not a symbol".
 122 "--"\-* [^$symbol :] .*                 ;
 123 "--"\-* / { atEOL }                     ;
 124
 125 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 126 -- blank lines) until we find a non-whitespace character, then do layout
 127 -- processing.
 128 --
 129 -- One slight wibble here: what if the line begins with {-#? In
 130 -- theory, we have to lex the pragma to see if it's one we recognise,
 131 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 132 -- as a nested comment.  We don't bother with this: if the line begins
 133 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 134 <bol> {
 135   \n                                    ;
 136   ^\# (line)?                           { begin line_prag1 }
 137   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 138   ^\# \! .* \n                          ; -- #!, for scripts
 139   ()                                    { do_bol }
 140 }
 141
 142 -- after a layout keyword (let, where, do, of), we begin a new layout
 143 -- context if the curly brace is missing.
 144 -- Careful! This stuff is quite delicate.
 145 <layout, layout_do> {
 146   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 147         -- we might encounter {-# here, but {- has been handled already
 148   \n                                    ;
 149   ^\# (line)?                           { begin line_prag1 }
 150 }
 151
 152 -- do is treated in a subtly different way, see new_layout_context
 153 <layout>    ()                          { new_layout_context True }
 154 <layout_do> ()                          { new_layout_context False }
 155
 156 -- after a new layout context which was found to be to the left of the
 157 -- previous context, we have generated a '{' token, and we now need to
 158 -- generate a matching '}' token.
 159 <layout_left>  ()                       { do_layout_left }
 160
 161 <0,option_prags,glaexts> \n                             { begin bol }
 162
 163 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 164
 165 -- single-line line pragmas, of the form
 166 --    # <line> "<file>" <extra-stuff> \n
 167 <line_prag1> $decdigit+                 { setLine line_prag1a }
 168 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 169 <line_prag1b> .*                        { pop }
 170
 171 -- Haskell-style line pragmas, of the form
 172 --    {-# LINE <line> "<file>" #-}
 173 <line_prag2> $decdigit+                 { setLine line_prag2a }
 174 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 175 <line_prag2b> "#-}"|"-}"                { pop }
 176    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 177    -- with older versions of GHC which generated these.
 178
 179 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 180 -- is on, because the contents of the pragma is always written using
 181 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 182 -- enabled, we're sure to get a parse error.
 183 -- (ToDo: we should really emit a warning when ignoring pragmas)
 184 <glaexts>
 185   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 186
 187 <0,option_prags,glaexts> {
 188   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 189   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 190                                         { token (ITinline_prag False) }
 191   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 192                                         { token ITspec_prag }
 193   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 194         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 195   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 196         $whitechar* (NO(T?)INLINE|no(t?)inline)
 197                                         { token (ITspec_inline_prag False) }
 198   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 199   "{-#" $whitechar* (DEPRECATED|deprecated)
 200                                         { token ITdeprecated_prag }
 201   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 202   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 203   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 204
 205   "{-#"                                 { nested_comment }
 206
 207   -- ToDo: should only be valid inside a pragma:
 208   "#-}"                                 { token ITclose_prag}
 209 }
 210
 211 <option_prags> {
 212   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 213   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 214                                         { lex_string_prag IToptions_prag }
 215   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 216   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 217 }
 218
 219 <0,option_prags,glaexts> {
 220         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 221   "{-#" $whitechar* $idchar+            { nested_comment }
 222 }
 223
 224 -- '0' state: ordinary lexemes
 225 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 226
 227 -- "special" symbols
 228
 229 <0,glaexts> {
 230   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 231   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 232 }
 233
 234 <0,glaexts> {
 235   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 236   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 237   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 238   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 239   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 240   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 241   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 242   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 243 }
 244
 245 <0,glaexts> {
 246   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 247                                         { special IToparenbar }
 248   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 249 }
 250
 251 <0,glaexts> {
 252   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 253 }
 254
 255 <glaexts> {
 256   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 257   "#)"                                  { token ITcubxparen }
 258   "{|"                                  { token ITocurlybar }
 259   "|}"                                  { token ITccurlybar }
 260 }
 261
 262 <0,option_prags,glaexts> {
 263   \(                                    { special IToparen }
 264   \)                                    { special ITcparen }
 265   \[                                    { special ITobrack }
 266   \]                                    { special ITcbrack }
 267   \,                                    { special ITcomma }
 268   \;                                    { special ITsemi }
 269   \`                                    { special ITbackquote }
 270
 271   \{                                    { open_brace }
 272   \}                                    { close_brace }
 273 }
 274
 275 <0,option_prags,glaexts> {
 276   @qual @varid                  { check_qvarid }
 277   @qual @conid                  { idtoken qconid }
 278   @varid                        { varid }
 279   @conid                        { idtoken conid }
 280 }
 281
 282 -- after an illegal qvarid, such as 'M.let',
 283 -- we back up and try again in the bad_qvarid state:
 284 <bad_qvarid> {
 285   @conid                        { pop_and (idtoken conid) }
 286   @qual @conid                  { pop_and (idtoken qconid) }
 287 }
 288
 289 <glaexts> {
 290   @qual @varid "#"+             { idtoken qvarid }
 291   @qual @conid "#"+             { idtoken qconid }
 292   @varid "#"+                   { varid }
 293   @conid "#"+                   { idtoken conid }
 294 }
 295
 296 -- ToDo: M.(,,,)
 297
 298 <0,glaexts> {
 299   @qual @varsym                 { idtoken qvarsym }
 300   @qual @consym                 { idtoken qconsym }
 301   @varsym                       { varsym }
 302   @consym                       { consym }
 303 }
 304
 305 <0,glaexts> {
 306   @decimal                      { tok_decimal }
 307   0[oO] @octal                  { tok_octal }
 308   0[xX] @hexadecimal            { tok_hexadecimal }
 309 }
 310
 311 <glaexts> {
 312   @decimal \#                   { prim_decimal }
 313   0[oO] @octal \#               { prim_octal }
 314   0[xX] @hexadecimal \#         { prim_hexadecimal }
 315 }
 316
 317 <0,glaexts> @floating_point             { strtoken tok_float }
 318 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 319 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 320
 321 -- Strings and chars are lexed by hand-written code.  The reason is
 322 -- that even if we recognise the string or char here in the regex
 323 -- lexer, we would still have to parse the string afterward in order
 324 -- to convert it to a String.
 325 <0,glaexts> {
 326   \'                            { lex_char_tok }
 327   \"                            { lex_string_tok }
 328 }
 329
 330 {
 331 -- work around bug in Alex 2.0
 332 #if __GLASGOW_HASKELL__ < 503
 333 unsafeAt arr i = arr ! i
 334 #endif
 335
 336 -- -----------------------------------------------------------------------------
 337 -- The token type
 338
 339 data Token
 340   = ITas                        -- Haskell keywords
 341   | ITcase
 342   | ITclass
 343   | ITdata
 344   | ITdefault
 345   | ITderiving
 346   | ITdo
 347   | ITelse
 348   | ITfor
 349   | IThiding
 350   | ITif
 351   | ITimport
 352   | ITin
 353   | ITinfix
 354   | ITinfixl
 355   | ITinfixr
 356   | ITinstance
 357   | ITlet
 358   | ITmodule
 359   | ITnewtype
 360   | ITof
 361   | ITqualified
 362   | ITthen
 363   | ITtype
 364   | ITwhere
 365   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 366
 367   | ITforall                    -- GHC extension keywords
 368   | ITforeign
 369   | ITexport
 370   | ITlabel
 371   | ITdynamic
 372   | ITsafe
 373   | ITthreadsafe
 374   | ITunsafe
 375   | ITstdcallconv
 376   | ITccallconv
 377   | ITdotnet
 378   | ITmdo
 379   | ITiso
 380   | ITfamily
 381
 382         -- Pragmas
 383   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 384   | ITspec_prag                 -- SPECIALISE
 385   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 386   | ITsource_prag
 387   | ITrules_prag
 388   | ITdeprecated_prag
 389   | ITline_prag
 390   | ITscc_prag
 391   | ITcore_prag                 -- hdaume: core annotations
 392   | ITunpack_prag
 393   | ITclose_prag
 394   | IToptions_prag String
 395   | ITinclude_prag String
 396   | ITlanguage_prag
 397
 398   | ITdotdot                    -- reserved symbols
 399   | ITcolon
 400   | ITdcolon
 401   | ITequal
 402   | ITlam
 403   | ITvbar
 404   | ITlarrow
 405   | ITrarrow
 406   | ITat
 407   | ITtilde
 408   | ITdarrow
 409   | ITminus
 410   | ITbang
 411   | ITstar
 412   | ITdot
 413
 414   | ITbiglam                    -- GHC-extension symbols
 415
 416   | ITocurly                    -- special symbols
 417   | ITccurly
 418   | ITocurlybar                 -- {|, for type applications
 419   | ITccurlybar                 -- |}, for type applications
 420   | ITvocurly
 421   | ITvccurly
 422   | ITobrack
 423   | ITopabrack                  -- [:, for parallel arrays with -fparr
 424   | ITcpabrack                  -- :], for parallel arrays with -fparr
 425   | ITcbrack
 426   | IToparen
 427   | ITcparen
 428   | IToubxparen
 429   | ITcubxparen
 430   | ITsemi
 431   | ITcomma
 432   | ITunderscore
 433   | ITbackquote
 434
 435   | ITvarid   FastString        -- identifiers
 436   | ITconid   FastString
 437   | ITvarsym  FastString
 438   | ITconsym  FastString
 439   | ITqvarid  (FastString,FastString)
 440   | ITqconid  (FastString,FastString)
 441   | ITqvarsym (FastString,FastString)
 442   | ITqconsym (FastString,FastString)
 443
 444   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 445
 446   | ITpragma StringBuffer
 447
 448   | ITchar       Char
 449   | ITstring     FastString
 450   | ITinteger    Integer
 451   | ITrational   Rational
 452
 453   | ITprimchar   Char
 454   | ITprimstring FastString
 455   | ITprimint    Integer
 456   | ITprimfloat  Rational
 457   | ITprimdouble Rational
 458
 459   -- MetaHaskell extension tokens
 460   | ITopenExpQuote              --  [| or [e|
 461   | ITopenPatQuote              --  [p|
 462   | ITopenDecQuote              --  [d|
 463   | ITopenTypQuote              --  [t|
 464   | ITcloseQuote                --  |]
 465   | ITidEscape   FastString     --  $x
 466   | ITparenEscape               --  $(
 467   | ITvarQuote                  --  '
 468   | ITtyQuote                   --  ''
 469
 470   -- Arrow notation extension
 471   | ITproc
 472   | ITrec
 473   | IToparenbar                 --  (|
 474   | ITcparenbar                 --  |)
 475   | ITlarrowtail                --  -<
 476   | ITrarrowtail                --  >-
 477   | ITLarrowtail                --  -<<
 478   | ITRarrowtail                --  >>-
 479
 480   | ITunknown String            -- Used when the lexer can't make sense of it
 481   | ITeof                       -- end of file token
 482 #ifdef DEBUG
 483   deriving Show -- debugging
 484 #endif
 485
 486 isSpecial :: Token -> Bool
 487 -- If we see M.x, where x is a keyword, but
 488 -- is special, we treat is as just plain M.x,
 489 -- not as a keyword.
 490 isSpecial ITas          = True
 491 isSpecial IThiding      = True
 492 isSpecial ITfor         = True
 493 isSpecial ITqualified   = True
 494 isSpecial ITforall      = True
 495 isSpecial ITexport      = True
 496 isSpecial ITlabel       = True
 497 isSpecial ITdynamic     = True
 498 isSpecial ITsafe        = True
 499 isSpecial ITthreadsafe  = True
 500 isSpecial ITunsafe      = True
 501 isSpecial ITccallconv   = True
 502 isSpecial ITstdcallconv = True
 503 isSpecial ITmdo         = True
 504 isSpecial ITiso         = True
 505 isSpecial ITfamily      = True
 506 isSpecial _             = False
 507
 508 -- the bitmap provided as the third component indicates whether the
 509 -- corresponding extension keyword is valid under the extension options
 510 -- provided to the compiler; if the extension corresponding to *any* of the
 511 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 512 -- facilitates using a keyword in two different extensions that can be
 513 -- activated independently)
 514 --
 515 reservedWordsFM = listToUFM $
 516         map (\(x, y, z) -> (mkFastString x, (y, z)))
 517        [( "_",          ITunderscore,   0 ),
 518         ( "as",         ITas,           0 ),
 519         ( "case",       ITcase,         0 ),
 520         ( "class",      ITclass,        0 ),
 521         ( "data",       ITdata,         0 ),
 522         ( "default",    ITdefault,      0 ),
 523         ( "deriving",   ITderiving,     0 ),
 524         ( "do",         ITdo,           0 ),
 525         ( "else",       ITelse,         0 ),
 526         ( "for",        ITfor,          0 ),
 527         ( "hiding",     IThiding,       0 ),
 528         ( "if",         ITif,           0 ),
 529         ( "import",     ITimport,       0 ),
 530         ( "in",         ITin,           0 ),
 531         ( "infix",      ITinfix,        0 ),
 532         ( "infixl",     ITinfixl,       0 ),
 533         ( "infixr",     ITinfixr,       0 ),
 534         ( "instance",   ITinstance,     0 ),
 535         ( "let",        ITlet,          0 ),
 536         ( "module",     ITmodule,       0 ),
 537         ( "newtype",    ITnewtype,      0 ),
 538         ( "of",         ITof,           0 ),
 539         ( "qualified",  ITqualified,    0 ),
 540         ( "then",       ITthen,         0 ),
 541         ( "type",       ITtype,         0 ),
 542         ( "where",      ITwhere,        0 ),
 543         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 544
 545         ( "forall",     ITforall,        bit tvBit),
 546         ( "mdo",        ITmdo,           bit glaExtsBit),
 547         ( "family",     ITfamily,        bit idxTysBit),
 548
 549         ( "foreign",    ITforeign,       bit ffiBit),
 550         ( "export",     ITexport,        bit ffiBit),
 551         ( "label",      ITlabel,         bit ffiBit),
 552         ( "dynamic",    ITdynamic,       bit ffiBit),
 553         ( "safe",       ITsafe,          bit ffiBit),
 554         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 555         ( "unsafe",     ITunsafe,        bit ffiBit),
 556         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 557         ( "ccall",      ITccallconv,     bit ffiBit),
 558         ( "dotnet",     ITdotnet,        bit ffiBit),
 559
 560         ( "rec",        ITrec,           bit arrowsBit),
 561         ( "proc",       ITproc,          bit arrowsBit)
 562      ]
 563
 564 reservedSymsFM = listToUFM $
 565         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 566       [ ("..",  ITdotdot,       0)
 567        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 568                                                 -- meaning only list cons
 569        ,("::",  ITdcolon,       0)
 570        ,("=",   ITequal,        0)
 571        ,("\\",  ITlam,          0)
 572        ,("|",   ITvbar,         0)
 573        ,("<-",  ITlarrow,       0)
 574        ,("->",  ITrarrow,       0)
 575        ,("@",   ITat,           0)
 576        ,("~",   ITtilde,        0)
 577        ,("=>",  ITdarrow,       0)
 578        ,("-",   ITminus,        0)
 579        ,("!",   ITbang,         0)
 580
 581        ,("*",   ITstar,         bit glaExtsBit .|.
 582                                 bit idxTysBit)      -- For data T (a::*) = MkT
 583        ,(".",   ITdot,          bit tvBit)          -- For 'forall a . t'
 584
 585        ,("-<",  ITlarrowtail,   bit arrowsBit)
 586        ,(">-",  ITrarrowtail,   bit arrowsBit)
 587        ,("-<<", ITLarrowtail,   bit arrowsBit)
 588        ,(">>-", ITRarrowtail,   bit arrowsBit)
 589
 590 #if __GLASGOW_HASKELL__ >= 605
 591        ,("λ",  ITlam,          bit glaExtsBit)
 592        ,("∷",   ITdcolon,       bit glaExtsBit)
 593        ,("⇒",   ITdarrow,     bit glaExtsBit)
 594        ,("∀", ITforall,       bit glaExtsBit)
 595        ,("→",   ITrarrow,     bit glaExtsBit)
 596        ,("←",   ITlarrow,     bit glaExtsBit)
 597        ,("⋯",         ITdotdot,       bit glaExtsBit)
 598         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 599         -- form part of a large operator.  This would let us have a better
 600         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 601 #endif
 602        ]
 603
 604 -- -----------------------------------------------------------------------------
 605 -- Lexer actions
 606
 607 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 608
 609 special :: Token -> Action
 610 special tok span _buf len = return (L span tok)
 611
 612 token, layout_token :: Token -> Action
 613 token t span buf len = return (L span t)
 614 layout_token t span buf len = pushLexState layout >> return (L span t)
 615
 616 idtoken :: (StringBuffer -> Int -> Token) -> Action
 617 idtoken f span buf len = return (L span $! (f buf len))
 618
 619 skip_one_varid :: (FastString -> Token) -> Action
 620 skip_one_varid f span buf len
 621   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 622
 623 strtoken :: (String -> Token) -> Action
 624 strtoken f span buf len =
 625   return (L span $! (f $! lexemeToString buf len))
 626
 627 init_strtoken :: Int -> (String -> Token) -> Action
 628 -- like strtoken, but drops the last N character(s)
 629 init_strtoken drop f span buf len =
 630   return (L span $! (f $! lexemeToString buf (len-drop)))
 631
 632 begin :: Int -> Action
 633 begin code _span _str _len = do pushLexState code; lexToken
 634
 635 pop :: Action
 636 pop _span _buf _len = do popLexState; lexToken
 637
 638 pop_and :: Action -> Action
 639 pop_and act span buf len = do popLexState; act span buf len
 640
 641 notFollowedBy char _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf /= char
 642
 643 notFollowedBySymbol _ _ _ (AI _ _ buf)
 644   = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
 645
 646 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 647
 648 ifExtension pred bits _ _ _ = pred bits
 649
 650 {-
 651   nested comments require traversing by hand, they can't be parsed
 652   using regular expressions.
 653 -}
 654 nested_comment :: Action
 655 nested_comment span _str _len = do
 656   input <- getInput
 657   go 1 input
 658   where go 0 input = do setInput input; lexToken
 659         go n input = do
 660           case alexGetChar input of
 661             Nothing  -> err input
 662             Just (c,input) -> do
 663               case c of
 664                 '-' -> do
 665                   case alexGetChar input of
 666                     Nothing  -> err input
 667                     Just ('\125',input) -> go (n-1) input
 668                     Just (c,_)          -> go n input
 669                 '\123' -> do
 670                   case alexGetChar input of
 671                     Nothing  -> err input
 672                     Just ('-',input') -> go (n+1) input'
 673                     Just (c,input)    -> go n input
 674                 c -> go n input
 675
 676         err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 677
 678 open_brace, close_brace :: Action
 679 open_brace span _str _len = do
 680   ctx <- getContext
 681   setContext (NoLayout:ctx)
 682   return (L span ITocurly)
 683 close_brace span _str _len = do
 684   popContext
 685   return (L span ITccurly)
 686
 687 -- We have to be careful not to count M.<varid> as a qualified name
 688 -- when <varid> is a keyword.  We hack around this by catching
 689 -- the offending tokens afterward, and re-lexing in a different state.
 690 check_qvarid span buf len = do
 691   case lookupUFM reservedWordsFM var of
 692         Just (keyword,exts)
 693           | not (isSpecial keyword) ->
 694           if exts == 0
 695              then try_again
 696              else do
 697                 b <- extension (\i -> exts .&. i /= 0)
 698                 if b then try_again
 699                      else return token
 700         _other -> return token
 701   where
 702         (mod,var) = splitQualName buf len
 703         token     = L span (ITqvarid (mod,var))
 704
 705         try_again = do
 706                 (AI _ offs _) <- getInput
 707                 setInput (AI (srcSpanStart span) (offs-len) buf)
 708                 pushLexState bad_qvarid
 709                 lexToken
 710
 711 qvarid buf len = ITqvarid $! splitQualName buf len
 712 qconid buf len = ITqconid $! splitQualName buf len
 713
 714 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 715 -- takes a StringBuffer and a length, and returns the module name
 716 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 717 -- because of hierarchical module names.
 718 splitQualName orig_buf len = split orig_buf orig_buf
 719   where
 720     split buf dot_buf
 721         | orig_buf `byteDiff` buf >= len  = done dot_buf
 722         | c == '.'                        = found_dot buf'
 723         | otherwise                       = split buf' dot_buf
 724       where
 725        (c,buf') = nextChar buf
 726
 727     -- careful, we might get names like M....
 728     -- so, if the character after the dot is not upper-case, this is
 729     -- the end of the qualifier part.
 730     found_dot buf -- buf points after the '.'
 731         | isUpper c    = split buf' buf
 732         | otherwise    = done buf
 733       where
 734        (c,buf') = nextChar buf
 735
 736     done dot_buf =
 737         (lexemeToFastString orig_buf (qual_size - 1),
 738          lexemeToFastString dot_buf (len - qual_size))
 739       where
 740         qual_size = orig_buf `byteDiff` dot_buf
 741
 742 varid span buf len =
 743   case lookupUFM reservedWordsFM fs of
 744         Just (keyword,0)    -> do
 745                 maybe_layout keyword
 746                 return (L span keyword)
 747         Just (keyword,exts) -> do
 748                 b <- extension (\i -> exts .&. i /= 0)
 749                 if b then do maybe_layout keyword
 750                              return (L span keyword)
 751                      else return (L span (ITvarid fs))
 752         _other -> return (L span (ITvarid fs))
 753   where
 754         fs = lexemeToFastString buf len
 755
 756 conid buf len = ITconid fs
 757   where fs = lexemeToFastString buf len
 758
 759 qvarsym buf len = ITqvarsym $! splitQualName buf len
 760 qconsym buf len = ITqconsym $! splitQualName buf len
 761
 762 varsym = sym ITvarsym
 763 consym = sym ITconsym
 764
 765 sym con span buf len =
 766   case lookupUFM reservedSymsFM fs of
 767         Just (keyword,0)    -> return (L span keyword)
 768         Just (keyword,exts) -> do
 769                 b <- extension (\i -> exts .&. i /= 0)
 770                 if b then return (L span keyword)
 771                      else return (L span $! con fs)
 772         _other -> return (L span $! con fs)
 773   where
 774         fs = lexemeToFastString buf len
 775
 776 tok_decimal span buf len
 777   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 778
 779 tok_octal span buf len
 780   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 781
 782 tok_hexadecimal span buf len
 783   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 784
 785 prim_decimal span buf len
 786   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 787
 788 prim_octal span buf len
 789   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 790
 791 prim_hexadecimal span buf len
 792   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 793
 794 tok_float        str = ITrational   $! readRational str
 795 prim_float       str = ITprimfloat  $! readRational str
 796 prim_double      str = ITprimdouble $! readRational str
 797
 798 -- -----------------------------------------------------------------------------
 799 -- Layout processing
 800
 801 -- we're at the first token on a line, insert layout tokens if necessary
 802 do_bol :: Action
 803 do_bol span _str _len = do
 804         pos <- getOffside
 805         case pos of
 806             LT -> do
 807                 --trace "layout: inserting '}'" $ do
 808                 popContext
 809                 -- do NOT pop the lex state, we might have a ';' to insert
 810                 return (L span ITvccurly)
 811             EQ -> do
 812                 --trace "layout: inserting ';'" $ do
 813                 popLexState
 814                 return (L span ITsemi)
 815             GT -> do
 816                 popLexState
 817                 lexToken
 818
 819 -- certain keywords put us in the "layout" state, where we might
 820 -- add an opening curly brace.
 821 maybe_layout ITdo       = pushLexState layout_do
 822 maybe_layout ITmdo      = pushLexState layout_do
 823 maybe_layout ITof       = pushLexState layout
 824 maybe_layout ITlet      = pushLexState layout
 825 maybe_layout ITwhere    = pushLexState layout
 826 maybe_layout ITrec      = pushLexState layout
 827 maybe_layout _          = return ()
 828
 829 -- Pushing a new implicit layout context.  If the indentation of the
 830 -- next token is not greater than the previous layout context, then
 831 -- Haskell 98 says that the new layout context should be empty; that is
 832 -- the lexer must generate {}.
 833 --
 834 -- We are slightly more lenient than this: when the new context is started
 835 -- by a 'do', then we allow the new context to be at the same indentation as
 836 -- the previous context.  This is what the 'strict' argument is for.
 837 --
 838 new_layout_context strict span _buf _len = do
 839     popLexState
 840     (AI _ offset _) <- getInput
 841     ctx <- getContext
 842     case ctx of
 843         Layout prev_off : _  |
 844            (strict     && prev_off >= offset  ||
 845             not strict && prev_off > offset) -> do
 846                 -- token is indented to the left of the previous context.
 847                 -- we must generate a {} sequence now.
 848                 pushLexState layout_left
 849                 return (L span ITvocurly)
 850         other -> do
 851                 setContext (Layout offset : ctx)
 852                 return (L span ITvocurly)
 853
 854 do_layout_left span _buf _len = do
 855     popLexState
 856     pushLexState bol  -- we must be at the start of a line
 857     return (L span ITvccurly)
 858
 859 -- -----------------------------------------------------------------------------
 860 -- LINE pragmas
 861
 862 setLine :: Int -> Action
 863 setLine code span buf len = do
 864   let line = parseInteger buf len 10 octDecDigit
 865   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
 866         -- subtract one: the line number refers to the *following* line
 867   popLexState
 868   pushLexState code
 869   lexToken
 870
 871 setFile :: Int -> Action
 872 setFile code span buf len = do
 873   let file = lexemeToFastString (stepOn buf) (len-2)
 874   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
 875   popLexState
 876   pushLexState code
 877   lexToken
 878
 879
 880 -- -----------------------------------------------------------------------------
 881 -- Options, includes and language pragmas.
 882
 883 lex_string_prag :: (String -> Token) -> Action
 884 lex_string_prag mkTok span buf len
 885     = do input <- getInput
 886          start <- getSrcLoc
 887          tok <- go [] input
 888          end <- getSrcLoc
 889          return (L (mkSrcSpan start end) tok)
 890     where go acc input
 891               = if isString input "#-}"
 892                    then do setInput input
 893                            return (mkTok (reverse acc))
 894                    else case alexGetChar input of
 895                           Just (c,i) -> go (c:acc) i
 896                           Nothing -> err input
 897           isString i [] = True
 898           isString i (x:xs)
 899               = case alexGetChar i of
 900                   Just (c,i') | c == x    -> isString i' xs
 901                   _other -> False
 902           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
 903
 904
 905 -- -----------------------------------------------------------------------------
 906 -- Strings & Chars
 907
 908 -- This stuff is horrible.  I hates it.
 909
 910 lex_string_tok :: Action
 911 lex_string_tok span buf len = do
 912   tok <- lex_string ""
 913   end <- getSrcLoc
 914   return (L (mkSrcSpan (srcSpanStart span) end) tok)
 915
 916 lex_string :: String -> P Token
 917 lex_string s = do
 918   i <- getInput
 919   case alexGetChar' i of
 920     Nothing -> lit_error
 921
 922     Just ('"',i)  -> do
 923         setInput i
 924         glaexts <- extension glaExtsEnabled
 925         if glaexts
 926           then do
 927             i <- getInput
 928             case alexGetChar' i of
 929               Just ('#',i) -> do
 930                    setInput i
 931                    if any (> '\xFF') s
 932                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 933                     else let s' = mkZFastString (reverse s) in
 934                          return (ITprimstring s')
 935                         -- mkZFastString is a hack to avoid encoding the
 936                         -- string in UTF-8.  We just want the exact bytes.
 937               _other ->
 938                 return (ITstring (mkFastString (reverse s)))
 939           else
 940                 return (ITstring (mkFastString (reverse s)))
 941
 942     Just ('\\',i)
 943         | Just ('&',i) <- next -> do
 944                 setInput i; lex_string s
 945         | Just (c,i) <- next, is_space c -> do
 946                 setInput i; lex_stringgap s
 947         where next = alexGetChar' i
 948
 949     Just (c, i) -> do
 950         c' <- lex_char c i
 951         lex_string (c':s)
 952
 953 lex_stringgap s = do
 954   c <- getCharOrFail
 955   case c of
 956     '\\' -> lex_string s
 957     c | is_space c -> lex_stringgap s
 958     _other -> lit_error
 959
 960
 961 lex_char_tok :: Action
 962 -- Here we are basically parsing character literals, such as 'x' or '\n'
 963 -- but, when Template Haskell is on, we additionally spot
 964 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
 965 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
 966 -- So we have to do two characters of lookahead: when we see 'x we need to
 967 -- see if there's a trailing quote
 968 lex_char_tok span buf len = do  -- We've seen '
 969    i1 <- getInput       -- Look ahead to first character
 970    let loc = srcSpanStart span
 971    case alexGetChar' i1 of
 972         Nothing -> lit_error
 973
 974         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
 975                   th_exts <- extension thEnabled
 976                   if th_exts then do
 977                         setInput i2
 978                         return (L (mkSrcSpan loc end2)  ITtyQuote)
 979                    else lit_error
 980
 981         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
 982                   setInput i2
 983                   lit_ch <- lex_escape
 984                   mc <- getCharOrFail   -- Trailing quote
 985                   if mc == '\'' then finish_char_tok loc lit_ch
 986                                 else do setInput i2; lit_error
 987
 988         Just (c, i2@(AI end2 _ _))
 989                 | not (isAny c) -> lit_error
 990                 | otherwise ->
 991
 992                 -- We've seen 'x, where x is a valid character
 993                 --  (i.e. not newline etc) but not a quote or backslash
 994            case alexGetChar' i2 of      -- Look ahead one more character
 995                 Nothing -> lit_error
 996                 Just ('\'', i3) -> do   -- We've seen 'x'
 997                         setInput i3
 998                         finish_char_tok loc c
 999                 _other -> do            -- We've seen 'x not followed by quote
1000                                         -- If TH is on, just parse the quote only
1001                         th_exts <- extension thEnabled
1002                         let (AI end _ _) = i1
1003                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1004                                    else do setInput i2; lit_error
1005
1006 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1007 finish_char_tok loc ch  -- We've already seen the closing quote
1008                         -- Just need to check for trailing #
1009   = do  glaexts <- extension glaExtsEnabled
1010         i@(AI end _ _) <- getInput
1011         if glaexts then do
1012                 case alexGetChar' i of
1013                         Just ('#',i@(AI end _ _)) -> do
1014                                 setInput i
1015                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1016                         _other ->
1017                                 return (L (mkSrcSpan loc end) (ITchar ch))
1018                 else do
1019                    return (L (mkSrcSpan loc end) (ITchar ch))
1020
1021 lex_char :: Char -> AlexInput -> P Char
1022 lex_char c inp = do
1023   case c of
1024       '\\' -> do setInput inp; lex_escape
1025       c | isAny c -> do setInput inp; return c
1026       _other -> lit_error
1027
1028 isAny c | c > '\xff' = isPrint c
1029         | otherwise  = is_any c
1030
1031 lex_escape :: P Char
1032 lex_escape = do
1033   c <- getCharOrFail
1034   case c of
1035         'a'   -> return '\a'
1036         'b'   -> return '\b'
1037         'f'   -> return '\f'
1038         'n'   -> return '\n'
1039         'r'   -> return '\r'
1040         't'   -> return '\t'
1041         'v'   -> return '\v'
1042         '\\'  -> return '\\'
1043         '"'   -> return '\"'
1044         '\''  -> return '\''
1045         '^'   -> do c <- getCharOrFail
1046                     if c >= '@' && c <= '_'
1047                         then return (chr (ord c - ord '@'))
1048                         else lit_error
1049
1050         'x'   -> readNum is_hexdigit 16 hexDigit
1051         'o'   -> readNum is_octdigit  8 octDecDigit
1052         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1053
1054         c1 ->  do
1055            i <- getInput
1056            case alexGetChar' i of
1057             Nothing -> lit_error
1058             Just (c2,i2) ->
1059               case alexGetChar' i2 of
1060                 Nothing -> do setInput i2; lit_error
1061                 Just (c3,i3) ->
1062                    let str = [c1,c2,c3] in
1063                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1064                                      Just rest <- [maybePrefixMatch p str] ] of
1065                           (escape_char,[]):_ -> do
1066                                 setInput i3
1067                                 return escape_char
1068                           (escape_char,_:_):_ -> do
1069                                 setInput i2
1070                                 return escape_char
1071                           [] -> lit_error
1072
1073 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1074 readNum is_digit base conv = do
1075   i <- getInput
1076   c <- getCharOrFail
1077   if is_digit c
1078         then readNum2 is_digit base conv (conv c)
1079         else do setInput i; lit_error
1080
1081 readNum2 is_digit base conv i = do
1082   input <- getInput
1083   read i input
1084   where read i input = do
1085           case alexGetChar' input of
1086             Just (c,input') | is_digit c -> do
1087                 read (i*base + conv c) input'
1088             _other -> do
1089                 if i >= 0 && i <= 0x10FFFF
1090                    then do setInput input; return (chr i)
1091                    else lit_error
1092
1093 silly_escape_chars = [
1094         ("NUL", '\NUL'),
1095         ("SOH", '\SOH'),
1096         ("STX", '\STX'),
1097         ("ETX", '\ETX'),
1098         ("EOT", '\EOT'),
1099         ("ENQ", '\ENQ'),
1100         ("ACK", '\ACK'),
1101         ("BEL", '\BEL'),
1102         ("BS", '\BS'),
1103         ("HT", '\HT'),
1104         ("LF", '\LF'),
1105         ("VT", '\VT'),
1106         ("FF", '\FF'),
1107         ("CR", '\CR'),
1108         ("SO", '\SO'),
1109         ("SI", '\SI'),
1110         ("DLE", '\DLE'),
1111         ("DC1", '\DC1'),
1112         ("DC2", '\DC2'),
1113         ("DC3", '\DC3'),
1114         ("DC4", '\DC4'),
1115         ("NAK", '\NAK'),
1116         ("SYN", '\SYN'),
1117         ("ETB", '\ETB'),
1118         ("CAN", '\CAN'),
1119         ("EM", '\EM'),
1120         ("SUB", '\SUB'),
1121         ("ESC", '\ESC'),
1122         ("FS", '\FS'),
1123         ("GS", '\GS'),
1124         ("RS", '\RS'),
1125         ("US", '\US'),
1126         ("SP", '\SP'),
1127         ("DEL", '\DEL')
1128         ]
1129
1130 -- before calling lit_error, ensure that the current input is pointing to
1131 -- the position of the error in the buffer.  This is so that we can report
1132 -- a correct location to the user, but also so we can detect UTF-8 decoding
1133 -- errors if they occur.
1134 lit_error = lexError "lexical error in string/character literal"
1135
1136 getCharOrFail :: P Char
1137 getCharOrFail =  do
1138   i <- getInput
1139   case alexGetChar' i of
1140         Nothing -> lexError "unexpected end-of-file in string/character literal"
1141         Just (c,i)  -> do setInput i; return c
1142
1143 -- -----------------------------------------------------------------------------
1144 -- The Parse Monad
1145
1146 data LayoutContext
1147   = NoLayout
1148   | Layout !Int
1149
1150 data ParseResult a
1151   = POk PState a
1152   | PFailed
1153         SrcSpan         -- The start and end of the text span related to
1154                         -- the error.  Might be used in environments which can
1155                         -- show this span, e.g. by highlighting it.
1156         Message         -- The error message
1157
1158 data PState = PState {
1159         buffer     :: StringBuffer,
1160         last_loc   :: SrcSpan,  -- pos of previous token
1161         last_offs  :: !Int,     -- offset of the previous token from the
1162                                 -- beginning of  the current line.
1163                                 -- \t is equal to 8 spaces.
1164         last_len   :: !Int,     -- len of previous token
1165         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1166         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1167         context    :: [LayoutContext],
1168         lex_state  :: [Int]
1169      }
1170         -- last_loc and last_len are used when generating error messages,
1171         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1172         -- current token to happyError, we could at least get rid of last_len.
1173         -- Getting rid of last_loc would require finding another way to
1174         -- implement pushCurrentContext (which is only called from one place).
1175
1176 newtype P a = P { unP :: PState -> ParseResult a }
1177
1178 instance Monad P where
1179   return = returnP
1180   (>>=) = thenP
1181   fail = failP
1182
1183 returnP :: a -> P a
1184 returnP a = P $ \s -> POk s a
1185
1186 thenP :: P a -> (a -> P b) -> P b
1187 (P m) `thenP` k = P $ \ s ->
1188         case m s of
1189                 POk s1 a         -> (unP (k a)) s1
1190                 PFailed span err -> PFailed span err
1191
1192 failP :: String -> P a
1193 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1194
1195 failMsgP :: String -> P a
1196 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1197
1198 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1199 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1200
1201 failSpanMsgP :: SrcSpan -> String -> P a
1202 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1203
1204 extension :: (Int -> Bool) -> P Bool
1205 extension p = P $ \s -> POk s (p $! extsBitmap s)
1206
1207 getExts :: P Int
1208 getExts = P $ \s -> POk s (extsBitmap s)
1209
1210 setSrcLoc :: SrcLoc -> P ()
1211 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1212
1213 getSrcLoc :: P SrcLoc
1214 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1215
1216 setLastToken :: SrcSpan -> Int -> P ()
1217 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1218
1219 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1220
1221 alexInputPrevChar :: AlexInput -> Char
1222 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1223
1224 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1225 alexGetChar (AI loc ofs s)
1226   | atEnd s   = Nothing
1227   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1228                 --trace (show (ord c)) $
1229                 Just (adj_c, (AI loc' ofs' s'))
1230   where (c,s') = nextChar s
1231         loc'   = advanceSrcLoc loc c
1232         ofs'   = advanceOffs c ofs
1233
1234         non_graphic     = '\x0'
1235         upper           = '\x1'
1236         lower           = '\x2'
1237         digit           = '\x3'
1238         symbol          = '\x4'
1239         space           = '\x5'
1240         other_graphic   = '\x6'
1241
1242         adj_c
1243           | c <= '\x06' = non_graphic
1244           | c <= '\xff' = c
1245           | otherwise =
1246                 case generalCategory c of
1247                   UppercaseLetter       -> upper
1248                   LowercaseLetter       -> lower
1249                   TitlecaseLetter       -> upper
1250                   ModifierLetter        -> other_graphic
1251                   OtherLetter           -> other_graphic
1252                   NonSpacingMark        -> other_graphic
1253                   SpacingCombiningMark  -> other_graphic
1254                   EnclosingMark         -> other_graphic
1255                   DecimalNumber         -> digit
1256                   LetterNumber          -> other_graphic
1257                   OtherNumber           -> other_graphic
1258                   ConnectorPunctuation  -> other_graphic
1259                   DashPunctuation       -> other_graphic
1260                   OpenPunctuation       -> other_graphic
1261                   ClosePunctuation      -> other_graphic
1262                   InitialQuote          -> other_graphic
1263                   FinalQuote            -> other_graphic
1264                   OtherPunctuation      -> other_graphic
1265                   MathSymbol            -> symbol
1266                   CurrencySymbol        -> symbol
1267                   ModifierSymbol        -> symbol
1268                   OtherSymbol           -> symbol
1269                   Space                 -> space
1270                   _other                -> non_graphic
1271
1272 -- This version does not squash unicode characters, it is used when
1273 -- lexing strings.
1274 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1275 alexGetChar' (AI loc ofs s)
1276   | atEnd s   = Nothing
1277   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1278                 --trace (show (ord c)) $
1279                 Just (c, (AI loc' ofs' s'))
1280   where (c,s') = nextChar s
1281         loc'   = advanceSrcLoc loc c
1282         ofs'   = advanceOffs c ofs
1283
1284 advanceOffs :: Char -> Int -> Int
1285 advanceOffs '\n' offs = 0
1286 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1287 advanceOffs _    offs = offs + 1
1288
1289 getInput :: P AlexInput
1290 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1291
1292 setInput :: AlexInput -> P ()
1293 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1294
1295 pushLexState :: Int -> P ()
1296 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1297
1298 popLexState :: P Int
1299 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1300
1301 getLexState :: P Int
1302 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1303
1304 -- for reasons of efficiency, flags indicating language extensions (eg,
1305 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1306 -- integer
1307
1308 glaExtsBit, ffiBit, parrBit :: Int
1309 glaExtsBit = 0
1310 ffiBit     = 1
1311 parrBit    = 2
1312 arrowsBit  = 4
1313 thBit      = 5
1314 ipBit      = 6
1315 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1316 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1317                 -- (doesn't affect the lexer)
1318 idxTysBit  = 9  -- indexed type families: 'family' keyword and kind sigs
1319
1320 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1321 glaExtsEnabled flags = testBit flags glaExtsBit
1322 ffiEnabled     flags = testBit flags ffiBit
1323 parrEnabled    flags = testBit flags parrBit
1324 arrowsEnabled  flags = testBit flags arrowsBit
1325 thEnabled      flags = testBit flags thBit
1326 ipEnabled      flags = testBit flags ipBit
1327 tvEnabled      flags = testBit flags tvBit
1328 bangPatEnabled flags = testBit flags bangPatBit
1329 idxTysEnabled  flags = testBit flags idxTysBit
1330
1331 -- PState for parsing options pragmas
1332 --
1333 pragState :: StringBuffer -> SrcLoc -> PState
1334 pragState buf loc  =
1335   PState {
1336       buffer     = buf,
1337       last_loc   = mkSrcSpan loc loc,
1338       last_offs  = 0,
1339       last_len   = 0,
1340       loc        = loc,
1341       extsBitmap = 0,
1342       context    = [],
1343       lex_state  = [bol, option_prags, 0]
1344     }
1345
1346
1347 -- create a parse state
1348 --
1349 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1350 mkPState buf loc flags  =
1351   PState {
1352       buffer     = buf,
1353       last_loc   = mkSrcSpan loc loc,
1354       last_offs  = 0,
1355       last_len   = 0,
1356       loc        = loc,
1357       extsBitmap = fromIntegral bitmap,
1358       context    = [],
1359       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1360         -- we begin in the layout state if toplev_layout is set
1361     }
1362     where
1363       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1364                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1365                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1366                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1367                .|. thBit      `setBitIf` dopt Opt_TH          flags
1368                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1369                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1370                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1371                .|. idxTysBit  `setBitIf` dopt Opt_IndexedTypes flags
1372       --
1373       setBitIf :: Int -> Bool -> Int
1374       b `setBitIf` cond | cond      = bit b
1375                         | otherwise = 0
1376
1377 getContext :: P [LayoutContext]
1378 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1379
1380 setContext :: [LayoutContext] -> P ()
1381 setContext ctx = P $ \s -> POk s{context=ctx} ()
1382
1383 popContext :: P ()
1384 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1385                            loc = loc, last_len = len, last_loc = last_loc }) ->
1386   case ctx of
1387         (_:tl) -> POk s{ context = tl } ()
1388         []     -> PFailed last_loc (srcParseErr buf len)
1389
1390 -- Push a new layout context at the indentation of the last token read.
1391 -- This is only used at the outer level of a module when the 'module'
1392 -- keyword is missing.
1393 pushCurrentContext :: P ()
1394 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_len=len, context=ctx } ->
1395   POk s{context = Layout (offs-len) : ctx} ()
1396
1397 getOffside :: P Ordering
1398 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1399                 let ord = case stk of
1400                         (Layout n:_) -> compare offs n
1401                         _            -> GT
1402                 in POk s ord
1403
1404 -- ---------------------------------------------------------------------------
1405 -- Construct a parse error
1406
1407 srcParseErr
1408   :: StringBuffer       -- current buffer (placed just after the last token)
1409   -> Int                -- length of the previous token
1410   -> Message
1411 srcParseErr buf len
1412   = hcat [ if null token
1413              then ptext SLIT("parse error (possibly incorrect indentation)")
1414              else hcat [ptext SLIT("parse error on input "),
1415                         char '`', text token, char '\'']
1416     ]
1417   where token = lexemeToString (offsetBytes (-len) buf) len
1418
1419 -- Report a parse failure, giving the span of the previous token as
1420 -- the location of the error.  This is the entry point for errors
1421 -- detected during parsing.
1422 srcParseFail :: P a
1423 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1424                             last_loc = last_loc } ->
1425     PFailed last_loc (srcParseErr buf len)
1426
1427 -- A lexical error is reported at a particular position in the source file,
1428 -- not over a token range.
1429 lexError :: String -> P a
1430 lexError str = do
1431   loc <- getSrcLoc
1432   i@(AI end _ buf) <- getInput
1433   reportLexError loc end buf str
1434
1435 -- -----------------------------------------------------------------------------
1436 -- This is the top-level function: called from the parser each time a
1437 -- new token is to be read from the input.
1438
1439 lexer :: (Located Token -> P a) -> P a
1440 lexer cont = do
1441   tok@(L _ tok__) <- lexToken
1442   --trace ("token: " ++ show tok__) $ do
1443   cont tok
1444
1445 lexToken :: P (Located Token)
1446 lexToken = do
1447   inp@(AI loc1 _ buf) <- getInput
1448   sc <- getLexState
1449   exts <- getExts
1450   case alexScanUser exts inp sc of
1451     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1452                   setLastToken span 0
1453                   return (L span ITeof)
1454     AlexError (AI loc2 _ buf) -> do
1455         reportLexError loc1 loc2 buf "lexical error"
1456     AlexSkip inp2 _ -> do
1457         setInput inp2
1458         lexToken
1459     AlexToken inp2@(AI end _ buf2) len t -> do
1460         setInput inp2
1461         let span = mkSrcSpan loc1 end
1462         let bytes = byteDiff buf buf2
1463         span `seq` setLastToken span bytes
1464         t span buf bytes
1465
1466 reportLexError loc1 loc2 buf str
1467   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1468   | otherwise =
1469   let
1470         c = fst (nextChar buf)
1471   in
1472   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1473     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1474     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1475 }