compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  29    getLexState, popLexState, pushLexState,
  30    extension, bangPatEnabled
  31   ) where
  32
  33 #include "HsVersions.h"
  34
  35 import ErrUtils         ( Message )
  36 import Outputable
  37 import StringBuffer
  38 import FastString
  39 import FastTypes
  40 import SrcLoc
  41 import UniqFM
  42 import DynFlags
  43 import Ctype
  44 import Util             ( maybePrefixMatch, readRational )
  45
  46 import DATA_BITS
  47 import Data.Char        ( chr )
  48 import Ratio
  49 --import TRACE
  50
  51 #if __GLASGOW_HASKELL__ >= 605
  52 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  53 #else
  54 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  55 #endif
  56 }
  57
  58 $unispace    = \x05
  59 $whitechar   = [\ \t\n\r\f\v\xa0 $unispace]
  60 $white_no_nl = $whitechar # \n
  61
  62 $ascdigit  = 0-9
  63 $unidigit  = \x03
  64 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  65 $digit     = [$ascdigit $unidigit]
  66
  67 $special   = [\(\)\,\;\[\]\`\{\}]
  68 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  69 $unisymbol = \x04
  70 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  71
  72 $unilarge  = \x01
  73 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  74 $large     = [$asclarge $unilarge]
  75
  76 $unismall  = \x02
  77 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  78 $small     = [$ascsmall $unismall \_]
  79
  80 $unigraphic = \x06
  81 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  82
  83 $octit     = 0-7
  84 $hexit     = [$decdigit A-F a-f]
  85 $symchar   = [$symbol \:]
  86 $nl        = [\n\r]
  87 $idchar    = [$small $large $digit \']
  88
  89 @varid     = $small $idchar*
  90 @conid     = $large $idchar*
  91
  92 @varsym    = $symbol $symchar*
  93 @consym    = \: $symchar*
  94
  95 @decimal     = $decdigit+
  96 @octal       = $octit+
  97 @hexadecimal = $hexit+
  98 @exponent    = [eE] [\-\+]? @decimal
  99
 100 -- we support the hierarchical module name extension:
 101 @qual = (@conid \.)+
 102
 103 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 104
 105 haskell :-
 106
 107 -- everywhere: skip whitespace and comments
 108 $white_no_nl+                           ;
 109
 110 -- Everywhere: deal with nested comments.  We explicitly rule out
 111 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 112 -- (this can happen even though pragmas will normally take precedence due to
 113 -- longest-match, because pragmas aren't valid in every state, but comments
 114 -- are).
 115 "{-" / { notFollowedBy '#' }            { nested_comment }
 116
 117 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 118 -- more dashes followed by a symbol should be parsed as a varsym, so we
 119 -- have to exclude those.
 120 -- The regex says: "munch all the characters after the dashes, as long as
 121 -- the first one is not a symbol".
 122 "--"\-* [^$symbol :] .*                 ;
 123 "--"\-* / { atEOL }                     ;
 124
 125 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 126 -- blank lines) until we find a non-whitespace character, then do layout
 127 -- processing.
 128 --
 129 -- One slight wibble here: what if the line begins with {-#? In
 130 -- theory, we have to lex the pragma to see if it's one we recognise,
 131 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 132 -- as a nested comment.  We don't bother with this: if the line begins
 133 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 134 <bol> {
 135   \n                                    ;
 136   ^\# (line)?                           { begin line_prag1 }
 137   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 138   ^\# \! .* \n                          ; -- #!, for scripts
 139   ()                                    { do_bol }
 140 }
 141
 142 -- after a layout keyword (let, where, do, of), we begin a new layout
 143 -- context if the curly brace is missing.
 144 -- Careful! This stuff is quite delicate.
 145 <layout, layout_do> {
 146   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 147         -- we might encounter {-# here, but {- has been handled already
 148   \n                                    ;
 149   ^\# (line)?                           { begin line_prag1 }
 150 }
 151
 152 -- do is treated in a subtly different way, see new_layout_context
 153 <layout>    ()                          { new_layout_context True }
 154 <layout_do> ()                          { new_layout_context False }
 155
 156 -- after a new layout context which was found to be to the left of the
 157 -- previous context, we have generated a '{' token, and we now need to
 158 -- generate a matching '}' token.
 159 <layout_left>  ()                       { do_layout_left }
 160
 161 <0,option_prags,glaexts> \n                             { begin bol }
 162
 163 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 164
 165 -- single-line line pragmas, of the form
 166 --    # <line> "<file>" <extra-stuff> \n
 167 <line_prag1> $decdigit+                 { setLine line_prag1a }
 168 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 169 <line_prag1b> .*                        { pop }
 170
 171 -- Haskell-style line pragmas, of the form
 172 --    {-# LINE <line> "<file>" #-}
 173 <line_prag2> $decdigit+                 { setLine line_prag2a }
 174 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 175 <line_prag2b> "#-}"|"-}"                { pop }
 176    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 177    -- with older versions of GHC which generated these.
 178
 179 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 180 -- is on, because the contents of the pragma is always written using
 181 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 182 -- enabled, we're sure to get a parse error.
 183 -- (ToDo: we should really emit a warning when ignoring pragmas)
 184 <glaexts>
 185   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 186
 187 <0,option_prags,glaexts> {
 188   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 189   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 190                                         { token (ITinline_prag False) }
 191   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 192                                         { token ITspec_prag }
 193   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 194         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 195   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 196         $whitechar* (NO(T?)INLINE|no(t?)inline)
 197                                         { token (ITspec_inline_prag False) }
 198   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 199   "{-#" $whitechar* (DEPRECATED|deprecated)
 200                                         { token ITdeprecated_prag }
 201   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 202   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 203   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 204
 205   "{-#"                                 { nested_comment }
 206
 207   -- ToDo: should only be valid inside a pragma:
 208   "#-}"                                 { token ITclose_prag}
 209 }
 210
 211 <option_prags> {
 212   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 213   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 214                                         { lex_string_prag IToptions_prag }
 215   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 216   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 217 }
 218
 219 <0,option_prags,glaexts> {
 220         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 221   "{-#" $whitechar* $idchar+            { nested_comment }
 222 }
 223
 224 -- '0' state: ordinary lexemes
 225 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 226
 227 -- "special" symbols
 228
 229 <0,glaexts> {
 230   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 231   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 232 }
 233
 234 <0,glaexts> {
 235   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 236   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 237   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 238   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 239   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 240   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 241   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 242   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 243 }
 244
 245 <0,glaexts> {
 246   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 247                                         { special IToparenbar }
 248   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 249 }
 250
 251 <0,glaexts> {
 252   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 253   \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
 254 }
 255
 256 <glaexts> {
 257   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 258   "#)"                                  { token ITcubxparen }
 259   "{|"                                  { token ITocurlybar }
 260   "|}"                                  { token ITccurlybar }
 261 }
 262
 263 <0,option_prags,glaexts> {
 264   \(                                    { special IToparen }
 265   \)                                    { special ITcparen }
 266   \[                                    { special ITobrack }
 267   \]                                    { special ITcbrack }
 268   \,                                    { special ITcomma }
 269   \;                                    { special ITsemi }
 270   \`                                    { special ITbackquote }
 271
 272   \{                                    { open_brace }
 273   \}                                    { close_brace }
 274 }
 275
 276 <0,option_prags,glaexts> {
 277   @qual @varid                  { check_qvarid }
 278   @qual @conid                  { idtoken qconid }
 279   @varid                        { varid }
 280   @conid                        { idtoken conid }
 281 }
 282
 283 -- after an illegal qvarid, such as 'M.let',
 284 -- we back up and try again in the bad_qvarid state:
 285 <bad_qvarid> {
 286   @conid                        { pop_and (idtoken conid) }
 287   @qual @conid                  { pop_and (idtoken qconid) }
 288 }
 289
 290 <glaexts> {
 291   @qual @varid "#"+             { idtoken qvarid }
 292   @qual @conid "#"+             { idtoken qconid }
 293   @varid "#"+                   { varid }
 294   @conid "#"+                   { idtoken conid }
 295 }
 296
 297 -- ToDo: M.(,,,)
 298
 299 <0,glaexts> {
 300   @qual @varsym                 { idtoken qvarsym }
 301   @qual @consym                 { idtoken qconsym }
 302   @varsym                       { varsym }
 303   @consym                       { consym }
 304 }
 305
 306 <0,glaexts> {
 307   @decimal                      { tok_decimal }
 308   0[oO] @octal                  { tok_octal }
 309   0[xX] @hexadecimal            { tok_hexadecimal }
 310 }
 311
 312 <glaexts> {
 313   @decimal \#                   { prim_decimal }
 314   0[oO] @octal \#               { prim_octal }
 315   0[xX] @hexadecimal \#         { prim_hexadecimal }
 316 }
 317
 318 <0,glaexts> @floating_point             { strtoken tok_float }
 319 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 320 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 321
 322 -- Strings and chars are lexed by hand-written code.  The reason is
 323 -- that even if we recognise the string or char here in the regex
 324 -- lexer, we would still have to parse the string afterward in order
 325 -- to convert it to a String.
 326 <0,glaexts> {
 327   \'                            { lex_char_tok }
 328   \"                            { lex_string_tok }
 329 }
 330
 331 {
 332 -- work around bug in Alex 2.0
 333 #if __GLASGOW_HASKELL__ < 503
 334 unsafeAt arr i = arr ! i
 335 #endif
 336
 337 -- -----------------------------------------------------------------------------
 338 -- The token type
 339
 340 data Token
 341   = ITas                        -- Haskell keywords
 342   | ITcase
 343   | ITclass
 344   | ITdata
 345   | ITdefault
 346   | ITderiving
 347   | ITdo
 348   | ITelse
 349   | IThiding
 350   | ITif
 351   | ITimport
 352   | ITin
 353   | ITinfix
 354   | ITinfixl
 355   | ITinfixr
 356   | ITinstance
 357   | ITlet
 358   | ITmodule
 359   | ITnewtype
 360   | ITof
 361   | ITqualified
 362   | ITthen
 363   | ITtype
 364   | ITwhere
 365   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 366
 367   | ITforall                    -- GHC extension keywords
 368   | ITforeign
 369   | ITexport
 370   | ITlabel
 371   | ITdynamic
 372   | ITsafe
 373   | ITthreadsafe
 374   | ITunsafe
 375   | ITstdcallconv
 376   | ITccallconv
 377   | ITdotnet
 378   | ITmdo
 379
 380         -- Pragmas
 381   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 382   | ITspec_prag                 -- SPECIALISE
 383   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 384   | ITsource_prag
 385   | ITrules_prag
 386   | ITdeprecated_prag
 387   | ITline_prag
 388   | ITscc_prag
 389   | ITcore_prag                 -- hdaume: core annotations
 390   | ITunpack_prag
 391   | ITclose_prag
 392   | IToptions_prag String
 393   | ITinclude_prag String
 394   | ITlanguage_prag
 395
 396   | ITdotdot                    -- reserved symbols
 397   | ITcolon
 398   | ITdcolon
 399   | ITequal
 400   | ITlam
 401   | ITvbar
 402   | ITlarrow
 403   | ITrarrow
 404   | ITat
 405   | ITtilde
 406   | ITdarrow
 407   | ITminus
 408   | ITbang
 409   | ITstar
 410   | ITdot
 411
 412   | ITbiglam                    -- GHC-extension symbols
 413
 414   | ITocurly                    -- special symbols
 415   | ITccurly
 416   | ITocurlybar                 -- {|, for type applications
 417   | ITccurlybar                 -- |}, for type applications
 418   | ITvocurly
 419   | ITvccurly
 420   | ITobrack
 421   | ITopabrack                  -- [:, for parallel arrays with -fparr
 422   | ITcpabrack                  -- :], for parallel arrays with -fparr
 423   | ITcbrack
 424   | IToparen
 425   | ITcparen
 426   | IToubxparen
 427   | ITcubxparen
 428   | ITsemi
 429   | ITcomma
 430   | ITunderscore
 431   | ITbackquote
 432
 433   | ITvarid   FastString        -- identifiers
 434   | ITconid   FastString
 435   | ITvarsym  FastString
 436   | ITconsym  FastString
 437   | ITqvarid  (FastString,FastString)
 438   | ITqconid  (FastString,FastString)
 439   | ITqvarsym (FastString,FastString)
 440   | ITqconsym (FastString,FastString)
 441
 442   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 443   | ITsplitipvarid FastString   -- GHC extension: implicit param: %x
 444
 445   | ITpragma StringBuffer
 446
 447   | ITchar       Char
 448   | ITstring     FastString
 449   | ITinteger    Integer
 450   | ITrational   Rational
 451
 452   | ITprimchar   Char
 453   | ITprimstring FastString
 454   | ITprimint    Integer
 455   | ITprimfloat  Rational
 456   | ITprimdouble Rational
 457
 458   -- MetaHaskell extension tokens
 459   | ITopenExpQuote              --  [| or [e|
 460   | ITopenPatQuote              --  [p|
 461   | ITopenDecQuote              --  [d|
 462   | ITopenTypQuote              --  [t|
 463   | ITcloseQuote                --  |]
 464   | ITidEscape   FastString     --  $x
 465   | ITparenEscape               --  $(
 466   | ITvarQuote                  --  '
 467   | ITtyQuote                   --  ''
 468
 469   -- Arrow notation extension
 470   | ITproc
 471   | ITrec
 472   | IToparenbar                 --  (|
 473   | ITcparenbar                 --  |)
 474   | ITlarrowtail                --  -<
 475   | ITrarrowtail                --  >-
 476   | ITLarrowtail                --  -<<
 477   | ITRarrowtail                --  >>-
 478
 479   | ITunknown String            -- Used when the lexer can't make sense of it
 480   | ITeof                       -- end of file token
 481 #ifdef DEBUG
 482   deriving Show -- debugging
 483 #endif
 484
 485 isSpecial :: Token -> Bool
 486 -- If we see M.x, where x is a keyword, but
 487 -- is special, we treat is as just plain M.x,
 488 -- not as a keyword.
 489 isSpecial ITas          = True
 490 isSpecial IThiding      = True
 491 isSpecial ITqualified   = True
 492 isSpecial ITforall      = True
 493 isSpecial ITexport      = True
 494 isSpecial ITlabel       = True
 495 isSpecial ITdynamic     = True
 496 isSpecial ITsafe        = True
 497 isSpecial ITthreadsafe  = True
 498 isSpecial ITunsafe      = True
 499 isSpecial ITccallconv   = True
 500 isSpecial ITstdcallconv = True
 501 isSpecial ITmdo         = True
 502 isSpecial _             = False
 503
 504 -- the bitmap provided as the third component indicates whether the
 505 -- corresponding extension keyword is valid under the extension options
 506 -- provided to the compiler; if the extension corresponding to *any* of the
 507 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 508 -- facilitates using a keyword in two different extensions that can be
 509 -- activated independently)
 510 --
 511 reservedWordsFM = listToUFM $
 512         map (\(x, y, z) -> (mkFastString x, (y, z)))
 513        [( "_",          ITunderscore,   0 ),
 514         ( "as",         ITas,           0 ),
 515         ( "case",       ITcase,         0 ),
 516         ( "class",      ITclass,        0 ),
 517         ( "data",       ITdata,         0 ),
 518         ( "default",    ITdefault,      0 ),
 519         ( "deriving",   ITderiving,     0 ),
 520         ( "do",         ITdo,           0 ),
 521         ( "else",       ITelse,         0 ),
 522         ( "hiding",     IThiding,       0 ),
 523         ( "if",         ITif,           0 ),
 524         ( "import",     ITimport,       0 ),
 525         ( "in",         ITin,           0 ),
 526         ( "infix",      ITinfix,        0 ),
 527         ( "infixl",     ITinfixl,       0 ),
 528         ( "infixr",     ITinfixr,       0 ),
 529         ( "instance",   ITinstance,     0 ),
 530         ( "let",        ITlet,          0 ),
 531         ( "module",     ITmodule,       0 ),
 532         ( "newtype",    ITnewtype,      0 ),
 533         ( "of",         ITof,           0 ),
 534         ( "qualified",  ITqualified,    0 ),
 535         ( "then",       ITthen,         0 ),
 536         ( "type",       ITtype,         0 ),
 537         ( "where",      ITwhere,        0 ),
 538         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 539
 540         ( "forall",     ITforall,        bit tvBit),
 541         ( "mdo",        ITmdo,           bit glaExtsBit),
 542
 543         ( "foreign",    ITforeign,       bit ffiBit),
 544         ( "export",     ITexport,        bit ffiBit),
 545         ( "label",      ITlabel,         bit ffiBit),
 546         ( "dynamic",    ITdynamic,       bit ffiBit),
 547         ( "safe",       ITsafe,          bit ffiBit),
 548         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 549         ( "unsafe",     ITunsafe,        bit ffiBit),
 550         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 551         ( "ccall",      ITccallconv,     bit ffiBit),
 552         ( "dotnet",     ITdotnet,        bit ffiBit),
 553
 554         ( "rec",        ITrec,           bit arrowsBit),
 555         ( "proc",       ITproc,          bit arrowsBit)
 556      ]
 557
 558 reservedSymsFM = listToUFM $
 559         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 560       [ ("..",  ITdotdot,       0)
 561        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 562                                                 -- meaning only list cons
 563        ,("::",  ITdcolon,       0)
 564        ,("=",   ITequal,        0)
 565        ,("\\",  ITlam,          0)
 566        ,("|",   ITvbar,         0)
 567        ,("<-",  ITlarrow,       0)
 568        ,("->",  ITrarrow,       0)
 569        ,("@",   ITat,           0)
 570        ,("~",   ITtilde,        0)
 571        ,("=>",  ITdarrow,       0)
 572        ,("-",   ITminus,        0)
 573        ,("!",   ITbang,         0)
 574
 575        ,("*",   ITstar,         bit glaExtsBit) -- For data T (a::*) = MkT
 576        ,(".",   ITdot,          bit tvBit)      -- For 'forall a . t'
 577
 578        ,("-<",  ITlarrowtail,   bit arrowsBit)
 579        ,(">-",  ITrarrowtail,   bit arrowsBit)
 580        ,("-<<", ITLarrowtail,   bit arrowsBit)
 581        ,(">>-", ITRarrowtail,   bit arrowsBit)
 582
 583 #if __GLASGOW_HASKELL__ >= 605
 584        ,("λ",  ITlam,          bit glaExtsBit)
 585        ,("∷",   ITdcolon,       bit glaExtsBit)
 586        ,("⇒",   ITdarrow,     bit glaExtsBit)
 587        ,("∀", ITforall,       bit glaExtsBit)
 588        ,("→",   ITrarrow,     bit glaExtsBit)
 589        ,("←",   ITlarrow,     bit glaExtsBit)
 590        ,("⋯",         ITdotdot,       bit glaExtsBit)
 591         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 592         -- form part of a large operator.  This would let us have a better
 593         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 594 #endif
 595        ]
 596
 597 -- -----------------------------------------------------------------------------
 598 -- Lexer actions
 599
 600 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 601
 602 special :: Token -> Action
 603 special tok span _buf len = return (L span tok)
 604
 605 token, layout_token :: Token -> Action
 606 token t span buf len = return (L span t)
 607 layout_token t span buf len = pushLexState layout >> return (L span t)
 608
 609 idtoken :: (StringBuffer -> Int -> Token) -> Action
 610 idtoken f span buf len = return (L span $! (f buf len))
 611
 612 skip_one_varid :: (FastString -> Token) -> Action
 613 skip_one_varid f span buf len
 614   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 615
 616 strtoken :: (String -> Token) -> Action
 617 strtoken f span buf len =
 618   return (L span $! (f $! lexemeToString buf len))
 619
 620 init_strtoken :: Int -> (String -> Token) -> Action
 621 -- like strtoken, but drops the last N character(s)
 622 init_strtoken drop f span buf len =
 623   return (L span $! (f $! lexemeToString buf (len-drop)))
 624
 625 begin :: Int -> Action
 626 begin code _span _str _len = do pushLexState code; lexToken
 627
 628 pop :: Action
 629 pop _span _buf _len = do popLexState; lexToken
 630
 631 pop_and :: Action -> Action
 632 pop_and act span buf len = do popLexState; act span buf len
 633
 634 notFollowedBy char _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf /= char
 635
 636 notFollowedBySymbol _ _ _ (AI _ _ buf)
 637   = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
 638
 639 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 640
 641 ifExtension pred bits _ _ _ = pred bits
 642
 643 {-
 644   nested comments require traversing by hand, they can't be parsed
 645   using regular expressions.
 646 -}
 647 nested_comment :: Action
 648 nested_comment span _str _len = do
 649   input <- getInput
 650   go 1 input
 651   where go 0 input = do setInput input; lexToken
 652         go n input = do
 653           case alexGetChar input of
 654             Nothing  -> err input
 655             Just (c,input) -> do
 656               case c of
 657                 '-' -> do
 658                   case alexGetChar input of
 659                     Nothing  -> err input
 660                     Just ('\125',input) -> go (n-1) input
 661                     Just (c,_)          -> go n input
 662                 '\123' -> do
 663                   case alexGetChar input of
 664                     Nothing  -> err input
 665                     Just ('-',input') -> go (n+1) input'
 666                     Just (c,input)    -> go n input
 667                 c -> go n input
 668
 669         err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 670
 671 open_brace, close_brace :: Action
 672 open_brace span _str _len = do
 673   ctx <- getContext
 674   setContext (NoLayout:ctx)
 675   return (L span ITocurly)
 676 close_brace span _str _len = do
 677   popContext
 678   return (L span ITccurly)
 679
 680 -- We have to be careful not to count M.<varid> as a qualified name
 681 -- when <varid> is a keyword.  We hack around this by catching
 682 -- the offending tokens afterward, and re-lexing in a different state.
 683 check_qvarid span buf len = do
 684   case lookupUFM reservedWordsFM var of
 685         Just (keyword,exts)
 686           | not (isSpecial keyword) ->
 687           if exts == 0
 688              then try_again
 689              else do
 690                 b <- extension (\i -> exts .&. i /= 0)
 691                 if b then try_again
 692                      else return token
 693         _other -> return token
 694   where
 695         (mod,var) = splitQualName buf len
 696         token     = L span (ITqvarid (mod,var))
 697
 698         try_again = do
 699                 (AI _ offs _) <- getInput
 700                 setInput (AI (srcSpanStart span) (offs-len) buf)
 701                 pushLexState bad_qvarid
 702                 lexToken
 703
 704 qvarid buf len = ITqvarid $! splitQualName buf len
 705 qconid buf len = ITqconid $! splitQualName buf len
 706
 707 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 708 -- takes a StringBuffer and a length, and returns the module name
 709 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 710 -- because of hierarchical module names.
 711 splitQualName orig_buf len = split orig_buf orig_buf
 712   where
 713     split buf dot_buf
 714         | orig_buf `byteDiff` buf >= len  = done dot_buf
 715         | c == '.'                        = found_dot buf'
 716         | otherwise                       = split buf' dot_buf
 717       where
 718        (c,buf') = nextChar buf
 719
 720     -- careful, we might get names like M....
 721     -- so, if the character after the dot is not upper-case, this is
 722     -- the end of the qualifier part.
 723     found_dot buf -- buf points after the '.'
 724         | isUpper c    = split buf' buf
 725         | otherwise    = done buf
 726       where
 727        (c,buf') = nextChar buf
 728
 729     done dot_buf =
 730         (lexemeToFastString orig_buf (qual_size - 1),
 731          lexemeToFastString dot_buf (len - qual_size))
 732       where
 733         qual_size = orig_buf `byteDiff` dot_buf
 734
 735 varid span buf len =
 736   case lookupUFM reservedWordsFM fs of
 737         Just (keyword,0)    -> do
 738                 maybe_layout keyword
 739                 return (L span keyword)
 740         Just (keyword,exts) -> do
 741                 b <- extension (\i -> exts .&. i /= 0)
 742                 if b then do maybe_layout keyword
 743                              return (L span keyword)
 744                      else return (L span (ITvarid fs))
 745         _other -> return (L span (ITvarid fs))
 746   where
 747         fs = lexemeToFastString buf len
 748
 749 conid buf len = ITconid fs
 750   where fs = lexemeToFastString buf len
 751
 752 qvarsym buf len = ITqvarsym $! splitQualName buf len
 753 qconsym buf len = ITqconsym $! splitQualName buf len
 754
 755 varsym = sym ITvarsym
 756 consym = sym ITconsym
 757
 758 sym con span buf len =
 759   case lookupUFM reservedSymsFM fs of
 760         Just (keyword,0)    -> return (L span keyword)
 761         Just (keyword,exts) -> do
 762                 b <- extension (\i -> exts .&. i /= 0)
 763                 if b then return (L span keyword)
 764                      else return (L span $! con fs)
 765         _other -> return (L span $! con fs)
 766   where
 767         fs = lexemeToFastString buf len
 768
 769 tok_decimal span buf len
 770   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 771
 772 tok_octal span buf len
 773   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 774
 775 tok_hexadecimal span buf len
 776   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 777
 778 prim_decimal span buf len
 779   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 780
 781 prim_octal span buf len
 782   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 783
 784 prim_hexadecimal span buf len
 785   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 786
 787 tok_float        str = ITrational   $! readRational str
 788 prim_float       str = ITprimfloat  $! readRational str
 789 prim_double      str = ITprimdouble $! readRational str
 790
 791 -- -----------------------------------------------------------------------------
 792 -- Layout processing
 793
 794 -- we're at the first token on a line, insert layout tokens if necessary
 795 do_bol :: Action
 796 do_bol span _str _len = do
 797         pos <- getOffside
 798         case pos of
 799             LT -> do
 800                 --trace "layout: inserting '}'" $ do
 801                 popContext
 802                 -- do NOT pop the lex state, we might have a ';' to insert
 803                 return (L span ITvccurly)
 804             EQ -> do
 805                 --trace "layout: inserting ';'" $ do
 806                 popLexState
 807                 return (L span ITsemi)
 808             GT -> do
 809                 popLexState
 810                 lexToken
 811
 812 -- certain keywords put us in the "layout" state, where we might
 813 -- add an opening curly brace.
 814 maybe_layout ITdo       = pushLexState layout_do
 815 maybe_layout ITmdo      = pushLexState layout_do
 816 maybe_layout ITof       = pushLexState layout
 817 maybe_layout ITlet      = pushLexState layout
 818 maybe_layout ITwhere    = pushLexState layout
 819 maybe_layout ITrec      = pushLexState layout
 820 maybe_layout _          = return ()
 821
 822 -- Pushing a new implicit layout context.  If the indentation of the
 823 -- next token is not greater than the previous layout context, then
 824 -- Haskell 98 says that the new layout context should be empty; that is
 825 -- the lexer must generate {}.
 826 --
 827 -- We are slightly more lenient than this: when the new context is started
 828 -- by a 'do', then we allow the new context to be at the same indentation as
 829 -- the previous context.  This is what the 'strict' argument is for.
 830 --
 831 new_layout_context strict span _buf _len = do
 832     popLexState
 833     (AI _ offset _) <- getInput
 834     ctx <- getContext
 835     case ctx of
 836         Layout prev_off : _  |
 837            (strict     && prev_off >= offset  ||
 838             not strict && prev_off > offset) -> do
 839                 -- token is indented to the left of the previous context.
 840                 -- we must generate a {} sequence now.
 841                 pushLexState layout_left
 842                 return (L span ITvocurly)
 843         other -> do
 844                 setContext (Layout offset : ctx)
 845                 return (L span ITvocurly)
 846
 847 do_layout_left span _buf _len = do
 848     popLexState
 849     pushLexState bol  -- we must be at the start of a line
 850     return (L span ITvccurly)
 851
 852 -- -----------------------------------------------------------------------------
 853 -- LINE pragmas
 854
 855 setLine :: Int -> Action
 856 setLine code span buf len = do
 857   let line = parseInteger buf len 10 octDecDigit
 858   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
 859         -- subtract one: the line number refers to the *following* line
 860   popLexState
 861   pushLexState code
 862   lexToken
 863
 864 setFile :: Int -> Action
 865 setFile code span buf len = do
 866   let file = lexemeToFastString (stepOn buf) (len-2)
 867   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
 868   popLexState
 869   pushLexState code
 870   lexToken
 871
 872
 873 -- -----------------------------------------------------------------------------
 874 -- Options, includes and language pragmas.
 875
 876 lex_string_prag :: (String -> Token) -> Action
 877 lex_string_prag mkTok span buf len
 878     = do input <- getInput
 879          start <- getSrcLoc
 880          tok <- go [] input
 881          end <- getSrcLoc
 882          return (L (mkSrcSpan start end) tok)
 883     where go acc input
 884               = if isString input "#-}"
 885                    then do setInput input
 886                            return (mkTok (reverse acc))
 887                    else case alexGetChar input of
 888                           Just (c,i) -> go (c:acc) i
 889                           Nothing -> err input
 890           isString i [] = True
 891           isString i (x:xs)
 892               = case alexGetChar i of
 893                   Just (c,i') | c == x    -> isString i' xs
 894                   _other -> False
 895           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
 896
 897
 898 -- -----------------------------------------------------------------------------
 899 -- Strings & Chars
 900
 901 -- This stuff is horrible.  I hates it.
 902
 903 lex_string_tok :: Action
 904 lex_string_tok span buf len = do
 905   tok <- lex_string ""
 906   end <- getSrcLoc
 907   return (L (mkSrcSpan (srcSpanStart span) end) tok)
 908
 909 lex_string :: String -> P Token
 910 lex_string s = do
 911   i <- getInput
 912   case alexGetChar' i of
 913     Nothing -> lit_error
 914
 915     Just ('"',i)  -> do
 916         setInput i
 917         glaexts <- extension glaExtsEnabled
 918         if glaexts
 919           then do
 920             i <- getInput
 921             case alexGetChar' i of
 922               Just ('#',i) -> do
 923                    setInput i
 924                    if any (> '\xFF') s
 925                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 926                     else let s' = mkZFastString (reverse s) in
 927                          return (ITprimstring s')
 928                         -- mkZFastString is a hack to avoid encoding the
 929                         -- string in UTF-8.  We just want the exact bytes.
 930               _other ->
 931                 return (ITstring (mkFastString (reverse s)))
 932           else
 933                 return (ITstring (mkFastString (reverse s)))
 934
 935     Just ('\\',i)
 936         | Just ('&',i) <- next -> do
 937                 setInput i; lex_string s
 938         | Just (c,i) <- next, is_space c -> do
 939                 setInput i; lex_stringgap s
 940         where next = alexGetChar' i
 941
 942     Just (c, i) -> do
 943         c' <- lex_char c i
 944         lex_string (c':s)
 945
 946 lex_stringgap s = do
 947   c <- getCharOrFail
 948   case c of
 949     '\\' -> lex_string s
 950     c | is_space c -> lex_stringgap s
 951     _other -> lit_error
 952
 953
 954 lex_char_tok :: Action
 955 -- Here we are basically parsing character literals, such as 'x' or '\n'
 956 -- but, when Template Haskell is on, we additionally spot
 957 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
 958 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
 959 -- So we have to do two characters of lookahead: when we see 'x we need to
 960 -- see if there's a trailing quote
 961 lex_char_tok span buf len = do  -- We've seen '
 962    i1 <- getInput       -- Look ahead to first character
 963    let loc = srcSpanStart span
 964    case alexGetChar' i1 of
 965         Nothing -> lit_error
 966
 967         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
 968                   th_exts <- extension thEnabled
 969                   if th_exts then do
 970                         setInput i2
 971                         return (L (mkSrcSpan loc end2)  ITtyQuote)
 972                    else lit_error
 973
 974         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
 975                   setInput i2
 976                   lit_ch <- lex_escape
 977                   mc <- getCharOrFail   -- Trailing quote
 978                   if mc == '\'' then finish_char_tok loc lit_ch
 979                                 else do setInput i2; lit_error
 980
 981         Just (c, i2@(AI end2 _ _))
 982                 | not (isAny c) -> lit_error
 983                 | otherwise ->
 984
 985                 -- We've seen 'x, where x is a valid character
 986                 --  (i.e. not newline etc) but not a quote or backslash
 987            case alexGetChar' i2 of      -- Look ahead one more character
 988                 Nothing -> lit_error
 989                 Just ('\'', i3) -> do   -- We've seen 'x'
 990                         setInput i3
 991                         finish_char_tok loc c
 992                 _other -> do            -- We've seen 'x not followed by quote
 993                                         -- If TH is on, just parse the quote only
 994                         th_exts <- extension thEnabled
 995                         let (AI end _ _) = i1
 996                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
 997                                    else do setInput i2; lit_error
 998
 999 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1000 finish_char_tok loc ch  -- We've already seen the closing quote
1001                         -- Just need to check for trailing #
1002   = do  glaexts <- extension glaExtsEnabled
1003         i@(AI end _ _) <- getInput
1004         if glaexts then do
1005                 case alexGetChar' i of
1006                         Just ('#',i@(AI end _ _)) -> do
1007                                 setInput i
1008                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1009                         _other ->
1010                                 return (L (mkSrcSpan loc end) (ITchar ch))
1011                 else do
1012                    return (L (mkSrcSpan loc end) (ITchar ch))
1013
1014 lex_char :: Char -> AlexInput -> P Char
1015 lex_char c inp = do
1016   case c of
1017       '\\' -> do setInput inp; lex_escape
1018       c | isAny c -> do setInput inp; return c
1019       _other -> lit_error
1020
1021 isAny c | c > '\xff' = isPrint c
1022         | otherwise  = is_any c
1023
1024 lex_escape :: P Char
1025 lex_escape = do
1026   c <- getCharOrFail
1027   case c of
1028         'a'   -> return '\a'
1029         'b'   -> return '\b'
1030         'f'   -> return '\f'
1031         'n'   -> return '\n'
1032         'r'   -> return '\r'
1033         't'   -> return '\t'
1034         'v'   -> return '\v'
1035         '\\'  -> return '\\'
1036         '"'   -> return '\"'
1037         '\''  -> return '\''
1038         '^'   -> do c <- getCharOrFail
1039                     if c >= '@' && c <= '_'
1040                         then return (chr (ord c - ord '@'))
1041                         else lit_error
1042
1043         'x'   -> readNum is_hexdigit 16 hexDigit
1044         'o'   -> readNum is_octdigit  8 octDecDigit
1045         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1046
1047         c1 ->  do
1048            i <- getInput
1049            case alexGetChar' i of
1050             Nothing -> lit_error
1051             Just (c2,i2) ->
1052               case alexGetChar' i2 of
1053                 Nothing -> do setInput i2; lit_error
1054                 Just (c3,i3) ->
1055                    let str = [c1,c2,c3] in
1056                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1057                                      Just rest <- [maybePrefixMatch p str] ] of
1058                           (escape_char,[]):_ -> do
1059                                 setInput i3
1060                                 return escape_char
1061                           (escape_char,_:_):_ -> do
1062                                 setInput i2
1063                                 return escape_char
1064                           [] -> lit_error
1065
1066 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1067 readNum is_digit base conv = do
1068   i <- getInput
1069   c <- getCharOrFail
1070   if is_digit c
1071         then readNum2 is_digit base conv (conv c)
1072         else do setInput i; lit_error
1073
1074 readNum2 is_digit base conv i = do
1075   input <- getInput
1076   read i input
1077   where read i input = do
1078           case alexGetChar' input of
1079             Just (c,input') | is_digit c -> do
1080                 read (i*base + conv c) input'
1081             _other -> do
1082                 if i >= 0 && i <= 0x10FFFF
1083                    then do setInput input; return (chr i)
1084                    else lit_error
1085
1086 silly_escape_chars = [
1087         ("NUL", '\NUL'),
1088         ("SOH", '\SOH'),
1089         ("STX", '\STX'),
1090         ("ETX", '\ETX'),
1091         ("EOT", '\EOT'),
1092         ("ENQ", '\ENQ'),
1093         ("ACK", '\ACK'),
1094         ("BEL", '\BEL'),
1095         ("BS", '\BS'),
1096         ("HT", '\HT'),
1097         ("LF", '\LF'),
1098         ("VT", '\VT'),
1099         ("FF", '\FF'),
1100         ("CR", '\CR'),
1101         ("SO", '\SO'),
1102         ("SI", '\SI'),
1103         ("DLE", '\DLE'),
1104         ("DC1", '\DC1'),
1105         ("DC2", '\DC2'),
1106         ("DC3", '\DC3'),
1107         ("DC4", '\DC4'),
1108         ("NAK", '\NAK'),
1109         ("SYN", '\SYN'),
1110         ("ETB", '\ETB'),
1111         ("CAN", '\CAN'),
1112         ("EM", '\EM'),
1113         ("SUB", '\SUB'),
1114         ("ESC", '\ESC'),
1115         ("FS", '\FS'),
1116         ("GS", '\GS'),
1117         ("RS", '\RS'),
1118         ("US", '\US'),
1119         ("SP", '\SP'),
1120         ("DEL", '\DEL')
1121         ]
1122
1123 -- before calling lit_error, ensure that the current input is pointing to
1124 -- the position of the error in the buffer.  This is so that we can report
1125 -- a correct location to the user, but also so we can detect UTF-8 decoding
1126 -- errors if they occur.
1127 lit_error = lexError "lexical error in string/character literal"
1128
1129 getCharOrFail :: P Char
1130 getCharOrFail =  do
1131   i <- getInput
1132   case alexGetChar' i of
1133         Nothing -> lexError "unexpected end-of-file in string/character literal"
1134         Just (c,i)  -> do setInput i; return c
1135
1136 -- -----------------------------------------------------------------------------
1137 -- The Parse Monad
1138
1139 data LayoutContext
1140   = NoLayout
1141   | Layout !Int
1142
1143 data ParseResult a
1144   = POk PState a
1145   | PFailed
1146         SrcSpan         -- The start and end of the text span related to
1147                         -- the error.  Might be used in environments which can
1148                         -- show this span, e.g. by highlighting it.
1149         Message         -- The error message
1150
1151 data PState = PState {
1152         buffer     :: StringBuffer,
1153         last_loc   :: SrcSpan,  -- pos of previous token
1154         last_offs  :: !Int,     -- offset of the previous token from the
1155                                 -- beginning of  the current line.
1156                                 -- \t is equal to 8 spaces.
1157         last_len   :: !Int,     -- len of previous token
1158         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1159         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1160         context    :: [LayoutContext],
1161         lex_state  :: [Int]
1162      }
1163         -- last_loc and last_len are used when generating error messages,
1164         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1165         -- current token to happyError, we could at least get rid of last_len.
1166         -- Getting rid of last_loc would require finding another way to
1167         -- implement pushCurrentContext (which is only called from one place).
1168
1169 newtype P a = P { unP :: PState -> ParseResult a }
1170
1171 instance Monad P where
1172   return = returnP
1173   (>>=) = thenP
1174   fail = failP
1175
1176 returnP :: a -> P a
1177 returnP a = P $ \s -> POk s a
1178
1179 thenP :: P a -> (a -> P b) -> P b
1180 (P m) `thenP` k = P $ \ s ->
1181         case m s of
1182                 POk s1 a         -> (unP (k a)) s1
1183                 PFailed span err -> PFailed span err
1184
1185 failP :: String -> P a
1186 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1187
1188 failMsgP :: String -> P a
1189 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1190
1191 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1192 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1193
1194 failSpanMsgP :: SrcSpan -> String -> P a
1195 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1196
1197 extension :: (Int -> Bool) -> P Bool
1198 extension p = P $ \s -> POk s (p $! extsBitmap s)
1199
1200 getExts :: P Int
1201 getExts = P $ \s -> POk s (extsBitmap s)
1202
1203 setSrcLoc :: SrcLoc -> P ()
1204 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1205
1206 getSrcLoc :: P SrcLoc
1207 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1208
1209 setLastToken :: SrcSpan -> Int -> P ()
1210 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1211
1212 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1213
1214 alexInputPrevChar :: AlexInput -> Char
1215 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1216
1217 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1218 alexGetChar (AI loc ofs s)
1219   | atEnd s   = Nothing
1220   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1221                 --trace (show (ord c)) $
1222                 Just (adj_c, (AI loc' ofs' s'))
1223   where (c,s') = nextChar s
1224         loc'   = advanceSrcLoc loc c
1225         ofs'   = advanceOffs c ofs
1226
1227         non_graphic     = '\x0'
1228         upper           = '\x1'
1229         lower           = '\x2'
1230         digit           = '\x3'
1231         symbol          = '\x4'
1232         space           = '\x5'
1233         other_graphic   = '\x6'
1234
1235         adj_c
1236           | c <= '\x06' = non_graphic
1237           | c <= '\xff' = c
1238           | otherwise =
1239                 case generalCategory c of
1240                   UppercaseLetter       -> upper
1241                   LowercaseLetter       -> lower
1242                   TitlecaseLetter       -> upper
1243                   ModifierLetter        -> other_graphic
1244                   OtherLetter           -> other_graphic
1245                   NonSpacingMark        -> other_graphic
1246                   SpacingCombiningMark  -> other_graphic
1247                   EnclosingMark         -> other_graphic
1248                   DecimalNumber         -> digit
1249                   LetterNumber          -> other_graphic
1250                   OtherNumber           -> other_graphic
1251                   ConnectorPunctuation  -> other_graphic
1252                   DashPunctuation       -> other_graphic
1253                   OpenPunctuation       -> other_graphic
1254                   ClosePunctuation      -> other_graphic
1255                   InitialQuote          -> other_graphic
1256                   FinalQuote            -> other_graphic
1257                   OtherPunctuation      -> other_graphic
1258                   MathSymbol            -> symbol
1259                   CurrencySymbol        -> symbol
1260                   ModifierSymbol        -> symbol
1261                   OtherSymbol           -> symbol
1262                   Space                 -> space
1263                   _other                -> non_graphic
1264
1265 -- This version does not squash unicode characters, it is used when
1266 -- lexing strings.
1267 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1268 alexGetChar' (AI loc ofs s)
1269   | atEnd s   = Nothing
1270   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1271                 --trace (show (ord c)) $
1272                 Just (c, (AI loc' ofs' s'))
1273   where (c,s') = nextChar s
1274         loc'   = advanceSrcLoc loc c
1275         ofs'   = advanceOffs c ofs
1276
1277 advanceOffs :: Char -> Int -> Int
1278 advanceOffs '\n' offs = 0
1279 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1280 advanceOffs _    offs = offs + 1
1281
1282 getInput :: P AlexInput
1283 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1284
1285 setInput :: AlexInput -> P ()
1286 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1287
1288 pushLexState :: Int -> P ()
1289 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1290
1291 popLexState :: P Int
1292 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1293
1294 getLexState :: P Int
1295 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1296
1297 -- for reasons of efficiency, flags indicating language extensions (eg,
1298 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1299 -- integer
1300
1301 glaExtsBit, ffiBit, parrBit :: Int
1302 glaExtsBit = 0
1303 ffiBit     = 1
1304 parrBit    = 2
1305 arrowsBit  = 4
1306 thBit      = 5
1307 ipBit      = 6
1308 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1309 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1310                 -- (doesn't affect the lexer)
1311
1312 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1313 glaExtsEnabled flags = testBit flags glaExtsBit
1314 ffiEnabled     flags = testBit flags ffiBit
1315 parrEnabled    flags = testBit flags parrBit
1316 arrowsEnabled  flags = testBit flags arrowsBit
1317 thEnabled      flags = testBit flags thBit
1318 ipEnabled      flags = testBit flags ipBit
1319 tvEnabled      flags = testBit flags tvBit
1320 bangPatEnabled flags = testBit flags bangPatBit
1321
1322 -- PState for parsing options pragmas
1323 --
1324 pragState :: StringBuffer -> SrcLoc -> PState
1325 pragState buf loc  =
1326   PState {
1327       buffer     = buf,
1328       last_loc   = mkSrcSpan loc loc,
1329       last_offs  = 0,
1330       last_len   = 0,
1331       loc        = loc,
1332       extsBitmap = 0,
1333       context    = [],
1334       lex_state  = [bol, option_prags, 0]
1335     }
1336
1337
1338 -- create a parse state
1339 --
1340 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1341 mkPState buf loc flags  =
1342   PState {
1343       buffer     = buf,
1344       last_loc   = mkSrcSpan loc loc,
1345       last_offs  = 0,
1346       last_len   = 0,
1347       loc        = loc,
1348       extsBitmap = fromIntegral bitmap,
1349       context    = [],
1350       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1351         -- we begin in the layout state if toplev_layout is set
1352     }
1353     where
1354       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1355                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1356                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1357                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1358                .|. thBit      `setBitIf` dopt Opt_TH          flags
1359                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1360                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1361                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1362       --
1363       setBitIf :: Int -> Bool -> Int
1364       b `setBitIf` cond | cond      = bit b
1365                         | otherwise = 0
1366
1367 getContext :: P [LayoutContext]
1368 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1369
1370 setContext :: [LayoutContext] -> P ()
1371 setContext ctx = P $ \s -> POk s{context=ctx} ()
1372
1373 popContext :: P ()
1374 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1375                            loc = loc, last_len = len, last_loc = last_loc }) ->
1376   case ctx of
1377         (_:tl) -> POk s{ context = tl } ()
1378         []     -> PFailed last_loc (srcParseErr buf len)
1379
1380 -- Push a new layout context at the indentation of the last token read.
1381 -- This is only used at the outer level of a module when the 'module'
1382 -- keyword is missing.
1383 pushCurrentContext :: P ()
1384 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_len=len, context=ctx } ->
1385   POk s{context = Layout (offs-len) : ctx} ()
1386
1387 getOffside :: P Ordering
1388 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1389                 let ord = case stk of
1390                         (Layout n:_) -> compare offs n
1391                         _            -> GT
1392                 in POk s ord
1393
1394 -- ---------------------------------------------------------------------------
1395 -- Construct a parse error
1396
1397 srcParseErr
1398   :: StringBuffer       -- current buffer (placed just after the last token)
1399   -> Int                -- length of the previous token
1400   -> Message
1401 srcParseErr buf len
1402   = hcat [ if null token
1403              then ptext SLIT("parse error (possibly incorrect indentation)")
1404              else hcat [ptext SLIT("parse error on input "),
1405                         char '`', text token, char '\'']
1406     ]
1407   where token = lexemeToString (offsetBytes (-len) buf) len
1408
1409 -- Report a parse failure, giving the span of the previous token as
1410 -- the location of the error.  This is the entry point for errors
1411 -- detected during parsing.
1412 srcParseFail :: P a
1413 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1414                             last_loc = last_loc } ->
1415     PFailed last_loc (srcParseErr buf len)
1416
1417 -- A lexical error is reported at a particular position in the source file,
1418 -- not over a token range.
1419 lexError :: String -> P a
1420 lexError str = do
1421   loc <- getSrcLoc
1422   i@(AI end _ buf) <- getInput
1423   reportLexError loc end buf str
1424
1425 -- -----------------------------------------------------------------------------
1426 -- This is the top-level function: called from the parser each time a
1427 -- new token is to be read from the input.
1428
1429 lexer :: (Located Token -> P a) -> P a
1430 lexer cont = do
1431   tok@(L _ tok__) <- lexToken
1432   --trace ("token: " ++ show tok__) $ do
1433   cont tok
1434
1435 lexToken :: P (Located Token)
1436 lexToken = do
1437   inp@(AI loc1 _ buf) <- getInput
1438   sc <- getLexState
1439   exts <- getExts
1440   case alexScanUser exts inp sc of
1441     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1442                   setLastToken span 0
1443                   return (L span ITeof)
1444     AlexError (AI loc2 _ buf) -> do
1445         reportLexError loc1 loc2 buf "lexical error"
1446     AlexSkip inp2 _ -> do
1447         setInput inp2
1448         lexToken
1449     AlexToken inp2@(AI end _ buf2) len t -> do
1450         setInput inp2
1451         let span = mkSrcSpan loc1 end
1452         let bytes = byteDiff buf buf2
1453         span `seq` setLastToken span bytes
1454         t span buf bytes
1455
1456 reportLexError loc1 loc2 buf str
1457   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1458   | otherwise =
1459   let
1460         c = fst (nextChar buf)
1461   in
1462   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1463     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1464     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1465 }