compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  29    getLexState, popLexState, pushLexState,
  30    extension, glaExtsEnabled, bangPatEnabled
  31   ) where
  32
  33 #include "HsVersions.h"
  34
  35 import ErrUtils         ( Message )
  36 import Outputable
  37 import StringBuffer
  38 import FastString
  39 import FastTypes
  40 import SrcLoc
  41 import UniqFM
  42 import DynFlags
  43 import Ctype
  44 import Util             ( maybePrefixMatch, readRational )
  45
  46 import Data.Bits
  47 import Data.Char        ( chr, isSpace )
  48 import Data.Ratio
  49 import Debug.Trace
  50
  51 #if __GLASGOW_HASKELL__ >= 605
  52 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  53 #else
  54 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  55 #endif
  56 }
  57
  58 $unispace    = \x05
  59 $whitechar   = [\ \t\n\r\f\v\xa0 $unispace]
  60 $white_no_nl = $whitechar # \n
  61
  62 $ascdigit  = 0-9
  63 $unidigit  = \x03
  64 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  65 $digit     = [$ascdigit $unidigit]
  66
  67 $special   = [\(\)\,\;\[\]\`\{\}]
  68 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  69 $unisymbol = \x04
  70 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  71
  72 $unilarge  = \x01
  73 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  74 $large     = [$asclarge $unilarge]
  75
  76 $unismall  = \x02
  77 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  78 $small     = [$ascsmall $unismall \_]
  79
  80 $unigraphic = \x06
  81 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  82
  83 $octit     = 0-7
  84 $hexit     = [$decdigit A-F a-f]
  85 $symchar   = [$symbol \:]
  86 $nl        = [\n\r]
  87 $idchar    = [$small $large $digit \']
  88
  89 $docsym    = [\| \^ \* \$]
  90
  91 @varid     = $small $idchar*
  92 @conid     = $large $idchar*
  93
  94 @varsym    = $symbol $symchar*
  95 @consym    = \: $symchar*
  96
  97 @decimal     = $decdigit+
  98 @octal       = $octit+
  99 @hexadecimal = $hexit+
 100 @exponent    = [eE] [\-\+]? @decimal
 101
 102 -- we support the hierarchical module name extension:
 103 @qual = (@conid \.)+
 104
 105 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 106
 107 haskell :-
 108
 109 -- everywhere: skip whitespace and comments
 110 $white_no_nl+                           ;
 111
 112 -- Everywhere: deal with nested comments.  We explicitly rule out
 113 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 114 -- (this can happen even though pragmas will normally take precedence due to
 115 -- longest-match, because pragmas aren't valid in every state, but comments
 116 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 117 -- set.
 118
 119 "{-" / { isNormalComment } { nested_comment lexToken }
 120
 121 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 122 -- more dashes followed by a symbol should be parsed as a varsym, so we
 123 -- have to exclude those.
 124
 125 -- Since Haddock comments aren't valid in every state, we need to rule them
 126 -- out here.
 127
 128 -- The following two rules match comments that begin with two dashes, but
 129 -- continue with a different character. The rules test that this character
 130 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 131 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 132 -- have a Haddock comment). The rules then munch the rest of the line.
 133
 134 "-- " ~$docsym .* ;
 135 "--" [^$symbol : \ ] .* ;
 136
 137 -- Next, match Haddock comments if no -haddock flag
 138
 139 "-- " $docsym .* / { ifExtension (not . haddockEnabled) } ;
 140
 141 -- Now, when we've matched comments that begin with 2 dashes and continue
 142 -- with a different character, we need to match comments that begin with three
 143 -- or more dashes (which clearly can't be Haddock comments). We only need to
 144 -- make sure that the first non-dash character isn't a symbol, and munch the
 145 -- rest of the line.
 146
 147 "---"\-* [^$symbol :] .* ;
 148
 149 -- Since the previous rules all match dashes followed by at least one
 150 -- character, we also need to match a whole line filled with just dashes.
 151
 152 "--"\-* / { atEOL } ;
 153
 154 -- We need this rule since none of the other single line comment rules
 155 -- actually match this case.
 156
 157 "-- " / { atEOL } ;
 158
 159 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 160 -- blank lines) until we find a non-whitespace character, then do layout
 161 -- processing.
 162 --
 163 -- One slight wibble here: what if the line begins with {-#? In
 164 -- theory, we have to lex the pragma to see if it's one we recognise,
 165 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 166 -- as a nested comment.  We don't bother with this: if the line begins
 167 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 168 <bol> {
 169   \n                                    ;
 170   ^\# (line)?                           { begin line_prag1 }
 171   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 172   ^\# \! .* \n                          ; -- #!, for scripts
 173   ()                                    { do_bol }
 174 }
 175
 176 -- after a layout keyword (let, where, do, of), we begin a new layout
 177 -- context if the curly brace is missing.
 178 -- Careful! This stuff is quite delicate.
 179 <layout, layout_do> {
 180   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 181         -- we might encounter {-# here, but {- has been handled already
 182   \n                                    ;
 183   ^\# (line)?                           { begin line_prag1 }
 184 }
 185
 186 -- do is treated in a subtly different way, see new_layout_context
 187 <layout>    ()                          { new_layout_context True }
 188 <layout_do> ()                          { new_layout_context False }
 189
 190 -- after a new layout context which was found to be to the left of the
 191 -- previous context, we have generated a '{' token, and we now need to
 192 -- generate a matching '}' token.
 193 <layout_left>  ()                       { do_layout_left }
 194
 195 <0,option_prags,glaexts> \n                             { begin bol }
 196
 197 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 198
 199 -- single-line line pragmas, of the form
 200 --    # <line> "<file>" <extra-stuff> \n
 201 <line_prag1> $decdigit+                 { setLine line_prag1a }
 202 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 203 <line_prag1b> .*                        { pop }
 204
 205 -- Haskell-style line pragmas, of the form
 206 --    {-# LINE <line> "<file>" #-}
 207 <line_prag2> $decdigit+                 { setLine line_prag2a }
 208 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 209 <line_prag2b> "#-}"|"-}"                { pop }
 210    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 211    -- with older versions of GHC which generated these.
 212
 213 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 214 -- is on, because the contents of the pragma is always written using
 215 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 216 -- enabled, we're sure to get a parse error.
 217 -- (ToDo: we should really emit a warning when ignoring pragmas)
 218 <glaexts>
 219   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 220
 221 <0,option_prags,glaexts> {
 222   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 223   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 224                                         { token (ITinline_prag False) }
 225   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 226                                         { token ITspec_prag }
 227   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 228         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 229   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 230         $whitechar* (NO(T?)INLINE|no(t?)inline)
 231                                         { token (ITspec_inline_prag False) }
 232   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 233   "{-#" $whitechar* (DEPRECATED|deprecated)
 234                                         { token ITdeprecated_prag }
 235   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 236   "{-#" $whitechar* (GENERATED|generated)
 237                                         { token ITgenerated_prag }
 238   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 239   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 240
 241   "{-#" $whitechar* (DOCOPTIONS|docoptions)
 242   / { ifExtension haddockEnabled }     { lex_string_prag ITdocOptions }
 243
 244  "{-#"                                 { nested_comment lexToken }
 245
 246   -- ToDo: should only be valid inside a pragma:
 247   "#-}"                                 { token ITclose_prag}
 248 }
 249
 250 <option_prags> {
 251   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 252   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 253                                         { lex_string_prag IToptions_prag }
 254   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 255   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 256 }
 257
 258 <0,option_prags,glaexts> {
 259         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 260   "{-#" $whitechar* $idchar+            { nested_comment lexToken }
 261 }
 262
 263 -- '0' state: ordinary lexemes
 264 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 265
 266 -- Haddock comments
 267
 268 <0,glaexts> {
 269   "-- " / $docsym    { multiline_doc_comment }
 270   "{-" \ ? / $docsym { nested_doc_comment }
 271 }
 272
 273 -- "special" symbols
 274
 275 <0,glaexts> {
 276   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 277   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 278 }
 279
 280 <0,glaexts> {
 281   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 282   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 283   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 284   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 285   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 286   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 287   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 288   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 289 }
 290
 291 <0,glaexts> {
 292   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 293                                         { special IToparenbar }
 294   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 295 }
 296
 297 <0,glaexts> {
 298   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 299 }
 300
 301 <glaexts> {
 302   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 303   "#)"                                  { token ITcubxparen }
 304   "{|"                                  { token ITocurlybar }
 305   "|}"                                  { token ITccurlybar }
 306 }
 307
 308 <0,option_prags,glaexts> {
 309   \(                                    { special IToparen }
 310   \)                                    { special ITcparen }
 311   \[                                    { special ITobrack }
 312   \]                                    { special ITcbrack }
 313   \,                                    { special ITcomma }
 314   \;                                    { special ITsemi }
 315   \`                                    { special ITbackquote }
 316
 317   \{                                    { open_brace }
 318   \}                                    { close_brace }
 319 }
 320
 321 <0,option_prags,glaexts> {
 322   @qual @varid                  { check_qvarid }
 323   @qual @conid                  { idtoken qconid }
 324   @varid                        { varid }
 325   @conid                        { idtoken conid }
 326 }
 327
 328 -- after an illegal qvarid, such as 'M.let',
 329 -- we back up and try again in the bad_qvarid state:
 330 <bad_qvarid> {
 331   @conid                        { pop_and (idtoken conid) }
 332   @qual @conid                  { pop_and (idtoken qconid) }
 333 }
 334
 335 <glaexts> {
 336   @qual @varid "#"+             { idtoken qvarid }
 337   @qual @conid "#"+             { idtoken qconid }
 338   @varid "#"+                   { varid }
 339   @conid "#"+                   { idtoken conid }
 340 }
 341
 342 -- ToDo: M.(,,,)
 343
 344 <0,glaexts> {
 345   @qual @varsym                 { idtoken qvarsym }
 346   @qual @consym                 { idtoken qconsym }
 347   @varsym                       { varsym }
 348   @consym                       { consym }
 349 }
 350
 351 <0,glaexts> {
 352   @decimal                      { tok_decimal }
 353   0[oO] @octal                  { tok_octal }
 354   0[xX] @hexadecimal            { tok_hexadecimal }
 355 }
 356
 357 <glaexts> {
 358   @decimal \#                   { prim_decimal }
 359   0[oO] @octal \#               { prim_octal }
 360   0[xX] @hexadecimal \#         { prim_hexadecimal }
 361 }
 362
 363 <0,glaexts> @floating_point             { strtoken tok_float }
 364 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 365 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 366
 367 -- Strings and chars are lexed by hand-written code.  The reason is
 368 -- that even if we recognise the string or char here in the regex
 369 -- lexer, we would still have to parse the string afterward in order
 370 -- to convert it to a String.
 371 <0,glaexts> {
 372   \'                            { lex_char_tok }
 373   \"                            { lex_string_tok }
 374 }
 375
 376 {
 377 -- work around bug in Alex 2.0
 378 #if __GLASGOW_HASKELL__ < 503
 379 unsafeAt arr i = arr ! i
 380 #endif
 381
 382 -- -----------------------------------------------------------------------------
 383 -- The token type
 384
 385 data Token
 386   = ITas                        -- Haskell keywords
 387   | ITcase
 388   | ITclass
 389   | ITdata
 390   | ITdefault
 391   | ITderiving
 392   | ITderive
 393   | ITdo
 394   | ITelse
 395   | IThiding
 396   | ITif
 397   | ITimport
 398   | ITin
 399   | ITinfix
 400   | ITinfixl
 401   | ITinfixr
 402   | ITinstance
 403   | ITlet
 404   | ITmodule
 405   | ITnewtype
 406   | ITof
 407   | ITqualified
 408   | ITthen
 409   | ITtype
 410   | ITwhere
 411   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 412
 413   | ITforall                    -- GHC extension keywords
 414   | ITforeign
 415   | ITexport
 416   | ITlabel
 417   | ITdynamic
 418   | ITsafe
 419   | ITthreadsafe
 420   | ITunsafe
 421   | ITstdcallconv
 422   | ITccallconv
 423   | ITdotnet
 424   | ITmdo
 425   | ITfamily
 426
 427         -- Pragmas
 428   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 429   | ITspec_prag                 -- SPECIALISE
 430   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 431   | ITsource_prag
 432   | ITrules_prag
 433   | ITdeprecated_prag
 434   | ITline_prag
 435   | ITscc_prag
 436   | ITgenerated_prag
 437   | ITcore_prag                 -- hdaume: core annotations
 438   | ITunpack_prag
 439   | ITclose_prag
 440   | IToptions_prag String
 441   | ITinclude_prag String
 442   | ITlanguage_prag
 443
 444   | ITdotdot                    -- reserved symbols
 445   | ITcolon
 446   | ITdcolon
 447   | ITequal
 448   | ITlam
 449   | ITvbar
 450   | ITlarrow
 451   | ITrarrow
 452   | ITat
 453   | ITtilde
 454   | ITdarrow
 455   | ITminus
 456   | ITbang
 457   | ITstar
 458   | ITdot
 459
 460   | ITbiglam                    -- GHC-extension symbols
 461
 462   | ITocurly                    -- special symbols
 463   | ITccurly
 464   | ITocurlybar                 -- {|, for type applications
 465   | ITccurlybar                 -- |}, for type applications
 466   | ITvocurly
 467   | ITvccurly
 468   | ITobrack
 469   | ITopabrack                  -- [:, for parallel arrays with -fparr
 470   | ITcpabrack                  -- :], for parallel arrays with -fparr
 471   | ITcbrack
 472   | IToparen
 473   | ITcparen
 474   | IToubxparen
 475   | ITcubxparen
 476   | ITsemi
 477   | ITcomma
 478   | ITunderscore
 479   | ITbackquote
 480
 481   | ITvarid   FastString        -- identifiers
 482   | ITconid   FastString
 483   | ITvarsym  FastString
 484   | ITconsym  FastString
 485   | ITqvarid  (FastString,FastString)
 486   | ITqconid  (FastString,FastString)
 487   | ITqvarsym (FastString,FastString)
 488   | ITqconsym (FastString,FastString)
 489
 490   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 491
 492   | ITpragma StringBuffer
 493
 494   | ITchar       Char
 495   | ITstring     FastString
 496   | ITinteger    Integer
 497   | ITrational   Rational
 498
 499   | ITprimchar   Char
 500   | ITprimstring FastString
 501   | ITprimint    Integer
 502   | ITprimfloat  Rational
 503   | ITprimdouble Rational
 504
 505   -- MetaHaskell extension tokens
 506   | ITopenExpQuote              --  [| or [e|
 507   | ITopenPatQuote              --  [p|
 508   | ITopenDecQuote              --  [d|
 509   | ITopenTypQuote              --  [t|
 510   | ITcloseQuote                --  |]
 511   | ITidEscape   FastString     --  $x
 512   | ITparenEscape               --  $(
 513   | ITvarQuote                  --  '
 514   | ITtyQuote                   --  ''
 515
 516   -- Arrow notation extension
 517   | ITproc
 518   | ITrec
 519   | IToparenbar                 --  (|
 520   | ITcparenbar                 --  |)
 521   | ITlarrowtail                --  -<
 522   | ITrarrowtail                --  >-
 523   | ITLarrowtail                --  -<<
 524   | ITRarrowtail                --  >>-
 525
 526   | ITunknown String            -- Used when the lexer can't make sense of it
 527   | ITeof                       -- end of file token
 528
 529   -- Documentation annotations
 530   | ITdocCommentNext  String     -- something beginning '-- |'
 531   | ITdocCommentPrev  String     -- something beginning '-- ^'
 532   | ITdocCommentNamed String     -- something beginning '-- $'
 533   | ITdocSection      Int String -- a section heading
 534   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 535
 536 #ifdef DEBUG
 537   deriving Show -- debugging
 538 #endif
 539
 540 isSpecial :: Token -> Bool
 541 -- If we see M.x, where x is a keyword, but
 542 -- is special, we treat is as just plain M.x,
 543 -- not as a keyword.
 544 isSpecial ITas          = True
 545 isSpecial IThiding      = True
 546 isSpecial ITderive      = True
 547 isSpecial ITqualified   = True
 548 isSpecial ITforall      = True
 549 isSpecial ITexport      = True
 550 isSpecial ITlabel       = True
 551 isSpecial ITdynamic     = True
 552 isSpecial ITsafe        = True
 553 isSpecial ITthreadsafe  = True
 554 isSpecial ITunsafe      = True
 555 isSpecial ITccallconv   = True
 556 isSpecial ITstdcallconv = True
 557 isSpecial ITmdo         = True
 558 isSpecial ITfamily      = True
 559 isSpecial _             = False
 560
 561 -- the bitmap provided as the third component indicates whether the
 562 -- corresponding extension keyword is valid under the extension options
 563 -- provided to the compiler; if the extension corresponding to *any* of the
 564 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 565 -- facilitates using a keyword in two different extensions that can be
 566 -- activated independently)
 567 --
 568 reservedWordsFM = listToUFM $
 569         map (\(x, y, z) -> (mkFastString x, (y, z)))
 570        [( "_",          ITunderscore,   0 ),
 571         ( "as",         ITas,           0 ),
 572         ( "case",       ITcase,         0 ),
 573         ( "class",      ITclass,        0 ),
 574         ( "data",       ITdata,         0 ),
 575         ( "default",    ITdefault,      0 ),
 576         ( "deriving",   ITderiving,     0 ),
 577         ( "derive",     ITderive,       0 ),
 578         ( "do",         ITdo,           0 ),
 579         ( "else",       ITelse,         0 ),
 580         ( "hiding",     IThiding,       0 ),
 581         ( "if",         ITif,           0 ),
 582         ( "import",     ITimport,       0 ),
 583         ( "in",         ITin,           0 ),
 584         ( "infix",      ITinfix,        0 ),
 585         ( "infixl",     ITinfixl,       0 ),
 586         ( "infixr",     ITinfixr,       0 ),
 587         ( "instance",   ITinstance,     0 ),
 588         ( "let",        ITlet,          0 ),
 589         ( "module",     ITmodule,       0 ),
 590         ( "newtype",    ITnewtype,      0 ),
 591         ( "of",         ITof,           0 ),
 592         ( "qualified",  ITqualified,    0 ),
 593         ( "then",       ITthen,         0 ),
 594         ( "type",       ITtype,         0 ),
 595         ( "where",      ITwhere,        0 ),
 596         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 597
 598         ( "forall",     ITforall,        bit tvBit),
 599         ( "mdo",        ITmdo,           bit glaExtsBit),
 600         ( "family",     ITfamily,        bit idxTysBit),
 601
 602         ( "foreign",    ITforeign,       bit ffiBit),
 603         ( "export",     ITexport,        bit ffiBit),
 604         ( "label",      ITlabel,         bit ffiBit),
 605         ( "dynamic",    ITdynamic,       bit ffiBit),
 606         ( "safe",       ITsafe,          bit ffiBit),
 607         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 608         ( "unsafe",     ITunsafe,        bit ffiBit),
 609         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 610         ( "ccall",      ITccallconv,     bit ffiBit),
 611         ( "dotnet",     ITdotnet,        bit ffiBit),
 612
 613         ( "rec",        ITrec,           bit arrowsBit),
 614         ( "proc",       ITproc,          bit arrowsBit)
 615      ]
 616
 617 reservedSymsFM = listToUFM $
 618         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 619       [ ("..",  ITdotdot,       0)
 620        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 621                                                 -- meaning only list cons
 622        ,("::",  ITdcolon,       0)
 623        ,("=",   ITequal,        0)
 624        ,("\\",  ITlam,          0)
 625        ,("|",   ITvbar,         0)
 626        ,("<-",  ITlarrow,       0)
 627        ,("->",  ITrarrow,       0)
 628        ,("@",   ITat,           0)
 629        ,("~",   ITtilde,        0)
 630        ,("=>",  ITdarrow,       0)
 631        ,("-",   ITminus,        0)
 632        ,("!",   ITbang,         0)
 633
 634        ,("*",   ITstar,         bit glaExtsBit .|.
 635                                 bit idxTysBit)      -- For data T (a::*) = MkT
 636        ,(".",   ITdot,          bit tvBit)          -- For 'forall a . t'
 637
 638        ,("-<",  ITlarrowtail,   bit arrowsBit)
 639        ,(">-",  ITrarrowtail,   bit arrowsBit)
 640        ,("-<<", ITLarrowtail,   bit arrowsBit)
 641        ,(">>-", ITRarrowtail,   bit arrowsBit)
 642
 643 #if __GLASGOW_HASKELL__ >= 605
 644        ,("λ",  ITlam,          bit glaExtsBit)
 645        ,("∷",   ITdcolon,       bit glaExtsBit)
 646        ,("⇒",   ITdarrow,     bit glaExtsBit)
 647        ,("∀", ITforall,       bit glaExtsBit)
 648        ,("→",   ITrarrow,     bit glaExtsBit)
 649        ,("←",   ITlarrow,     bit glaExtsBit)
 650        ,("?",   ITdotdot,       bit glaExtsBit)
 651         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 652         -- form part of a large operator.  This would let us have a better
 653         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 654 #endif
 655        ]
 656
 657 -- -----------------------------------------------------------------------------
 658 -- Lexer actions
 659
 660 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 661
 662 special :: Token -> Action
 663 special tok span _buf len = return (L span tok)
 664
 665 token, layout_token :: Token -> Action
 666 token t span buf len = return (L span t)
 667 layout_token t span buf len = pushLexState layout >> return (L span t)
 668
 669 idtoken :: (StringBuffer -> Int -> Token) -> Action
 670 idtoken f span buf len = return (L span $! (f buf len))
 671
 672 skip_one_varid :: (FastString -> Token) -> Action
 673 skip_one_varid f span buf len
 674   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 675
 676 strtoken :: (String -> Token) -> Action
 677 strtoken f span buf len =
 678   return (L span $! (f $! lexemeToString buf len))
 679
 680 init_strtoken :: Int -> (String -> Token) -> Action
 681 -- like strtoken, but drops the last N character(s)
 682 init_strtoken drop f span buf len =
 683   return (L span $! (f $! lexemeToString buf (len-drop)))
 684
 685 begin :: Int -> Action
 686 begin code _span _str _len = do pushLexState code; lexToken
 687
 688 pop :: Action
 689 pop _span _buf _len = do popLexState; lexToken
 690
 691 pop_and :: Action -> Action
 692 pop_and act span buf len = do popLexState; act span buf len
 693
 694 {-# INLINE nextCharIs #-}
 695 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 696
 697 notFollowedBy char _ _ _ (AI _ _ buf)
 698   = nextCharIs buf (/=char)
 699
 700 notFollowedBySymbol _ _ _ (AI _ _ buf)
 701   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 702
 703 isNormalComment bits _ _ (AI _ _ buf)
 704   | haddockEnabled bits = notFollowedByDocOrPragma
 705   | otherwise           = nextCharIs buf (/='#')
 706   where
 707     notFollowedByDocOrPragma
 708         = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 709
 710 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 711
 712 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 713   = if haddockEnabled bits then False else (p buf)
 714
 715 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 716
 717 ifExtension pred bits _ _ _ = pred bits
 718
 719 multiline_doc_comment :: Action
 720 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 721   where
 722     worker commentAcc input docType oneLine = case alexGetChar input of
 723       Just ('\n', input')
 724         | oneLine -> docCommentEnd input commentAcc docType buf span
 725         | otherwise -> case checkIfCommentLine input' of
 726           Just input -> worker ('\n':commentAcc) input docType False
 727           Nothing -> docCommentEnd input commentAcc docType buf span
 728       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 729       Nothing -> docCommentEnd input commentAcc docType buf span
 730
 731     checkIfCommentLine input = check (dropNonNewlineSpace input)
 732       where
 733         check input = case alexGetChar input of
 734           Just ('-', input) -> case alexGetChar input of
 735             Just ('-', input) -> case alexGetChar input of
 736               Just (c, _) | c /= '-' -> Just input
 737               _ -> Nothing
 738             _ -> Nothing
 739           _ -> Nothing
 740
 741         dropNonNewlineSpace input = case alexGetChar input of
 742           Just (c, input')
 743             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 744             | otherwise -> input
 745           Nothing -> input
 746
 747 {-
 748   nested comments require traversing by hand, they can't be parsed
 749   using regular expressions.
 750 -}
 751 nested_comment :: P (Located Token) -> Action
 752 nested_comment cont span _str _len = do
 753   input <- getInput
 754   go 1 input
 755   where
 756     go 0 input = do setInput input; cont
 757     go n input = case alexGetChar input of
 758       Nothing -> errBrace input span
 759       Just ('-',input) -> case alexGetChar input of
 760         Nothing  -> errBrace input span
 761         Just ('\125',input) -> go (n-1) input
 762         Just (c,_)          -> go n input
 763       Just ('\123',input) -> case alexGetChar input of
 764         Nothing  -> errBrace input span
 765         Just ('-',input) -> go (n+1) input
 766         Just (c,_)       -> go n input
 767       Just (c,input) -> go n input
 768
 769 nested_doc_comment :: Action
 770 nested_doc_comment span buf _len = withLexedDocType (go "")
 771   where
 772     go commentAcc input docType _ = case alexGetChar input of
 773       Nothing -> errBrace input span
 774       Just ('-',input) -> case alexGetChar input of
 775         Nothing -> errBrace input span
 776         Just ('\125',input@(AI end _ buf2)) ->
 777           docCommentEnd input commentAcc docType buf span
 778         Just (c,_) -> go ('-':commentAcc) input docType False
 779       Just ('\123', input) -> case alexGetChar input of
 780         Nothing  -> errBrace input span
 781         Just ('-',input) -> do
 782           setInput input
 783           let cont = do input <- getInput; go commentAcc input docType False
 784           nested_comment cont span buf _len
 785         Just (c,_) -> go ('\123':commentAcc) input docType False
 786       Just (c,input) -> go (c:commentAcc) input docType False
 787
 788 withLexedDocType lexDocComment = do
 789   input <- getInput
 790   case alexGetChar input of
 791     Nothing -> error "Can't happen"
 792     Just ('|', input) -> lexDocComment input ITdocCommentNext False
 793     Just ('^', input) -> lexDocComment input ITdocCommentPrev False
 794     Just ('$', input) -> lexDocComment input ITdocCommentNamed False
 795     Just ('*', input) -> lexDocSection 1 input
 796  where
 797     lexDocSection n input = case alexGetChar input of
 798       Just ('*', input) -> lexDocSection (n+1) input
 799       Just (c, _) -> lexDocComment input (ITdocSection n) True
 800       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 801
 802 -- docCommentEnd
 803 -------------------------------------------------------------------------------
 804 -- This function is quite tricky. We can't just return a new token, we also
 805 -- need to update the state of the parser. Why? Because the token is longer
 806 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 807 -- it writes the wrong token length to the parser state. This function is
 808 -- called afterwards, so it can just update the state.
 809
 810 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 811 -- which is something that the original lexer didn't account for.
 812 -- I have added last_line_len in the parser state which represents the length
 813 -- of the part of the token that is on the last line. It is now used for layout
 814 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 815 -- was before, the full length of the token, and it is now only used for error
 816 -- messages. /Waern
 817
 818 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 819                  SrcSpan -> P (Located Token)
 820 docCommentEnd input commentAcc docType buf span = do
 821   setInput input
 822   let (AI loc last_offs nextBuf) = input
 823       comment = reverse commentAcc
 824       span' = mkSrcSpan (srcSpanStart span) loc
 825       last_len = byteDiff buf nextBuf
 826
 827       last_line_len = if (last_offs - last_len < 0)
 828         then last_offs
 829         else last_len
 830
 831   span `seq` setLastToken span' last_len last_line_len
 832   return (L span' (docType comment))
 833
 834 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 835
 836 open_brace, close_brace :: Action
 837 open_brace span _str _len = do
 838   ctx <- getContext
 839   setContext (NoLayout:ctx)
 840   return (L span ITocurly)
 841 close_brace span _str _len = do
 842   popContext
 843   return (L span ITccurly)
 844
 845 -- We have to be careful not to count M.<varid> as a qualified name
 846 -- when <varid> is a keyword.  We hack around this by catching
 847 -- the offending tokens afterward, and re-lexing in a different state.
 848 check_qvarid span buf len = do
 849   case lookupUFM reservedWordsFM var of
 850         Just (keyword,exts)
 851           | not (isSpecial keyword) ->
 852           if exts == 0
 853              then try_again
 854              else do
 855                 b <- extension (\i -> exts .&. i /= 0)
 856                 if b then try_again
 857                      else return token
 858         _other -> return token
 859   where
 860         (mod,var) = splitQualName buf len
 861         token     = L span (ITqvarid (mod,var))
 862
 863         try_again = do
 864                 (AI _ offs _) <- getInput
 865                 setInput (AI (srcSpanStart span) (offs-len) buf)
 866                 pushLexState bad_qvarid
 867                 lexToken
 868
 869 qvarid buf len = ITqvarid $! splitQualName buf len
 870 qconid buf len = ITqconid $! splitQualName buf len
 871
 872 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 873 -- takes a StringBuffer and a length, and returns the module name
 874 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 875 -- because of hierarchical module names.
 876 splitQualName orig_buf len = split orig_buf orig_buf
 877   where
 878     split buf dot_buf
 879         | orig_buf `byteDiff` buf >= len  = done dot_buf
 880         | c == '.'                        = found_dot buf'
 881         | otherwise                       = split buf' dot_buf
 882       where
 883        (c,buf') = nextChar buf
 884
 885     -- careful, we might get names like M....
 886     -- so, if the character after the dot is not upper-case, this is
 887     -- the end of the qualifier part.
 888     found_dot buf -- buf points after the '.'
 889         | isUpper c    = split buf' buf
 890         | otherwise    = done buf
 891       where
 892        (c,buf') = nextChar buf
 893
 894     done dot_buf =
 895         (lexemeToFastString orig_buf (qual_size - 1),
 896          lexemeToFastString dot_buf (len - qual_size))
 897       where
 898         qual_size = orig_buf `byteDiff` dot_buf
 899
 900 varid span buf len =
 901   case lookupUFM reservedWordsFM fs of
 902         Just (keyword,0)    -> do
 903                 maybe_layout keyword
 904                 return (L span keyword)
 905         Just (keyword,exts) -> do
 906                 b <- extension (\i -> exts .&. i /= 0)
 907                 if b then do maybe_layout keyword
 908                              return (L span keyword)
 909                      else return (L span (ITvarid fs))
 910         _other -> return (L span (ITvarid fs))
 911   where
 912         fs = lexemeToFastString buf len
 913
 914 conid buf len = ITconid fs
 915   where fs = lexemeToFastString buf len
 916
 917 qvarsym buf len = ITqvarsym $! splitQualName buf len
 918 qconsym buf len = ITqconsym $! splitQualName buf len
 919
 920 varsym = sym ITvarsym
 921 consym = sym ITconsym
 922
 923 sym con span buf len =
 924   case lookupUFM reservedSymsFM fs of
 925         Just (keyword,0)    -> return (L span keyword)
 926         Just (keyword,exts) -> do
 927                 b <- extension (\i -> exts .&. i /= 0)
 928                 if b then return (L span keyword)
 929                      else return (L span $! con fs)
 930         _other -> return (L span $! con fs)
 931   where
 932         fs = lexemeToFastString buf len
 933
 934 tok_decimal span buf len
 935   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 936
 937 tok_octal span buf len
 938   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 939
 940 tok_hexadecimal span buf len
 941   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 942
 943 prim_decimal span buf len
 944   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 945
 946 prim_octal span buf len
 947   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 948
 949 prim_hexadecimal span buf len
 950   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 951
 952 tok_float        str = ITrational   $! readRational str
 953 prim_float       str = ITprimfloat  $! readRational str
 954 prim_double      str = ITprimdouble $! readRational str
 955
 956 -- -----------------------------------------------------------------------------
 957 -- Layout processing
 958
 959 -- we're at the first token on a line, insert layout tokens if necessary
 960 do_bol :: Action
 961 do_bol span _str _len = do
 962         pos <- getOffside
 963         case pos of
 964             LT -> do
 965                 --trace "layout: inserting '}'" $ do
 966                 popContext
 967                 -- do NOT pop the lex state, we might have a ';' to insert
 968                 return (L span ITvccurly)
 969             EQ -> do
 970                 --trace "layout: inserting ';'" $ do
 971                 popLexState
 972                 return (L span ITsemi)
 973             GT -> do
 974                 popLexState
 975                 lexToken
 976
 977 -- certain keywords put us in the "layout" state, where we might
 978 -- add an opening curly brace.
 979 maybe_layout ITdo       = pushLexState layout_do
 980 maybe_layout ITmdo      = pushLexState layout_do
 981 maybe_layout ITof       = pushLexState layout
 982 maybe_layout ITlet      = pushLexState layout
 983 maybe_layout ITwhere    = pushLexState layout
 984 maybe_layout ITrec      = pushLexState layout
 985 maybe_layout _          = return ()
 986
 987 -- Pushing a new implicit layout context.  If the indentation of the
 988 -- next token is not greater than the previous layout context, then
 989 -- Haskell 98 says that the new layout context should be empty; that is
 990 -- the lexer must generate {}.
 991 --
 992 -- We are slightly more lenient than this: when the new context is started
 993 -- by a 'do', then we allow the new context to be at the same indentation as
 994 -- the previous context.  This is what the 'strict' argument is for.
 995 --
 996 new_layout_context strict span _buf _len = do
 997     popLexState
 998     (AI _ offset _) <- getInput
 999     ctx <- getContext
1000     case ctx of
1001         Layout prev_off : _  |
1002            (strict     && prev_off >= offset  ||
1003             not strict && prev_off > offset) -> do
1004                 -- token is indented to the left of the previous context.
1005                 -- we must generate a {} sequence now.
1006                 pushLexState layout_left
1007                 return (L span ITvocurly)
1008         other -> do
1009                 setContext (Layout offset : ctx)
1010                 return (L span ITvocurly)
1011
1012 do_layout_left span _buf _len = do
1013     popLexState
1014     pushLexState bol  -- we must be at the start of a line
1015     return (L span ITvccurly)
1016
1017 -- -----------------------------------------------------------------------------
1018 -- LINE pragmas
1019
1020 setLine :: Int -> Action
1021 setLine code span buf len = do
1022   let line = parseInteger buf len 10 octDecDigit
1023   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1024         -- subtract one: the line number refers to the *following* line
1025   popLexState
1026   pushLexState code
1027   lexToken
1028
1029 setFile :: Int -> Action
1030 setFile code span buf len = do
1031   let file = lexemeToFastString (stepOn buf) (len-2)
1032   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1033   popLexState
1034   pushLexState code
1035   lexToken
1036
1037
1038 -- -----------------------------------------------------------------------------
1039 -- Options, includes and language pragmas.
1040
1041 lex_string_prag :: (String -> Token) -> Action
1042 lex_string_prag mkTok span buf len
1043     = do input <- getInput
1044          start <- getSrcLoc
1045          tok <- go [] input
1046          end <- getSrcLoc
1047          return (L (mkSrcSpan start end) tok)
1048     where go acc input
1049               = if isString input "#-}"
1050                    then do setInput input
1051                            return (mkTok (reverse acc))
1052                    else case alexGetChar input of
1053                           Just (c,i) -> go (c:acc) i
1054                           Nothing -> err input
1055           isString i [] = True
1056           isString i (x:xs)
1057               = case alexGetChar i of
1058                   Just (c,i') | c == x    -> isString i' xs
1059                   _other -> False
1060           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1061
1062
1063 -- -----------------------------------------------------------------------------
1064 -- Strings & Chars
1065
1066 -- This stuff is horrible.  I hates it.
1067
1068 lex_string_tok :: Action
1069 lex_string_tok span buf len = do
1070   tok <- lex_string ""
1071   end <- getSrcLoc
1072   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1073
1074 lex_string :: String -> P Token
1075 lex_string s = do
1076   i <- getInput
1077   case alexGetChar' i of
1078     Nothing -> lit_error
1079
1080     Just ('"',i)  -> do
1081         setInput i
1082         glaexts <- extension glaExtsEnabled
1083         if glaexts
1084           then do
1085             i <- getInput
1086             case alexGetChar' i of
1087               Just ('#',i) -> do
1088                    setInput i
1089                    if any (> '\xFF') s
1090                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1091                     else let s' = mkZFastString (reverse s) in
1092                          return (ITprimstring s')
1093                         -- mkZFastString is a hack to avoid encoding the
1094                         -- string in UTF-8.  We just want the exact bytes.
1095               _other ->
1096                 return (ITstring (mkFastString (reverse s)))
1097           else
1098                 return (ITstring (mkFastString (reverse s)))
1099
1100     Just ('\\',i)
1101         | Just ('&',i) <- next -> do
1102                 setInput i; lex_string s
1103         | Just (c,i) <- next, is_space c -> do
1104                 setInput i; lex_stringgap s
1105         where next = alexGetChar' i
1106
1107     Just (c, i) -> do
1108         c' <- lex_char c i
1109         lex_string (c':s)
1110
1111 lex_stringgap s = do
1112   c <- getCharOrFail
1113   case c of
1114     '\\' -> lex_string s
1115     c | is_space c -> lex_stringgap s
1116     _other -> lit_error
1117
1118
1119 lex_char_tok :: Action
1120 -- Here we are basically parsing character literals, such as 'x' or '\n'
1121 -- but, when Template Haskell is on, we additionally spot
1122 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1123 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1124 -- So we have to do two characters of lookahead: when we see 'x we need to
1125 -- see if there's a trailing quote
1126 lex_char_tok span buf len = do  -- We've seen '
1127    i1 <- getInput       -- Look ahead to first character
1128    let loc = srcSpanStart span
1129    case alexGetChar' i1 of
1130         Nothing -> lit_error
1131
1132         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1133                   th_exts <- extension thEnabled
1134                   if th_exts then do
1135                         setInput i2
1136                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1137                    else lit_error
1138
1139         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
1140                   setInput i2
1141                   lit_ch <- lex_escape
1142                   mc <- getCharOrFail   -- Trailing quote
1143                   if mc == '\'' then finish_char_tok loc lit_ch
1144                                 else do setInput i2; lit_error
1145
1146         Just (c, i2@(AI end2 _ _))
1147                 | not (isAny c) -> lit_error
1148                 | otherwise ->
1149
1150                 -- We've seen 'x, where x is a valid character
1151                 --  (i.e. not newline etc) but not a quote or backslash
1152            case alexGetChar' i2 of      -- Look ahead one more character
1153                 Nothing -> lit_error
1154                 Just ('\'', i3) -> do   -- We've seen 'x'
1155                         setInput i3
1156                         finish_char_tok loc c
1157                 _other -> do            -- We've seen 'x not followed by quote
1158                                         -- If TH is on, just parse the quote only
1159                         th_exts <- extension thEnabled
1160                         let (AI end _ _) = i1
1161                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1162                                    else do setInput i2; lit_error
1163
1164 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1165 finish_char_tok loc ch  -- We've already seen the closing quote
1166                         -- Just need to check for trailing #
1167   = do  glaexts <- extension glaExtsEnabled
1168         i@(AI end _ _) <- getInput
1169         if glaexts then do
1170                 case alexGetChar' i of
1171                         Just ('#',i@(AI end _ _)) -> do
1172                                 setInput i
1173                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1174                         _other ->
1175                                 return (L (mkSrcSpan loc end) (ITchar ch))
1176                 else do
1177                    return (L (mkSrcSpan loc end) (ITchar ch))
1178
1179 lex_char :: Char -> AlexInput -> P Char
1180 lex_char c inp = do
1181   case c of
1182       '\\' -> do setInput inp; lex_escape
1183       c | isAny c -> do setInput inp; return c
1184       _other -> lit_error
1185
1186 isAny c | c > '\xff' = isPrint c
1187         | otherwise  = is_any c
1188
1189 lex_escape :: P Char
1190 lex_escape = do
1191   c <- getCharOrFail
1192   case c of
1193         'a'   -> return '\a'
1194         'b'   -> return '\b'
1195         'f'   -> return '\f'
1196         'n'   -> return '\n'
1197         'r'   -> return '\r'
1198         't'   -> return '\t'
1199         'v'   -> return '\v'
1200         '\\'  -> return '\\'
1201         '"'   -> return '\"'
1202         '\''  -> return '\''
1203         '^'   -> do c <- getCharOrFail
1204                     if c >= '@' && c <= '_'
1205                         then return (chr (ord c - ord '@'))
1206                         else lit_error
1207
1208         'x'   -> readNum is_hexdigit 16 hexDigit
1209         'o'   -> readNum is_octdigit  8 octDecDigit
1210         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1211
1212         c1 ->  do
1213            i <- getInput
1214            case alexGetChar' i of
1215             Nothing -> lit_error
1216             Just (c2,i2) ->
1217               case alexGetChar' i2 of
1218                 Nothing -> do setInput i2; lit_error
1219                 Just (c3,i3) ->
1220                    let str = [c1,c2,c3] in
1221                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1222                                      Just rest <- [maybePrefixMatch p str] ] of
1223                           (escape_char,[]):_ -> do
1224                                 setInput i3
1225                                 return escape_char
1226                           (escape_char,_:_):_ -> do
1227                                 setInput i2
1228                                 return escape_char
1229                           [] -> lit_error
1230
1231 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1232 readNum is_digit base conv = do
1233   i <- getInput
1234   c <- getCharOrFail
1235   if is_digit c
1236         then readNum2 is_digit base conv (conv c)
1237         else do setInput i; lit_error
1238
1239 readNum2 is_digit base conv i = do
1240   input <- getInput
1241   read i input
1242   where read i input = do
1243           case alexGetChar' input of
1244             Just (c,input') | is_digit c -> do
1245                 read (i*base + conv c) input'
1246             _other -> do
1247                 if i >= 0 && i <= 0x10FFFF
1248                    then do setInput input; return (chr i)
1249                    else lit_error
1250
1251 silly_escape_chars = [
1252         ("NUL", '\NUL'),
1253         ("SOH", '\SOH'),
1254         ("STX", '\STX'),
1255         ("ETX", '\ETX'),
1256         ("EOT", '\EOT'),
1257         ("ENQ", '\ENQ'),
1258         ("ACK", '\ACK'),
1259         ("BEL", '\BEL'),
1260         ("BS", '\BS'),
1261         ("HT", '\HT'),
1262         ("LF", '\LF'),
1263         ("VT", '\VT'),
1264         ("FF", '\FF'),
1265         ("CR", '\CR'),
1266         ("SO", '\SO'),
1267         ("SI", '\SI'),
1268         ("DLE", '\DLE'),
1269         ("DC1", '\DC1'),
1270         ("DC2", '\DC2'),
1271         ("DC3", '\DC3'),
1272         ("DC4", '\DC4'),
1273         ("NAK", '\NAK'),
1274         ("SYN", '\SYN'),
1275         ("ETB", '\ETB'),
1276         ("CAN", '\CAN'),
1277         ("EM", '\EM'),
1278         ("SUB", '\SUB'),
1279         ("ESC", '\ESC'),
1280         ("FS", '\FS'),
1281         ("GS", '\GS'),
1282         ("RS", '\RS'),
1283         ("US", '\US'),
1284         ("SP", '\SP'),
1285         ("DEL", '\DEL')
1286         ]
1287
1288 -- before calling lit_error, ensure that the current input is pointing to
1289 -- the position of the error in the buffer.  This is so that we can report
1290 -- a correct location to the user, but also so we can detect UTF-8 decoding
1291 -- errors if they occur.
1292 lit_error = lexError "lexical error in string/character literal"
1293
1294 getCharOrFail :: P Char
1295 getCharOrFail =  do
1296   i <- getInput
1297   case alexGetChar' i of
1298         Nothing -> lexError "unexpected end-of-file in string/character literal"
1299         Just (c,i)  -> do setInput i; return c
1300
1301 -- -----------------------------------------------------------------------------
1302 -- The Parse Monad
1303
1304 data LayoutContext
1305   = NoLayout
1306   | Layout !Int
1307   deriving Show
1308
1309 data ParseResult a
1310   = POk PState a
1311   | PFailed
1312         SrcSpan         -- The start and end of the text span related to
1313                         -- the error.  Might be used in environments which can
1314                         -- show this span, e.g. by highlighting it.
1315         Message         -- The error message
1316
1317 data PState = PState {
1318         buffer     :: StringBuffer,
1319         last_loc   :: SrcSpan,  -- pos of previous token
1320         last_offs  :: !Int,     -- offset of the previous token from the
1321                                 -- beginning of  the current line.
1322                                 -- \t is equal to 8 spaces.
1323         last_len   :: !Int,     -- len of previous token
1324   last_line_len :: !Int,
1325         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1326         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1327         context    :: [LayoutContext],
1328         lex_state  :: [Int]
1329      }
1330         -- last_loc and last_len are used when generating error messages,
1331         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1332         -- current token to happyError, we could at least get rid of last_len.
1333         -- Getting rid of last_loc would require finding another way to
1334         -- implement pushCurrentContext (which is only called from one place).
1335
1336 newtype P a = P { unP :: PState -> ParseResult a }
1337
1338 instance Monad P where
1339   return = returnP
1340   (>>=) = thenP
1341   fail = failP
1342
1343 returnP :: a -> P a
1344 returnP a = P $ \s -> POk s a
1345
1346 thenP :: P a -> (a -> P b) -> P b
1347 (P m) `thenP` k = P $ \ s ->
1348         case m s of
1349                 POk s1 a         -> (unP (k a)) s1
1350                 PFailed span err -> PFailed span err
1351
1352 failP :: String -> P a
1353 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1354
1355 failMsgP :: String -> P a
1356 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1357
1358 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1359 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1360
1361 failSpanMsgP :: SrcSpan -> String -> P a
1362 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1363
1364 extension :: (Int -> Bool) -> P Bool
1365 extension p = P $ \s -> POk s (p $! extsBitmap s)
1366
1367 getExts :: P Int
1368 getExts = P $ \s -> POk s (extsBitmap s)
1369
1370 setSrcLoc :: SrcLoc -> P ()
1371 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1372
1373 getSrcLoc :: P SrcLoc
1374 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1375
1376 setLastToken :: SrcSpan -> Int -> Int -> P ()
1377 setLastToken loc len line_len = P $ \s -> POk s {
1378   last_loc=loc,
1379   last_len=len,
1380   last_line_len=line_len
1381 } ()
1382
1383 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1384
1385 alexInputPrevChar :: AlexInput -> Char
1386 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1387
1388 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1389 alexGetChar (AI loc ofs s)
1390   | atEnd s   = Nothing
1391   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1392                 --trace (show (ord c)) $
1393                 Just (adj_c, (AI loc' ofs' s'))
1394   where (c,s') = nextChar s
1395         loc'   = advanceSrcLoc loc c
1396         ofs'   = advanceOffs c ofs
1397
1398         non_graphic     = '\x0'
1399         upper           = '\x1'
1400         lower           = '\x2'
1401         digit           = '\x3'
1402         symbol          = '\x4'
1403         space           = '\x5'
1404         other_graphic   = '\x6'
1405
1406         adj_c
1407           | c <= '\x06' = non_graphic
1408           | c <= '\xff' = c
1409           | otherwise =
1410                 case generalCategory c of
1411                   UppercaseLetter       -> upper
1412                   LowercaseLetter       -> lower
1413                   TitlecaseLetter       -> upper
1414                   ModifierLetter        -> other_graphic
1415                   OtherLetter           -> other_graphic
1416                   NonSpacingMark        -> other_graphic
1417                   SpacingCombiningMark  -> other_graphic
1418                   EnclosingMark         -> other_graphic
1419                   DecimalNumber         -> digit
1420                   LetterNumber          -> other_graphic
1421                   OtherNumber           -> other_graphic
1422                   ConnectorPunctuation  -> other_graphic
1423                   DashPunctuation       -> other_graphic
1424                   OpenPunctuation       -> other_graphic
1425                   ClosePunctuation      -> other_graphic
1426                   InitialQuote          -> other_graphic
1427                   FinalQuote            -> other_graphic
1428                   OtherPunctuation      -> other_graphic
1429                   MathSymbol            -> symbol
1430                   CurrencySymbol        -> symbol
1431                   ModifierSymbol        -> symbol
1432                   OtherSymbol           -> symbol
1433                   Space                 -> space
1434                   _other                -> non_graphic
1435
1436 -- This version does not squash unicode characters, it is used when
1437 -- lexing strings.
1438 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1439 alexGetChar' (AI loc ofs s)
1440   | atEnd s   = Nothing
1441   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1442                 --trace (show (ord c)) $
1443                 Just (c, (AI loc' ofs' s'))
1444   where (c,s') = nextChar s
1445         loc'   = advanceSrcLoc loc c
1446         ofs'   = advanceOffs c ofs
1447
1448 advanceOffs :: Char -> Int -> Int
1449 advanceOffs '\n' offs = 0
1450 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1451 advanceOffs _    offs = offs + 1
1452
1453 getInput :: P AlexInput
1454 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1455
1456 setInput :: AlexInput -> P ()
1457 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1458
1459 pushLexState :: Int -> P ()
1460 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1461
1462 popLexState :: P Int
1463 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1464
1465 getLexState :: P Int
1466 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1467
1468 -- for reasons of efficiency, flags indicating language extensions (eg,
1469 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1470 -- integer
1471
1472 glaExtsBit, ffiBit, parrBit :: Int
1473 glaExtsBit = 0
1474 ffiBit     = 1
1475 parrBit    = 2
1476 arrowsBit  = 4
1477 thBit      = 5
1478 ipBit      = 6
1479 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1480 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1481                 -- (doesn't affect the lexer)
1482 idxTysBit  = 9  -- indexed type families: 'family' keyword and kind sigs
1483 haddockBit = 10 -- Lex and parse Haddock comments
1484
1485 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1486 glaExtsEnabled flags = testBit flags glaExtsBit
1487 ffiEnabled     flags = testBit flags ffiBit
1488 parrEnabled    flags = testBit flags parrBit
1489 arrowsEnabled  flags = testBit flags arrowsBit
1490 thEnabled      flags = testBit flags thBit
1491 ipEnabled      flags = testBit flags ipBit
1492 tvEnabled      flags = testBit flags tvBit
1493 bangPatEnabled flags = testBit flags bangPatBit
1494 idxTysEnabled  flags = testBit flags idxTysBit
1495 haddockEnabled flags = testBit flags haddockBit
1496
1497 -- PState for parsing options pragmas
1498 --
1499 pragState :: StringBuffer -> SrcLoc -> PState
1500 pragState buf loc  =
1501   PState {
1502       buffer          = buf,
1503       last_loc      = mkSrcSpan loc loc,
1504       last_offs     = 0,
1505       last_len      = 0,
1506       last_line_len = 0,
1507       loc           = loc,
1508       extsBitmap    = 0,
1509       context       = [],
1510       lex_state     = [bol, option_prags, 0]
1511     }
1512
1513
1514 -- create a parse state
1515 --
1516 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1517 mkPState buf loc flags  =
1518   PState {
1519       buffer          = buf,
1520       last_loc      = mkSrcSpan loc loc,
1521       last_offs     = 0,
1522       last_len      = 0,
1523       last_line_len = 0,
1524       loc           = loc,
1525       extsBitmap    = fromIntegral bitmap,
1526       context       = [],
1527       lex_state     = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1528         -- we begin in the layout state if toplev_layout is set
1529     }
1530     where
1531       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1532                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1533                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1534                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1535                .|. thBit      `setBitIf` dopt Opt_TH          flags
1536                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1537                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1538                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1539                .|. idxTysBit  `setBitIf` dopt Opt_IndexedTypes flags
1540                .|. haddockBit `setBitIf` dopt Opt_Haddock     flags
1541       --
1542       setBitIf :: Int -> Bool -> Int
1543       b `setBitIf` cond | cond      = bit b
1544                         | otherwise = 0
1545
1546 getContext :: P [LayoutContext]
1547 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1548
1549 setContext :: [LayoutContext] -> P ()
1550 setContext ctx = P $ \s -> POk s{context=ctx} ()
1551
1552 popContext :: P ()
1553 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1554                            loc = loc, last_len = len, last_loc = last_loc }) ->
1555   case ctx of
1556         (_:tl) -> POk s{ context = tl } ()
1557         []     -> PFailed last_loc (srcParseErr buf len)
1558
1559 -- Push a new layout context at the indentation of the last token read.
1560 -- This is only used at the outer level of a module when the 'module'
1561 -- keyword is missing.
1562 pushCurrentContext :: P ()
1563 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1564     POk s{context = Layout (offs-len) : ctx} ()
1565 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1566
1567 getOffside :: P Ordering
1568 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1569                 let ord = case stk of
1570                         (Layout n:_) -> compare offs n
1571                         _            -> GT
1572                 in POk s ord
1573
1574 -- ---------------------------------------------------------------------------
1575 -- Construct a parse error
1576
1577 srcParseErr
1578   :: StringBuffer       -- current buffer (placed just after the last token)
1579   -> Int                -- length of the previous token
1580   -> Message
1581 srcParseErr buf len
1582   = hcat [ if null token
1583              then ptext SLIT("parse error (possibly incorrect indentation)")
1584              else hcat [ptext SLIT("parse error on input "),
1585                         char '`', text token, char '\'']
1586     ]
1587   where token = lexemeToString (offsetBytes (-len) buf) len
1588
1589 -- Report a parse failure, giving the span of the previous token as
1590 -- the location of the error.  This is the entry point for errors
1591 -- detected during parsing.
1592 srcParseFail :: P a
1593 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1594                             last_loc = last_loc } ->
1595     PFailed last_loc (srcParseErr buf len)
1596
1597 -- A lexical error is reported at a particular position in the source file,
1598 -- not over a token range.
1599 lexError :: String -> P a
1600 lexError str = do
1601   loc <- getSrcLoc
1602   i@(AI end _ buf) <- getInput
1603   reportLexError loc end buf str
1604
1605 -- -----------------------------------------------------------------------------
1606 -- This is the top-level function: called from the parser each time a
1607 -- new token is to be read from the input.
1608
1609 lexer :: (Located Token -> P a) -> P a
1610 lexer cont = do
1611   tok@(L span tok__) <- lexToken
1612 --  trace ("token: " ++ show tok__) $ do
1613   cont tok
1614
1615 lexToken :: P (Located Token)
1616 lexToken = do
1617   inp@(AI loc1 _ buf) <- getInput
1618   sc <- getLexState
1619   exts <- getExts
1620   case alexScanUser exts inp sc of
1621     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1622                   setLastToken span 0 0
1623                   return (L span ITeof)
1624     AlexError (AI loc2 _ buf) -> do
1625         reportLexError loc1 loc2 buf "lexical error"
1626     AlexSkip inp2 _ -> do
1627         setInput inp2
1628         lexToken
1629     AlexToken inp2@(AI end _ buf2) len t -> do
1630     setInput inp2
1631     let span = mkSrcSpan loc1 end
1632     let bytes = byteDiff buf buf2
1633     span `seq` setLastToken span bytes bytes
1634     t span buf bytes
1635
1636 reportLexError loc1 loc2 buf str
1637   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1638   | otherwise =
1639   let
1640         c = fst (nextChar buf)
1641   in
1642   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1643     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1644     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1645 }