compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    getMessages,
  29    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  30    getLexState, popLexState, pushLexState,
  31    extension, glaExtsEnabled, bangPatEnabled
  32   ) where
  33
  34 #include "HsVersions.h"
  35
  36 import Bag
  37 import ErrUtils
  38 import Outputable
  39 import StringBuffer
  40 import FastString
  41 import FastTypes
  42 import SrcLoc
  43 import UniqFM
  44 import DynFlags
  45 import Ctype
  46 import Util             ( maybePrefixMatch, readRational )
  47
  48 import Control.Monad
  49 import Data.Bits
  50 import Data.Char        ( chr, isSpace )
  51 import Data.Ratio
  52 import Debug.Trace
  53
  54 #if __GLASGOW_HASKELL__ >= 605
  55 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  56 #else
  57 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  58 #endif
  59 }
  60
  61 $unispace    = \x05
  62 $whitechar   = [\ \n\r\f\v\xa0 $unispace]
  63 $white_no_nl = $whitechar # \n
  64 $tab         = \t
  65
  66 $ascdigit  = 0-9
  67 $unidigit  = \x03
  68 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  69 $digit     = [$ascdigit $unidigit]
  70
  71 $special   = [\(\)\,\;\[\]\`\{\}]
  72 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  73 $unisymbol = \x04
  74 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  75
  76 $unilarge  = \x01
  77 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  78 $large     = [$asclarge $unilarge]
  79
  80 $unismall  = \x02
  81 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  82 $small     = [$ascsmall $unismall \_]
  83
  84 $unigraphic = \x06
  85 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  86
  87 $octit     = 0-7
  88 $hexit     = [$decdigit A-F a-f]
  89 $symchar   = [$symbol \:]
  90 $nl        = [\n\r]
  91 $idchar    = [$small $large $digit \']
  92
  93 $docsym    = [\| \^ \* \$]
  94
  95 @varid     = $small $idchar*
  96 @conid     = $large $idchar*
  97
  98 @varsym    = $symbol $symchar*
  99 @consym    = \: $symchar*
 100
 101 @decimal     = $decdigit+
 102 @octal       = $octit+
 103 @hexadecimal = $hexit+
 104 @exponent    = [eE] [\-\+]? @decimal
 105
 106 -- we support the hierarchical module name extension:
 107 @qual = (@conid \.)+
 108
 109 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 110
 111 haskell :-
 112
 113 -- everywhere: skip whitespace and comments
 114 $white_no_nl+                           ;
 115 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 116
 117 -- Everywhere: deal with nested comments.  We explicitly rule out
 118 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 119 -- (this can happen even though pragmas will normally take precedence due to
 120 -- longest-match, because pragmas aren't valid in every state, but comments
 121 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 122 -- set.
 123
 124 "{-" / { isNormalComment } { nested_comment lexToken }
 125
 126 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 127 -- more dashes followed by a symbol should be parsed as a varsym, so we
 128 -- have to exclude those.
 129
 130 -- Since Haddock comments aren't valid in every state, we need to rule them
 131 -- out here.
 132
 133 -- The following two rules match comments that begin with two dashes, but
 134 -- continue with a different character. The rules test that this character
 135 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 136 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 137 -- have a Haddock comment). The rules then munch the rest of the line.
 138
 139 "-- " ~$docsym .* ;
 140 "--" [^$symbol : \ ] .* ;
 141
 142 -- Next, match Haddock comments if no -haddock flag
 143
 144 "-- " $docsym .* / { ifExtension (not . haddockEnabled) } ;
 145
 146 -- Now, when we've matched comments that begin with 2 dashes and continue
 147 -- with a different character, we need to match comments that begin with three
 148 -- or more dashes (which clearly can't be Haddock comments). We only need to
 149 -- make sure that the first non-dash character isn't a symbol, and munch the
 150 -- rest of the line.
 151
 152 "---"\-* [^$symbol :] .* ;
 153
 154 -- Since the previous rules all match dashes followed by at least one
 155 -- character, we also need to match a whole line filled with just dashes.
 156
 157 "--"\-* / { atEOL } ;
 158
 159 -- We need this rule since none of the other single line comment rules
 160 -- actually match this case.
 161
 162 "-- " / { atEOL } ;
 163
 164 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 165 -- blank lines) until we find a non-whitespace character, then do layout
 166 -- processing.
 167 --
 168 -- One slight wibble here: what if the line begins with {-#? In
 169 -- theory, we have to lex the pragma to see if it's one we recognise,
 170 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 171 -- as a nested comment.  We don't bother with this: if the line begins
 172 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 173 <bol> {
 174   \n                                    ;
 175   ^\# (line)?                           { begin line_prag1 }
 176   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 177   ^\# \! .* \n                          ; -- #!, for scripts
 178   ()                                    { do_bol }
 179 }
 180
 181 -- after a layout keyword (let, where, do, of), we begin a new layout
 182 -- context if the curly brace is missing.
 183 -- Careful! This stuff is quite delicate.
 184 <layout, layout_do> {
 185   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 186         -- we might encounter {-# here, but {- has been handled already
 187   \n                                    ;
 188   ^\# (line)?                           { begin line_prag1 }
 189 }
 190
 191 -- do is treated in a subtly different way, see new_layout_context
 192 <layout>    ()                          { new_layout_context True }
 193 <layout_do> ()                          { new_layout_context False }
 194
 195 -- after a new layout context which was found to be to the left of the
 196 -- previous context, we have generated a '{' token, and we now need to
 197 -- generate a matching '}' token.
 198 <layout_left>  ()                       { do_layout_left }
 199
 200 <0,option_prags,glaexts> \n                             { begin bol }
 201
 202 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 203
 204 -- single-line line pragmas, of the form
 205 --    # <line> "<file>" <extra-stuff> \n
 206 <line_prag1> $decdigit+                 { setLine line_prag1a }
 207 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 208 <line_prag1b> .*                        { pop }
 209
 210 -- Haskell-style line pragmas, of the form
 211 --    {-# LINE <line> "<file>" #-}
 212 <line_prag2> $decdigit+                 { setLine line_prag2a }
 213 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 214 <line_prag2b> "#-}"|"-}"                { pop }
 215    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 216    -- with older versions of GHC which generated these.
 217
 218 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 219 -- is on, because the contents of the pragma is always written using
 220 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 221 -- enabled, we're sure to get a parse error.
 222 -- (ToDo: we should really emit a warning when ignoring pragmas)
 223 <glaexts>
 224   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 225
 226 <0,option_prags,glaexts> {
 227   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 228   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 229                                         { token (ITinline_prag False) }
 230   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 231                                         { token ITspec_prag }
 232   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 233         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 234   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 235         $whitechar* (NO(T?)INLINE|no(t?)inline)
 236                                         { token (ITspec_inline_prag False) }
 237   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 238   "{-#" $whitechar* (DEPRECATED|deprecated)
 239                                         { token ITdeprecated_prag }
 240   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 241   "{-#" $whitechar* (GENERATED|generated)
 242                                         { token ITgenerated_prag }
 243   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 244   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 245
 246   "{-#" $whitechar* (DOCOPTIONS|docoptions)
 247   / { ifExtension haddockEnabled }     { lex_string_prag ITdocOptions }
 248
 249  "{-#"                                 { nested_comment lexToken }
 250
 251   -- ToDo: should only be valid inside a pragma:
 252   "#-}"                                 { token ITclose_prag}
 253 }
 254
 255 <option_prags> {
 256   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 257   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 258                                         { lex_string_prag IToptions_prag }
 259   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 260   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 261 }
 262
 263 <0,option_prags,glaexts> {
 264         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 265   "{-#" $whitechar* $idchar+            { nested_comment lexToken }
 266 }
 267
 268 -- '0' state: ordinary lexemes
 269 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 270
 271 -- Haddock comments
 272
 273 <0,glaexts> {
 274   "-- " / $docsym    { multiline_doc_comment }
 275   "{-" \ ? / $docsym { nested_doc_comment }
 276 }
 277
 278 -- "special" symbols
 279
 280 <0,glaexts> {
 281   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 282   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 283 }
 284
 285 <0,glaexts> {
 286   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 287   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 288   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 289   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 290   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 291   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 292   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 293   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 294 }
 295
 296 <0,glaexts> {
 297   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 298                                         { special IToparenbar }
 299   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 300 }
 301
 302 <0,glaexts> {
 303   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 304 }
 305
 306 <glaexts> {
 307   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 308   "#)"                                  { token ITcubxparen }
 309   "{|"                                  { token ITocurlybar }
 310   "|}"                                  { token ITccurlybar }
 311 }
 312
 313 <0,option_prags,glaexts> {
 314   \(                                    { special IToparen }
 315   \)                                    { special ITcparen }
 316   \[                                    { special ITobrack }
 317   \]                                    { special ITcbrack }
 318   \,                                    { special ITcomma }
 319   \;                                    { special ITsemi }
 320   \`                                    { special ITbackquote }
 321
 322   \{                                    { open_brace }
 323   \}                                    { close_brace }
 324 }
 325
 326 <0,option_prags,glaexts> {
 327   @qual @varid                  { check_qvarid }
 328   @qual @conid                  { idtoken qconid }
 329   @varid                        { varid }
 330   @conid                        { idtoken conid }
 331 }
 332
 333 -- after an illegal qvarid, such as 'M.let',
 334 -- we back up and try again in the bad_qvarid state:
 335 <bad_qvarid> {
 336   @conid                        { pop_and (idtoken conid) }
 337   @qual @conid                  { pop_and (idtoken qconid) }
 338 }
 339
 340 <glaexts> {
 341   @qual @varid "#"+             { idtoken qvarid }
 342   @qual @conid "#"+             { idtoken qconid }
 343   @varid "#"+                   { varid }
 344   @conid "#"+                   { idtoken conid }
 345 }
 346
 347 -- ToDo: M.(,,,)
 348
 349 <0,glaexts> {
 350   @qual @varsym                 { idtoken qvarsym }
 351   @qual @consym                 { idtoken qconsym }
 352   @varsym                       { varsym }
 353   @consym                       { consym }
 354 }
 355
 356 <0,glaexts> {
 357   @decimal                      { tok_decimal }
 358   0[oO] @octal                  { tok_octal }
 359   0[xX] @hexadecimal            { tok_hexadecimal }
 360 }
 361
 362 <glaexts> {
 363   @decimal \#                   { prim_decimal }
 364   0[oO] @octal \#               { prim_octal }
 365   0[xX] @hexadecimal \#         { prim_hexadecimal }
 366 }
 367
 368 <0,glaexts> @floating_point             { strtoken tok_float }
 369 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 370 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 371
 372 -- Strings and chars are lexed by hand-written code.  The reason is
 373 -- that even if we recognise the string or char here in the regex
 374 -- lexer, we would still have to parse the string afterward in order
 375 -- to convert it to a String.
 376 <0,glaexts> {
 377   \'                            { lex_char_tok }
 378   \"                            { lex_string_tok }
 379 }
 380
 381 {
 382 -- work around bug in Alex 2.0
 383 #if __GLASGOW_HASKELL__ < 503
 384 unsafeAt arr i = arr ! i
 385 #endif
 386
 387 -- -----------------------------------------------------------------------------
 388 -- The token type
 389
 390 data Token
 391   = ITas                        -- Haskell keywords
 392   | ITcase
 393   | ITclass
 394   | ITdata
 395   | ITdefault
 396   | ITderiving
 397   | ITderive
 398   | ITdo
 399   | ITelse
 400   | IThiding
 401   | ITif
 402   | ITimport
 403   | ITin
 404   | ITinfix
 405   | ITinfixl
 406   | ITinfixr
 407   | ITinstance
 408   | ITlet
 409   | ITmodule
 410   | ITnewtype
 411   | ITof
 412   | ITqualified
 413   | ITthen
 414   | ITtype
 415   | ITwhere
 416   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 417
 418   | ITforall                    -- GHC extension keywords
 419   | ITforeign
 420   | ITexport
 421   | ITlabel
 422   | ITdynamic
 423   | ITsafe
 424   | ITthreadsafe
 425   | ITunsafe
 426   | ITstdcallconv
 427   | ITccallconv
 428   | ITdotnet
 429   | ITmdo
 430   | ITfamily
 431
 432         -- Pragmas
 433   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 434   | ITspec_prag                 -- SPECIALISE
 435   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 436   | ITsource_prag
 437   | ITrules_prag
 438   | ITdeprecated_prag
 439   | ITline_prag
 440   | ITscc_prag
 441   | ITgenerated_prag
 442   | ITcore_prag                 -- hdaume: core annotations
 443   | ITunpack_prag
 444   | ITclose_prag
 445   | IToptions_prag String
 446   | ITinclude_prag String
 447   | ITlanguage_prag
 448
 449   | ITdotdot                    -- reserved symbols
 450   | ITcolon
 451   | ITdcolon
 452   | ITequal
 453   | ITlam
 454   | ITvbar
 455   | ITlarrow
 456   | ITrarrow
 457   | ITat
 458   | ITtilde
 459   | ITdarrow
 460   | ITminus
 461   | ITbang
 462   | ITstar
 463   | ITdot
 464
 465   | ITbiglam                    -- GHC-extension symbols
 466
 467   | ITocurly                    -- special symbols
 468   | ITccurly
 469   | ITocurlybar                 -- {|, for type applications
 470   | ITccurlybar                 -- |}, for type applications
 471   | ITvocurly
 472   | ITvccurly
 473   | ITobrack
 474   | ITopabrack                  -- [:, for parallel arrays with -fparr
 475   | ITcpabrack                  -- :], for parallel arrays with -fparr
 476   | ITcbrack
 477   | IToparen
 478   | ITcparen
 479   | IToubxparen
 480   | ITcubxparen
 481   | ITsemi
 482   | ITcomma
 483   | ITunderscore
 484   | ITbackquote
 485
 486   | ITvarid   FastString        -- identifiers
 487   | ITconid   FastString
 488   | ITvarsym  FastString
 489   | ITconsym  FastString
 490   | ITqvarid  (FastString,FastString)
 491   | ITqconid  (FastString,FastString)
 492   | ITqvarsym (FastString,FastString)
 493   | ITqconsym (FastString,FastString)
 494
 495   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 496
 497   | ITpragma StringBuffer
 498
 499   | ITchar       Char
 500   | ITstring     FastString
 501   | ITinteger    Integer
 502   | ITrational   Rational
 503
 504   | ITprimchar   Char
 505   | ITprimstring FastString
 506   | ITprimint    Integer
 507   | ITprimfloat  Rational
 508   | ITprimdouble Rational
 509
 510   -- MetaHaskell extension tokens
 511   | ITopenExpQuote              --  [| or [e|
 512   | ITopenPatQuote              --  [p|
 513   | ITopenDecQuote              --  [d|
 514   | ITopenTypQuote              --  [t|
 515   | ITcloseQuote                --  |]
 516   | ITidEscape   FastString     --  $x
 517   | ITparenEscape               --  $(
 518   | ITvarQuote                  --  '
 519   | ITtyQuote                   --  ''
 520
 521   -- Arrow notation extension
 522   | ITproc
 523   | ITrec
 524   | IToparenbar                 --  (|
 525   | ITcparenbar                 --  |)
 526   | ITlarrowtail                --  -<
 527   | ITrarrowtail                --  >-
 528   | ITLarrowtail                --  -<<
 529   | ITRarrowtail                --  >>-
 530
 531   | ITunknown String            -- Used when the lexer can't make sense of it
 532   | ITeof                       -- end of file token
 533
 534   -- Documentation annotations
 535   | ITdocCommentNext  String     -- something beginning '-- |'
 536   | ITdocCommentPrev  String     -- something beginning '-- ^'
 537   | ITdocCommentNamed String     -- something beginning '-- $'
 538   | ITdocSection      Int String -- a section heading
 539   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 540
 541 #ifdef DEBUG
 542   deriving Show -- debugging
 543 #endif
 544
 545 isSpecial :: Token -> Bool
 546 -- If we see M.x, where x is a keyword, but
 547 -- is special, we treat is as just plain M.x,
 548 -- not as a keyword.
 549 isSpecial ITas          = True
 550 isSpecial IThiding      = True
 551 isSpecial ITderive      = True
 552 isSpecial ITqualified   = True
 553 isSpecial ITforall      = True
 554 isSpecial ITexport      = True
 555 isSpecial ITlabel       = True
 556 isSpecial ITdynamic     = True
 557 isSpecial ITsafe        = True
 558 isSpecial ITthreadsafe  = True
 559 isSpecial ITunsafe      = True
 560 isSpecial ITccallconv   = True
 561 isSpecial ITstdcallconv = True
 562 isSpecial ITmdo         = True
 563 isSpecial ITfamily      = True
 564 isSpecial _             = False
 565
 566 -- the bitmap provided as the third component indicates whether the
 567 -- corresponding extension keyword is valid under the extension options
 568 -- provided to the compiler; if the extension corresponding to *any* of the
 569 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 570 -- facilitates using a keyword in two different extensions that can be
 571 -- activated independently)
 572 --
 573 reservedWordsFM = listToUFM $
 574         map (\(x, y, z) -> (mkFastString x, (y, z)))
 575        [( "_",          ITunderscore,   0 ),
 576         ( "as",         ITas,           0 ),
 577         ( "case",       ITcase,         0 ),
 578         ( "class",      ITclass,        0 ),
 579         ( "data",       ITdata,         0 ),
 580         ( "default",    ITdefault,      0 ),
 581         ( "deriving",   ITderiving,     0 ),
 582         ( "derive",     ITderive,       0 ),
 583         ( "do",         ITdo,           0 ),
 584         ( "else",       ITelse,         0 ),
 585         ( "hiding",     IThiding,       0 ),
 586         ( "if",         ITif,           0 ),
 587         ( "import",     ITimport,       0 ),
 588         ( "in",         ITin,           0 ),
 589         ( "infix",      ITinfix,        0 ),
 590         ( "infixl",     ITinfixl,       0 ),
 591         ( "infixr",     ITinfixr,       0 ),
 592         ( "instance",   ITinstance,     0 ),
 593         ( "let",        ITlet,          0 ),
 594         ( "module",     ITmodule,       0 ),
 595         ( "newtype",    ITnewtype,      0 ),
 596         ( "of",         ITof,           0 ),
 597         ( "qualified",  ITqualified,    0 ),
 598         ( "then",       ITthen,         0 ),
 599         ( "type",       ITtype,         0 ),
 600         ( "where",      ITwhere,        0 ),
 601         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 602
 603         ( "forall",     ITforall,        bit tvBit),
 604         ( "mdo",        ITmdo,           bit glaExtsBit),
 605         ( "family",     ITfamily,        bit idxTysBit),
 606
 607         ( "foreign",    ITforeign,       bit ffiBit),
 608         ( "export",     ITexport,        bit ffiBit),
 609         ( "label",      ITlabel,         bit ffiBit),
 610         ( "dynamic",    ITdynamic,       bit ffiBit),
 611         ( "safe",       ITsafe,          bit ffiBit),
 612         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 613         ( "unsafe",     ITunsafe,        bit ffiBit),
 614         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 615         ( "ccall",      ITccallconv,     bit ffiBit),
 616         ( "dotnet",     ITdotnet,        bit ffiBit),
 617
 618         ( "rec",        ITrec,           bit arrowsBit),
 619         ( "proc",       ITproc,          bit arrowsBit)
 620      ]
 621
 622 reservedSymsFM = listToUFM $
 623         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 624       [ ("..",  ITdotdot,       0)
 625        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 626                                                 -- meaning only list cons
 627        ,("::",  ITdcolon,       0)
 628        ,("=",   ITequal,        0)
 629        ,("\\",  ITlam,          0)
 630        ,("|",   ITvbar,         0)
 631        ,("<-",  ITlarrow,       0)
 632        ,("->",  ITrarrow,       0)
 633        ,("@",   ITat,           0)
 634        ,("~",   ITtilde,        0)
 635        ,("=>",  ITdarrow,       0)
 636        ,("-",   ITminus,        0)
 637        ,("!",   ITbang,         0)
 638
 639        ,("*",   ITstar,         bit glaExtsBit .|.
 640                                 bit idxTysBit)      -- For data T (a::*) = MkT
 641        ,(".",   ITdot,          bit tvBit)          -- For 'forall a . t'
 642
 643        ,("-<",  ITlarrowtail,   bit arrowsBit)
 644        ,(">-",  ITrarrowtail,   bit arrowsBit)
 645        ,("-<<", ITLarrowtail,   bit arrowsBit)
 646        ,(">>-", ITRarrowtail,   bit arrowsBit)
 647
 648 #if __GLASGOW_HASKELL__ >= 605
 649        ,("λ",  ITlam,          bit glaExtsBit)
 650        ,("∷",   ITdcolon,       bit glaExtsBit)
 651        ,("⇒",   ITdarrow,     bit glaExtsBit)
 652        ,("∀", ITforall,       bit glaExtsBit)
 653        ,("→",   ITrarrow,     bit glaExtsBit)
 654        ,("←",   ITlarrow,     bit glaExtsBit)
 655        ,("?",   ITdotdot,       bit glaExtsBit)
 656         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 657         -- form part of a large operator.  This would let us have a better
 658         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 659 #endif
 660        ]
 661
 662 -- -----------------------------------------------------------------------------
 663 -- Lexer actions
 664
 665 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 666
 667 special :: Token -> Action
 668 special tok span _buf len = return (L span tok)
 669
 670 token, layout_token :: Token -> Action
 671 token t span buf len = return (L span t)
 672 layout_token t span buf len = pushLexState layout >> return (L span t)
 673
 674 idtoken :: (StringBuffer -> Int -> Token) -> Action
 675 idtoken f span buf len = return (L span $! (f buf len))
 676
 677 skip_one_varid :: (FastString -> Token) -> Action
 678 skip_one_varid f span buf len
 679   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 680
 681 strtoken :: (String -> Token) -> Action
 682 strtoken f span buf len =
 683   return (L span $! (f $! lexemeToString buf len))
 684
 685 init_strtoken :: Int -> (String -> Token) -> Action
 686 -- like strtoken, but drops the last N character(s)
 687 init_strtoken drop f span buf len =
 688   return (L span $! (f $! lexemeToString buf (len-drop)))
 689
 690 begin :: Int -> Action
 691 begin code _span _str _len = do pushLexState code; lexToken
 692
 693 pop :: Action
 694 pop _span _buf _len = do popLexState; lexToken
 695
 696 pop_and :: Action -> Action
 697 pop_and act span buf len = do popLexState; act span buf len
 698
 699 {-# INLINE nextCharIs #-}
 700 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 701
 702 notFollowedBy char _ _ _ (AI _ _ buf)
 703   = nextCharIs buf (/=char)
 704
 705 notFollowedBySymbol _ _ _ (AI _ _ buf)
 706   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 707
 708 isNormalComment bits _ _ (AI _ _ buf)
 709   | haddockEnabled bits = notFollowedByDocOrPragma
 710   | otherwise           = nextCharIs buf (/='#')
 711   where
 712     notFollowedByDocOrPragma
 713         = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 714
 715 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 716
 717 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 718   = if haddockEnabled bits then False else (p buf)
 719
 720 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 721
 722 ifExtension pred bits _ _ _ = pred bits
 723
 724 multiline_doc_comment :: Action
 725 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 726   where
 727     worker commentAcc input docType oneLine = case alexGetChar input of
 728       Just ('\n', input')
 729         | oneLine -> docCommentEnd input commentAcc docType buf span
 730         | otherwise -> case checkIfCommentLine input' of
 731           Just input -> worker ('\n':commentAcc) input docType False
 732           Nothing -> docCommentEnd input commentAcc docType buf span
 733       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 734       Nothing -> docCommentEnd input commentAcc docType buf span
 735
 736     checkIfCommentLine input = check (dropNonNewlineSpace input)
 737       where
 738         check input = case alexGetChar input of
 739           Just ('-', input) -> case alexGetChar input of
 740             Just ('-', input) -> case alexGetChar input of
 741               Just (c, _) | c /= '-' -> Just input
 742               _ -> Nothing
 743             _ -> Nothing
 744           _ -> Nothing
 745
 746         dropNonNewlineSpace input = case alexGetChar input of
 747           Just (c, input')
 748             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 749             | otherwise -> input
 750           Nothing -> input
 751
 752 {-
 753   nested comments require traversing by hand, they can't be parsed
 754   using regular expressions.
 755 -}
 756 nested_comment :: P (Located Token) -> Action
 757 nested_comment cont span _str _len = do
 758   input <- getInput
 759   go 1 input
 760   where
 761     go 0 input = do setInput input; cont
 762     go n input = case alexGetChar input of
 763       Nothing -> errBrace input span
 764       Just ('-',input) -> case alexGetChar input of
 765         Nothing  -> errBrace input span
 766         Just ('\125',input) -> go (n-1) input
 767         Just (c,_)          -> go n input
 768       Just ('\123',input) -> case alexGetChar input of
 769         Nothing  -> errBrace input span
 770         Just ('-',input) -> go (n+1) input
 771         Just (c,_)       -> go n input
 772       Just (c,input) -> go n input
 773
 774 nested_doc_comment :: Action
 775 nested_doc_comment span buf _len = withLexedDocType (go "")
 776   where
 777     go commentAcc input docType _ = case alexGetChar input of
 778       Nothing -> errBrace input span
 779       Just ('-',input) -> case alexGetChar input of
 780         Nothing -> errBrace input span
 781         Just ('\125',input@(AI end _ buf2)) ->
 782           docCommentEnd input commentAcc docType buf span
 783         Just (c,_) -> go ('-':commentAcc) input docType False
 784       Just ('\123', input) -> case alexGetChar input of
 785         Nothing  -> errBrace input span
 786         Just ('-',input) -> do
 787           setInput input
 788           let cont = do input <- getInput; go commentAcc input docType False
 789           nested_comment cont span buf _len
 790         Just (c,_) -> go ('\123':commentAcc) input docType False
 791       Just (c,input) -> go (c:commentAcc) input docType False
 792
 793 withLexedDocType lexDocComment = do
 794   input <- getInput
 795   case alexGetChar input of
 796     Nothing -> error "Can't happen"
 797     Just ('|', input) -> lexDocComment input ITdocCommentNext False
 798     Just ('^', input) -> lexDocComment input ITdocCommentPrev False
 799     Just ('$', input) -> lexDocComment input ITdocCommentNamed False
 800     Just ('*', input) -> lexDocSection 1 input
 801  where
 802     lexDocSection n input = case alexGetChar input of
 803       Just ('*', input) -> lexDocSection (n+1) input
 804       Just (c, _) -> lexDocComment input (ITdocSection n) True
 805       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 806
 807 -- docCommentEnd
 808 -------------------------------------------------------------------------------
 809 -- This function is quite tricky. We can't just return a new token, we also
 810 -- need to update the state of the parser. Why? Because the token is longer
 811 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 812 -- it writes the wrong token length to the parser state. This function is
 813 -- called afterwards, so it can just update the state.
 814
 815 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 816 -- which is something that the original lexer didn't account for.
 817 -- I have added last_line_len in the parser state which represents the length
 818 -- of the part of the token that is on the last line. It is now used for layout
 819 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 820 -- was before, the full length of the token, and it is now only used for error
 821 -- messages. /Waern
 822
 823 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 824                  SrcSpan -> P (Located Token)
 825 docCommentEnd input commentAcc docType buf span = do
 826   setInput input
 827   let (AI loc last_offs nextBuf) = input
 828       comment = reverse commentAcc
 829       span' = mkSrcSpan (srcSpanStart span) loc
 830       last_len = byteDiff buf nextBuf
 831
 832       last_line_len = if (last_offs - last_len < 0)
 833         then last_offs
 834         else last_len
 835
 836   span `seq` setLastToken span' last_len last_line_len
 837   return (L span' (docType comment))
 838
 839 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 840
 841 open_brace, close_brace :: Action
 842 open_brace span _str _len = do
 843   ctx <- getContext
 844   setContext (NoLayout:ctx)
 845   return (L span ITocurly)
 846 close_brace span _str _len = do
 847   popContext
 848   return (L span ITccurly)
 849
 850 -- We have to be careful not to count M.<varid> as a qualified name
 851 -- when <varid> is a keyword.  We hack around this by catching
 852 -- the offending tokens afterward, and re-lexing in a different state.
 853 check_qvarid span buf len = do
 854   case lookupUFM reservedWordsFM var of
 855         Just (keyword,exts)
 856           | not (isSpecial keyword) ->
 857           if exts == 0
 858              then try_again
 859              else do
 860                 b <- extension (\i -> exts .&. i /= 0)
 861                 if b then try_again
 862                      else return token
 863         _other -> return token
 864   where
 865         (mod,var) = splitQualName buf len
 866         token     = L span (ITqvarid (mod,var))
 867
 868         try_again = do
 869                 (AI _ offs _) <- getInput
 870                 setInput (AI (srcSpanStart span) (offs-len) buf)
 871                 pushLexState bad_qvarid
 872                 lexToken
 873
 874 qvarid buf len = ITqvarid $! splitQualName buf len
 875 qconid buf len = ITqconid $! splitQualName buf len
 876
 877 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 878 -- takes a StringBuffer and a length, and returns the module name
 879 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 880 -- because of hierarchical module names.
 881 splitQualName orig_buf len = split orig_buf orig_buf
 882   where
 883     split buf dot_buf
 884         | orig_buf `byteDiff` buf >= len  = done dot_buf
 885         | c == '.'                        = found_dot buf'
 886         | otherwise                       = split buf' dot_buf
 887       where
 888        (c,buf') = nextChar buf
 889
 890     -- careful, we might get names like M....
 891     -- so, if the character after the dot is not upper-case, this is
 892     -- the end of the qualifier part.
 893     found_dot buf -- buf points after the '.'
 894         | isUpper c    = split buf' buf
 895         | otherwise    = done buf
 896       where
 897        (c,buf') = nextChar buf
 898
 899     done dot_buf =
 900         (lexemeToFastString orig_buf (qual_size - 1),
 901          lexemeToFastString dot_buf (len - qual_size))
 902       where
 903         qual_size = orig_buf `byteDiff` dot_buf
 904
 905 varid span buf len =
 906   case lookupUFM reservedWordsFM fs of
 907         Just (keyword,0)    -> do
 908                 maybe_layout keyword
 909                 return (L span keyword)
 910         Just (keyword,exts) -> do
 911                 b <- extension (\i -> exts .&. i /= 0)
 912                 if b then do maybe_layout keyword
 913                              return (L span keyword)
 914                      else return (L span (ITvarid fs))
 915         _other -> return (L span (ITvarid fs))
 916   where
 917         fs = lexemeToFastString buf len
 918
 919 conid buf len = ITconid fs
 920   where fs = lexemeToFastString buf len
 921
 922 qvarsym buf len = ITqvarsym $! splitQualName buf len
 923 qconsym buf len = ITqconsym $! splitQualName buf len
 924
 925 varsym = sym ITvarsym
 926 consym = sym ITconsym
 927
 928 sym con span buf len =
 929   case lookupUFM reservedSymsFM fs of
 930         Just (keyword,0)    -> return (L span keyword)
 931         Just (keyword,exts) -> do
 932                 b <- extension (\i -> exts .&. i /= 0)
 933                 if b then return (L span keyword)
 934                      else return (L span $! con fs)
 935         _other -> return (L span $! con fs)
 936   where
 937         fs = lexemeToFastString buf len
 938
 939 tok_decimal span buf len
 940   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 941
 942 tok_octal span buf len
 943   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 944
 945 tok_hexadecimal span buf len
 946   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 947
 948 prim_decimal span buf len
 949   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 950
 951 prim_octal span buf len
 952   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 953
 954 prim_hexadecimal span buf len
 955   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 956
 957 tok_float        str = ITrational   $! readRational str
 958 prim_float       str = ITprimfloat  $! readRational str
 959 prim_double      str = ITprimdouble $! readRational str
 960
 961 -- -----------------------------------------------------------------------------
 962 -- Layout processing
 963
 964 -- we're at the first token on a line, insert layout tokens if necessary
 965 do_bol :: Action
 966 do_bol span _str _len = do
 967         pos <- getOffside
 968         case pos of
 969             LT -> do
 970                 --trace "layout: inserting '}'" $ do
 971                 popContext
 972                 -- do NOT pop the lex state, we might have a ';' to insert
 973                 return (L span ITvccurly)
 974             EQ -> do
 975                 --trace "layout: inserting ';'" $ do
 976                 popLexState
 977                 return (L span ITsemi)
 978             GT -> do
 979                 popLexState
 980                 lexToken
 981
 982 -- certain keywords put us in the "layout" state, where we might
 983 -- add an opening curly brace.
 984 maybe_layout ITdo       = pushLexState layout_do
 985 maybe_layout ITmdo      = pushLexState layout_do
 986 maybe_layout ITof       = pushLexState layout
 987 maybe_layout ITlet      = pushLexState layout
 988 maybe_layout ITwhere    = pushLexState layout
 989 maybe_layout ITrec      = pushLexState layout
 990 maybe_layout _          = return ()
 991
 992 -- Pushing a new implicit layout context.  If the indentation of the
 993 -- next token is not greater than the previous layout context, then
 994 -- Haskell 98 says that the new layout context should be empty; that is
 995 -- the lexer must generate {}.
 996 --
 997 -- We are slightly more lenient than this: when the new context is started
 998 -- by a 'do', then we allow the new context to be at the same indentation as
 999 -- the previous context.  This is what the 'strict' argument is for.
1000 --
1001 new_layout_context strict span _buf _len = do
1002     popLexState
1003     (AI _ offset _) <- getInput
1004     ctx <- getContext
1005     case ctx of
1006         Layout prev_off : _  |
1007            (strict     && prev_off >= offset  ||
1008             not strict && prev_off > offset) -> do
1009                 -- token is indented to the left of the previous context.
1010                 -- we must generate a {} sequence now.
1011                 pushLexState layout_left
1012                 return (L span ITvocurly)
1013         other -> do
1014                 setContext (Layout offset : ctx)
1015                 return (L span ITvocurly)
1016
1017 do_layout_left span _buf _len = do
1018     popLexState
1019     pushLexState bol  -- we must be at the start of a line
1020     return (L span ITvccurly)
1021
1022 -- -----------------------------------------------------------------------------
1023 -- LINE pragmas
1024
1025 setLine :: Int -> Action
1026 setLine code span buf len = do
1027   let line = parseInteger buf len 10 octDecDigit
1028   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1029         -- subtract one: the line number refers to the *following* line
1030   popLexState
1031   pushLexState code
1032   lexToken
1033
1034 setFile :: Int -> Action
1035 setFile code span buf len = do
1036   let file = lexemeToFastString (stepOn buf) (len-2)
1037   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1038   popLexState
1039   pushLexState code
1040   lexToken
1041
1042
1043 -- -----------------------------------------------------------------------------
1044 -- Options, includes and language pragmas.
1045
1046 lex_string_prag :: (String -> Token) -> Action
1047 lex_string_prag mkTok span buf len
1048     = do input <- getInput
1049          start <- getSrcLoc
1050          tok <- go [] input
1051          end <- getSrcLoc
1052          return (L (mkSrcSpan start end) tok)
1053     where go acc input
1054               = if isString input "#-}"
1055                    then do setInput input
1056                            return (mkTok (reverse acc))
1057                    else case alexGetChar input of
1058                           Just (c,i) -> go (c:acc) i
1059                           Nothing -> err input
1060           isString i [] = True
1061           isString i (x:xs)
1062               = case alexGetChar i of
1063                   Just (c,i') | c == x    -> isString i' xs
1064                   _other -> False
1065           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1066
1067
1068 -- -----------------------------------------------------------------------------
1069 -- Strings & Chars
1070
1071 -- This stuff is horrible.  I hates it.
1072
1073 lex_string_tok :: Action
1074 lex_string_tok span buf len = do
1075   tok <- lex_string ""
1076   end <- getSrcLoc
1077   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1078
1079 lex_string :: String -> P Token
1080 lex_string s = do
1081   i <- getInput
1082   case alexGetChar' i of
1083     Nothing -> lit_error
1084
1085     Just ('"',i)  -> do
1086         setInput i
1087         glaexts <- extension glaExtsEnabled
1088         if glaexts
1089           then do
1090             i <- getInput
1091             case alexGetChar' i of
1092               Just ('#',i) -> do
1093                    setInput i
1094                    if any (> '\xFF') s
1095                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1096                     else let s' = mkZFastString (reverse s) in
1097                          return (ITprimstring s')
1098                         -- mkZFastString is a hack to avoid encoding the
1099                         -- string in UTF-8.  We just want the exact bytes.
1100               _other ->
1101                 return (ITstring (mkFastString (reverse s)))
1102           else
1103                 return (ITstring (mkFastString (reverse s)))
1104
1105     Just ('\\',i)
1106         | Just ('&',i) <- next -> do
1107                 setInput i; lex_string s
1108         | Just (c,i) <- next, is_space c -> do
1109                 setInput i; lex_stringgap s
1110         where next = alexGetChar' i
1111
1112     Just (c, i) -> do
1113         c' <- lex_char c i
1114         lex_string (c':s)
1115
1116 lex_stringgap s = do
1117   c <- getCharOrFail
1118   case c of
1119     '\\' -> lex_string s
1120     c | is_space c -> lex_stringgap s
1121     _other -> lit_error
1122
1123
1124 lex_char_tok :: Action
1125 -- Here we are basically parsing character literals, such as 'x' or '\n'
1126 -- but, when Template Haskell is on, we additionally spot
1127 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1128 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1129 -- So we have to do two characters of lookahead: when we see 'x we need to
1130 -- see if there's a trailing quote
1131 lex_char_tok span buf len = do  -- We've seen '
1132    i1 <- getInput       -- Look ahead to first character
1133    let loc = srcSpanStart span
1134    case alexGetChar' i1 of
1135         Nothing -> lit_error
1136
1137         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1138                   th_exts <- extension thEnabled
1139                   if th_exts then do
1140                         setInput i2
1141                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1142                    else lit_error
1143
1144         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
1145                   setInput i2
1146                   lit_ch <- lex_escape
1147                   mc <- getCharOrFail   -- Trailing quote
1148                   if mc == '\'' then finish_char_tok loc lit_ch
1149                                 else do setInput i2; lit_error
1150
1151         Just (c, i2@(AI end2 _ _))
1152                 | not (isAny c) -> lit_error
1153                 | otherwise ->
1154
1155                 -- We've seen 'x, where x is a valid character
1156                 --  (i.e. not newline etc) but not a quote or backslash
1157            case alexGetChar' i2 of      -- Look ahead one more character
1158                 Nothing -> lit_error
1159                 Just ('\'', i3) -> do   -- We've seen 'x'
1160                         setInput i3
1161                         finish_char_tok loc c
1162                 _other -> do            -- We've seen 'x not followed by quote
1163                                         -- If TH is on, just parse the quote only
1164                         th_exts <- extension thEnabled
1165                         let (AI end _ _) = i1
1166                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1167                                    else do setInput i2; lit_error
1168
1169 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1170 finish_char_tok loc ch  -- We've already seen the closing quote
1171                         -- Just need to check for trailing #
1172   = do  glaexts <- extension glaExtsEnabled
1173         i@(AI end _ _) <- getInput
1174         if glaexts then do
1175                 case alexGetChar' i of
1176                         Just ('#',i@(AI end _ _)) -> do
1177                                 setInput i
1178                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1179                         _other ->
1180                                 return (L (mkSrcSpan loc end) (ITchar ch))
1181                 else do
1182                    return (L (mkSrcSpan loc end) (ITchar ch))
1183
1184 lex_char :: Char -> AlexInput -> P Char
1185 lex_char c inp = do
1186   case c of
1187       '\\' -> do setInput inp; lex_escape
1188       c | isAny c -> do setInput inp; return c
1189       _other -> lit_error
1190
1191 isAny c | c > '\xff' = isPrint c
1192         | otherwise  = is_any c
1193
1194 lex_escape :: P Char
1195 lex_escape = do
1196   c <- getCharOrFail
1197   case c of
1198         'a'   -> return '\a'
1199         'b'   -> return '\b'
1200         'f'   -> return '\f'
1201         'n'   -> return '\n'
1202         'r'   -> return '\r'
1203         't'   -> return '\t'
1204         'v'   -> return '\v'
1205         '\\'  -> return '\\'
1206         '"'   -> return '\"'
1207         '\''  -> return '\''
1208         '^'   -> do c <- getCharOrFail
1209                     if c >= '@' && c <= '_'
1210                         then return (chr (ord c - ord '@'))
1211                         else lit_error
1212
1213         'x'   -> readNum is_hexdigit 16 hexDigit
1214         'o'   -> readNum is_octdigit  8 octDecDigit
1215         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1216
1217         c1 ->  do
1218            i <- getInput
1219            case alexGetChar' i of
1220             Nothing -> lit_error
1221             Just (c2,i2) ->
1222               case alexGetChar' i2 of
1223                 Nothing -> do setInput i2; lit_error
1224                 Just (c3,i3) ->
1225                    let str = [c1,c2,c3] in
1226                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1227                                      Just rest <- [maybePrefixMatch p str] ] of
1228                           (escape_char,[]):_ -> do
1229                                 setInput i3
1230                                 return escape_char
1231                           (escape_char,_:_):_ -> do
1232                                 setInput i2
1233                                 return escape_char
1234                           [] -> lit_error
1235
1236 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1237 readNum is_digit base conv = do
1238   i <- getInput
1239   c <- getCharOrFail
1240   if is_digit c
1241         then readNum2 is_digit base conv (conv c)
1242         else do setInput i; lit_error
1243
1244 readNum2 is_digit base conv i = do
1245   input <- getInput
1246   read i input
1247   where read i input = do
1248           case alexGetChar' input of
1249             Just (c,input') | is_digit c -> do
1250                 read (i*base + conv c) input'
1251             _other -> do
1252                 if i >= 0 && i <= 0x10FFFF
1253                    then do setInput input; return (chr i)
1254                    else lit_error
1255
1256 silly_escape_chars = [
1257         ("NUL", '\NUL'),
1258         ("SOH", '\SOH'),
1259         ("STX", '\STX'),
1260         ("ETX", '\ETX'),
1261         ("EOT", '\EOT'),
1262         ("ENQ", '\ENQ'),
1263         ("ACK", '\ACK'),
1264         ("BEL", '\BEL'),
1265         ("BS", '\BS'),
1266         ("HT", '\HT'),
1267         ("LF", '\LF'),
1268         ("VT", '\VT'),
1269         ("FF", '\FF'),
1270         ("CR", '\CR'),
1271         ("SO", '\SO'),
1272         ("SI", '\SI'),
1273         ("DLE", '\DLE'),
1274         ("DC1", '\DC1'),
1275         ("DC2", '\DC2'),
1276         ("DC3", '\DC3'),
1277         ("DC4", '\DC4'),
1278         ("NAK", '\NAK'),
1279         ("SYN", '\SYN'),
1280         ("ETB", '\ETB'),
1281         ("CAN", '\CAN'),
1282         ("EM", '\EM'),
1283         ("SUB", '\SUB'),
1284         ("ESC", '\ESC'),
1285         ("FS", '\FS'),
1286         ("GS", '\GS'),
1287         ("RS", '\RS'),
1288         ("US", '\US'),
1289         ("SP", '\SP'),
1290         ("DEL", '\DEL')
1291         ]
1292
1293 -- before calling lit_error, ensure that the current input is pointing to
1294 -- the position of the error in the buffer.  This is so that we can report
1295 -- a correct location to the user, but also so we can detect UTF-8 decoding
1296 -- errors if they occur.
1297 lit_error = lexError "lexical error in string/character literal"
1298
1299 getCharOrFail :: P Char
1300 getCharOrFail =  do
1301   i <- getInput
1302   case alexGetChar' i of
1303         Nothing -> lexError "unexpected end-of-file in string/character literal"
1304         Just (c,i)  -> do setInput i; return c
1305
1306 -- -----------------------------------------------------------------------------
1307 -- Warnings
1308
1309 warn :: DynFlag -> SDoc -> Action
1310 warn option warning span _buf _len = do
1311     addWarning option (mkWarnMsg span alwaysQualify warning)
1312     lexToken
1313
1314 -- -----------------------------------------------------------------------------
1315 -- The Parse Monad
1316
1317 data LayoutContext
1318   = NoLayout
1319   | Layout !Int
1320   deriving Show
1321
1322 data ParseResult a
1323   = POk PState a
1324   | PFailed
1325         SrcSpan         -- The start and end of the text span related to
1326                         -- the error.  Might be used in environments which can
1327                         -- show this span, e.g. by highlighting it.
1328         Message         -- The error message
1329
1330 data PState = PState {
1331         buffer     :: StringBuffer,
1332     dflags     :: DynFlags,
1333     messages   :: Messages,
1334         last_loc   :: SrcSpan,  -- pos of previous token
1335         last_offs  :: !Int,     -- offset of the previous token from the
1336                                 -- beginning of  the current line.
1337                                 -- \t is equal to 8 spaces.
1338         last_len   :: !Int,     -- len of previous token
1339   last_line_len :: !Int,
1340         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1341         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1342         context    :: [LayoutContext],
1343         lex_state  :: [Int]
1344      }
1345         -- last_loc and last_len are used when generating error messages,
1346         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1347         -- current token to happyError, we could at least get rid of last_len.
1348         -- Getting rid of last_loc would require finding another way to
1349         -- implement pushCurrentContext (which is only called from one place).
1350
1351 newtype P a = P { unP :: PState -> ParseResult a }
1352
1353 instance Monad P where
1354   return = returnP
1355   (>>=) = thenP
1356   fail = failP
1357
1358 returnP :: a -> P a
1359 returnP a = P $ \s -> POk s a
1360
1361 thenP :: P a -> (a -> P b) -> P b
1362 (P m) `thenP` k = P $ \ s ->
1363         case m s of
1364                 POk s1 a         -> (unP (k a)) s1
1365                 PFailed span err -> PFailed span err
1366
1367 failP :: String -> P a
1368 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1369
1370 failMsgP :: String -> P a
1371 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1372
1373 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1374 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1375
1376 failSpanMsgP :: SrcSpan -> String -> P a
1377 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1378
1379 extension :: (Int -> Bool) -> P Bool
1380 extension p = P $ \s -> POk s (p $! extsBitmap s)
1381
1382 getExts :: P Int
1383 getExts = P $ \s -> POk s (extsBitmap s)
1384
1385 setSrcLoc :: SrcLoc -> P ()
1386 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1387
1388 getSrcLoc :: P SrcLoc
1389 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1390
1391 setLastToken :: SrcSpan -> Int -> Int -> P ()
1392 setLastToken loc len line_len = P $ \s -> POk s {
1393   last_loc=loc,
1394   last_len=len,
1395   last_line_len=line_len
1396 } ()
1397
1398 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1399
1400 alexInputPrevChar :: AlexInput -> Char
1401 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1402
1403 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1404 alexGetChar (AI loc ofs s)
1405   | atEnd s   = Nothing
1406   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1407                 --trace (show (ord c)) $
1408                 Just (adj_c, (AI loc' ofs' s'))
1409   where (c,s') = nextChar s
1410         loc'   = advanceSrcLoc loc c
1411         ofs'   = advanceOffs c ofs
1412
1413         non_graphic     = '\x0'
1414         upper           = '\x1'
1415         lower           = '\x2'
1416         digit           = '\x3'
1417         symbol          = '\x4'
1418         space           = '\x5'
1419         other_graphic   = '\x6'
1420
1421         adj_c
1422           | c <= '\x06' = non_graphic
1423           | c <= '\xff' = c
1424           | otherwise =
1425                 case generalCategory c of
1426                   UppercaseLetter       -> upper
1427                   LowercaseLetter       -> lower
1428                   TitlecaseLetter       -> upper
1429                   ModifierLetter        -> other_graphic
1430                   OtherLetter           -> other_graphic
1431                   NonSpacingMark        -> other_graphic
1432                   SpacingCombiningMark  -> other_graphic
1433                   EnclosingMark         -> other_graphic
1434                   DecimalNumber         -> digit
1435                   LetterNumber          -> other_graphic
1436                   OtherNumber           -> other_graphic
1437                   ConnectorPunctuation  -> other_graphic
1438                   DashPunctuation       -> other_graphic
1439                   OpenPunctuation       -> other_graphic
1440                   ClosePunctuation      -> other_graphic
1441                   InitialQuote          -> other_graphic
1442                   FinalQuote            -> other_graphic
1443                   OtherPunctuation      -> other_graphic
1444                   MathSymbol            -> symbol
1445                   CurrencySymbol        -> symbol
1446                   ModifierSymbol        -> symbol
1447                   OtherSymbol           -> symbol
1448                   Space                 -> space
1449                   _other                -> non_graphic
1450
1451 -- This version does not squash unicode characters, it is used when
1452 -- lexing strings.
1453 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1454 alexGetChar' (AI loc ofs s)
1455   | atEnd s   = Nothing
1456   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1457                 --trace (show (ord c)) $
1458                 Just (c, (AI loc' ofs' s'))
1459   where (c,s') = nextChar s
1460         loc'   = advanceSrcLoc loc c
1461         ofs'   = advanceOffs c ofs
1462
1463 advanceOffs :: Char -> Int -> Int
1464 advanceOffs '\n' offs = 0
1465 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1466 advanceOffs _    offs = offs + 1
1467
1468 getInput :: P AlexInput
1469 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1470
1471 setInput :: AlexInput -> P ()
1472 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1473
1474 pushLexState :: Int -> P ()
1475 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1476
1477 popLexState :: P Int
1478 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1479
1480 getLexState :: P Int
1481 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1482
1483 -- for reasons of efficiency, flags indicating language extensions (eg,
1484 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1485 -- integer
1486
1487 glaExtsBit, ffiBit, parrBit :: Int
1488 glaExtsBit = 0
1489 ffiBit     = 1
1490 parrBit    = 2
1491 arrowsBit  = 4
1492 thBit      = 5
1493 ipBit      = 6
1494 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1495 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1496                 -- (doesn't affect the lexer)
1497 idxTysBit  = 9  -- indexed type families: 'family' keyword and kind sigs
1498 haddockBit = 10 -- Lex and parse Haddock comments
1499
1500 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1501 glaExtsEnabled flags = testBit flags glaExtsBit
1502 ffiEnabled     flags = testBit flags ffiBit
1503 parrEnabled    flags = testBit flags parrBit
1504 arrowsEnabled  flags = testBit flags arrowsBit
1505 thEnabled      flags = testBit flags thBit
1506 ipEnabled      flags = testBit flags ipBit
1507 tvEnabled      flags = testBit flags tvBit
1508 bangPatEnabled flags = testBit flags bangPatBit
1509 idxTysEnabled  flags = testBit flags idxTysBit
1510 haddockEnabled flags = testBit flags haddockBit
1511
1512 -- PState for parsing options pragmas
1513 --
1514 pragState :: StringBuffer -> SrcLoc -> PState
1515 pragState buf loc  =
1516   PState {
1517       buffer          = buf,
1518       messages      = emptyMessages,
1519       -- XXX defaultDynFlags is not right, but we don't have a real
1520       -- dflags handy
1521       dflags        = defaultDynFlags,
1522       last_loc      = mkSrcSpan loc loc,
1523       last_offs     = 0,
1524       last_len      = 0,
1525       last_line_len = 0,
1526       loc           = loc,
1527       extsBitmap    = 0,
1528       context       = [],
1529       lex_state     = [bol, option_prags, 0]
1530     }
1531
1532
1533 -- create a parse state
1534 --
1535 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1536 mkPState buf loc flags  =
1537   PState {
1538       buffer          = buf,
1539       dflags        = flags,
1540       messages      = emptyMessages,
1541       last_loc      = mkSrcSpan loc loc,
1542       last_offs     = 0,
1543       last_len      = 0,
1544       last_line_len = 0,
1545       loc           = loc,
1546       extsBitmap    = fromIntegral bitmap,
1547       context       = [],
1548       lex_state     = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1549         -- we begin in the layout state if toplev_layout is set
1550     }
1551     where
1552       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1553                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1554                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1555                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1556                .|. thBit      `setBitIf` dopt Opt_TH          flags
1557                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1558                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1559                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1560                .|. idxTysBit  `setBitIf` dopt Opt_IndexedTypes flags
1561                .|. haddockBit `setBitIf` dopt Opt_Haddock     flags
1562       --
1563       setBitIf :: Int -> Bool -> Int
1564       b `setBitIf` cond | cond      = bit b
1565                         | otherwise = 0
1566
1567 addWarning :: DynFlag -> WarnMsg -> P ()
1568 addWarning option w
1569  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1570        let ws' = if dopt option d then ws `snocBag` w else ws
1571        in POk s{messages=(ws', es)} ()
1572
1573 getMessages :: PState -> Messages
1574 getMessages PState{messages=ms} = ms
1575
1576 getContext :: P [LayoutContext]
1577 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1578
1579 setContext :: [LayoutContext] -> P ()
1580 setContext ctx = P $ \s -> POk s{context=ctx} ()
1581
1582 popContext :: P ()
1583 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1584                            loc = loc, last_len = len, last_loc = last_loc }) ->
1585   case ctx of
1586         (_:tl) -> POk s{ context = tl } ()
1587         []     -> PFailed last_loc (srcParseErr buf len)
1588
1589 -- Push a new layout context at the indentation of the last token read.
1590 -- This is only used at the outer level of a module when the 'module'
1591 -- keyword is missing.
1592 pushCurrentContext :: P ()
1593 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1594     POk s{context = Layout (offs-len) : ctx} ()
1595 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1596
1597 getOffside :: P Ordering
1598 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1599                 let ord = case stk of
1600                         (Layout n:_) -> compare offs n
1601                         _            -> GT
1602                 in POk s ord
1603
1604 -- ---------------------------------------------------------------------------
1605 -- Construct a parse error
1606
1607 srcParseErr
1608   :: StringBuffer       -- current buffer (placed just after the last token)
1609   -> Int                -- length of the previous token
1610   -> Message
1611 srcParseErr buf len
1612   = hcat [ if null token
1613              then ptext SLIT("parse error (possibly incorrect indentation)")
1614              else hcat [ptext SLIT("parse error on input "),
1615                         char '`', text token, char '\'']
1616     ]
1617   where token = lexemeToString (offsetBytes (-len) buf) len
1618
1619 -- Report a parse failure, giving the span of the previous token as
1620 -- the location of the error.  This is the entry point for errors
1621 -- detected during parsing.
1622 srcParseFail :: P a
1623 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1624                             last_loc = last_loc } ->
1625     PFailed last_loc (srcParseErr buf len)
1626
1627 -- A lexical error is reported at a particular position in the source file,
1628 -- not over a token range.
1629 lexError :: String -> P a
1630 lexError str = do
1631   loc <- getSrcLoc
1632   i@(AI end _ buf) <- getInput
1633   reportLexError loc end buf str
1634
1635 -- -----------------------------------------------------------------------------
1636 -- This is the top-level function: called from the parser each time a
1637 -- new token is to be read from the input.
1638
1639 lexer :: (Located Token -> P a) -> P a
1640 lexer cont = do
1641   tok@(L span tok__) <- lexToken
1642 --  trace ("token: " ++ show tok__) $ do
1643   cont tok
1644
1645 lexToken :: P (Located Token)
1646 lexToken = do
1647   inp@(AI loc1 _ buf) <- getInput
1648   sc <- getLexState
1649   exts <- getExts
1650   case alexScanUser exts inp sc of
1651     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1652                   setLastToken span 0 0
1653                   return (L span ITeof)
1654     AlexError (AI loc2 _ buf) -> do
1655         reportLexError loc1 loc2 buf "lexical error"
1656     AlexSkip inp2 _ -> do
1657         setInput inp2
1658         lexToken
1659     AlexToken inp2@(AI end _ buf2) len t -> do
1660     setInput inp2
1661     let span = mkSrcSpan loc1 end
1662     let bytes = byteDiff buf buf2
1663     span `seq` setLastToken span bytes bytes
1664     t span buf bytes
1665
1666 reportLexError loc1 loc2 buf str
1667   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1668   | otherwise =
1669   let
1670         c = fst (nextChar buf)
1671   in
1672   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1673     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1674     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1675 }