compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    getMessages,
  29    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  30    getLexState, popLexState, pushLexState,
  31    extension, glaExtsEnabled, bangPatEnabled
  32   ) where
  33
  34 #include "HsVersions.h"
  35
  36 import Bag
  37 import ErrUtils
  38 import Outputable
  39 import StringBuffer
  40 import FastString
  41 import FastTypes
  42 import SrcLoc
  43 import UniqFM
  44 import DynFlags
  45 import Ctype
  46 import Util             ( maybePrefixMatch, readRational )
  47
  48 import Control.Monad
  49 import Data.Bits
  50 import Data.Char        ( chr, isSpace )
  51 import Data.Ratio
  52 import Debug.Trace
  53
  54 #if __GLASGOW_HASKELL__ >= 605
  55 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  56 #else
  57 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  58 #endif
  59 }
  60
  61 $unispace    = \x05
  62 $whitechar   = [\ \n\r\f\v\xa0 $unispace]
  63 $white_no_nl = $whitechar # \n
  64 $tab         = \t
  65
  66 $ascdigit  = 0-9
  67 $unidigit  = \x03
  68 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  69 $digit     = [$ascdigit $unidigit]
  70
  71 $special   = [\(\)\,\;\[\]\`\{\}]
  72 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  73 $unisymbol = \x04
  74 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  75
  76 $unilarge  = \x01
  77 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  78 $large     = [$asclarge $unilarge]
  79
  80 $unismall  = \x02
  81 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  82 $small     = [$ascsmall $unismall \_]
  83
  84 $unigraphic = \x06
  85 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  86
  87 $octit     = 0-7
  88 $hexit     = [$decdigit A-F a-f]
  89 $symchar   = [$symbol \:]
  90 $nl        = [\n\r]
  91 $idchar    = [$small $large $digit \']
  92
  93 $docsym    = [\| \^ \* \$]
  94
  95 @varid     = $small $idchar*
  96 @conid     = $large $idchar*
  97
  98 @varsym    = $symbol $symchar*
  99 @consym    = \: $symchar*
 100
 101 @decimal     = $decdigit+
 102 @octal       = $octit+
 103 @hexadecimal = $hexit+
 104 @exponent    = [eE] [\-\+]? @decimal
 105
 106 -- we support the hierarchical module name extension:
 107 @qual = (@conid \.)+
 108
 109 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 110
 111 haskell :-
 112
 113 -- everywhere: skip whitespace and comments
 114 $white_no_nl+                           ;
 115 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 116
 117 -- Everywhere: deal with nested comments.  We explicitly rule out
 118 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 119 -- (this can happen even though pragmas will normally take precedence due to
 120 -- longest-match, because pragmas aren't valid in every state, but comments
 121 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 122 -- set.
 123
 124 "{-" / { isNormalComment } { nested_comment lexToken }
 125
 126 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 127 -- more dashes followed by a symbol should be parsed as a varsym, so we
 128 -- have to exclude those.
 129
 130 -- Since Haddock comments aren't valid in every state, we need to rule them
 131 -- out here.
 132
 133 -- The following two rules match comments that begin with two dashes, but
 134 -- continue with a different character. The rules test that this character
 135 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 136 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 137 -- have a Haddock comment). The rules then munch the rest of the line.
 138
 139 "-- " ~$docsym .* ;
 140 "--" [^$symbol : \ ] .* ;
 141
 142 -- Next, match Haddock comments if no -haddock flag
 143
 144 "-- " $docsym .* / { ifExtension (not . haddockEnabled) } ;
 145
 146 -- Now, when we've matched comments that begin with 2 dashes and continue
 147 -- with a different character, we need to match comments that begin with three
 148 -- or more dashes (which clearly can't be Haddock comments). We only need to
 149 -- make sure that the first non-dash character isn't a symbol, and munch the
 150 -- rest of the line.
 151
 152 "---"\-* [^$symbol :] .* ;
 153
 154 -- Since the previous rules all match dashes followed by at least one
 155 -- character, we also need to match a whole line filled with just dashes.
 156
 157 "--"\-* / { atEOL } ;
 158
 159 -- We need this rule since none of the other single line comment rules
 160 -- actually match this case.
 161
 162 "-- " / { atEOL } ;
 163
 164 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 165 -- blank lines) until we find a non-whitespace character, then do layout
 166 -- processing.
 167 --
 168 -- One slight wibble here: what if the line begins with {-#? In
 169 -- theory, we have to lex the pragma to see if it's one we recognise,
 170 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 171 -- as a nested comment.  We don't bother with this: if the line begins
 172 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 173 <bol> {
 174   \n                                    ;
 175   ^\# (line)?                           { begin line_prag1 }
 176   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 177   ^\# \! .* \n                          ; -- #!, for scripts
 178   ()                                    { do_bol }
 179 }
 180
 181 -- after a layout keyword (let, where, do, of), we begin a new layout
 182 -- context if the curly brace is missing.
 183 -- Careful! This stuff is quite delicate.
 184 <layout, layout_do> {
 185   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 186         -- we might encounter {-# here, but {- has been handled already
 187   \n                                    ;
 188   ^\# (line)?                           { begin line_prag1 }
 189 }
 190
 191 -- do is treated in a subtly different way, see new_layout_context
 192 <layout>    ()                          { new_layout_context True }
 193 <layout_do> ()                          { new_layout_context False }
 194
 195 -- after a new layout context which was found to be to the left of the
 196 -- previous context, we have generated a '{' token, and we now need to
 197 -- generate a matching '}' token.
 198 <layout_left>  ()                       { do_layout_left }
 199
 200 <0,option_prags,glaexts> \n                             { begin bol }
 201
 202 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 203
 204 -- single-line line pragmas, of the form
 205 --    # <line> "<file>" <extra-stuff> \n
 206 <line_prag1> $decdigit+                 { setLine line_prag1a }
 207 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 208 <line_prag1b> .*                        { pop }
 209
 210 -- Haskell-style line pragmas, of the form
 211 --    {-# LINE <line> "<file>" #-}
 212 <line_prag2> $decdigit+                 { setLine line_prag2a }
 213 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 214 <line_prag2b> "#-}"|"-}"                { pop }
 215    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 216    -- with older versions of GHC which generated these.
 217
 218 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 219 -- is on, because the contents of the pragma is always written using
 220 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 221 -- enabled, we're sure to get a parse error.
 222 -- (ToDo: we should really emit a warning when ignoring pragmas)
 223 <glaexts>
 224   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 225
 226 <0,option_prags,glaexts> {
 227   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 228   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 229                                         { token (ITinline_prag False) }
 230   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 231                                         { token ITspec_prag }
 232   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 233         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 234   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 235         $whitechar* (NO(T?)INLINE|no(t?)inline)
 236                                         { token (ITspec_inline_prag False) }
 237   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 238   "{-#" $whitechar* (DEPRECATED|deprecated)
 239                                         { token ITdeprecated_prag }
 240   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 241   "{-#" $whitechar* (GENERATED|generated)
 242                                         { token ITgenerated_prag }
 243   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 244   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 245
 246   "{-#" $whitechar* (DOCOPTIONS|docoptions)
 247   / { ifExtension haddockEnabled }     { lex_string_prag ITdocOptions }
 248
 249  "{-#"                                 { nested_comment lexToken }
 250
 251   -- ToDo: should only be valid inside a pragma:
 252   "#-}"                                 { token ITclose_prag}
 253 }
 254
 255 <option_prags> {
 256   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 257   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 258                                         { lex_string_prag IToptions_prag }
 259   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 260   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 261 }
 262
 263 <0,option_prags,glaexts> {
 264         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 265   "{-#" $whitechar* $idchar+            { nested_comment lexToken }
 266 }
 267
 268 -- '0' state: ordinary lexemes
 269 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 270
 271 -- Haddock comments
 272
 273 <0,glaexts> {
 274   "-- " / $docsym    { multiline_doc_comment }
 275   "{-" \ ? / $docsym { nested_doc_comment }
 276 }
 277
 278 -- "special" symbols
 279
 280 <0,glaexts> {
 281   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 282   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 283 }
 284
 285 <0,glaexts> {
 286   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 287   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 288   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 289   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 290   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 291   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 292   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 293   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 294 }
 295
 296 <0,glaexts> {
 297   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 298                                         { special IToparenbar }
 299   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 300 }
 301
 302 <0,glaexts> {
 303   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 304 }
 305
 306 <glaexts> {
 307   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 308   "#)"                                  { token ITcubxparen }
 309   "{|"                                  { token ITocurlybar }
 310   "|}"                                  { token ITccurlybar }
 311 }
 312
 313 <0,option_prags,glaexts> {
 314   \(                                    { special IToparen }
 315   \)                                    { special ITcparen }
 316   \[                                    { special ITobrack }
 317   \]                                    { special ITcbrack }
 318   \,                                    { special ITcomma }
 319   \;                                    { special ITsemi }
 320   \`                                    { special ITbackquote }
 321
 322   \{                                    { open_brace }
 323   \}                                    { close_brace }
 324 }
 325
 326 <0,option_prags,glaexts> {
 327   @qual @varid                  { check_qvarid }
 328   @qual @conid                  { idtoken qconid }
 329   @varid                        { varid }
 330   @conid                        { idtoken conid }
 331 }
 332
 333 -- after an illegal qvarid, such as 'M.let',
 334 -- we back up and try again in the bad_qvarid state:
 335 <bad_qvarid> {
 336   @conid                        { pop_and (idtoken conid) }
 337   @qual @conid                  { pop_and (idtoken qconid) }
 338 }
 339
 340 <glaexts> {
 341   @qual @varid "#"+             { idtoken qvarid }
 342   @qual @conid "#"+             { idtoken qconid }
 343   @varid "#"+                   { varid }
 344   @conid "#"+                   { idtoken conid }
 345 }
 346
 347 -- ToDo: M.(,,,)
 348
 349 <0,glaexts> {
 350   @qual @varsym                 { idtoken qvarsym }
 351   @qual @consym                 { idtoken qconsym }
 352   @varsym                       { varsym }
 353   @consym                       { consym }
 354 }
 355
 356 <0,glaexts> {
 357   @decimal                      { tok_decimal }
 358   0[oO] @octal                  { tok_octal }
 359   0[xX] @hexadecimal            { tok_hexadecimal }
 360 }
 361
 362 <glaexts> {
 363   @decimal \#                   { prim_decimal }
 364   0[oO] @octal \#               { prim_octal }
 365   0[xX] @hexadecimal \#         { prim_hexadecimal }
 366 }
 367
 368 <0,glaexts> @floating_point             { strtoken tok_float }
 369 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 370 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 371
 372 -- Strings and chars are lexed by hand-written code.  The reason is
 373 -- that even if we recognise the string or char here in the regex
 374 -- lexer, we would still have to parse the string afterward in order
 375 -- to convert it to a String.
 376 <0,glaexts> {
 377   \'                            { lex_char_tok }
 378   \"                            { lex_string_tok }
 379 }
 380
 381 {
 382 -- work around bug in Alex 2.0
 383 #if __GLASGOW_HASKELL__ < 503
 384 unsafeAt arr i = arr ! i
 385 #endif
 386
 387 -- -----------------------------------------------------------------------------
 388 -- The token type
 389
 390 data Token
 391   = ITas                        -- Haskell keywords
 392   | ITcase
 393   | ITclass
 394   | ITdata
 395   | ITdefault
 396   | ITderiving
 397   | ITderive
 398   | ITdo
 399   | ITelse
 400   | IThiding
 401   | ITif
 402   | ITimport
 403   | ITin
 404   | ITinfix
 405   | ITinfixl
 406   | ITinfixr
 407   | ITinstance
 408   | ITlet
 409   | ITmodule
 410   | ITnewtype
 411   | ITof
 412   | ITqualified
 413   | ITthen
 414   | ITtype
 415   | ITwhere
 416   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 417
 418   | ITforall                    -- GHC extension keywords
 419   | ITforeign
 420   | ITexport
 421   | ITlabel
 422   | ITdynamic
 423   | ITsafe
 424   | ITthreadsafe
 425   | ITunsafe
 426   | ITstdcallconv
 427   | ITccallconv
 428   | ITdotnet
 429   | ITmdo
 430   | ITfamily
 431
 432         -- Pragmas
 433   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 434   | ITspec_prag                 -- SPECIALISE
 435   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 436   | ITsource_prag
 437   | ITrules_prag
 438   | ITdeprecated_prag
 439   | ITline_prag
 440   | ITscc_prag
 441   | ITgenerated_prag
 442   | ITcore_prag                 -- hdaume: core annotations
 443   | ITunpack_prag
 444   | ITclose_prag
 445   | IToptions_prag String
 446   | ITinclude_prag String
 447   | ITlanguage_prag
 448
 449   | ITdotdot                    -- reserved symbols
 450   | ITcolon
 451   | ITdcolon
 452   | ITequal
 453   | ITlam
 454   | ITvbar
 455   | ITlarrow
 456   | ITrarrow
 457   | ITat
 458   | ITtilde
 459   | ITdarrow
 460   | ITminus
 461   | ITbang
 462   | ITstar
 463   | ITdot
 464
 465   | ITbiglam                    -- GHC-extension symbols
 466
 467   | ITocurly                    -- special symbols
 468   | ITccurly
 469   | ITocurlybar                 -- {|, for type applications
 470   | ITccurlybar                 -- |}, for type applications
 471   | ITvocurly
 472   | ITvccurly
 473   | ITobrack
 474   | ITopabrack                  -- [:, for parallel arrays with -fparr
 475   | ITcpabrack                  -- :], for parallel arrays with -fparr
 476   | ITcbrack
 477   | IToparen
 478   | ITcparen
 479   | IToubxparen
 480   | ITcubxparen
 481   | ITsemi
 482   | ITcomma
 483   | ITunderscore
 484   | ITbackquote
 485
 486   | ITvarid   FastString        -- identifiers
 487   | ITconid   FastString
 488   | ITvarsym  FastString
 489   | ITconsym  FastString
 490   | ITqvarid  (FastString,FastString)
 491   | ITqconid  (FastString,FastString)
 492   | ITqvarsym (FastString,FastString)
 493   | ITqconsym (FastString,FastString)
 494
 495   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 496
 497   | ITpragma StringBuffer
 498
 499   | ITchar       Char
 500   | ITstring     FastString
 501   | ITinteger    Integer
 502   | ITrational   Rational
 503
 504   | ITprimchar   Char
 505   | ITprimstring FastString
 506   | ITprimint    Integer
 507   | ITprimfloat  Rational
 508   | ITprimdouble Rational
 509
 510   -- MetaHaskell extension tokens
 511   | ITopenExpQuote              --  [| or [e|
 512   | ITopenPatQuote              --  [p|
 513   | ITopenDecQuote              --  [d|
 514   | ITopenTypQuote              --  [t|
 515   | ITcloseQuote                --  |]
 516   | ITidEscape   FastString     --  $x
 517   | ITparenEscape               --  $(
 518   | ITvarQuote                  --  '
 519   | ITtyQuote                   --  ''
 520
 521   -- Arrow notation extension
 522   | ITproc
 523   | ITrec
 524   | IToparenbar                 --  (|
 525   | ITcparenbar                 --  |)
 526   | ITlarrowtail                --  -<
 527   | ITrarrowtail                --  >-
 528   | ITLarrowtail                --  -<<
 529   | ITRarrowtail                --  >>-
 530
 531   | ITunknown String            -- Used when the lexer can't make sense of it
 532   | ITeof                       -- end of file token
 533
 534   -- Documentation annotations
 535   | ITdocCommentNext  String     -- something beginning '-- |'
 536   | ITdocCommentPrev  String     -- something beginning '-- ^'
 537   | ITdocCommentNamed String     -- something beginning '-- $'
 538   | ITdocSection      Int String -- a section heading
 539   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 540
 541 #ifdef DEBUG
 542   deriving Show -- debugging
 543 #endif
 544
 545 isSpecial :: Token -> Bool
 546 -- If we see M.x, where x is a keyword, but
 547 -- is special, we treat is as just plain M.x,
 548 -- not as a keyword.
 549 isSpecial ITas          = True
 550 isSpecial IThiding      = True
 551 isSpecial ITderive      = True
 552 isSpecial ITqualified   = True
 553 isSpecial ITforall      = True
 554 isSpecial ITexport      = True
 555 isSpecial ITlabel       = True
 556 isSpecial ITdynamic     = True
 557 isSpecial ITsafe        = True
 558 isSpecial ITthreadsafe  = True
 559 isSpecial ITunsafe      = True
 560 isSpecial ITccallconv   = True
 561 isSpecial ITstdcallconv = True
 562 isSpecial ITmdo         = True
 563 isSpecial ITfamily      = True
 564 isSpecial _             = False
 565
 566 -- the bitmap provided as the third component indicates whether the
 567 -- corresponding extension keyword is valid under the extension options
 568 -- provided to the compiler; if the extension corresponding to *any* of the
 569 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 570 -- facilitates using a keyword in two different extensions that can be
 571 -- activated independently)
 572 --
 573 reservedWordsFM = listToUFM $
 574         map (\(x, y, z) -> (mkFastString x, (y, z)))
 575        [( "_",          ITunderscore,   0 ),
 576         ( "as",         ITas,           0 ),
 577         ( "case",       ITcase,         0 ),
 578         ( "class",      ITclass,        0 ),
 579         ( "data",       ITdata,         0 ),
 580         ( "default",    ITdefault,      0 ),
 581         ( "deriving",   ITderiving,     0 ),
 582         ( "derive",     ITderive,       0 ),
 583         ( "do",         ITdo,           0 ),
 584         ( "else",       ITelse,         0 ),
 585         ( "hiding",     IThiding,       0 ),
 586         ( "if",         ITif,           0 ),
 587         ( "import",     ITimport,       0 ),
 588         ( "in",         ITin,           0 ),
 589         ( "infix",      ITinfix,        0 ),
 590         ( "infixl",     ITinfixl,       0 ),
 591         ( "infixr",     ITinfixr,       0 ),
 592         ( "instance",   ITinstance,     0 ),
 593         ( "let",        ITlet,          0 ),
 594         ( "module",     ITmodule,       0 ),
 595         ( "newtype",    ITnewtype,      0 ),
 596         ( "of",         ITof,           0 ),
 597         ( "qualified",  ITqualified,    0 ),
 598         ( "then",       ITthen,         0 ),
 599         ( "type",       ITtype,         0 ),
 600         ( "where",      ITwhere,        0 ),
 601         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 602
 603         ( "forall",     ITforall,        bit tvBit),
 604         ( "mdo",        ITmdo,           bit glaExtsBit),
 605         ( "family",     ITfamily,        bit idxTysBit),
 606
 607         ( "foreign",    ITforeign,       bit ffiBit),
 608         ( "export",     ITexport,        bit ffiBit),
 609         ( "label",      ITlabel,         bit ffiBit),
 610         ( "dynamic",    ITdynamic,       bit ffiBit),
 611         ( "safe",       ITsafe,          bit ffiBit),
 612         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 613         ( "unsafe",     ITunsafe,        bit ffiBit),
 614         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 615         ( "ccall",      ITccallconv,     bit ffiBit),
 616         ( "dotnet",     ITdotnet,        bit ffiBit),
 617
 618         ( "rec",        ITrec,           bit arrowsBit),
 619         ( "proc",       ITproc,          bit arrowsBit)
 620      ]
 621
 622 reservedSymsFM = listToUFM $
 623         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 624       [ ("..",  ITdotdot,       0)
 625        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 626                                                 -- meaning only list cons
 627        ,("::",  ITdcolon,       0)
 628        ,("=",   ITequal,        0)
 629        ,("\\",  ITlam,          0)
 630        ,("|",   ITvbar,         0)
 631        ,("<-",  ITlarrow,       0)
 632        ,("->",  ITrarrow,       0)
 633        ,("@",   ITat,           0)
 634        ,("~",   ITtilde,        0)
 635        ,("=>",  ITdarrow,       0)
 636        ,("-",   ITminus,        0)
 637        ,("!",   ITbang,         0)
 638
 639        ,("*",   ITstar,         bit glaExtsBit .|.
 640                                 bit idxTysBit)      -- For data T (a::*) = MkT
 641        ,(".",   ITdot,          bit tvBit)          -- For 'forall a . t'
 642
 643        ,("-<",  ITlarrowtail,   bit arrowsBit)
 644        ,(">-",  ITrarrowtail,   bit arrowsBit)
 645        ,("-<<", ITLarrowtail,   bit arrowsBit)
 646        ,(">>-", ITRarrowtail,   bit arrowsBit)
 647
 648 #if __GLASGOW_HASKELL__ >= 605
 649        ,("∷",   ITdcolon,       bit glaExtsBit)
 650        ,("⇒",   ITdarrow,     bit glaExtsBit)
 651        ,("∀", ITforall,       bit glaExtsBit)
 652        ,("→",   ITrarrow,     bit glaExtsBit)
 653        ,("←",   ITlarrow,     bit glaExtsBit)
 654        ,("?",   ITdotdot,       bit glaExtsBit)
 655         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 656         -- form part of a large operator.  This would let us have a better
 657         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 658 #endif
 659        ]
 660
 661 -- -----------------------------------------------------------------------------
 662 -- Lexer actions
 663
 664 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 665
 666 special :: Token -> Action
 667 special tok span _buf len = return (L span tok)
 668
 669 token, layout_token :: Token -> Action
 670 token t span buf len = return (L span t)
 671 layout_token t span buf len = pushLexState layout >> return (L span t)
 672
 673 idtoken :: (StringBuffer -> Int -> Token) -> Action
 674 idtoken f span buf len = return (L span $! (f buf len))
 675
 676 skip_one_varid :: (FastString -> Token) -> Action
 677 skip_one_varid f span buf len
 678   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 679
 680 strtoken :: (String -> Token) -> Action
 681 strtoken f span buf len =
 682   return (L span $! (f $! lexemeToString buf len))
 683
 684 init_strtoken :: Int -> (String -> Token) -> Action
 685 -- like strtoken, but drops the last N character(s)
 686 init_strtoken drop f span buf len =
 687   return (L span $! (f $! lexemeToString buf (len-drop)))
 688
 689 begin :: Int -> Action
 690 begin code _span _str _len = do pushLexState code; lexToken
 691
 692 pop :: Action
 693 pop _span _buf _len = do popLexState; lexToken
 694
 695 pop_and :: Action -> Action
 696 pop_and act span buf len = do popLexState; act span buf len
 697
 698 {-# INLINE nextCharIs #-}
 699 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 700
 701 notFollowedBy char _ _ _ (AI _ _ buf)
 702   = nextCharIs buf (/=char)
 703
 704 notFollowedBySymbol _ _ _ (AI _ _ buf)
 705   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 706
 707 isNormalComment bits _ _ (AI _ _ buf)
 708   | haddockEnabled bits = notFollowedByDocOrPragma
 709   | otherwise           = nextCharIs buf (/='#')
 710   where
 711     notFollowedByDocOrPragma
 712         = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 713
 714 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 715
 716 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 717   = if haddockEnabled bits then False else (p buf)
 718
 719 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 720
 721 ifExtension pred bits _ _ _ = pred bits
 722
 723 multiline_doc_comment :: Action
 724 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 725   where
 726     worker commentAcc input docType oneLine = case alexGetChar input of
 727       Just ('\n', input')
 728         | oneLine -> docCommentEnd input commentAcc docType buf span
 729         | otherwise -> case checkIfCommentLine input' of
 730           Just input -> worker ('\n':commentAcc) input docType False
 731           Nothing -> docCommentEnd input commentAcc docType buf span
 732       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 733       Nothing -> docCommentEnd input commentAcc docType buf span
 734
 735     checkIfCommentLine input = check (dropNonNewlineSpace input)
 736       where
 737         check input = case alexGetChar input of
 738           Just ('-', input) -> case alexGetChar input of
 739             Just ('-', input) -> case alexGetChar input of
 740               Just (c, _) | c /= '-' -> Just input
 741               _ -> Nothing
 742             _ -> Nothing
 743           _ -> Nothing
 744
 745         dropNonNewlineSpace input = case alexGetChar input of
 746           Just (c, input')
 747             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 748             | otherwise -> input
 749           Nothing -> input
 750
 751 {-
 752   nested comments require traversing by hand, they can't be parsed
 753   using regular expressions.
 754 -}
 755 nested_comment :: P (Located Token) -> Action
 756 nested_comment cont span _str _len = do
 757   input <- getInput
 758   go 1 input
 759   where
 760     go 0 input = do setInput input; cont
 761     go n input = case alexGetChar input of
 762       Nothing -> errBrace input span
 763       Just ('-',input) -> case alexGetChar input of
 764         Nothing  -> errBrace input span
 765         Just ('\125',input) -> go (n-1) input
 766         Just (c,_)          -> go n input
 767       Just ('\123',input) -> case alexGetChar input of
 768         Nothing  -> errBrace input span
 769         Just ('-',input) -> go (n+1) input
 770         Just (c,_)       -> go n input
 771       Just (c,input) -> go n input
 772
 773 nested_doc_comment :: Action
 774 nested_doc_comment span buf _len = withLexedDocType (go "")
 775   where
 776     go commentAcc input docType _ = case alexGetChar input of
 777       Nothing -> errBrace input span
 778       Just ('-',input) -> case alexGetChar input of
 779         Nothing -> errBrace input span
 780         Just ('\125',input@(AI end _ buf2)) ->
 781           docCommentEnd input commentAcc docType buf span
 782         Just (c,_) -> go ('-':commentAcc) input docType False
 783       Just ('\123', input) -> case alexGetChar input of
 784         Nothing  -> errBrace input span
 785         Just ('-',input) -> do
 786           setInput input
 787           let cont = do input <- getInput; go commentAcc input docType False
 788           nested_comment cont span buf _len
 789         Just (c,_) -> go ('\123':commentAcc) input docType False
 790       Just (c,input) -> go (c:commentAcc) input docType False
 791
 792 withLexedDocType lexDocComment = do
 793   input <- getInput
 794   case alexGetChar input of
 795     Nothing -> error "Can't happen"
 796     Just ('|', input) -> lexDocComment input ITdocCommentNext False
 797     Just ('^', input) -> lexDocComment input ITdocCommentPrev False
 798     Just ('$', input) -> lexDocComment input ITdocCommentNamed False
 799     Just ('*', input) -> lexDocSection 1 input
 800  where
 801     lexDocSection n input = case alexGetChar input of
 802       Just ('*', input) -> lexDocSection (n+1) input
 803       Just (c, _) -> lexDocComment input (ITdocSection n) True
 804       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 805
 806 -- docCommentEnd
 807 -------------------------------------------------------------------------------
 808 -- This function is quite tricky. We can't just return a new token, we also
 809 -- need to update the state of the parser. Why? Because the token is longer
 810 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 811 -- it writes the wrong token length to the parser state. This function is
 812 -- called afterwards, so it can just update the state.
 813
 814 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 815 -- which is something that the original lexer didn't account for.
 816 -- I have added last_line_len in the parser state which represents the length
 817 -- of the part of the token that is on the last line. It is now used for layout
 818 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 819 -- was before, the full length of the token, and it is now only used for error
 820 -- messages. /Waern
 821
 822 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 823                  SrcSpan -> P (Located Token)
 824 docCommentEnd input commentAcc docType buf span = do
 825   setInput input
 826   let (AI loc last_offs nextBuf) = input
 827       comment = reverse commentAcc
 828       span' = mkSrcSpan (srcSpanStart span) loc
 829       last_len = byteDiff buf nextBuf
 830
 831       last_line_len = if (last_offs - last_len < 0)
 832         then last_offs
 833         else last_len
 834
 835   span `seq` setLastToken span' last_len last_line_len
 836   return (L span' (docType comment))
 837
 838 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 839
 840 open_brace, close_brace :: Action
 841 open_brace span _str _len = do
 842   ctx <- getContext
 843   setContext (NoLayout:ctx)
 844   return (L span ITocurly)
 845 close_brace span _str _len = do
 846   popContext
 847   return (L span ITccurly)
 848
 849 -- We have to be careful not to count M.<varid> as a qualified name
 850 -- when <varid> is a keyword.  We hack around this by catching
 851 -- the offending tokens afterward, and re-lexing in a different state.
 852 check_qvarid span buf len = do
 853   case lookupUFM reservedWordsFM var of
 854         Just (keyword,exts)
 855           | not (isSpecial keyword) ->
 856           if exts == 0
 857              then try_again
 858              else do
 859                 b <- extension (\i -> exts .&. i /= 0)
 860                 if b then try_again
 861                      else return token
 862         _other -> return token
 863   where
 864         (mod,var) = splitQualName buf len
 865         token     = L span (ITqvarid (mod,var))
 866
 867         try_again = do
 868                 (AI _ offs _) <- getInput
 869                 setInput (AI (srcSpanStart span) (offs-len) buf)
 870                 pushLexState bad_qvarid
 871                 lexToken
 872
 873 qvarid buf len = ITqvarid $! splitQualName buf len
 874 qconid buf len = ITqconid $! splitQualName buf len
 875
 876 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 877 -- takes a StringBuffer and a length, and returns the module name
 878 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 879 -- because of hierarchical module names.
 880 splitQualName orig_buf len = split orig_buf orig_buf
 881   where
 882     split buf dot_buf
 883         | orig_buf `byteDiff` buf >= len  = done dot_buf
 884         | c == '.'                        = found_dot buf'
 885         | otherwise                       = split buf' dot_buf
 886       where
 887        (c,buf') = nextChar buf
 888
 889     -- careful, we might get names like M....
 890     -- so, if the character after the dot is not upper-case, this is
 891     -- the end of the qualifier part.
 892     found_dot buf -- buf points after the '.'
 893         | isUpper c    = split buf' buf
 894         | otherwise    = done buf
 895       where
 896        (c,buf') = nextChar buf
 897
 898     done dot_buf =
 899         (lexemeToFastString orig_buf (qual_size - 1),
 900          lexemeToFastString dot_buf (len - qual_size))
 901       where
 902         qual_size = orig_buf `byteDiff` dot_buf
 903
 904 varid span buf len =
 905   case lookupUFM reservedWordsFM fs of
 906         Just (keyword,0)    -> do
 907                 maybe_layout keyword
 908                 return (L span keyword)
 909         Just (keyword,exts) -> do
 910                 b <- extension (\i -> exts .&. i /= 0)
 911                 if b then do maybe_layout keyword
 912                              return (L span keyword)
 913                      else return (L span (ITvarid fs))
 914         _other -> return (L span (ITvarid fs))
 915   where
 916         fs = lexemeToFastString buf len
 917
 918 conid buf len = ITconid fs
 919   where fs = lexemeToFastString buf len
 920
 921 qvarsym buf len = ITqvarsym $! splitQualName buf len
 922 qconsym buf len = ITqconsym $! splitQualName buf len
 923
 924 varsym = sym ITvarsym
 925 consym = sym ITconsym
 926
 927 sym con span buf len =
 928   case lookupUFM reservedSymsFM fs of
 929         Just (keyword,0)    -> return (L span keyword)
 930         Just (keyword,exts) -> do
 931                 b <- extension (\i -> exts .&. i /= 0)
 932                 if b then return (L span keyword)
 933                      else return (L span $! con fs)
 934         _other -> return (L span $! con fs)
 935   where
 936         fs = lexemeToFastString buf len
 937
 938 tok_decimal span buf len
 939   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 940
 941 tok_octal span buf len
 942   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 943
 944 tok_hexadecimal span buf len
 945   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 946
 947 prim_decimal span buf len
 948   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 949
 950 prim_octal span buf len
 951   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 952
 953 prim_hexadecimal span buf len
 954   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 955
 956 tok_float        str = ITrational   $! readRational str
 957 prim_float       str = ITprimfloat  $! readRational str
 958 prim_double      str = ITprimdouble $! readRational str
 959
 960 -- -----------------------------------------------------------------------------
 961 -- Layout processing
 962
 963 -- we're at the first token on a line, insert layout tokens if necessary
 964 do_bol :: Action
 965 do_bol span _str _len = do
 966         pos <- getOffside
 967         case pos of
 968             LT -> do
 969                 --trace "layout: inserting '}'" $ do
 970                 popContext
 971                 -- do NOT pop the lex state, we might have a ';' to insert
 972                 return (L span ITvccurly)
 973             EQ -> do
 974                 --trace "layout: inserting ';'" $ do
 975                 popLexState
 976                 return (L span ITsemi)
 977             GT -> do
 978                 popLexState
 979                 lexToken
 980
 981 -- certain keywords put us in the "layout" state, where we might
 982 -- add an opening curly brace.
 983 maybe_layout ITdo       = pushLexState layout_do
 984 maybe_layout ITmdo      = pushLexState layout_do
 985 maybe_layout ITof       = pushLexState layout
 986 maybe_layout ITlet      = pushLexState layout
 987 maybe_layout ITwhere    = pushLexState layout
 988 maybe_layout ITrec      = pushLexState layout
 989 maybe_layout _          = return ()
 990
 991 -- Pushing a new implicit layout context.  If the indentation of the
 992 -- next token is not greater than the previous layout context, then
 993 -- Haskell 98 says that the new layout context should be empty; that is
 994 -- the lexer must generate {}.
 995 --
 996 -- We are slightly more lenient than this: when the new context is started
 997 -- by a 'do', then we allow the new context to be at the same indentation as
 998 -- the previous context.  This is what the 'strict' argument is for.
 999 --
1000 new_layout_context strict span _buf _len = do
1001     popLexState
1002     (AI _ offset _) <- getInput
1003     ctx <- getContext
1004     case ctx of
1005         Layout prev_off : _  |
1006            (strict     && prev_off >= offset  ||
1007             not strict && prev_off > offset) -> do
1008                 -- token is indented to the left of the previous context.
1009                 -- we must generate a {} sequence now.
1010                 pushLexState layout_left
1011                 return (L span ITvocurly)
1012         other -> do
1013                 setContext (Layout offset : ctx)
1014                 return (L span ITvocurly)
1015
1016 do_layout_left span _buf _len = do
1017     popLexState
1018     pushLexState bol  -- we must be at the start of a line
1019     return (L span ITvccurly)
1020
1021 -- -----------------------------------------------------------------------------
1022 -- LINE pragmas
1023
1024 setLine :: Int -> Action
1025 setLine code span buf len = do
1026   let line = parseInteger buf len 10 octDecDigit
1027   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1028         -- subtract one: the line number refers to the *following* line
1029   popLexState
1030   pushLexState code
1031   lexToken
1032
1033 setFile :: Int -> Action
1034 setFile code span buf len = do
1035   let file = lexemeToFastString (stepOn buf) (len-2)
1036   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1037   popLexState
1038   pushLexState code
1039   lexToken
1040
1041
1042 -- -----------------------------------------------------------------------------
1043 -- Options, includes and language pragmas.
1044
1045 lex_string_prag :: (String -> Token) -> Action
1046 lex_string_prag mkTok span buf len
1047     = do input <- getInput
1048          start <- getSrcLoc
1049          tok <- go [] input
1050          end <- getSrcLoc
1051          return (L (mkSrcSpan start end) tok)
1052     where go acc input
1053               = if isString input "#-}"
1054                    then do setInput input
1055                            return (mkTok (reverse acc))
1056                    else case alexGetChar input of
1057                           Just (c,i) -> go (c:acc) i
1058                           Nothing -> err input
1059           isString i [] = True
1060           isString i (x:xs)
1061               = case alexGetChar i of
1062                   Just (c,i') | c == x    -> isString i' xs
1063                   _other -> False
1064           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1065
1066
1067 -- -----------------------------------------------------------------------------
1068 -- Strings & Chars
1069
1070 -- This stuff is horrible.  I hates it.
1071
1072 lex_string_tok :: Action
1073 lex_string_tok span buf len = do
1074   tok <- lex_string ""
1075   end <- getSrcLoc
1076   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1077
1078 lex_string :: String -> P Token
1079 lex_string s = do
1080   i <- getInput
1081   case alexGetChar' i of
1082     Nothing -> lit_error
1083
1084     Just ('"',i)  -> do
1085         setInput i
1086         glaexts <- extension glaExtsEnabled
1087         if glaexts
1088           then do
1089             i <- getInput
1090             case alexGetChar' i of
1091               Just ('#',i) -> do
1092                    setInput i
1093                    if any (> '\xFF') s
1094                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1095                     else let s' = mkZFastString (reverse s) in
1096                          return (ITprimstring s')
1097                         -- mkZFastString is a hack to avoid encoding the
1098                         -- string in UTF-8.  We just want the exact bytes.
1099               _other ->
1100                 return (ITstring (mkFastString (reverse s)))
1101           else
1102                 return (ITstring (mkFastString (reverse s)))
1103
1104     Just ('\\',i)
1105         | Just ('&',i) <- next -> do
1106                 setInput i; lex_string s
1107         | Just (c,i) <- next, is_space c -> do
1108                 setInput i; lex_stringgap s
1109         where next = alexGetChar' i
1110
1111     Just (c, i) -> do
1112         c' <- lex_char c i
1113         lex_string (c':s)
1114
1115 lex_stringgap s = do
1116   c <- getCharOrFail
1117   case c of
1118     '\\' -> lex_string s
1119     c | is_space c -> lex_stringgap s
1120     _other -> lit_error
1121
1122
1123 lex_char_tok :: Action
1124 -- Here we are basically parsing character literals, such as 'x' or '\n'
1125 -- but, when Template Haskell is on, we additionally spot
1126 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1127 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1128 -- So we have to do two characters of lookahead: when we see 'x we need to
1129 -- see if there's a trailing quote
1130 lex_char_tok span buf len = do  -- We've seen '
1131    i1 <- getInput       -- Look ahead to first character
1132    let loc = srcSpanStart span
1133    case alexGetChar' i1 of
1134         Nothing -> lit_error
1135
1136         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1137                   th_exts <- extension thEnabled
1138                   if th_exts then do
1139                         setInput i2
1140                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1141                    else lit_error
1142
1143         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
1144                   setInput i2
1145                   lit_ch <- lex_escape
1146                   mc <- getCharOrFail   -- Trailing quote
1147                   if mc == '\'' then finish_char_tok loc lit_ch
1148                                 else do setInput i2; lit_error
1149
1150         Just (c, i2@(AI end2 _ _))
1151                 | not (isAny c) -> lit_error
1152                 | otherwise ->
1153
1154                 -- We've seen 'x, where x is a valid character
1155                 --  (i.e. not newline etc) but not a quote or backslash
1156            case alexGetChar' i2 of      -- Look ahead one more character
1157                 Nothing -> lit_error
1158                 Just ('\'', i3) -> do   -- We've seen 'x'
1159                         setInput i3
1160                         finish_char_tok loc c
1161                 _other -> do            -- We've seen 'x not followed by quote
1162                                         -- If TH is on, just parse the quote only
1163                         th_exts <- extension thEnabled
1164                         let (AI end _ _) = i1
1165                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1166                                    else do setInput i2; lit_error
1167
1168 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1169 finish_char_tok loc ch  -- We've already seen the closing quote
1170                         -- Just need to check for trailing #
1171   = do  glaexts <- extension glaExtsEnabled
1172         i@(AI end _ _) <- getInput
1173         if glaexts then do
1174                 case alexGetChar' i of
1175                         Just ('#',i@(AI end _ _)) -> do
1176                                 setInput i
1177                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1178                         _other ->
1179                                 return (L (mkSrcSpan loc end) (ITchar ch))
1180                 else do
1181                    return (L (mkSrcSpan loc end) (ITchar ch))
1182
1183 lex_char :: Char -> AlexInput -> P Char
1184 lex_char c inp = do
1185   case c of
1186       '\\' -> do setInput inp; lex_escape
1187       c | isAny c -> do setInput inp; return c
1188       _other -> lit_error
1189
1190 isAny c | c > '\xff' = isPrint c
1191         | otherwise  = is_any c
1192
1193 lex_escape :: P Char
1194 lex_escape = do
1195   c <- getCharOrFail
1196   case c of
1197         'a'   -> return '\a'
1198         'b'   -> return '\b'
1199         'f'   -> return '\f'
1200         'n'   -> return '\n'
1201         'r'   -> return '\r'
1202         't'   -> return '\t'
1203         'v'   -> return '\v'
1204         '\\'  -> return '\\'
1205         '"'   -> return '\"'
1206         '\''  -> return '\''
1207         '^'   -> do c <- getCharOrFail
1208                     if c >= '@' && c <= '_'
1209                         then return (chr (ord c - ord '@'))
1210                         else lit_error
1211
1212         'x'   -> readNum is_hexdigit 16 hexDigit
1213         'o'   -> readNum is_octdigit  8 octDecDigit
1214         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1215
1216         c1 ->  do
1217            i <- getInput
1218            case alexGetChar' i of
1219             Nothing -> lit_error
1220             Just (c2,i2) ->
1221               case alexGetChar' i2 of
1222                 Nothing -> do setInput i2; lit_error
1223                 Just (c3,i3) ->
1224                    let str = [c1,c2,c3] in
1225                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1226                                      Just rest <- [maybePrefixMatch p str] ] of
1227                           (escape_char,[]):_ -> do
1228                                 setInput i3
1229                                 return escape_char
1230                           (escape_char,_:_):_ -> do
1231                                 setInput i2
1232                                 return escape_char
1233                           [] -> lit_error
1234
1235 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1236 readNum is_digit base conv = do
1237   i <- getInput
1238   c <- getCharOrFail
1239   if is_digit c
1240         then readNum2 is_digit base conv (conv c)
1241         else do setInput i; lit_error
1242
1243 readNum2 is_digit base conv i = do
1244   input <- getInput
1245   read i input
1246   where read i input = do
1247           case alexGetChar' input of
1248             Just (c,input') | is_digit c -> do
1249                 read (i*base + conv c) input'
1250             _other -> do
1251                 if i >= 0 && i <= 0x10FFFF
1252                    then do setInput input; return (chr i)
1253                    else lit_error
1254
1255 silly_escape_chars = [
1256         ("NUL", '\NUL'),
1257         ("SOH", '\SOH'),
1258         ("STX", '\STX'),
1259         ("ETX", '\ETX'),
1260         ("EOT", '\EOT'),
1261         ("ENQ", '\ENQ'),
1262         ("ACK", '\ACK'),
1263         ("BEL", '\BEL'),
1264         ("BS", '\BS'),
1265         ("HT", '\HT'),
1266         ("LF", '\LF'),
1267         ("VT", '\VT'),
1268         ("FF", '\FF'),
1269         ("CR", '\CR'),
1270         ("SO", '\SO'),
1271         ("SI", '\SI'),
1272         ("DLE", '\DLE'),
1273         ("DC1", '\DC1'),
1274         ("DC2", '\DC2'),
1275         ("DC3", '\DC3'),
1276         ("DC4", '\DC4'),
1277         ("NAK", '\NAK'),
1278         ("SYN", '\SYN'),
1279         ("ETB", '\ETB'),
1280         ("CAN", '\CAN'),
1281         ("EM", '\EM'),
1282         ("SUB", '\SUB'),
1283         ("ESC", '\ESC'),
1284         ("FS", '\FS'),
1285         ("GS", '\GS'),
1286         ("RS", '\RS'),
1287         ("US", '\US'),
1288         ("SP", '\SP'),
1289         ("DEL", '\DEL')
1290         ]
1291
1292 -- before calling lit_error, ensure that the current input is pointing to
1293 -- the position of the error in the buffer.  This is so that we can report
1294 -- a correct location to the user, but also so we can detect UTF-8 decoding
1295 -- errors if they occur.
1296 lit_error = lexError "lexical error in string/character literal"
1297
1298 getCharOrFail :: P Char
1299 getCharOrFail =  do
1300   i <- getInput
1301   case alexGetChar' i of
1302         Nothing -> lexError "unexpected end-of-file in string/character literal"
1303         Just (c,i)  -> do setInput i; return c
1304
1305 -- -----------------------------------------------------------------------------
1306 -- Warnings
1307
1308 warn :: DynFlag -> SDoc -> Action
1309 warn option warning span _buf _len = do
1310     addWarning option (mkWarnMsg span alwaysQualify warning)
1311     lexToken
1312
1313 -- -----------------------------------------------------------------------------
1314 -- The Parse Monad
1315
1316 data LayoutContext
1317   = NoLayout
1318   | Layout !Int
1319   deriving Show
1320
1321 data ParseResult a
1322   = POk PState a
1323   | PFailed
1324         SrcSpan         -- The start and end of the text span related to
1325                         -- the error.  Might be used in environments which can
1326                         -- show this span, e.g. by highlighting it.
1327         Message         -- The error message
1328
1329 data PState = PState {
1330         buffer     :: StringBuffer,
1331     dflags     :: DynFlags,
1332     messages   :: Messages,
1333         last_loc   :: SrcSpan,  -- pos of previous token
1334         last_offs  :: !Int,     -- offset of the previous token from the
1335                                 -- beginning of  the current line.
1336                                 -- \t is equal to 8 spaces.
1337         last_len   :: !Int,     -- len of previous token
1338   last_line_len :: !Int,
1339         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1340         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1341         context    :: [LayoutContext],
1342         lex_state  :: [Int]
1343      }
1344         -- last_loc and last_len are used when generating error messages,
1345         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1346         -- current token to happyError, we could at least get rid of last_len.
1347         -- Getting rid of last_loc would require finding another way to
1348         -- implement pushCurrentContext (which is only called from one place).
1349
1350 newtype P a = P { unP :: PState -> ParseResult a }
1351
1352 instance Monad P where
1353   return = returnP
1354   (>>=) = thenP
1355   fail = failP
1356
1357 returnP :: a -> P a
1358 returnP a = P $ \s -> POk s a
1359
1360 thenP :: P a -> (a -> P b) -> P b
1361 (P m) `thenP` k = P $ \ s ->
1362         case m s of
1363                 POk s1 a         -> (unP (k a)) s1
1364                 PFailed span err -> PFailed span err
1365
1366 failP :: String -> P a
1367 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1368
1369 failMsgP :: String -> P a
1370 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1371
1372 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1373 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1374
1375 failSpanMsgP :: SrcSpan -> String -> P a
1376 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1377
1378 extension :: (Int -> Bool) -> P Bool
1379 extension p = P $ \s -> POk s (p $! extsBitmap s)
1380
1381 getExts :: P Int
1382 getExts = P $ \s -> POk s (extsBitmap s)
1383
1384 setSrcLoc :: SrcLoc -> P ()
1385 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1386
1387 getSrcLoc :: P SrcLoc
1388 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1389
1390 setLastToken :: SrcSpan -> Int -> Int -> P ()
1391 setLastToken loc len line_len = P $ \s -> POk s {
1392   last_loc=loc,
1393   last_len=len,
1394   last_line_len=line_len
1395 } ()
1396
1397 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1398
1399 alexInputPrevChar :: AlexInput -> Char
1400 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1401
1402 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1403 alexGetChar (AI loc ofs s)
1404   | atEnd s   = Nothing
1405   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1406                 --trace (show (ord c)) $
1407                 Just (adj_c, (AI loc' ofs' s'))
1408   where (c,s') = nextChar s
1409         loc'   = advanceSrcLoc loc c
1410         ofs'   = advanceOffs c ofs
1411
1412         non_graphic     = '\x0'
1413         upper           = '\x1'
1414         lower           = '\x2'
1415         digit           = '\x3'
1416         symbol          = '\x4'
1417         space           = '\x5'
1418         other_graphic   = '\x6'
1419
1420         adj_c
1421           | c <= '\x06' = non_graphic
1422           | c <= '\xff' = c
1423           | otherwise =
1424                 case generalCategory c of
1425                   UppercaseLetter       -> upper
1426                   LowercaseLetter       -> lower
1427                   TitlecaseLetter       -> upper
1428                   ModifierLetter        -> other_graphic
1429                   OtherLetter           -> other_graphic
1430                   NonSpacingMark        -> other_graphic
1431                   SpacingCombiningMark  -> other_graphic
1432                   EnclosingMark         -> other_graphic
1433                   DecimalNumber         -> digit
1434                   LetterNumber          -> other_graphic
1435                   OtherNumber           -> other_graphic
1436                   ConnectorPunctuation  -> other_graphic
1437                   DashPunctuation       -> other_graphic
1438                   OpenPunctuation       -> other_graphic
1439                   ClosePunctuation      -> other_graphic
1440                   InitialQuote          -> other_graphic
1441                   FinalQuote            -> other_graphic
1442                   OtherPunctuation      -> other_graphic
1443                   MathSymbol            -> symbol
1444                   CurrencySymbol        -> symbol
1445                   ModifierSymbol        -> symbol
1446                   OtherSymbol           -> symbol
1447                   Space                 -> space
1448                   _other                -> non_graphic
1449
1450 -- This version does not squash unicode characters, it is used when
1451 -- lexing strings.
1452 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1453 alexGetChar' (AI loc ofs s)
1454   | atEnd s   = Nothing
1455   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1456                 --trace (show (ord c)) $
1457                 Just (c, (AI loc' ofs' s'))
1458   where (c,s') = nextChar s
1459         loc'   = advanceSrcLoc loc c
1460         ofs'   = advanceOffs c ofs
1461
1462 advanceOffs :: Char -> Int -> Int
1463 advanceOffs '\n' offs = 0
1464 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1465 advanceOffs _    offs = offs + 1
1466
1467 getInput :: P AlexInput
1468 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1469
1470 setInput :: AlexInput -> P ()
1471 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1472
1473 pushLexState :: Int -> P ()
1474 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1475
1476 popLexState :: P Int
1477 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1478
1479 getLexState :: P Int
1480 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1481
1482 -- for reasons of efficiency, flags indicating language extensions (eg,
1483 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1484 -- integer
1485
1486 glaExtsBit, ffiBit, parrBit :: Int
1487 glaExtsBit = 0
1488 ffiBit     = 1
1489 parrBit    = 2
1490 arrowsBit  = 4
1491 thBit      = 5
1492 ipBit      = 6
1493 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1494 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1495                 -- (doesn't affect the lexer)
1496 idxTysBit  = 9  -- indexed type families: 'family' keyword and kind sigs
1497 haddockBit = 10 -- Lex and parse Haddock comments
1498
1499 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1500 glaExtsEnabled flags = testBit flags glaExtsBit
1501 ffiEnabled     flags = testBit flags ffiBit
1502 parrEnabled    flags = testBit flags parrBit
1503 arrowsEnabled  flags = testBit flags arrowsBit
1504 thEnabled      flags = testBit flags thBit
1505 ipEnabled      flags = testBit flags ipBit
1506 tvEnabled      flags = testBit flags tvBit
1507 bangPatEnabled flags = testBit flags bangPatBit
1508 idxTysEnabled  flags = testBit flags idxTysBit
1509 haddockEnabled flags = testBit flags haddockBit
1510
1511 -- PState for parsing options pragmas
1512 --
1513 pragState :: StringBuffer -> SrcLoc -> PState
1514 pragState buf loc  =
1515   PState {
1516       buffer          = buf,
1517       messages      = emptyMessages,
1518       -- XXX defaultDynFlags is not right, but we don't have a real
1519       -- dflags handy
1520       dflags        = defaultDynFlags,
1521       last_loc      = mkSrcSpan loc loc,
1522       last_offs     = 0,
1523       last_len      = 0,
1524       last_line_len = 0,
1525       loc           = loc,
1526       extsBitmap    = 0,
1527       context       = [],
1528       lex_state     = [bol, option_prags, 0]
1529     }
1530
1531
1532 -- create a parse state
1533 --
1534 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1535 mkPState buf loc flags  =
1536   PState {
1537       buffer          = buf,
1538       dflags        = flags,
1539       messages      = emptyMessages,
1540       last_loc      = mkSrcSpan loc loc,
1541       last_offs     = 0,
1542       last_len      = 0,
1543       last_line_len = 0,
1544       loc           = loc,
1545       extsBitmap    = fromIntegral bitmap,
1546       context       = [],
1547       lex_state     = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1548         -- we begin in the layout state if toplev_layout is set
1549     }
1550     where
1551       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1552                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1553                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1554                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1555                .|. thBit      `setBitIf` dopt Opt_TH          flags
1556                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1557                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1558                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1559                .|. idxTysBit  `setBitIf` dopt Opt_IndexedTypes flags
1560                .|. haddockBit `setBitIf` dopt Opt_Haddock     flags
1561       --
1562       setBitIf :: Int -> Bool -> Int
1563       b `setBitIf` cond | cond      = bit b
1564                         | otherwise = 0
1565
1566 addWarning :: DynFlag -> WarnMsg -> P ()
1567 addWarning option w
1568  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1569        let ws' = if dopt option d then ws `snocBag` w else ws
1570        in POk s{messages=(ws', es)} ()
1571
1572 getMessages :: PState -> Messages
1573 getMessages PState{messages=ms} = ms
1574
1575 getContext :: P [LayoutContext]
1576 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1577
1578 setContext :: [LayoutContext] -> P ()
1579 setContext ctx = P $ \s -> POk s{context=ctx} ()
1580
1581 popContext :: P ()
1582 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1583                            loc = loc, last_len = len, last_loc = last_loc }) ->
1584   case ctx of
1585         (_:tl) -> POk s{ context = tl } ()
1586         []     -> PFailed last_loc (srcParseErr buf len)
1587
1588 -- Push a new layout context at the indentation of the last token read.
1589 -- This is only used at the outer level of a module when the 'module'
1590 -- keyword is missing.
1591 pushCurrentContext :: P ()
1592 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1593     POk s{context = Layout (offs-len) : ctx} ()
1594 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1595
1596 getOffside :: P Ordering
1597 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1598                 let ord = case stk of
1599                         (Layout n:_) -> compare offs n
1600                         _            -> GT
1601                 in POk s ord
1602
1603 -- ---------------------------------------------------------------------------
1604 -- Construct a parse error
1605
1606 srcParseErr
1607   :: StringBuffer       -- current buffer (placed just after the last token)
1608   -> Int                -- length of the previous token
1609   -> Message
1610 srcParseErr buf len
1611   = hcat [ if null token
1612              then ptext SLIT("parse error (possibly incorrect indentation)")
1613              else hcat [ptext SLIT("parse error on input "),
1614                         char '`', text token, char '\'']
1615     ]
1616   where token = lexemeToString (offsetBytes (-len) buf) len
1617
1618 -- Report a parse failure, giving the span of the previous token as
1619 -- the location of the error.  This is the entry point for errors
1620 -- detected during parsing.
1621 srcParseFail :: P a
1622 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1623                             last_loc = last_loc } ->
1624     PFailed last_loc (srcParseErr buf len)
1625
1626 -- A lexical error is reported at a particular position in the source file,
1627 -- not over a token range.
1628 lexError :: String -> P a
1629 lexError str = do
1630   loc <- getSrcLoc
1631   i@(AI end _ buf) <- getInput
1632   reportLexError loc end buf str
1633
1634 -- -----------------------------------------------------------------------------
1635 -- This is the top-level function: called from the parser each time a
1636 -- new token is to be read from the input.
1637
1638 lexer :: (Located Token -> P a) -> P a
1639 lexer cont = do
1640   tok@(L span tok__) <- lexToken
1641 --  trace ("token: " ++ show tok__) $ do
1642   cont tok
1643
1644 lexToken :: P (Located Token)
1645 lexToken = do
1646   inp@(AI loc1 _ buf) <- getInput
1647   sc <- getLexState
1648   exts <- getExts
1649   case alexScanUser exts inp sc of
1650     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1651                   setLastToken span 0 0
1652                   return (L span ITeof)
1653     AlexError (AI loc2 _ buf) -> do
1654         reportLexError loc1 loc2 buf "lexical error"
1655     AlexSkip inp2 _ -> do
1656         setInput inp2
1657         lexToken
1658     AlexToken inp2@(AI end _ buf2) len t -> do
1659     setInput inp2
1660     let span = mkSrcSpan loc1 end
1661     let bytes = byteDiff buf buf2
1662     span `seq` setLastToken span bytes bytes
1663     t span buf bytes
1664
1665 reportLexError loc1 loc2 buf str
1666   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1667   | otherwise =
1668   let
1669         c = fst (nextChar buf)
1670   in
1671   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1672     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1673     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1674 }