compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  29    getLexState, popLexState, pushLexState,
  30    extension, glaExtsEnabled, bangPatEnabled
  31   ) where
  32
  33 #include "HsVersions.h"
  34
  35 import ErrUtils         ( Message )
  36 import Outputable
  37 import StringBuffer
  38 import FastString
  39 import FastTypes
  40 import SrcLoc
  41 import UniqFM
  42 import DynFlags
  43 import Ctype
  44 import Util             ( maybePrefixMatch, readRational )
  45
  46 import Data.Bits
  47 import Data.Char        ( chr, isSpace )
  48 import Data.Ratio
  49 import Debug.Trace
  50
  51 #if __GLASGOW_HASKELL__ >= 605
  52 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  53 #else
  54 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  55 #endif
  56 }
  57
  58 $unispace    = \x05
  59 $whitechar   = [\ \t\n\r\f\v\xa0 $unispace]
  60 $white_no_nl = $whitechar # \n
  61
  62 $ascdigit  = 0-9
  63 $unidigit  = \x03
  64 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  65 $digit     = [$ascdigit $unidigit]
  66
  67 $special   = [\(\)\,\;\[\]\`\{\}]
  68 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  69 $unisymbol = \x04
  70 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  71
  72 $unilarge  = \x01
  73 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  74 $large     = [$asclarge $unilarge]
  75
  76 $unismall  = \x02
  77 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  78 $small     = [$ascsmall $unismall \_]
  79
  80 $unigraphic = \x06
  81 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  82
  83 $octit     = 0-7
  84 $hexit     = [$decdigit A-F a-f]
  85 $symchar   = [$symbol \:]
  86 $nl        = [\n\r]
  87 $idchar    = [$small $large $digit \']
  88
  89 $docsym    = [\| \^ \* \$]
  90
  91 @varid     = $small $idchar*
  92 @conid     = $large $idchar*
  93
  94 @varsym    = $symbol $symchar*
  95 @consym    = \: $symchar*
  96
  97 @decimal     = $decdigit+
  98 @octal       = $octit+
  99 @hexadecimal = $hexit+
 100 @exponent    = [eE] [\-\+]? @decimal
 101
 102 -- we support the hierarchical module name extension:
 103 @qual = (@conid \.)+
 104
 105 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 106
 107 haskell :-
 108
 109 -- everywhere: skip whitespace and comments
 110 $white_no_nl+                           ;
 111
 112 -- Everywhere: deal with nested comments.  We explicitly rule out
 113 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 114 -- (this can happen even though pragmas will normally take precedence due to
 115 -- longest-match, because pragmas aren't valid in every state, but comments
 116 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 117 -- set.
 118
 119 "{-" / { isNormalComment } { nested_comment lexToken }
 120
 121 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 122 -- more dashes followed by a symbol should be parsed as a varsym, so we
 123 -- have to exclude those.
 124
 125 -- Since Haddock comments aren't valid in every state, we need to rule them
 126 -- out here.
 127
 128 -- The following two rules match comments that begin with two dashes, but
 129 -- continue with a different character. The rules test that this character
 130 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 131 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 132 -- have a Haddock comment). The rules then munch the rest of the line.
 133
 134 "-- " ~$docsym .* ;
 135 "--" [^$symbol : \ ] .* ;
 136
 137 -- Next, match Haddock comments if no -haddock flag
 138
 139 "-- " $docsym .* / { ifExtension (not . haddockEnabled) } ;
 140
 141 -- Now, when we've matched comments that begin with 2 dashes and continue
 142 -- with a different character, we need to match comments that begin with three
 143 -- or more dashes (which clearly can't be Haddock comments). We only need to
 144 -- make sure that the first non-dash character isn't a symbol, and munch the
 145 -- rest of the line.
 146
 147 "---"\-* [^$symbol :] .* ;
 148
 149 -- Since the previous rules all match dashes followed by at least one
 150 -- character, we also need to match a whole line filled with just dashes.
 151
 152 "--"\-* / { atEOL } ;
 153
 154 -- We need this rule since none of the other single line comment rules
 155 -- actually match this case.
 156
 157 "-- " / { atEOL } ;
 158
 159 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 160 -- blank lines) until we find a non-whitespace character, then do layout
 161 -- processing.
 162 --
 163 -- One slight wibble here: what if the line begins with {-#? In
 164 -- theory, we have to lex the pragma to see if it's one we recognise,
 165 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 166 -- as a nested comment.  We don't bother with this: if the line begins
 167 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 168 <bol> {
 169   \n                                    ;
 170   ^\# (line)?                           { begin line_prag1 }
 171   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 172   ^\# \! .* \n                          ; -- #!, for scripts
 173   ()                                    { do_bol }
 174 }
 175
 176 -- after a layout keyword (let, where, do, of), we begin a new layout
 177 -- context if the curly brace is missing.
 178 -- Careful! This stuff is quite delicate.
 179 <layout, layout_do> {
 180   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 181         -- we might encounter {-# here, but {- has been handled already
 182   \n                                    ;
 183   ^\# (line)?                           { begin line_prag1 }
 184 }
 185
 186 -- do is treated in a subtly different way, see new_layout_context
 187 <layout>    ()                          { new_layout_context True }
 188 <layout_do> ()                          { new_layout_context False }
 189
 190 -- after a new layout context which was found to be to the left of the
 191 -- previous context, we have generated a '{' token, and we now need to
 192 -- generate a matching '}' token.
 193 <layout_left>  ()                       { do_layout_left }
 194
 195 <0,option_prags,glaexts> \n                             { begin bol }
 196
 197 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 198
 199 -- single-line line pragmas, of the form
 200 --    # <line> "<file>" <extra-stuff> \n
 201 <line_prag1> $decdigit+                 { setLine line_prag1a }
 202 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 203 <line_prag1b> .*                        { pop }
 204
 205 -- Haskell-style line pragmas, of the form
 206 --    {-# LINE <line> "<file>" #-}
 207 <line_prag2> $decdigit+                 { setLine line_prag2a }
 208 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 209 <line_prag2b> "#-}"|"-}"                { pop }
 210    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 211    -- with older versions of GHC which generated these.
 212
 213 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 214 -- is on, because the contents of the pragma is always written using
 215 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 216 -- enabled, we're sure to get a parse error.
 217 -- (ToDo: we should really emit a warning when ignoring pragmas)
 218 <glaexts>
 219   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 220
 221 <0,option_prags,glaexts> {
 222   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 223   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 224                                         { token (ITinline_prag False) }
 225   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 226                                         { token ITspec_prag }
 227   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 228         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 229   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 230         $whitechar* (NO(T?)INLINE|no(t?)inline)
 231                                         { token (ITspec_inline_prag False) }
 232   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 233   "{-#" $whitechar* (DEPRECATED|deprecated)
 234                                         { token ITdeprecated_prag }
 235   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 236   "{-#" $whitechar* (GENERATED|generated)
 237                                         { token ITgenerated_prag }
 238   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 239   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 240
 241   "{-#" $whitechar* (DOCOPTIONS|docoptions)
 242   / { ifExtension haddockEnabled }     { lex_string_prag ITdocOptions }
 243
 244  "{-#"                                 { nested_comment lexToken }
 245
 246   -- ToDo: should only be valid inside a pragma:
 247   "#-}"                                 { token ITclose_prag}
 248 }
 249
 250 <option_prags> {
 251   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 252   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 253                                         { lex_string_prag IToptions_prag }
 254   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 255   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 256 }
 257
 258 <0,option_prags,glaexts> {
 259         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 260   "{-#" $whitechar* $idchar+            { nested_comment lexToken }
 261 }
 262
 263 -- '0' state: ordinary lexemes
 264 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 265
 266 -- Haddock comments
 267
 268 <0,glaexts> {
 269   "-- " / $docsym    { multiline_doc_comment }
 270   "{-" \ ? / $docsym { nested_doc_comment }
 271 }
 272
 273 -- "special" symbols
 274
 275 <0,glaexts> {
 276   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 277   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 278 }
 279
 280 <0,glaexts> {
 281   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 282   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 283   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 284   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 285   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 286   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 287   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 288   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 289 }
 290
 291 <0,glaexts> {
 292   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 293                                         { special IToparenbar }
 294   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 295 }
 296
 297 <0,glaexts> {
 298   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 299 }
 300
 301 <glaexts> {
 302   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 303   "#)"                                  { token ITcubxparen }
 304   "{|"                                  { token ITocurlybar }
 305   "|}"                                  { token ITccurlybar }
 306 }
 307
 308 <0,option_prags,glaexts> {
 309   \(                                    { special IToparen }
 310   \)                                    { special ITcparen }
 311   \[                                    { special ITobrack }
 312   \]                                    { special ITcbrack }
 313   \,                                    { special ITcomma }
 314   \;                                    { special ITsemi }
 315   \`                                    { special ITbackquote }
 316
 317   \{                                    { open_brace }
 318   \}                                    { close_brace }
 319 }
 320
 321 <0,option_prags,glaexts> {
 322   @qual @varid                  { check_qvarid }
 323   @qual @conid                  { idtoken qconid }
 324   @varid                        { varid }
 325   @conid                        { idtoken conid }
 326 }
 327
 328 -- after an illegal qvarid, such as 'M.let',
 329 -- we back up and try again in the bad_qvarid state:
 330 <bad_qvarid> {
 331   @conid                        { pop_and (idtoken conid) }
 332   @qual @conid                  { pop_and (idtoken qconid) }
 333 }
 334
 335 <glaexts> {
 336   @qual @varid "#"+             { idtoken qvarid }
 337   @qual @conid "#"+             { idtoken qconid }
 338   @varid "#"+                   { varid }
 339   @conid "#"+                   { idtoken conid }
 340 }
 341
 342 -- ToDo: M.(,,,)
 343
 344 <0,glaexts> {
 345   @qual @varsym                 { idtoken qvarsym }
 346   @qual @consym                 { idtoken qconsym }
 347   @varsym                       { varsym }
 348   @consym                       { consym }
 349 }
 350
 351 <0,glaexts> {
 352   @decimal                      { tok_decimal }
 353   0[oO] @octal                  { tok_octal }
 354   0[xX] @hexadecimal            { tok_hexadecimal }
 355 }
 356
 357 <glaexts> {
 358   @decimal \#                   { prim_decimal }
 359   0[oO] @octal \#               { prim_octal }
 360   0[xX] @hexadecimal \#         { prim_hexadecimal }
 361 }
 362
 363 <0,glaexts> @floating_point             { strtoken tok_float }
 364 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 365 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 366
 367 -- Strings and chars are lexed by hand-written code.  The reason is
 368 -- that even if we recognise the string or char here in the regex
 369 -- lexer, we would still have to parse the string afterward in order
 370 -- to convert it to a String.
 371 <0,glaexts> {
 372   \'                            { lex_char_tok }
 373   \"                            { lex_string_tok }
 374 }
 375
 376 {
 377 -- work around bug in Alex 2.0
 378 #if __GLASGOW_HASKELL__ < 503
 379 unsafeAt arr i = arr ! i
 380 #endif
 381
 382 -- -----------------------------------------------------------------------------
 383 -- The token type
 384
 385 data Token
 386   = ITas                        -- Haskell keywords
 387   | ITcase
 388   | ITclass
 389   | ITdata
 390   | ITdefault
 391   | ITderiving
 392   | ITderived
 393   | ITdo
 394   | ITelse
 395   | IThiding
 396   | ITif
 397   | ITimport
 398   | ITin
 399   | ITinfix
 400   | ITinfixl
 401   | ITinfixr
 402   | ITinstance
 403   | ITlet
 404   | ITmodule
 405   | ITnewtype
 406   | ITof
 407   | ITqualified
 408   | ITthen
 409   | ITtype
 410   | ITwhere
 411   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 412
 413   | ITforall                    -- GHC extension keywords
 414   | ITforeign
 415   | ITexport
 416   | ITlabel
 417   | ITdynamic
 418   | ITsafe
 419   | ITthreadsafe
 420   | ITunsafe
 421   | ITstdcallconv
 422   | ITccallconv
 423   | ITdotnet
 424   | ITmdo
 425   | ITiso
 426   | ITfamily
 427
 428         -- Pragmas
 429   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 430   | ITspec_prag                 -- SPECIALISE
 431   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 432   | ITsource_prag
 433   | ITrules_prag
 434   | ITdeprecated_prag
 435   | ITline_prag
 436   | ITscc_prag
 437   | ITgenerated_prag
 438   | ITcore_prag                 -- hdaume: core annotations
 439   | ITunpack_prag
 440   | ITclose_prag
 441   | IToptions_prag String
 442   | ITinclude_prag String
 443   | ITlanguage_prag
 444
 445   | ITdotdot                    -- reserved symbols
 446   | ITcolon
 447   | ITdcolon
 448   | ITequal
 449   | ITlam
 450   | ITvbar
 451   | ITlarrow
 452   | ITrarrow
 453   | ITat
 454   | ITtilde
 455   | ITdarrow
 456   | ITminus
 457   | ITbang
 458   | ITstar
 459   | ITdot
 460
 461   | ITbiglam                    -- GHC-extension symbols
 462
 463   | ITocurly                    -- special symbols
 464   | ITccurly
 465   | ITocurlybar                 -- {|, for type applications
 466   | ITccurlybar                 -- |}, for type applications
 467   | ITvocurly
 468   | ITvccurly
 469   | ITobrack
 470   | ITopabrack                  -- [:, for parallel arrays with -fparr
 471   | ITcpabrack                  -- :], for parallel arrays with -fparr
 472   | ITcbrack
 473   | IToparen
 474   | ITcparen
 475   | IToubxparen
 476   | ITcubxparen
 477   | ITsemi
 478   | ITcomma
 479   | ITunderscore
 480   | ITbackquote
 481
 482   | ITvarid   FastString        -- identifiers
 483   | ITconid   FastString
 484   | ITvarsym  FastString
 485   | ITconsym  FastString
 486   | ITqvarid  (FastString,FastString)
 487   | ITqconid  (FastString,FastString)
 488   | ITqvarsym (FastString,FastString)
 489   | ITqconsym (FastString,FastString)
 490
 491   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 492
 493   | ITpragma StringBuffer
 494
 495   | ITchar       Char
 496   | ITstring     FastString
 497   | ITinteger    Integer
 498   | ITrational   Rational
 499
 500   | ITprimchar   Char
 501   | ITprimstring FastString
 502   | ITprimint    Integer
 503   | ITprimfloat  Rational
 504   | ITprimdouble Rational
 505
 506   -- MetaHaskell extension tokens
 507   | ITopenExpQuote              --  [| or [e|
 508   | ITopenPatQuote              --  [p|
 509   | ITopenDecQuote              --  [d|
 510   | ITopenTypQuote              --  [t|
 511   | ITcloseQuote                --  |]
 512   | ITidEscape   FastString     --  $x
 513   | ITparenEscape               --  $(
 514   | ITvarQuote                  --  '
 515   | ITtyQuote                   --  ''
 516
 517   -- Arrow notation extension
 518   | ITproc
 519   | ITrec
 520   | IToparenbar                 --  (|
 521   | ITcparenbar                 --  |)
 522   | ITlarrowtail                --  -<
 523   | ITrarrowtail                --  >-
 524   | ITLarrowtail                --  -<<
 525   | ITRarrowtail                --  >>-
 526
 527   | ITunknown String            -- Used when the lexer can't make sense of it
 528   | ITeof                       -- end of file token
 529
 530   -- Documentation annotations
 531   | ITdocCommentNext  String     -- something beginning '-- |'
 532   | ITdocCommentPrev  String     -- something beginning '-- ^'
 533   | ITdocCommentNamed String     -- something beginning '-- $'
 534   | ITdocSection      Int String -- a section heading
 535   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 536
 537 #ifdef DEBUG
 538   deriving Show -- debugging
 539 #endif
 540
 541 isSpecial :: Token -> Bool
 542 -- If we see M.x, where x is a keyword, but
 543 -- is special, we treat is as just plain M.x,
 544 -- not as a keyword.
 545 isSpecial ITas          = True
 546 isSpecial IThiding      = True
 547 isSpecial ITderived     = True
 548 isSpecial ITqualified   = True
 549 isSpecial ITforall      = True
 550 isSpecial ITexport      = True
 551 isSpecial ITlabel       = True
 552 isSpecial ITdynamic     = True
 553 isSpecial ITsafe        = True
 554 isSpecial ITthreadsafe  = True
 555 isSpecial ITunsafe      = True
 556 isSpecial ITccallconv   = True
 557 isSpecial ITstdcallconv = True
 558 isSpecial ITmdo         = True
 559 isSpecial ITiso         = True
 560 isSpecial ITfamily      = True
 561 isSpecial _             = False
 562
 563 -- the bitmap provided as the third component indicates whether the
 564 -- corresponding extension keyword is valid under the extension options
 565 -- provided to the compiler; if the extension corresponding to *any* of the
 566 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 567 -- facilitates using a keyword in two different extensions that can be
 568 -- activated independently)
 569 --
 570 reservedWordsFM = listToUFM $
 571         map (\(x, y, z) -> (mkFastString x, (y, z)))
 572        [( "_",          ITunderscore,   0 ),
 573         ( "as",         ITas,           0 ),
 574         ( "case",       ITcase,         0 ),
 575         ( "class",      ITclass,        0 ),
 576         ( "data",       ITdata,         0 ),
 577         ( "default",    ITdefault,      0 ),
 578         ( "deriving",   ITderiving,     0 ),
 579         ( "derived",    ITderived,      0 ),
 580         ( "do",         ITdo,           0 ),
 581         ( "else",       ITelse,         0 ),
 582         ( "hiding",     IThiding,       0 ),
 583         ( "if",         ITif,           0 ),
 584         ( "import",     ITimport,       0 ),
 585         ( "in",         ITin,           0 ),
 586         ( "infix",      ITinfix,        0 ),
 587         ( "infixl",     ITinfixl,       0 ),
 588         ( "infixr",     ITinfixr,       0 ),
 589         ( "instance",   ITinstance,     0 ),
 590         ( "let",        ITlet,          0 ),
 591         ( "module",     ITmodule,       0 ),
 592         ( "newtype",    ITnewtype,      0 ),
 593         ( "of",         ITof,           0 ),
 594         ( "qualified",  ITqualified,    0 ),
 595         ( "then",       ITthen,         0 ),
 596         ( "type",       ITtype,         0 ),
 597         ( "where",      ITwhere,        0 ),
 598         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 599
 600         ( "forall",     ITforall,        bit tvBit),
 601         ( "mdo",        ITmdo,           bit glaExtsBit),
 602         ( "family",     ITfamily,        bit idxTysBit),
 603
 604         ( "foreign",    ITforeign,       bit ffiBit),
 605         ( "export",     ITexport,        bit ffiBit),
 606         ( "label",      ITlabel,         bit ffiBit),
 607         ( "dynamic",    ITdynamic,       bit ffiBit),
 608         ( "safe",       ITsafe,          bit ffiBit),
 609         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 610         ( "unsafe",     ITunsafe,        bit ffiBit),
 611         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 612         ( "ccall",      ITccallconv,     bit ffiBit),
 613         ( "dotnet",     ITdotnet,        bit ffiBit),
 614
 615         ( "rec",        ITrec,           bit arrowsBit),
 616         ( "proc",       ITproc,          bit arrowsBit)
 617      ]
 618
 619 reservedSymsFM = listToUFM $
 620         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 621       [ ("..",  ITdotdot,       0)
 622        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 623                                                 -- meaning only list cons
 624        ,("::",  ITdcolon,       0)
 625        ,("=",   ITequal,        0)
 626        ,("\\",  ITlam,          0)
 627        ,("|",   ITvbar,         0)
 628        ,("<-",  ITlarrow,       0)
 629        ,("->",  ITrarrow,       0)
 630        ,("@",   ITat,           0)
 631        ,("~",   ITtilde,        0)
 632        ,("=>",  ITdarrow,       0)
 633        ,("-",   ITminus,        0)
 634        ,("!",   ITbang,         0)
 635
 636        ,("*",   ITstar,         bit glaExtsBit .|.
 637                                 bit idxTysBit)      -- For data T (a::*) = MkT
 638        ,(".",   ITdot,          bit tvBit)          -- For 'forall a . t'
 639
 640        ,("-<",  ITlarrowtail,   bit arrowsBit)
 641        ,(">-",  ITrarrowtail,   bit arrowsBit)
 642        ,("-<<", ITLarrowtail,   bit arrowsBit)
 643        ,(">>-", ITRarrowtail,   bit arrowsBit)
 644
 645 #if __GLASGOW_HASKELL__ >= 605
 646        ,("λ",  ITlam,          bit glaExtsBit)
 647        ,("∷",   ITdcolon,       bit glaExtsBit)
 648        ,("⇒",   ITdarrow,     bit glaExtsBit)
 649        ,("∀", ITforall,       bit glaExtsBit)
 650        ,("→",   ITrarrow,     bit glaExtsBit)
 651        ,("←",   ITlarrow,     bit glaExtsBit)
 652        ,("⋯",         ITdotdot,       bit glaExtsBit)
 653         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 654         -- form part of a large operator.  This would let us have a better
 655         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 656 #endif
 657        ]
 658
 659 -- -----------------------------------------------------------------------------
 660 -- Lexer actions
 661
 662 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 663
 664 special :: Token -> Action
 665 special tok span _buf len = return (L span tok)
 666
 667 token, layout_token :: Token -> Action
 668 token t span buf len = return (L span t)
 669 layout_token t span buf len = pushLexState layout >> return (L span t)
 670
 671 idtoken :: (StringBuffer -> Int -> Token) -> Action
 672 idtoken f span buf len = return (L span $! (f buf len))
 673
 674 skip_one_varid :: (FastString -> Token) -> Action
 675 skip_one_varid f span buf len
 676   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 677
 678 strtoken :: (String -> Token) -> Action
 679 strtoken f span buf len =
 680   return (L span $! (f $! lexemeToString buf len))
 681
 682 init_strtoken :: Int -> (String -> Token) -> Action
 683 -- like strtoken, but drops the last N character(s)
 684 init_strtoken drop f span buf len =
 685   return (L span $! (f $! lexemeToString buf (len-drop)))
 686
 687 begin :: Int -> Action
 688 begin code _span _str _len = do pushLexState code; lexToken
 689
 690 pop :: Action
 691 pop _span _buf _len = do popLexState; lexToken
 692
 693 pop_and :: Action -> Action
 694 pop_and act span buf len = do popLexState; act span buf len
 695
 696 {-# INLINE nextCharIs #-}
 697 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 698
 699 notFollowedBy char _ _ _ (AI _ _ buf)
 700   = nextCharIs buf (/=char)
 701
 702 notFollowedBySymbol _ _ _ (AI _ _ buf)
 703   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 704
 705 isNormalComment bits _ _ (AI _ _ buf)
 706   | haddockEnabled bits = notFollowedByDocOrPragma
 707   | otherwise           = nextCharIs buf (/='#')
 708   where
 709     notFollowedByDocOrPragma
 710         = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 711
 712 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 713
 714 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 715   = if haddockEnabled bits then False else (p buf)
 716
 717 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 718
 719 ifExtension pred bits _ _ _ = pred bits
 720
 721 multiline_doc_comment :: Action
 722 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 723   where
 724     worker commentAcc input docType oneLine = case alexGetChar input of
 725       Just ('\n', input')
 726         | oneLine -> docCommentEnd input commentAcc docType buf span
 727         | otherwise -> case checkIfCommentLine input' of
 728           Just input -> worker ('\n':commentAcc) input docType False
 729           Nothing -> docCommentEnd input commentAcc docType buf span
 730       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 731       Nothing -> docCommentEnd input commentAcc docType buf span
 732
 733     checkIfCommentLine input = check (dropNonNewlineSpace input)
 734       where
 735         check input = case alexGetChar input of
 736           Just ('-', input) -> case alexGetChar input of
 737             Just ('-', input) -> case alexGetChar input of
 738               Just (c, _) | c /= '-' -> Just input
 739               _ -> Nothing
 740             _ -> Nothing
 741           _ -> Nothing
 742
 743         dropNonNewlineSpace input = case alexGetChar input of
 744           Just (c, input')
 745             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 746             | otherwise -> input
 747           Nothing -> input
 748
 749 {-
 750   nested comments require traversing by hand, they can't be parsed
 751   using regular expressions.
 752 -}
 753 nested_comment :: P (Located Token) -> Action
 754 nested_comment cont span _str _len = do
 755   input <- getInput
 756   go 1 input
 757   where
 758     go 0 input = do setInput input; cont
 759     go n input = case alexGetChar input of
 760       Nothing -> errBrace input span
 761       Just ('-',input) -> case alexGetChar input of
 762         Nothing  -> errBrace input span
 763         Just ('\125',input) -> go (n-1) input
 764         Just (c,_)          -> go n input
 765       Just ('\123',input) -> case alexGetChar input of
 766         Nothing  -> errBrace input span
 767         Just ('-',input) -> go (n+1) input
 768         Just (c,_)       -> go n input
 769       Just (c,input) -> go n input
 770
 771 nested_doc_comment :: Action
 772 nested_doc_comment span buf _len = withLexedDocType (go "")
 773   where
 774     go commentAcc input docType _ = case alexGetChar input of
 775       Nothing -> errBrace input span
 776       Just ('-',input) -> case alexGetChar input of
 777         Nothing -> errBrace input span
 778         Just ('\125',input@(AI end _ buf2)) ->
 779           docCommentEnd input commentAcc docType buf span
 780         Just (c,_) -> go ('-':commentAcc) input docType False
 781       Just ('\123', input) -> case alexGetChar input of
 782         Nothing  -> errBrace input span
 783         Just ('-',input) -> do
 784           setInput input
 785           let cont = do input <- getInput; go commentAcc input docType False
 786           nested_comment cont span buf _len
 787         Just (c,_) -> go ('\123':commentAcc) input docType False
 788       Just (c,input) -> go (c:commentAcc) input docType False
 789
 790 withLexedDocType lexDocComment = do
 791   input <- getInput
 792   case alexGetChar input of
 793     Nothing -> error "Can't happen"
 794     Just ('|', input) -> lexDocComment input ITdocCommentNext False
 795     Just ('^', input) -> lexDocComment input ITdocCommentPrev False
 796     Just ('$', input) -> lexDocComment input ITdocCommentNamed False
 797     Just ('*', input) -> lexDocSection 1 input
 798  where
 799     lexDocSection n input = case alexGetChar input of
 800       Just ('*', input) -> lexDocSection (n+1) input
 801       Just (c, _) -> lexDocComment input (ITdocSection n) True
 802       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 803
 804 -- docCommentEnd
 805 -------------------------------------------------------------------------------
 806 -- This function is quite tricky. We can't just return a new token, we also
 807 -- need to update the state of the parser. Why? Because the token is longer
 808 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 809 -- it writes the wrong token length to the parser state. This function is
 810 -- called afterwards, so it can just update the state.
 811
 812 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 813 -- which is something that the original lexer didn't account for.
 814 -- I have added last_line_len in the parser state which represents the length
 815 -- of the part of the token that is on the last line. It is now used for layout
 816 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 817 -- was before, the full length of the token, and it is now only used for error
 818 -- messages. /Waern
 819
 820 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 821                  SrcSpan -> P (Located Token)
 822 docCommentEnd input commentAcc docType buf span = do
 823   setInput input
 824   let (AI loc last_offs nextBuf) = input
 825       comment = reverse commentAcc
 826       span' = mkSrcSpan (srcSpanStart span) loc
 827       last_len = byteDiff buf nextBuf
 828
 829       last_line_len = if (last_offs - last_len < 0)
 830         then last_offs
 831         else last_len
 832
 833   span `seq` setLastToken span' last_len last_line_len
 834   return (L span' (docType comment))
 835
 836 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 837
 838 open_brace, close_brace :: Action
 839 open_brace span _str _len = do
 840   ctx <- getContext
 841   setContext (NoLayout:ctx)
 842   return (L span ITocurly)
 843 close_brace span _str _len = do
 844   popContext
 845   return (L span ITccurly)
 846
 847 -- We have to be careful not to count M.<varid> as a qualified name
 848 -- when <varid> is a keyword.  We hack around this by catching
 849 -- the offending tokens afterward, and re-lexing in a different state.
 850 check_qvarid span buf len = do
 851   case lookupUFM reservedWordsFM var of
 852         Just (keyword,exts)
 853           | not (isSpecial keyword) ->
 854           if exts == 0
 855              then try_again
 856              else do
 857                 b <- extension (\i -> exts .&. i /= 0)
 858                 if b then try_again
 859                      else return token
 860         _other -> return token
 861   where
 862         (mod,var) = splitQualName buf len
 863         token     = L span (ITqvarid (mod,var))
 864
 865         try_again = do
 866                 (AI _ offs _) <- getInput
 867                 setInput (AI (srcSpanStart span) (offs-len) buf)
 868                 pushLexState bad_qvarid
 869                 lexToken
 870
 871 qvarid buf len = ITqvarid $! splitQualName buf len
 872 qconid buf len = ITqconid $! splitQualName buf len
 873
 874 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 875 -- takes a StringBuffer and a length, and returns the module name
 876 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 877 -- because of hierarchical module names.
 878 splitQualName orig_buf len = split orig_buf orig_buf
 879   where
 880     split buf dot_buf
 881         | orig_buf `byteDiff` buf >= len  = done dot_buf
 882         | c == '.'                        = found_dot buf'
 883         | otherwise                       = split buf' dot_buf
 884       where
 885        (c,buf') = nextChar buf
 886
 887     -- careful, we might get names like M....
 888     -- so, if the character after the dot is not upper-case, this is
 889     -- the end of the qualifier part.
 890     found_dot buf -- buf points after the '.'
 891         | isUpper c    = split buf' buf
 892         | otherwise    = done buf
 893       where
 894        (c,buf') = nextChar buf
 895
 896     done dot_buf =
 897         (lexemeToFastString orig_buf (qual_size - 1),
 898          lexemeToFastString dot_buf (len - qual_size))
 899       where
 900         qual_size = orig_buf `byteDiff` dot_buf
 901
 902 varid span buf len =
 903   case lookupUFM reservedWordsFM fs of
 904         Just (keyword,0)    -> do
 905                 maybe_layout keyword
 906                 return (L span keyword)
 907         Just (keyword,exts) -> do
 908                 b <- extension (\i -> exts .&. i /= 0)
 909                 if b then do maybe_layout keyword
 910                              return (L span keyword)
 911                      else return (L span (ITvarid fs))
 912         _other -> return (L span (ITvarid fs))
 913   where
 914         fs = lexemeToFastString buf len
 915
 916 conid buf len = ITconid fs
 917   where fs = lexemeToFastString buf len
 918
 919 qvarsym buf len = ITqvarsym $! splitQualName buf len
 920 qconsym buf len = ITqconsym $! splitQualName buf len
 921
 922 varsym = sym ITvarsym
 923 consym = sym ITconsym
 924
 925 sym con span buf len =
 926   case lookupUFM reservedSymsFM fs of
 927         Just (keyword,0)    -> return (L span keyword)
 928         Just (keyword,exts) -> do
 929                 b <- extension (\i -> exts .&. i /= 0)
 930                 if b then return (L span keyword)
 931                      else return (L span $! con fs)
 932         _other -> return (L span $! con fs)
 933   where
 934         fs = lexemeToFastString buf len
 935
 936 tok_decimal span buf len
 937   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 938
 939 tok_octal span buf len
 940   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 941
 942 tok_hexadecimal span buf len
 943   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 944
 945 prim_decimal span buf len
 946   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 947
 948 prim_octal span buf len
 949   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 950
 951 prim_hexadecimal span buf len
 952   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 953
 954 tok_float        str = ITrational   $! readRational str
 955 prim_float       str = ITprimfloat  $! readRational str
 956 prim_double      str = ITprimdouble $! readRational str
 957
 958 -- -----------------------------------------------------------------------------
 959 -- Layout processing
 960
 961 -- we're at the first token on a line, insert layout tokens if necessary
 962 do_bol :: Action
 963 do_bol span _str _len = do
 964         pos <- getOffside
 965         case pos of
 966             LT -> do
 967                 --trace "layout: inserting '}'" $ do
 968                 popContext
 969                 -- do NOT pop the lex state, we might have a ';' to insert
 970                 return (L span ITvccurly)
 971             EQ -> do
 972                 --trace "layout: inserting ';'" $ do
 973                 popLexState
 974                 return (L span ITsemi)
 975             GT -> do
 976                 popLexState
 977                 lexToken
 978
 979 -- certain keywords put us in the "layout" state, where we might
 980 -- add an opening curly brace.
 981 maybe_layout ITdo       = pushLexState layout_do
 982 maybe_layout ITmdo      = pushLexState layout_do
 983 maybe_layout ITof       = pushLexState layout
 984 maybe_layout ITlet      = pushLexState layout
 985 maybe_layout ITwhere    = pushLexState layout
 986 maybe_layout ITrec      = pushLexState layout
 987 maybe_layout _          = return ()
 988
 989 -- Pushing a new implicit layout context.  If the indentation of the
 990 -- next token is not greater than the previous layout context, then
 991 -- Haskell 98 says that the new layout context should be empty; that is
 992 -- the lexer must generate {}.
 993 --
 994 -- We are slightly more lenient than this: when the new context is started
 995 -- by a 'do', then we allow the new context to be at the same indentation as
 996 -- the previous context.  This is what the 'strict' argument is for.
 997 --
 998 new_layout_context strict span _buf _len = do
 999     popLexState
1000     (AI _ offset _) <- getInput
1001     ctx <- getContext
1002     case ctx of
1003         Layout prev_off : _  |
1004            (strict     && prev_off >= offset  ||
1005             not strict && prev_off > offset) -> do
1006                 -- token is indented to the left of the previous context.
1007                 -- we must generate a {} sequence now.
1008                 pushLexState layout_left
1009                 return (L span ITvocurly)
1010         other -> do
1011                 setContext (Layout offset : ctx)
1012                 return (L span ITvocurly)
1013
1014 do_layout_left span _buf _len = do
1015     popLexState
1016     pushLexState bol  -- we must be at the start of a line
1017     return (L span ITvccurly)
1018
1019 -- -----------------------------------------------------------------------------
1020 -- LINE pragmas
1021
1022 setLine :: Int -> Action
1023 setLine code span buf len = do
1024   let line = parseInteger buf len 10 octDecDigit
1025   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1026         -- subtract one: the line number refers to the *following* line
1027   popLexState
1028   pushLexState code
1029   lexToken
1030
1031 setFile :: Int -> Action
1032 setFile code span buf len = do
1033   let file = lexemeToFastString (stepOn buf) (len-2)
1034   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1035   popLexState
1036   pushLexState code
1037   lexToken
1038
1039
1040 -- -----------------------------------------------------------------------------
1041 -- Options, includes and language pragmas.
1042
1043 lex_string_prag :: (String -> Token) -> Action
1044 lex_string_prag mkTok span buf len
1045     = do input <- getInput
1046          start <- getSrcLoc
1047          tok <- go [] input
1048          end <- getSrcLoc
1049          return (L (mkSrcSpan start end) tok)
1050     where go acc input
1051               = if isString input "#-}"
1052                    then do setInput input
1053                            return (mkTok (reverse acc))
1054                    else case alexGetChar input of
1055                           Just (c,i) -> go (c:acc) i
1056                           Nothing -> err input
1057           isString i [] = True
1058           isString i (x:xs)
1059               = case alexGetChar i of
1060                   Just (c,i') | c == x    -> isString i' xs
1061                   _other -> False
1062           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1063
1064
1065 -- -----------------------------------------------------------------------------
1066 -- Strings & Chars
1067
1068 -- This stuff is horrible.  I hates it.
1069
1070 lex_string_tok :: Action
1071 lex_string_tok span buf len = do
1072   tok <- lex_string ""
1073   end <- getSrcLoc
1074   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1075
1076 lex_string :: String -> P Token
1077 lex_string s = do
1078   i <- getInput
1079   case alexGetChar' i of
1080     Nothing -> lit_error
1081
1082     Just ('"',i)  -> do
1083         setInput i
1084         glaexts <- extension glaExtsEnabled
1085         if glaexts
1086           then do
1087             i <- getInput
1088             case alexGetChar' i of
1089               Just ('#',i) -> do
1090                    setInput i
1091                    if any (> '\xFF') s
1092                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1093                     else let s' = mkZFastString (reverse s) in
1094                          return (ITprimstring s')
1095                         -- mkZFastString is a hack to avoid encoding the
1096                         -- string in UTF-8.  We just want the exact bytes.
1097               _other ->
1098                 return (ITstring (mkFastString (reverse s)))
1099           else
1100                 return (ITstring (mkFastString (reverse s)))
1101
1102     Just ('\\',i)
1103         | Just ('&',i) <- next -> do
1104                 setInput i; lex_string s
1105         | Just (c,i) <- next, is_space c -> do
1106                 setInput i; lex_stringgap s
1107         where next = alexGetChar' i
1108
1109     Just (c, i) -> do
1110         c' <- lex_char c i
1111         lex_string (c':s)
1112
1113 lex_stringgap s = do
1114   c <- getCharOrFail
1115   case c of
1116     '\\' -> lex_string s
1117     c | is_space c -> lex_stringgap s
1118     _other -> lit_error
1119
1120
1121 lex_char_tok :: Action
1122 -- Here we are basically parsing character literals, such as 'x' or '\n'
1123 -- but, when Template Haskell is on, we additionally spot
1124 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1125 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1126 -- So we have to do two characters of lookahead: when we see 'x we need to
1127 -- see if there's a trailing quote
1128 lex_char_tok span buf len = do  -- We've seen '
1129    i1 <- getInput       -- Look ahead to first character
1130    let loc = srcSpanStart span
1131    case alexGetChar' i1 of
1132         Nothing -> lit_error
1133
1134         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1135                   th_exts <- extension thEnabled
1136                   if th_exts then do
1137                         setInput i2
1138                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1139                    else lit_error
1140
1141         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
1142                   setInput i2
1143                   lit_ch <- lex_escape
1144                   mc <- getCharOrFail   -- Trailing quote
1145                   if mc == '\'' then finish_char_tok loc lit_ch
1146                                 else do setInput i2; lit_error
1147
1148         Just (c, i2@(AI end2 _ _))
1149                 | not (isAny c) -> lit_error
1150                 | otherwise ->
1151
1152                 -- We've seen 'x, where x is a valid character
1153                 --  (i.e. not newline etc) but not a quote or backslash
1154            case alexGetChar' i2 of      -- Look ahead one more character
1155                 Nothing -> lit_error
1156                 Just ('\'', i3) -> do   -- We've seen 'x'
1157                         setInput i3
1158                         finish_char_tok loc c
1159                 _other -> do            -- We've seen 'x not followed by quote
1160                                         -- If TH is on, just parse the quote only
1161                         th_exts <- extension thEnabled
1162                         let (AI end _ _) = i1
1163                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1164                                    else do setInput i2; lit_error
1165
1166 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1167 finish_char_tok loc ch  -- We've already seen the closing quote
1168                         -- Just need to check for trailing #
1169   = do  glaexts <- extension glaExtsEnabled
1170         i@(AI end _ _) <- getInput
1171         if glaexts then do
1172                 case alexGetChar' i of
1173                         Just ('#',i@(AI end _ _)) -> do
1174                                 setInput i
1175                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1176                         _other ->
1177                                 return (L (mkSrcSpan loc end) (ITchar ch))
1178                 else do
1179                    return (L (mkSrcSpan loc end) (ITchar ch))
1180
1181 lex_char :: Char -> AlexInput -> P Char
1182 lex_char c inp = do
1183   case c of
1184       '\\' -> do setInput inp; lex_escape
1185       c | isAny c -> do setInput inp; return c
1186       _other -> lit_error
1187
1188 isAny c | c > '\xff' = isPrint c
1189         | otherwise  = is_any c
1190
1191 lex_escape :: P Char
1192 lex_escape = do
1193   c <- getCharOrFail
1194   case c of
1195         'a'   -> return '\a'
1196         'b'   -> return '\b'
1197         'f'   -> return '\f'
1198         'n'   -> return '\n'
1199         'r'   -> return '\r'
1200         't'   -> return '\t'
1201         'v'   -> return '\v'
1202         '\\'  -> return '\\'
1203         '"'   -> return '\"'
1204         '\''  -> return '\''
1205         '^'   -> do c <- getCharOrFail
1206                     if c >= '@' && c <= '_'
1207                         then return (chr (ord c - ord '@'))
1208                         else lit_error
1209
1210         'x'   -> readNum is_hexdigit 16 hexDigit
1211         'o'   -> readNum is_octdigit  8 octDecDigit
1212         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1213
1214         c1 ->  do
1215            i <- getInput
1216            case alexGetChar' i of
1217             Nothing -> lit_error
1218             Just (c2,i2) ->
1219               case alexGetChar' i2 of
1220                 Nothing -> do setInput i2; lit_error
1221                 Just (c3,i3) ->
1222                    let str = [c1,c2,c3] in
1223                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1224                                      Just rest <- [maybePrefixMatch p str] ] of
1225                           (escape_char,[]):_ -> do
1226                                 setInput i3
1227                                 return escape_char
1228                           (escape_char,_:_):_ -> do
1229                                 setInput i2
1230                                 return escape_char
1231                           [] -> lit_error
1232
1233 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1234 readNum is_digit base conv = do
1235   i <- getInput
1236   c <- getCharOrFail
1237   if is_digit c
1238         then readNum2 is_digit base conv (conv c)
1239         else do setInput i; lit_error
1240
1241 readNum2 is_digit base conv i = do
1242   input <- getInput
1243   read i input
1244   where read i input = do
1245           case alexGetChar' input of
1246             Just (c,input') | is_digit c -> do
1247                 read (i*base + conv c) input'
1248             _other -> do
1249                 if i >= 0 && i <= 0x10FFFF
1250                    then do setInput input; return (chr i)
1251                    else lit_error
1252
1253 silly_escape_chars = [
1254         ("NUL", '\NUL'),
1255         ("SOH", '\SOH'),
1256         ("STX", '\STX'),
1257         ("ETX", '\ETX'),
1258         ("EOT", '\EOT'),
1259         ("ENQ", '\ENQ'),
1260         ("ACK", '\ACK'),
1261         ("BEL", '\BEL'),
1262         ("BS", '\BS'),
1263         ("HT", '\HT'),
1264         ("LF", '\LF'),
1265         ("VT", '\VT'),
1266         ("FF", '\FF'),
1267         ("CR", '\CR'),
1268         ("SO", '\SO'),
1269         ("SI", '\SI'),
1270         ("DLE", '\DLE'),
1271         ("DC1", '\DC1'),
1272         ("DC2", '\DC2'),
1273         ("DC3", '\DC3'),
1274         ("DC4", '\DC4'),
1275         ("NAK", '\NAK'),
1276         ("SYN", '\SYN'),
1277         ("ETB", '\ETB'),
1278         ("CAN", '\CAN'),
1279         ("EM", '\EM'),
1280         ("SUB", '\SUB'),
1281         ("ESC", '\ESC'),
1282         ("FS", '\FS'),
1283         ("GS", '\GS'),
1284         ("RS", '\RS'),
1285         ("US", '\US'),
1286         ("SP", '\SP'),
1287         ("DEL", '\DEL')
1288         ]
1289
1290 -- before calling lit_error, ensure that the current input is pointing to
1291 -- the position of the error in the buffer.  This is so that we can report
1292 -- a correct location to the user, but also so we can detect UTF-8 decoding
1293 -- errors if they occur.
1294 lit_error = lexError "lexical error in string/character literal"
1295
1296 getCharOrFail :: P Char
1297 getCharOrFail =  do
1298   i <- getInput
1299   case alexGetChar' i of
1300         Nothing -> lexError "unexpected end-of-file in string/character literal"
1301         Just (c,i)  -> do setInput i; return c
1302
1303 -- -----------------------------------------------------------------------------
1304 -- The Parse Monad
1305
1306 data LayoutContext
1307   = NoLayout
1308   | Layout !Int
1309   deriving Show
1310
1311 data ParseResult a
1312   = POk PState a
1313   | PFailed
1314         SrcSpan         -- The start and end of the text span related to
1315                         -- the error.  Might be used in environments which can
1316                         -- show this span, e.g. by highlighting it.
1317         Message         -- The error message
1318
1319 data PState = PState {
1320         buffer     :: StringBuffer,
1321         last_loc   :: SrcSpan,  -- pos of previous token
1322         last_offs  :: !Int,     -- offset of the previous token from the
1323                                 -- beginning of  the current line.
1324                                 -- \t is equal to 8 spaces.
1325         last_len   :: !Int,     -- len of previous token
1326   last_line_len :: !Int,
1327         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1328         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1329         context    :: [LayoutContext],
1330         lex_state  :: [Int]
1331      }
1332         -- last_loc and last_len are used when generating error messages,
1333         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1334         -- current token to happyError, we could at least get rid of last_len.
1335         -- Getting rid of last_loc would require finding another way to
1336         -- implement pushCurrentContext (which is only called from one place).
1337
1338 newtype P a = P { unP :: PState -> ParseResult a }
1339
1340 instance Monad P where
1341   return = returnP
1342   (>>=) = thenP
1343   fail = failP
1344
1345 returnP :: a -> P a
1346 returnP a = P $ \s -> POk s a
1347
1348 thenP :: P a -> (a -> P b) -> P b
1349 (P m) `thenP` k = P $ \ s ->
1350         case m s of
1351                 POk s1 a         -> (unP (k a)) s1
1352                 PFailed span err -> PFailed span err
1353
1354 failP :: String -> P a
1355 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1356
1357 failMsgP :: String -> P a
1358 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1359
1360 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1361 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1362
1363 failSpanMsgP :: SrcSpan -> String -> P a
1364 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1365
1366 extension :: (Int -> Bool) -> P Bool
1367 extension p = P $ \s -> POk s (p $! extsBitmap s)
1368
1369 getExts :: P Int
1370 getExts = P $ \s -> POk s (extsBitmap s)
1371
1372 setSrcLoc :: SrcLoc -> P ()
1373 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1374
1375 getSrcLoc :: P SrcLoc
1376 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1377
1378 setLastToken :: SrcSpan -> Int -> Int -> P ()
1379 setLastToken loc len line_len = P $ \s -> POk s {
1380   last_loc=loc,
1381   last_len=len,
1382   last_line_len=line_len
1383 } ()
1384
1385 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1386
1387 alexInputPrevChar :: AlexInput -> Char
1388 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1389
1390 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1391 alexGetChar (AI loc ofs s)
1392   | atEnd s   = Nothing
1393   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1394                 --trace (show (ord c)) $
1395                 Just (adj_c, (AI loc' ofs' s'))
1396   where (c,s') = nextChar s
1397         loc'   = advanceSrcLoc loc c
1398         ofs'   = advanceOffs c ofs
1399
1400         non_graphic     = '\x0'
1401         upper           = '\x1'
1402         lower           = '\x2'
1403         digit           = '\x3'
1404         symbol          = '\x4'
1405         space           = '\x5'
1406         other_graphic   = '\x6'
1407
1408         adj_c
1409           | c <= '\x06' = non_graphic
1410           | c <= '\xff' = c
1411           | otherwise =
1412                 case generalCategory c of
1413                   UppercaseLetter       -> upper
1414                   LowercaseLetter       -> lower
1415                   TitlecaseLetter       -> upper
1416                   ModifierLetter        -> other_graphic
1417                   OtherLetter           -> other_graphic
1418                   NonSpacingMark        -> other_graphic
1419                   SpacingCombiningMark  -> other_graphic
1420                   EnclosingMark         -> other_graphic
1421                   DecimalNumber         -> digit
1422                   LetterNumber          -> other_graphic
1423                   OtherNumber           -> other_graphic
1424                   ConnectorPunctuation  -> other_graphic
1425                   DashPunctuation       -> other_graphic
1426                   OpenPunctuation       -> other_graphic
1427                   ClosePunctuation      -> other_graphic
1428                   InitialQuote          -> other_graphic
1429                   FinalQuote            -> other_graphic
1430                   OtherPunctuation      -> other_graphic
1431                   MathSymbol            -> symbol
1432                   CurrencySymbol        -> symbol
1433                   ModifierSymbol        -> symbol
1434                   OtherSymbol           -> symbol
1435                   Space                 -> space
1436                   _other                -> non_graphic
1437
1438 -- This version does not squash unicode characters, it is used when
1439 -- lexing strings.
1440 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1441 alexGetChar' (AI loc ofs s)
1442   | atEnd s   = Nothing
1443   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1444                 --trace (show (ord c)) $
1445                 Just (c, (AI loc' ofs' s'))
1446   where (c,s') = nextChar s
1447         loc'   = advanceSrcLoc loc c
1448         ofs'   = advanceOffs c ofs
1449
1450 advanceOffs :: Char -> Int -> Int
1451 advanceOffs '\n' offs = 0
1452 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1453 advanceOffs _    offs = offs + 1
1454
1455 getInput :: P AlexInput
1456 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1457
1458 setInput :: AlexInput -> P ()
1459 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1460
1461 pushLexState :: Int -> P ()
1462 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1463
1464 popLexState :: P Int
1465 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1466
1467 getLexState :: P Int
1468 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1469
1470 -- for reasons of efficiency, flags indicating language extensions (eg,
1471 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1472 -- integer
1473
1474 glaExtsBit, ffiBit, parrBit :: Int
1475 glaExtsBit = 0
1476 ffiBit     = 1
1477 parrBit    = 2
1478 arrowsBit  = 4
1479 thBit      = 5
1480 ipBit      = 6
1481 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1482 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1483                 -- (doesn't affect the lexer)
1484 idxTysBit  = 9  -- indexed type families: 'family' keyword and kind sigs
1485 haddockBit = 10 -- Lex and parse Haddock comments
1486
1487 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1488 glaExtsEnabled flags = testBit flags glaExtsBit
1489 ffiEnabled     flags = testBit flags ffiBit
1490 parrEnabled    flags = testBit flags parrBit
1491 arrowsEnabled  flags = testBit flags arrowsBit
1492 thEnabled      flags = testBit flags thBit
1493 ipEnabled      flags = testBit flags ipBit
1494 tvEnabled      flags = testBit flags tvBit
1495 bangPatEnabled flags = testBit flags bangPatBit
1496 idxTysEnabled  flags = testBit flags idxTysBit
1497 haddockEnabled flags = testBit flags haddockBit
1498
1499 -- PState for parsing options pragmas
1500 --
1501 pragState :: StringBuffer -> SrcLoc -> PState
1502 pragState buf loc  =
1503   PState {
1504       buffer          = buf,
1505       last_loc      = mkSrcSpan loc loc,
1506       last_offs     = 0,
1507       last_len      = 0,
1508       last_line_len = 0,
1509       loc           = loc,
1510       extsBitmap    = 0,
1511       context       = [],
1512       lex_state     = [bol, option_prags, 0]
1513     }
1514
1515
1516 -- create a parse state
1517 --
1518 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1519 mkPState buf loc flags  =
1520   PState {
1521       buffer          = buf,
1522       last_loc      = mkSrcSpan loc loc,
1523       last_offs     = 0,
1524       last_len      = 0,
1525       last_line_len = 0,
1526       loc           = loc,
1527       extsBitmap    = fromIntegral bitmap,
1528       context       = [],
1529       lex_state     = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1530         -- we begin in the layout state if toplev_layout is set
1531     }
1532     where
1533       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1534                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1535                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1536                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1537                .|. thBit      `setBitIf` dopt Opt_TH          flags
1538                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1539                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1540                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1541                .|. idxTysBit  `setBitIf` dopt Opt_IndexedTypes flags
1542                .|. haddockBit `setBitIf` dopt Opt_Haddock     flags
1543       --
1544       setBitIf :: Int -> Bool -> Int
1545       b `setBitIf` cond | cond      = bit b
1546                         | otherwise = 0
1547
1548 getContext :: P [LayoutContext]
1549 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1550
1551 setContext :: [LayoutContext] -> P ()
1552 setContext ctx = P $ \s -> POk s{context=ctx} ()
1553
1554 popContext :: P ()
1555 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1556                            loc = loc, last_len = len, last_loc = last_loc }) ->
1557   case ctx of
1558         (_:tl) -> POk s{ context = tl } ()
1559         []     -> PFailed last_loc (srcParseErr buf len)
1560
1561 -- Push a new layout context at the indentation of the last token read.
1562 -- This is only used at the outer level of a module when the 'module'
1563 -- keyword is missing.
1564 pushCurrentContext :: P ()
1565 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1566     POk s{context = Layout (offs-len) : ctx} ()
1567 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1568
1569 getOffside :: P Ordering
1570 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1571                 let ord = case stk of
1572                         (Layout n:_) -> compare offs n
1573                         _            -> GT
1574                 in POk s ord
1575
1576 -- ---------------------------------------------------------------------------
1577 -- Construct a parse error
1578
1579 srcParseErr
1580   :: StringBuffer       -- current buffer (placed just after the last token)
1581   -> Int                -- length of the previous token
1582   -> Message
1583 srcParseErr buf len
1584   = hcat [ if null token
1585              then ptext SLIT("parse error (possibly incorrect indentation)")
1586              else hcat [ptext SLIT("parse error on input "),
1587                         char '`', text token, char '\'']
1588     ]
1589   where token = lexemeToString (offsetBytes (-len) buf) len
1590
1591 -- Report a parse failure, giving the span of the previous token as
1592 -- the location of the error.  This is the entry point for errors
1593 -- detected during parsing.
1594 srcParseFail :: P a
1595 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1596                             last_loc = last_loc } ->
1597     PFailed last_loc (srcParseErr buf len)
1598
1599 -- A lexical error is reported at a particular position in the source file,
1600 -- not over a token range.
1601 lexError :: String -> P a
1602 lexError str = do
1603   loc <- getSrcLoc
1604   i@(AI end _ buf) <- getInput
1605   reportLexError loc end buf str
1606
1607 -- -----------------------------------------------------------------------------
1608 -- This is the top-level function: called from the parser each time a
1609 -- new token is to be read from the input.
1610
1611 lexer :: (Located Token -> P a) -> P a
1612 lexer cont = do
1613   tok@(L span tok__) <- lexToken
1614 --  trace ("token: " ++ show tok__) $ do
1615   cont tok
1616
1617 lexToken :: P (Located Token)
1618 lexToken = do
1619   inp@(AI loc1 _ buf) <- getInput
1620   sc <- getLexState
1621   exts <- getExts
1622   case alexScanUser exts inp sc of
1623     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1624                   setLastToken span 0 0
1625                   return (L span ITeof)
1626     AlexError (AI loc2 _ buf) -> do
1627         reportLexError loc1 loc2 buf "lexical error"
1628     AlexSkip inp2 _ -> do
1629         setInput inp2
1630         lexToken
1631     AlexToken inp2@(AI end _ buf2) len t -> do
1632     setInput inp2
1633     let span = mkSrcSpan loc1 end
1634     let bytes = byteDiff buf buf2
1635     span `seq` setLastToken span bytes bytes
1636     t span buf bytes
1637
1638 reportLexError loc1 loc2 buf str
1639   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1640   | otherwise =
1641   let
1642         c = fst (nextChar buf)
1643   in
1644   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1645     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1646     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1647 }