compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    getMessages,
  29    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  30    getLexState, popLexState, pushLexState,
  31    extension, glaExtsEnabled, bangPatEnabled
  32   ) where
  33
  34 #include "HsVersions.h"
  35
  36 import Bag
  37 import ErrUtils
  38 import Outputable
  39 import StringBuffer
  40 import FastString
  41 import FastTypes
  42 import SrcLoc
  43 import UniqFM
  44 import DynFlags
  45 import Ctype
  46 import Util             ( maybePrefixMatch, readRational )
  47
  48 import Control.Monad
  49 import Data.Bits
  50 import Data.Char        ( chr, isSpace )
  51 import Data.Ratio
  52 import Debug.Trace
  53
  54 #if __GLASGOW_HASKELL__ >= 605
  55 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  56 #else
  57 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  58 #endif
  59 }
  60
  61 $unispace    = \x05
  62 $whitechar   = [\ \n\r\f\v\xa0 $unispace]
  63 $white_no_nl = $whitechar # \n
  64 $tab         = \t
  65
  66 $ascdigit  = 0-9
  67 $unidigit  = \x03
  68 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  69 $digit     = [$ascdigit $unidigit]
  70
  71 $special   = [\(\)\,\;\[\]\`\{\}]
  72 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  73 $unisymbol = \x04
  74 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  75
  76 $unilarge  = \x01
  77 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  78 $large     = [$asclarge $unilarge]
  79
  80 $unismall  = \x02
  81 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  82 $small     = [$ascsmall $unismall \_]
  83
  84 $unigraphic = \x06
  85 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  86
  87 $octit     = 0-7
  88 $hexit     = [$decdigit A-F a-f]
  89 $symchar   = [$symbol \:]
  90 $nl        = [\n\r]
  91 $idchar    = [$small $large $digit \']
  92
  93 $docsym    = [\| \^ \* \$]
  94
  95 @varid     = $small $idchar*
  96 @conid     = $large $idchar*
  97
  98 @varsym    = $symbol $symchar*
  99 @consym    = \: $symchar*
 100
 101 @decimal     = $decdigit+
 102 @octal       = $octit+
 103 @hexadecimal = $hexit+
 104 @exponent    = [eE] [\-\+]? @decimal
 105
 106 -- we support the hierarchical module name extension:
 107 @qual = (@conid \.)+
 108
 109 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 110
 111 haskell :-
 112
 113 -- everywhere: skip whitespace and comments
 114 $white_no_nl+                           ;
 115 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 116
 117 -- Everywhere: deal with nested comments.  We explicitly rule out
 118 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 119 -- (this can happen even though pragmas will normally take precedence due to
 120 -- longest-match, because pragmas aren't valid in every state, but comments
 121 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 122 -- set.
 123
 124 "{-" / { isNormalComment } { nested_comment lexToken }
 125
 126 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 127 -- more dashes followed by a symbol should be parsed as a varsym, so we
 128 -- have to exclude those.
 129
 130 -- Since Haddock comments aren't valid in every state, we need to rule them
 131 -- out here.
 132
 133 -- The following two rules match comments that begin with two dashes, but
 134 -- continue with a different character. The rules test that this character
 135 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 136 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 137 -- have a Haddock comment). The rules then munch the rest of the line.
 138
 139 "-- " ~$docsym .* ;
 140 "--" [^$symbol : \ ] .* ;
 141
 142 -- Next, match Haddock comments if no -haddock flag
 143
 144 "-- " $docsym .* / { ifExtension (not . haddockEnabled) } ;
 145
 146 -- Now, when we've matched comments that begin with 2 dashes and continue
 147 -- with a different character, we need to match comments that begin with three
 148 -- or more dashes (which clearly can't be Haddock comments). We only need to
 149 -- make sure that the first non-dash character isn't a symbol, and munch the
 150 -- rest of the line.
 151
 152 "---"\-* [^$symbol :] .* ;
 153
 154 -- Since the previous rules all match dashes followed by at least one
 155 -- character, we also need to match a whole line filled with just dashes.
 156
 157 "--"\-* / { atEOL } ;
 158
 159 -- We need this rule since none of the other single line comment rules
 160 -- actually match this case.
 161
 162 "-- " / { atEOL } ;
 163
 164 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 165 -- blank lines) until we find a non-whitespace character, then do layout
 166 -- processing.
 167 --
 168 -- One slight wibble here: what if the line begins with {-#? In
 169 -- theory, we have to lex the pragma to see if it's one we recognise,
 170 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 171 -- as a nested comment.  We don't bother with this: if the line begins
 172 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 173 <bol> {
 174   \n                                    ;
 175   ^\# (line)?                           { begin line_prag1 }
 176   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 177   ^\# \! .* \n                          ; -- #!, for scripts
 178   ()                                    { do_bol }
 179 }
 180
 181 -- after a layout keyword (let, where, do, of), we begin a new layout
 182 -- context if the curly brace is missing.
 183 -- Careful! This stuff is quite delicate.
 184 <layout, layout_do> {
 185   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 186         -- we might encounter {-# here, but {- has been handled already
 187   \n                                    ;
 188   ^\# (line)?                           { begin line_prag1 }
 189 }
 190
 191 -- do is treated in a subtly different way, see new_layout_context
 192 <layout>    ()                          { new_layout_context True }
 193 <layout_do> ()                          { new_layout_context False }
 194
 195 -- after a new layout context which was found to be to the left of the
 196 -- previous context, we have generated a '{' token, and we now need to
 197 -- generate a matching '}' token.
 198 <layout_left>  ()                       { do_layout_left }
 199
 200 <0,option_prags,glaexts> \n                             { begin bol }
 201
 202 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 203
 204 -- single-line line pragmas, of the form
 205 --    # <line> "<file>" <extra-stuff> \n
 206 <line_prag1> $decdigit+                 { setLine line_prag1a }
 207 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 208 <line_prag1b> .*                        { pop }
 209
 210 -- Haskell-style line pragmas, of the form
 211 --    {-# LINE <line> "<file>" #-}
 212 <line_prag2> $decdigit+                 { setLine line_prag2a }
 213 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 214 <line_prag2b> "#-}"|"-}"                { pop }
 215    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 216    -- with older versions of GHC which generated these.
 217
 218 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 219 -- is on, because the contents of the pragma is always written using
 220 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 221 -- enabled, we're sure to get a parse error.
 222 -- (ToDo: we should really emit a warning when ignoring pragmas)
 223 <glaexts>
 224   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 225
 226 <0,option_prags,glaexts> {
 227   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 228   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 229                                         { token (ITinline_prag False) }
 230   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 231                                         { token ITspec_prag }
 232   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 233         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 234   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 235         $whitechar* (NO(T?)INLINE|no(t?)inline)
 236                                         { token (ITspec_inline_prag False) }
 237   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 238   "{-#" $whitechar* (DEPRECATED|deprecated)
 239                                         { token ITdeprecated_prag }
 240   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 241   "{-#" $whitechar* (GENERATED|generated)
 242                                         { token ITgenerated_prag }
 243   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 244   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 245
 246   "{-#" $whitechar* (DOCOPTIONS|docoptions)
 247   / { ifExtension haddockEnabled }     { lex_string_prag ITdocOptions }
 248
 249  "{-#"                                 { nested_comment lexToken }
 250
 251   -- ToDo: should only be valid inside a pragma:
 252   "#-}"                                 { token ITclose_prag}
 253 }
 254
 255 <option_prags> {
 256   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 257   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 258                                         { lex_string_prag IToptions_prag }
 259   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 260   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 261 }
 262
 263 <0,option_prags,glaexts> {
 264         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 265   "{-#" $whitechar* $idchar+            { nested_comment lexToken }
 266 }
 267
 268 -- '0' state: ordinary lexemes
 269 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 270
 271 -- Haddock comments
 272
 273 <0,glaexts> {
 274   "-- " / $docsym    { multiline_doc_comment }
 275   "{-" \ ? / $docsym { nested_doc_comment }
 276 }
 277
 278 -- "special" symbols
 279
 280 <0,glaexts> {
 281   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 282   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 283 }
 284
 285 <0,glaexts> {
 286   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 287   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 288   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 289   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 290   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 291   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 292   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 293   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 294 }
 295
 296 <0,glaexts> {
 297   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 298                                         { special IToparenbar }
 299   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 300 }
 301
 302 <0,glaexts> {
 303   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 304 }
 305
 306 <glaexts> {
 307   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 308   "#)"                                  { token ITcubxparen }
 309   "{|"                                  { token ITocurlybar }
 310   "|}"                                  { token ITccurlybar }
 311 }
 312
 313 <0,option_prags,glaexts> {
 314   \(                                    { special IToparen }
 315   \)                                    { special ITcparen }
 316   \[                                    { special ITobrack }
 317   \]                                    { special ITcbrack }
 318   \,                                    { special ITcomma }
 319   \;                                    { special ITsemi }
 320   \`                                    { special ITbackquote }
 321
 322   \{                                    { open_brace }
 323   \}                                    { close_brace }
 324 }
 325
 326 <0,option_prags,glaexts> {
 327   @qual @varid                  { check_qvarid }
 328   @qual @conid                  { idtoken qconid }
 329   @varid                        { varid }
 330   @conid                        { idtoken conid }
 331 }
 332
 333 -- after an illegal qvarid, such as 'M.let',
 334 -- we back up and try again in the bad_qvarid state:
 335 <bad_qvarid> {
 336   @conid                        { pop_and (idtoken conid) }
 337   @qual @conid                  { pop_and (idtoken qconid) }
 338 }
 339
 340 <glaexts> {
 341   @qual @varid "#"+             { idtoken qvarid }
 342   @qual @conid "#"+             { idtoken qconid }
 343   @varid "#"+                   { varid }
 344   @conid "#"+                   { idtoken conid }
 345 }
 346
 347 -- ToDo: M.(,,,)
 348
 349 <0,glaexts> {
 350   @qual @varsym                 { idtoken qvarsym }
 351   @qual @consym                 { idtoken qconsym }
 352   @varsym                       { varsym }
 353   @consym                       { consym }
 354 }
 355
 356 <0,glaexts> {
 357   @decimal                      { tok_decimal }
 358   0[oO] @octal                  { tok_octal }
 359   0[xX] @hexadecimal            { tok_hexadecimal }
 360 }
 361
 362 <glaexts> {
 363   @decimal \#                   { prim_decimal }
 364   0[oO] @octal \#               { prim_octal }
 365   0[xX] @hexadecimal \#         { prim_hexadecimal }
 366 }
 367
 368 <0,glaexts> @floating_point             { strtoken tok_float }
 369 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 370 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 371
 372 -- Strings and chars are lexed by hand-written code.  The reason is
 373 -- that even if we recognise the string or char here in the regex
 374 -- lexer, we would still have to parse the string afterward in order
 375 -- to convert it to a String.
 376 <0,glaexts> {
 377   \'                            { lex_char_tok }
 378   \"                            { lex_string_tok }
 379 }
 380
 381 {
 382 -- -----------------------------------------------------------------------------
 383 -- The token type
 384
 385 data Token
 386   = ITas                        -- Haskell keywords
 387   | ITcase
 388   | ITclass
 389   | ITdata
 390   | ITdefault
 391   | ITderiving
 392   | ITderive
 393   | ITdo
 394   | ITelse
 395   | IThiding
 396   | ITif
 397   | ITimport
 398   | ITin
 399   | ITinfix
 400   | ITinfixl
 401   | ITinfixr
 402   | ITinstance
 403   | ITlet
 404   | ITmodule
 405   | ITnewtype
 406   | ITof
 407   | ITqualified
 408   | ITthen
 409   | ITtype
 410   | ITwhere
 411   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 412
 413   | ITforall                    -- GHC extension keywords
 414   | ITforeign
 415   | ITexport
 416   | ITlabel
 417   | ITdynamic
 418   | ITsafe
 419   | ITthreadsafe
 420   | ITunsafe
 421   | ITstdcallconv
 422   | ITccallconv
 423   | ITdotnet
 424   | ITmdo
 425   | ITfamily
 426
 427         -- Pragmas
 428   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 429   | ITspec_prag                 -- SPECIALISE
 430   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 431   | ITsource_prag
 432   | ITrules_prag
 433   | ITdeprecated_prag
 434   | ITline_prag
 435   | ITscc_prag
 436   | ITgenerated_prag
 437   | ITcore_prag                 -- hdaume: core annotations
 438   | ITunpack_prag
 439   | ITclose_prag
 440   | IToptions_prag String
 441   | ITinclude_prag String
 442   | ITlanguage_prag
 443
 444   | ITdotdot                    -- reserved symbols
 445   | ITcolon
 446   | ITdcolon
 447   | ITequal
 448   | ITlam
 449   | ITvbar
 450   | ITlarrow
 451   | ITrarrow
 452   | ITat
 453   | ITtilde
 454   | ITdarrow
 455   | ITminus
 456   | ITbang
 457   | ITstar
 458   | ITdot
 459
 460   | ITbiglam                    -- GHC-extension symbols
 461
 462   | ITocurly                    -- special symbols
 463   | ITccurly
 464   | ITocurlybar                 -- {|, for type applications
 465   | ITccurlybar                 -- |}, for type applications
 466   | ITvocurly
 467   | ITvccurly
 468   | ITobrack
 469   | ITopabrack                  -- [:, for parallel arrays with -fparr
 470   | ITcpabrack                  -- :], for parallel arrays with -fparr
 471   | ITcbrack
 472   | IToparen
 473   | ITcparen
 474   | IToubxparen
 475   | ITcubxparen
 476   | ITsemi
 477   | ITcomma
 478   | ITunderscore
 479   | ITbackquote
 480
 481   | ITvarid   FastString        -- identifiers
 482   | ITconid   FastString
 483   | ITvarsym  FastString
 484   | ITconsym  FastString
 485   | ITqvarid  (FastString,FastString)
 486   | ITqconid  (FastString,FastString)
 487   | ITqvarsym (FastString,FastString)
 488   | ITqconsym (FastString,FastString)
 489
 490   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 491
 492   | ITpragma StringBuffer
 493
 494   | ITchar       Char
 495   | ITstring     FastString
 496   | ITinteger    Integer
 497   | ITrational   Rational
 498
 499   | ITprimchar   Char
 500   | ITprimstring FastString
 501   | ITprimint    Integer
 502   | ITprimfloat  Rational
 503   | ITprimdouble Rational
 504
 505   -- MetaHaskell extension tokens
 506   | ITopenExpQuote              --  [| or [e|
 507   | ITopenPatQuote              --  [p|
 508   | ITopenDecQuote              --  [d|
 509   | ITopenTypQuote              --  [t|
 510   | ITcloseQuote                --  |]
 511   | ITidEscape   FastString     --  $x
 512   | ITparenEscape               --  $(
 513   | ITvarQuote                  --  '
 514   | ITtyQuote                   --  ''
 515
 516   -- Arrow notation extension
 517   | ITproc
 518   | ITrec
 519   | IToparenbar                 --  (|
 520   | ITcparenbar                 --  |)
 521   | ITlarrowtail                --  -<
 522   | ITrarrowtail                --  >-
 523   | ITLarrowtail                --  -<<
 524   | ITRarrowtail                --  >>-
 525
 526   | ITunknown String            -- Used when the lexer can't make sense of it
 527   | ITeof                       -- end of file token
 528
 529   -- Documentation annotations
 530   | ITdocCommentNext  String     -- something beginning '-- |'
 531   | ITdocCommentPrev  String     -- something beginning '-- ^'
 532   | ITdocCommentNamed String     -- something beginning '-- $'
 533   | ITdocSection      Int String -- a section heading
 534   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 535
 536 #ifdef DEBUG
 537   deriving Show -- debugging
 538 #endif
 539
 540 isSpecial :: Token -> Bool
 541 -- If we see M.x, where x is a keyword, but
 542 -- is special, we treat is as just plain M.x,
 543 -- not as a keyword.
 544 isSpecial ITas          = True
 545 isSpecial IThiding      = True
 546 isSpecial ITderive      = True
 547 isSpecial ITqualified   = True
 548 isSpecial ITforall      = True
 549 isSpecial ITexport      = True
 550 isSpecial ITlabel       = True
 551 isSpecial ITdynamic     = True
 552 isSpecial ITsafe        = True
 553 isSpecial ITthreadsafe  = True
 554 isSpecial ITunsafe      = True
 555 isSpecial ITccallconv   = True
 556 isSpecial ITstdcallconv = True
 557 isSpecial ITmdo         = True
 558 isSpecial ITfamily      = True
 559 isSpecial _             = False
 560
 561 -- the bitmap provided as the third component indicates whether the
 562 -- corresponding extension keyword is valid under the extension options
 563 -- provided to the compiler; if the extension corresponding to *any* of the
 564 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 565 -- facilitates using a keyword in two different extensions that can be
 566 -- activated independently)
 567 --
 568 reservedWordsFM = listToUFM $
 569         map (\(x, y, z) -> (mkFastString x, (y, z)))
 570        [( "_",          ITunderscore,   0 ),
 571         ( "as",         ITas,           0 ),
 572         ( "case",       ITcase,         0 ),
 573         ( "class",      ITclass,        0 ),
 574         ( "data",       ITdata,         0 ),
 575         ( "default",    ITdefault,      0 ),
 576         ( "deriving",   ITderiving,     0 ),
 577         ( "derive",     ITderive,       0 ),
 578         ( "do",         ITdo,           0 ),
 579         ( "else",       ITelse,         0 ),
 580         ( "hiding",     IThiding,       0 ),
 581         ( "if",         ITif,           0 ),
 582         ( "import",     ITimport,       0 ),
 583         ( "in",         ITin,           0 ),
 584         ( "infix",      ITinfix,        0 ),
 585         ( "infixl",     ITinfixl,       0 ),
 586         ( "infixr",     ITinfixr,       0 ),
 587         ( "instance",   ITinstance,     0 ),
 588         ( "let",        ITlet,          0 ),
 589         ( "module",     ITmodule,       0 ),
 590         ( "newtype",    ITnewtype,      0 ),
 591         ( "of",         ITof,           0 ),
 592         ( "qualified",  ITqualified,    0 ),
 593         ( "then",       ITthen,         0 ),
 594         ( "type",       ITtype,         0 ),
 595         ( "where",      ITwhere,        0 ),
 596         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 597
 598         ( "forall",     ITforall,        bit tvBit),
 599         ( "mdo",        ITmdo,           bit glaExtsBit),
 600         ( "family",     ITfamily,        bit idxTysBit),
 601
 602         ( "foreign",    ITforeign,       bit ffiBit),
 603         ( "export",     ITexport,        bit ffiBit),
 604         ( "label",      ITlabel,         bit ffiBit),
 605         ( "dynamic",    ITdynamic,       bit ffiBit),
 606         ( "safe",       ITsafe,          bit ffiBit),
 607         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 608         ( "unsafe",     ITunsafe,        bit ffiBit),
 609         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 610         ( "ccall",      ITccallconv,     bit ffiBit),
 611         ( "dotnet",     ITdotnet,        bit ffiBit),
 612
 613         ( "rec",        ITrec,           bit arrowsBit),
 614         ( "proc",       ITproc,          bit arrowsBit)
 615      ]
 616
 617 reservedSymsFM = listToUFM $
 618         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 619       [ ("..",  ITdotdot,       0)
 620        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 621                                                 -- meaning only list cons
 622        ,("::",  ITdcolon,       0)
 623        ,("=",   ITequal,        0)
 624        ,("\\",  ITlam,          0)
 625        ,("|",   ITvbar,         0)
 626        ,("<-",  ITlarrow,       0)
 627        ,("->",  ITrarrow,       0)
 628        ,("@",   ITat,           0)
 629        ,("~",   ITtilde,        0)
 630        ,("=>",  ITdarrow,       0)
 631        ,("-",   ITminus,        0)
 632        ,("!",   ITbang,         0)
 633
 634        ,("*",   ITstar,         bit glaExtsBit .|.
 635                                 bit idxTysBit)      -- For data T (a::*) = MkT
 636        ,(".",   ITdot,          bit tvBit)          -- For 'forall a . t'
 637
 638        ,("-<",  ITlarrowtail,   bit arrowsBit)
 639        ,(">-",  ITrarrowtail,   bit arrowsBit)
 640        ,("-<<", ITLarrowtail,   bit arrowsBit)
 641        ,(">>-", ITRarrowtail,   bit arrowsBit)
 642
 643 #if __GLASGOW_HASKELL__ >= 605
 644        ,("∷",   ITdcolon,       bit glaExtsBit)
 645        ,("⇒",   ITdarrow,     bit glaExtsBit)
 646        ,("∀", ITforall,       bit glaExtsBit)
 647        ,("→",   ITrarrow,     bit glaExtsBit)
 648        ,("←",   ITlarrow,     bit glaExtsBit)
 649        ,("⋯",         ITdotdot,       bit glaExtsBit)
 650         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 651         -- form part of a large operator.  This would let us have a better
 652         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 653 #endif
 654        ]
 655
 656 -- -----------------------------------------------------------------------------
 657 -- Lexer actions
 658
 659 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 660
 661 special :: Token -> Action
 662 special tok span _buf len = return (L span tok)
 663
 664 token, layout_token :: Token -> Action
 665 token t span buf len = return (L span t)
 666 layout_token t span buf len = pushLexState layout >> return (L span t)
 667
 668 idtoken :: (StringBuffer -> Int -> Token) -> Action
 669 idtoken f span buf len = return (L span $! (f buf len))
 670
 671 skip_one_varid :: (FastString -> Token) -> Action
 672 skip_one_varid f span buf len
 673   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 674
 675 strtoken :: (String -> Token) -> Action
 676 strtoken f span buf len =
 677   return (L span $! (f $! lexemeToString buf len))
 678
 679 init_strtoken :: Int -> (String -> Token) -> Action
 680 -- like strtoken, but drops the last N character(s)
 681 init_strtoken drop f span buf len =
 682   return (L span $! (f $! lexemeToString buf (len-drop)))
 683
 684 begin :: Int -> Action
 685 begin code _span _str _len = do pushLexState code; lexToken
 686
 687 pop :: Action
 688 pop _span _buf _len = do popLexState; lexToken
 689
 690 pop_and :: Action -> Action
 691 pop_and act span buf len = do popLexState; act span buf len
 692
 693 {-# INLINE nextCharIs #-}
 694 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 695
 696 notFollowedBy char _ _ _ (AI _ _ buf)
 697   = nextCharIs buf (/=char)
 698
 699 notFollowedBySymbol _ _ _ (AI _ _ buf)
 700   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 701
 702 isNormalComment bits _ _ (AI _ _ buf)
 703   | haddockEnabled bits = notFollowedByDocOrPragma
 704   | otherwise           = nextCharIs buf (/='#')
 705   where
 706     notFollowedByDocOrPragma
 707         = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 708
 709 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 710
 711 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 712   = if haddockEnabled bits then False else (p buf)
 713
 714 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 715
 716 ifExtension pred bits _ _ _ = pred bits
 717
 718 multiline_doc_comment :: Action
 719 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 720   where
 721     worker commentAcc input docType oneLine = case alexGetChar input of
 722       Just ('\n', input')
 723         | oneLine -> docCommentEnd input commentAcc docType buf span
 724         | otherwise -> case checkIfCommentLine input' of
 725           Just input -> worker ('\n':commentAcc) input docType False
 726           Nothing -> docCommentEnd input commentAcc docType buf span
 727       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 728       Nothing -> docCommentEnd input commentAcc docType buf span
 729
 730     checkIfCommentLine input = check (dropNonNewlineSpace input)
 731       where
 732         check input = case alexGetChar input of
 733           Just ('-', input) -> case alexGetChar input of
 734             Just ('-', input) -> case alexGetChar input of
 735               Just (c, _) | c /= '-' -> Just input
 736               _ -> Nothing
 737             _ -> Nothing
 738           _ -> Nothing
 739
 740         dropNonNewlineSpace input = case alexGetChar input of
 741           Just (c, input')
 742             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 743             | otherwise -> input
 744           Nothing -> input
 745
 746 {-
 747   nested comments require traversing by hand, they can't be parsed
 748   using regular expressions.
 749 -}
 750 nested_comment :: P (Located Token) -> Action
 751 nested_comment cont span _str _len = do
 752   input <- getInput
 753   go 1 input
 754   where
 755     go 0 input = do setInput input; cont
 756     go n input = case alexGetChar input of
 757       Nothing -> errBrace input span
 758       Just ('-',input) -> case alexGetChar input of
 759         Nothing  -> errBrace input span
 760         Just ('\125',input) -> go (n-1) input
 761         Just (c,_)          -> go n input
 762       Just ('\123',input) -> case alexGetChar input of
 763         Nothing  -> errBrace input span
 764         Just ('-',input) -> go (n+1) input
 765         Just (c,_)       -> go n input
 766       Just (c,input) -> go n input
 767
 768 nested_doc_comment :: Action
 769 nested_doc_comment span buf _len = withLexedDocType (go "")
 770   where
 771     go commentAcc input docType _ = case alexGetChar input of
 772       Nothing -> errBrace input span
 773       Just ('-',input) -> case alexGetChar input of
 774         Nothing -> errBrace input span
 775         Just ('\125',input@(AI end _ buf2)) ->
 776           docCommentEnd input commentAcc docType buf span
 777         Just (c,_) -> go ('-':commentAcc) input docType False
 778       Just ('\123', input) -> case alexGetChar input of
 779         Nothing  -> errBrace input span
 780         Just ('-',input) -> do
 781           setInput input
 782           let cont = do input <- getInput; go commentAcc input docType False
 783           nested_comment cont span buf _len
 784         Just (c,_) -> go ('\123':commentAcc) input docType False
 785       Just (c,input) -> go (c:commentAcc) input docType False
 786
 787 withLexedDocType lexDocComment = do
 788   input <- getInput
 789   case alexGetChar input of
 790     Nothing -> error "Can't happen"
 791     Just ('|', input) -> lexDocComment input ITdocCommentNext False
 792     Just ('^', input) -> lexDocComment input ITdocCommentPrev False
 793     Just ('$', input) -> lexDocComment input ITdocCommentNamed False
 794     Just ('*', input) -> lexDocSection 1 input
 795  where
 796     lexDocSection n input = case alexGetChar input of
 797       Just ('*', input) -> lexDocSection (n+1) input
 798       Just (c, _) -> lexDocComment input (ITdocSection n) True
 799       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 800
 801 -- docCommentEnd
 802 -------------------------------------------------------------------------------
 803 -- This function is quite tricky. We can't just return a new token, we also
 804 -- need to update the state of the parser. Why? Because the token is longer
 805 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 806 -- it writes the wrong token length to the parser state. This function is
 807 -- called afterwards, so it can just update the state.
 808
 809 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 810 -- which is something that the original lexer didn't account for.
 811 -- I have added last_line_len in the parser state which represents the length
 812 -- of the part of the token that is on the last line. It is now used for layout
 813 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 814 -- was before, the full length of the token, and it is now only used for error
 815 -- messages. /Waern
 816
 817 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 818                  SrcSpan -> P (Located Token)
 819 docCommentEnd input commentAcc docType buf span = do
 820   setInput input
 821   let (AI loc last_offs nextBuf) = input
 822       comment = reverse commentAcc
 823       span' = mkSrcSpan (srcSpanStart span) loc
 824       last_len = byteDiff buf nextBuf
 825
 826       last_line_len = if (last_offs - last_len < 0)
 827         then last_offs
 828         else last_len
 829
 830   span `seq` setLastToken span' last_len last_line_len
 831   return (L span' (docType comment))
 832
 833 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 834
 835 open_brace, close_brace :: Action
 836 open_brace span _str _len = do
 837   ctx <- getContext
 838   setContext (NoLayout:ctx)
 839   return (L span ITocurly)
 840 close_brace span _str _len = do
 841   popContext
 842   return (L span ITccurly)
 843
 844 -- We have to be careful not to count M.<varid> as a qualified name
 845 -- when <varid> is a keyword.  We hack around this by catching
 846 -- the offending tokens afterward, and re-lexing in a different state.
 847 check_qvarid span buf len = do
 848   case lookupUFM reservedWordsFM var of
 849         Just (keyword,exts)
 850           | not (isSpecial keyword) ->
 851           if exts == 0
 852              then try_again
 853              else do
 854                 b <- extension (\i -> exts .&. i /= 0)
 855                 if b then try_again
 856                      else return token
 857         _other -> return token
 858   where
 859         (mod,var) = splitQualName buf len
 860         token     = L span (ITqvarid (mod,var))
 861
 862         try_again = do
 863                 (AI _ offs _) <- getInput
 864                 setInput (AI (srcSpanStart span) (offs-len) buf)
 865                 pushLexState bad_qvarid
 866                 lexToken
 867
 868 qvarid buf len = ITqvarid $! splitQualName buf len
 869 qconid buf len = ITqconid $! splitQualName buf len
 870
 871 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 872 -- takes a StringBuffer and a length, and returns the module name
 873 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 874 -- because of hierarchical module names.
 875 splitQualName orig_buf len = split orig_buf orig_buf
 876   where
 877     split buf dot_buf
 878         | orig_buf `byteDiff` buf >= len  = done dot_buf
 879         | c == '.'                        = found_dot buf'
 880         | otherwise                       = split buf' dot_buf
 881       where
 882        (c,buf') = nextChar buf
 883
 884     -- careful, we might get names like M....
 885     -- so, if the character after the dot is not upper-case, this is
 886     -- the end of the qualifier part.
 887     found_dot buf -- buf points after the '.'
 888         | isUpper c    = split buf' buf
 889         | otherwise    = done buf
 890       where
 891        (c,buf') = nextChar buf
 892
 893     done dot_buf =
 894         (lexemeToFastString orig_buf (qual_size - 1),
 895          lexemeToFastString dot_buf (len - qual_size))
 896       where
 897         qual_size = orig_buf `byteDiff` dot_buf
 898
 899 varid span buf len =
 900   case lookupUFM reservedWordsFM fs of
 901         Just (keyword,0)    -> do
 902                 maybe_layout keyword
 903                 return (L span keyword)
 904         Just (keyword,exts) -> do
 905                 b <- extension (\i -> exts .&. i /= 0)
 906                 if b then do maybe_layout keyword
 907                              return (L span keyword)
 908                      else return (L span (ITvarid fs))
 909         _other -> return (L span (ITvarid fs))
 910   where
 911         fs = lexemeToFastString buf len
 912
 913 conid buf len = ITconid fs
 914   where fs = lexemeToFastString buf len
 915
 916 qvarsym buf len = ITqvarsym $! splitQualName buf len
 917 qconsym buf len = ITqconsym $! splitQualName buf len
 918
 919 varsym = sym ITvarsym
 920 consym = sym ITconsym
 921
 922 sym con span buf len =
 923   case lookupUFM reservedSymsFM fs of
 924         Just (keyword,0)    -> return (L span keyword)
 925         Just (keyword,exts) -> do
 926                 b <- extension (\i -> exts .&. i /= 0)
 927                 if b then return (L span keyword)
 928                      else return (L span $! con fs)
 929         _other -> return (L span $! con fs)
 930   where
 931         fs = lexemeToFastString buf len
 932
 933 tok_decimal span buf len
 934   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 935
 936 tok_octal span buf len
 937   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 938
 939 tok_hexadecimal span buf len
 940   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 941
 942 prim_decimal span buf len
 943   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 944
 945 prim_octal span buf len
 946   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 947
 948 prim_hexadecimal span buf len
 949   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 950
 951 tok_float        str = ITrational   $! readRational str
 952 prim_float       str = ITprimfloat  $! readRational str
 953 prim_double      str = ITprimdouble $! readRational str
 954
 955 -- -----------------------------------------------------------------------------
 956 -- Layout processing
 957
 958 -- we're at the first token on a line, insert layout tokens if necessary
 959 do_bol :: Action
 960 do_bol span _str _len = do
 961         pos <- getOffside
 962         case pos of
 963             LT -> do
 964                 --trace "layout: inserting '}'" $ do
 965                 popContext
 966                 -- do NOT pop the lex state, we might have a ';' to insert
 967                 return (L span ITvccurly)
 968             EQ -> do
 969                 --trace "layout: inserting ';'" $ do
 970                 popLexState
 971                 return (L span ITsemi)
 972             GT -> do
 973                 popLexState
 974                 lexToken
 975
 976 -- certain keywords put us in the "layout" state, where we might
 977 -- add an opening curly brace.
 978 maybe_layout ITdo       = pushLexState layout_do
 979 maybe_layout ITmdo      = pushLexState layout_do
 980 maybe_layout ITof       = pushLexState layout
 981 maybe_layout ITlet      = pushLexState layout
 982 maybe_layout ITwhere    = pushLexState layout
 983 maybe_layout ITrec      = pushLexState layout
 984 maybe_layout _          = return ()
 985
 986 -- Pushing a new implicit layout context.  If the indentation of the
 987 -- next token is not greater than the previous layout context, then
 988 -- Haskell 98 says that the new layout context should be empty; that is
 989 -- the lexer must generate {}.
 990 --
 991 -- We are slightly more lenient than this: when the new context is started
 992 -- by a 'do', then we allow the new context to be at the same indentation as
 993 -- the previous context.  This is what the 'strict' argument is for.
 994 --
 995 new_layout_context strict span _buf _len = do
 996     popLexState
 997     (AI _ offset _) <- getInput
 998     ctx <- getContext
 999     case ctx of
1000         Layout prev_off : _  |
1001            (strict     && prev_off >= offset  ||
1002             not strict && prev_off > offset) -> do
1003                 -- token is indented to the left of the previous context.
1004                 -- we must generate a {} sequence now.
1005                 pushLexState layout_left
1006                 return (L span ITvocurly)
1007         other -> do
1008                 setContext (Layout offset : ctx)
1009                 return (L span ITvocurly)
1010
1011 do_layout_left span _buf _len = do
1012     popLexState
1013     pushLexState bol  -- we must be at the start of a line
1014     return (L span ITvccurly)
1015
1016 -- -----------------------------------------------------------------------------
1017 -- LINE pragmas
1018
1019 setLine :: Int -> Action
1020 setLine code span buf len = do
1021   let line = parseInteger buf len 10 octDecDigit
1022   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1023         -- subtract one: the line number refers to the *following* line
1024   popLexState
1025   pushLexState code
1026   lexToken
1027
1028 setFile :: Int -> Action
1029 setFile code span buf len = do
1030   let file = lexemeToFastString (stepOn buf) (len-2)
1031   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1032   popLexState
1033   pushLexState code
1034   lexToken
1035
1036
1037 -- -----------------------------------------------------------------------------
1038 -- Options, includes and language pragmas.
1039
1040 lex_string_prag :: (String -> Token) -> Action
1041 lex_string_prag mkTok span buf len
1042     = do input <- getInput
1043          start <- getSrcLoc
1044          tok <- go [] input
1045          end <- getSrcLoc
1046          return (L (mkSrcSpan start end) tok)
1047     where go acc input
1048               = if isString input "#-}"
1049                    then do setInput input
1050                            return (mkTok (reverse acc))
1051                    else case alexGetChar input of
1052                           Just (c,i) -> go (c:acc) i
1053                           Nothing -> err input
1054           isString i [] = True
1055           isString i (x:xs)
1056               = case alexGetChar i of
1057                   Just (c,i') | c == x    -> isString i' xs
1058                   _other -> False
1059           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1060
1061
1062 -- -----------------------------------------------------------------------------
1063 -- Strings & Chars
1064
1065 -- This stuff is horrible.  I hates it.
1066
1067 lex_string_tok :: Action
1068 lex_string_tok span buf len = do
1069   tok <- lex_string ""
1070   end <- getSrcLoc
1071   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1072
1073 lex_string :: String -> P Token
1074 lex_string s = do
1075   i <- getInput
1076   case alexGetChar' i of
1077     Nothing -> lit_error
1078
1079     Just ('"',i)  -> do
1080         setInput i
1081         glaexts <- extension glaExtsEnabled
1082         if glaexts
1083           then do
1084             i <- getInput
1085             case alexGetChar' i of
1086               Just ('#',i) -> do
1087                    setInput i
1088                    if any (> '\xFF') s
1089                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1090                     else let s' = mkZFastString (reverse s) in
1091                          return (ITprimstring s')
1092                         -- mkZFastString is a hack to avoid encoding the
1093                         -- string in UTF-8.  We just want the exact bytes.
1094               _other ->
1095                 return (ITstring (mkFastString (reverse s)))
1096           else
1097                 return (ITstring (mkFastString (reverse s)))
1098
1099     Just ('\\',i)
1100         | Just ('&',i) <- next -> do
1101                 setInput i; lex_string s
1102         | Just (c,i) <- next, is_space c -> do
1103                 setInput i; lex_stringgap s
1104         where next = alexGetChar' i
1105
1106     Just (c, i) -> do
1107         c' <- lex_char c i
1108         lex_string (c':s)
1109
1110 lex_stringgap s = do
1111   c <- getCharOrFail
1112   case c of
1113     '\\' -> lex_string s
1114     c | is_space c -> lex_stringgap s
1115     _other -> lit_error
1116
1117
1118 lex_char_tok :: Action
1119 -- Here we are basically parsing character literals, such as 'x' or '\n'
1120 -- but, when Template Haskell is on, we additionally spot
1121 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1122 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1123 -- So we have to do two characters of lookahead: when we see 'x we need to
1124 -- see if there's a trailing quote
1125 lex_char_tok span buf len = do  -- We've seen '
1126    i1 <- getInput       -- Look ahead to first character
1127    let loc = srcSpanStart span
1128    case alexGetChar' i1 of
1129         Nothing -> lit_error
1130
1131         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1132                   th_exts <- extension thEnabled
1133                   if th_exts then do
1134                         setInput i2
1135                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1136                    else lit_error
1137
1138         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
1139                   setInput i2
1140                   lit_ch <- lex_escape
1141                   mc <- getCharOrFail   -- Trailing quote
1142                   if mc == '\'' then finish_char_tok loc lit_ch
1143                                 else do setInput i2; lit_error
1144
1145         Just (c, i2@(AI end2 _ _))
1146                 | not (isAny c) -> lit_error
1147                 | otherwise ->
1148
1149                 -- We've seen 'x, where x is a valid character
1150                 --  (i.e. not newline etc) but not a quote or backslash
1151            case alexGetChar' i2 of      -- Look ahead one more character
1152                 Nothing -> lit_error
1153                 Just ('\'', i3) -> do   -- We've seen 'x'
1154                         setInput i3
1155                         finish_char_tok loc c
1156                 _other -> do            -- We've seen 'x not followed by quote
1157                                         -- If TH is on, just parse the quote only
1158                         th_exts <- extension thEnabled
1159                         let (AI end _ _) = i1
1160                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1161                                    else do setInput i2; lit_error
1162
1163 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1164 finish_char_tok loc ch  -- We've already seen the closing quote
1165                         -- Just need to check for trailing #
1166   = do  glaexts <- extension glaExtsEnabled
1167         i@(AI end _ _) <- getInput
1168         if glaexts then do
1169                 case alexGetChar' i of
1170                         Just ('#',i@(AI end _ _)) -> do
1171                                 setInput i
1172                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1173                         _other ->
1174                                 return (L (mkSrcSpan loc end) (ITchar ch))
1175                 else do
1176                    return (L (mkSrcSpan loc end) (ITchar ch))
1177
1178 lex_char :: Char -> AlexInput -> P Char
1179 lex_char c inp = do
1180   case c of
1181       '\\' -> do setInput inp; lex_escape
1182       c | isAny c -> do setInput inp; return c
1183       _other -> lit_error
1184
1185 isAny c | c > '\xff' = isPrint c
1186         | otherwise  = is_any c
1187
1188 lex_escape :: P Char
1189 lex_escape = do
1190   c <- getCharOrFail
1191   case c of
1192         'a'   -> return '\a'
1193         'b'   -> return '\b'
1194         'f'   -> return '\f'
1195         'n'   -> return '\n'
1196         'r'   -> return '\r'
1197         't'   -> return '\t'
1198         'v'   -> return '\v'
1199         '\\'  -> return '\\'
1200         '"'   -> return '\"'
1201         '\''  -> return '\''
1202         '^'   -> do c <- getCharOrFail
1203                     if c >= '@' && c <= '_'
1204                         then return (chr (ord c - ord '@'))
1205                         else lit_error
1206
1207         'x'   -> readNum is_hexdigit 16 hexDigit
1208         'o'   -> readNum is_octdigit  8 octDecDigit
1209         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1210
1211         c1 ->  do
1212            i <- getInput
1213            case alexGetChar' i of
1214             Nothing -> lit_error
1215             Just (c2,i2) ->
1216               case alexGetChar' i2 of
1217                 Nothing -> do setInput i2; lit_error
1218                 Just (c3,i3) ->
1219                    let str = [c1,c2,c3] in
1220                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1221                                      Just rest <- [maybePrefixMatch p str] ] of
1222                           (escape_char,[]):_ -> do
1223                                 setInput i3
1224                                 return escape_char
1225                           (escape_char,_:_):_ -> do
1226                                 setInput i2
1227                                 return escape_char
1228                           [] -> lit_error
1229
1230 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1231 readNum is_digit base conv = do
1232   i <- getInput
1233   c <- getCharOrFail
1234   if is_digit c
1235         then readNum2 is_digit base conv (conv c)
1236         else do setInput i; lit_error
1237
1238 readNum2 is_digit base conv i = do
1239   input <- getInput
1240   read i input
1241   where read i input = do
1242           case alexGetChar' input of
1243             Just (c,input') | is_digit c -> do
1244                 read (i*base + conv c) input'
1245             _other -> do
1246                 if i >= 0 && i <= 0x10FFFF
1247                    then do setInput input; return (chr i)
1248                    else lit_error
1249
1250 silly_escape_chars = [
1251         ("NUL", '\NUL'),
1252         ("SOH", '\SOH'),
1253         ("STX", '\STX'),
1254         ("ETX", '\ETX'),
1255         ("EOT", '\EOT'),
1256         ("ENQ", '\ENQ'),
1257         ("ACK", '\ACK'),
1258         ("BEL", '\BEL'),
1259         ("BS", '\BS'),
1260         ("HT", '\HT'),
1261         ("LF", '\LF'),
1262         ("VT", '\VT'),
1263         ("FF", '\FF'),
1264         ("CR", '\CR'),
1265         ("SO", '\SO'),
1266         ("SI", '\SI'),
1267         ("DLE", '\DLE'),
1268         ("DC1", '\DC1'),
1269         ("DC2", '\DC2'),
1270         ("DC3", '\DC3'),
1271         ("DC4", '\DC4'),
1272         ("NAK", '\NAK'),
1273         ("SYN", '\SYN'),
1274         ("ETB", '\ETB'),
1275         ("CAN", '\CAN'),
1276         ("EM", '\EM'),
1277         ("SUB", '\SUB'),
1278         ("ESC", '\ESC'),
1279         ("FS", '\FS'),
1280         ("GS", '\GS'),
1281         ("RS", '\RS'),
1282         ("US", '\US'),
1283         ("SP", '\SP'),
1284         ("DEL", '\DEL')
1285         ]
1286
1287 -- before calling lit_error, ensure that the current input is pointing to
1288 -- the position of the error in the buffer.  This is so that we can report
1289 -- a correct location to the user, but also so we can detect UTF-8 decoding
1290 -- errors if they occur.
1291 lit_error = lexError "lexical error in string/character literal"
1292
1293 getCharOrFail :: P Char
1294 getCharOrFail =  do
1295   i <- getInput
1296   case alexGetChar' i of
1297         Nothing -> lexError "unexpected end-of-file in string/character literal"
1298         Just (c,i)  -> do setInput i; return c
1299
1300 -- -----------------------------------------------------------------------------
1301 -- Warnings
1302
1303 warn :: DynFlag -> SDoc -> Action
1304 warn option warning span _buf _len = do
1305     addWarning option (mkWarnMsg span alwaysQualify warning)
1306     lexToken
1307
1308 -- -----------------------------------------------------------------------------
1309 -- The Parse Monad
1310
1311 data LayoutContext
1312   = NoLayout
1313   | Layout !Int
1314   deriving Show
1315
1316 data ParseResult a
1317   = POk PState a
1318   | PFailed
1319         SrcSpan         -- The start and end of the text span related to
1320                         -- the error.  Might be used in environments which can
1321                         -- show this span, e.g. by highlighting it.
1322         Message         -- The error message
1323
1324 data PState = PState {
1325         buffer     :: StringBuffer,
1326     dflags     :: DynFlags,
1327     messages   :: Messages,
1328         last_loc   :: SrcSpan,  -- pos of previous token
1329         last_offs  :: !Int,     -- offset of the previous token from the
1330                                 -- beginning of  the current line.
1331                                 -- \t is equal to 8 spaces.
1332         last_len   :: !Int,     -- len of previous token
1333   last_line_len :: !Int,
1334         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1335         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1336         context    :: [LayoutContext],
1337         lex_state  :: [Int]
1338      }
1339         -- last_loc and last_len are used when generating error messages,
1340         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1341         -- current token to happyError, we could at least get rid of last_len.
1342         -- Getting rid of last_loc would require finding another way to
1343         -- implement pushCurrentContext (which is only called from one place).
1344
1345 newtype P a = P { unP :: PState -> ParseResult a }
1346
1347 instance Monad P where
1348   return = returnP
1349   (>>=) = thenP
1350   fail = failP
1351
1352 returnP :: a -> P a
1353 returnP a = P $ \s -> POk s a
1354
1355 thenP :: P a -> (a -> P b) -> P b
1356 (P m) `thenP` k = P $ \ s ->
1357         case m s of
1358                 POk s1 a         -> (unP (k a)) s1
1359                 PFailed span err -> PFailed span err
1360
1361 failP :: String -> P a
1362 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1363
1364 failMsgP :: String -> P a
1365 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1366
1367 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1368 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1369
1370 failSpanMsgP :: SrcSpan -> String -> P a
1371 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1372
1373 extension :: (Int -> Bool) -> P Bool
1374 extension p = P $ \s -> POk s (p $! extsBitmap s)
1375
1376 getExts :: P Int
1377 getExts = P $ \s -> POk s (extsBitmap s)
1378
1379 setSrcLoc :: SrcLoc -> P ()
1380 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1381
1382 getSrcLoc :: P SrcLoc
1383 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1384
1385 setLastToken :: SrcSpan -> Int -> Int -> P ()
1386 setLastToken loc len line_len = P $ \s -> POk s {
1387   last_loc=loc,
1388   last_len=len,
1389   last_line_len=line_len
1390 } ()
1391
1392 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1393
1394 alexInputPrevChar :: AlexInput -> Char
1395 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1396
1397 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1398 alexGetChar (AI loc ofs s)
1399   | atEnd s   = Nothing
1400   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1401                 --trace (show (ord c)) $
1402                 Just (adj_c, (AI loc' ofs' s'))
1403   where (c,s') = nextChar s
1404         loc'   = advanceSrcLoc loc c
1405         ofs'   = advanceOffs c ofs
1406
1407         non_graphic     = '\x0'
1408         upper           = '\x1'
1409         lower           = '\x2'
1410         digit           = '\x3'
1411         symbol          = '\x4'
1412         space           = '\x5'
1413         other_graphic   = '\x6'
1414
1415         adj_c
1416           | c <= '\x06' = non_graphic
1417           | c <= '\xff' = c
1418           | otherwise =
1419                 case generalCategory c of
1420                   UppercaseLetter       -> upper
1421                   LowercaseLetter       -> lower
1422                   TitlecaseLetter       -> upper
1423                   ModifierLetter        -> other_graphic
1424                   OtherLetter           -> other_graphic
1425                   NonSpacingMark        -> other_graphic
1426                   SpacingCombiningMark  -> other_graphic
1427                   EnclosingMark         -> other_graphic
1428                   DecimalNumber         -> digit
1429                   LetterNumber          -> other_graphic
1430                   OtherNumber           -> other_graphic
1431                   ConnectorPunctuation  -> other_graphic
1432                   DashPunctuation       -> other_graphic
1433                   OpenPunctuation       -> other_graphic
1434                   ClosePunctuation      -> other_graphic
1435                   InitialQuote          -> other_graphic
1436                   FinalQuote            -> other_graphic
1437                   OtherPunctuation      -> other_graphic
1438                   MathSymbol            -> symbol
1439                   CurrencySymbol        -> symbol
1440                   ModifierSymbol        -> symbol
1441                   OtherSymbol           -> symbol
1442                   Space                 -> space
1443                   _other                -> non_graphic
1444
1445 -- This version does not squash unicode characters, it is used when
1446 -- lexing strings.
1447 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1448 alexGetChar' (AI loc ofs s)
1449   | atEnd s   = Nothing
1450   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1451                 --trace (show (ord c)) $
1452                 Just (c, (AI loc' ofs' s'))
1453   where (c,s') = nextChar s
1454         loc'   = advanceSrcLoc loc c
1455         ofs'   = advanceOffs c ofs
1456
1457 advanceOffs :: Char -> Int -> Int
1458 advanceOffs '\n' offs = 0
1459 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1460 advanceOffs _    offs = offs + 1
1461
1462 getInput :: P AlexInput
1463 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1464
1465 setInput :: AlexInput -> P ()
1466 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1467
1468 pushLexState :: Int -> P ()
1469 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1470
1471 popLexState :: P Int
1472 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1473
1474 getLexState :: P Int
1475 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1476
1477 -- for reasons of efficiency, flags indicating language extensions (eg,
1478 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1479 -- integer
1480
1481 glaExtsBit, ffiBit, parrBit :: Int
1482 glaExtsBit = 0
1483 ffiBit     = 1
1484 parrBit    = 2
1485 arrowsBit  = 4
1486 thBit      = 5
1487 ipBit      = 6
1488 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1489 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1490                 -- (doesn't affect the lexer)
1491 idxTysBit  = 9  -- indexed type families: 'family' keyword and kind sigs
1492 haddockBit = 10 -- Lex and parse Haddock comments
1493
1494 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1495 glaExtsEnabled flags = testBit flags glaExtsBit
1496 ffiEnabled     flags = testBit flags ffiBit
1497 parrEnabled    flags = testBit flags parrBit
1498 arrowsEnabled  flags = testBit flags arrowsBit
1499 thEnabled      flags = testBit flags thBit
1500 ipEnabled      flags = testBit flags ipBit
1501 tvEnabled      flags = testBit flags tvBit
1502 bangPatEnabled flags = testBit flags bangPatBit
1503 idxTysEnabled  flags = testBit flags idxTysBit
1504 haddockEnabled flags = testBit flags haddockBit
1505
1506 -- PState for parsing options pragmas
1507 --
1508 pragState :: StringBuffer -> SrcLoc -> PState
1509 pragState buf loc  =
1510   PState {
1511       buffer          = buf,
1512       messages      = emptyMessages,
1513       -- XXX defaultDynFlags is not right, but we don't have a real
1514       -- dflags handy
1515       dflags        = defaultDynFlags,
1516       last_loc      = mkSrcSpan loc loc,
1517       last_offs     = 0,
1518       last_len      = 0,
1519       last_line_len = 0,
1520       loc           = loc,
1521       extsBitmap    = 0,
1522       context       = [],
1523       lex_state     = [bol, option_prags, 0]
1524     }
1525
1526
1527 -- create a parse state
1528 --
1529 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1530 mkPState buf loc flags  =
1531   PState {
1532       buffer          = buf,
1533       dflags        = flags,
1534       messages      = emptyMessages,
1535       last_loc      = mkSrcSpan loc loc,
1536       last_offs     = 0,
1537       last_len      = 0,
1538       last_line_len = 0,
1539       loc           = loc,
1540       extsBitmap    = fromIntegral bitmap,
1541       context       = [],
1542       lex_state     = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1543         -- we begin in the layout state if toplev_layout is set
1544     }
1545     where
1546       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1547                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1548                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1549                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1550                .|. thBit      `setBitIf` dopt Opt_TH          flags
1551                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1552                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1553                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1554                .|. idxTysBit  `setBitIf` dopt Opt_IndexedTypes flags
1555                .|. haddockBit `setBitIf` dopt Opt_Haddock     flags
1556       --
1557       setBitIf :: Int -> Bool -> Int
1558       b `setBitIf` cond | cond      = bit b
1559                         | otherwise = 0
1560
1561 addWarning :: DynFlag -> WarnMsg -> P ()
1562 addWarning option w
1563  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1564        let ws' = if dopt option d then ws `snocBag` w else ws
1565        in POk s{messages=(ws', es)} ()
1566
1567 getMessages :: PState -> Messages
1568 getMessages PState{messages=ms} = ms
1569
1570 getContext :: P [LayoutContext]
1571 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1572
1573 setContext :: [LayoutContext] -> P ()
1574 setContext ctx = P $ \s -> POk s{context=ctx} ()
1575
1576 popContext :: P ()
1577 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1578                            loc = loc, last_len = len, last_loc = last_loc }) ->
1579   case ctx of
1580         (_:tl) -> POk s{ context = tl } ()
1581         []     -> PFailed last_loc (srcParseErr buf len)
1582
1583 -- Push a new layout context at the indentation of the last token read.
1584 -- This is only used at the outer level of a module when the 'module'
1585 -- keyword is missing.
1586 pushCurrentContext :: P ()
1587 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1588     POk s{context = Layout (offs-len) : ctx} ()
1589 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1590
1591 getOffside :: P Ordering
1592 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1593                 let ord = case stk of
1594                         (Layout n:_) -> compare offs n
1595                         _            -> GT
1596                 in POk s ord
1597
1598 -- ---------------------------------------------------------------------------
1599 -- Construct a parse error
1600
1601 srcParseErr
1602   :: StringBuffer       -- current buffer (placed just after the last token)
1603   -> Int                -- length of the previous token
1604   -> Message
1605 srcParseErr buf len
1606   = hcat [ if null token
1607              then ptext SLIT("parse error (possibly incorrect indentation)")
1608              else hcat [ptext SLIT("parse error on input "),
1609                         char '`', text token, char '\'']
1610     ]
1611   where token = lexemeToString (offsetBytes (-len) buf) len
1612
1613 -- Report a parse failure, giving the span of the previous token as
1614 -- the location of the error.  This is the entry point for errors
1615 -- detected during parsing.
1616 srcParseFail :: P a
1617 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1618                             last_loc = last_loc } ->
1619     PFailed last_loc (srcParseErr buf len)
1620
1621 -- A lexical error is reported at a particular position in the source file,
1622 -- not over a token range.
1623 lexError :: String -> P a
1624 lexError str = do
1625   loc <- getSrcLoc
1626   i@(AI end _ buf) <- getInput
1627   reportLexError loc end buf str
1628
1629 -- -----------------------------------------------------------------------------
1630 -- This is the top-level function: called from the parser each time a
1631 -- new token is to be read from the input.
1632
1633 lexer :: (Located Token -> P a) -> P a
1634 lexer cont = do
1635   tok@(L span tok__) <- lexToken
1636 --  trace ("token: " ++ show tok__) $ do
1637   cont tok
1638
1639 lexToken :: P (Located Token)
1640 lexToken = do
1641   inp@(AI loc1 _ buf) <- getInput
1642   sc <- getLexState
1643   exts <- getExts
1644   case alexScanUser exts inp sc of
1645     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1646                   setLastToken span 0 0
1647                   return (L span ITeof)
1648     AlexError (AI loc2 _ buf) -> do
1649         reportLexError loc1 loc2 buf "lexical error"
1650     AlexSkip inp2 _ -> do
1651         setInput inp2
1652         lexToken
1653     AlexToken inp2@(AI end _ buf2) len t -> do
1654     setInput inp2
1655     let span = mkSrcSpan loc1 end
1656     let bytes = byteDiff buf buf2
1657     span `seq` setLastToken span bytes bytes
1658     t span buf bytes
1659
1660 reportLexError loc1 loc2 buf str
1661   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1662   | otherwise =
1663   let
1664         c = fst (nextChar buf)
1665   in
1666   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1667     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1668     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1669 }