compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    getMessages,
  29    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  30    getLexState, popLexState, pushLexState,
  31    extension, standaloneDerivingEnabled, bangPatEnabled,
  32    addWarning
  33   ) where
  34
  35 #include "HsVersions.h"
  36
  37 import Bag
  38 import ErrUtils
  39 import Outputable
  40 import StringBuffer
  41 import FastString
  42 import FastTypes
  43 import SrcLoc
  44 import UniqFM
  45 import DynFlags
  46 import Ctype
  47 import Util             ( maybePrefixMatch, readRational )
  48
  49 import Control.Monad
  50 import Data.Bits
  51 import Data.Char        ( chr, isSpace )
  52 import Data.Ratio
  53 import Debug.Trace
  54
  55 #if __GLASGOW_HASKELL__ >= 605
  56 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  57 #else
  58 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  59 #endif
  60 }
  61
  62 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  63 $whitechar   = [\ \n\r\f\v\xa0 $unispace]
  64 $white_no_nl = $whitechar # \n
  65 $tab         = \t
  66
  67 $ascdigit  = 0-9
  68 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  69 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  70 $digit     = [$ascdigit $unidigit]
  71
  72 $special   = [\(\)\,\;\[\]\`\{\}]
  73 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  74 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  75 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  76
  77 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  78 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  79 $large     = [$asclarge $unilarge]
  80
  81 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
  82 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  83 $small     = [$ascsmall $unismall \_]
  84
  85 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
  86 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  87
  88 $octit     = 0-7
  89 $hexit     = [$decdigit A-F a-f]
  90 $symchar   = [$symbol \:]
  91 $nl        = [\n\r]
  92 $idchar    = [$small $large $digit \']
  93
  94 $docsym    = [\| \^ \* \$]
  95
  96 @varid     = $small $idchar*
  97 @conid     = $large $idchar*
  98
  99 @varsym    = $symbol $symchar*
 100 @consym    = \: $symchar*
 101
 102 @decimal     = $decdigit+
 103 @octal       = $octit+
 104 @hexadecimal = $hexit+
 105 @exponent    = [eE] [\-\+]? @decimal
 106
 107 -- we support the hierarchical module name extension:
 108 @qual = (@conid \.)+
 109
 110 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 111
 112 -- normal signed numerical literals can only be explicitly negative,
 113 -- not explicitly positive (contrast @exponent)
 114 @negative = \-
 115 @signed = @negative ?
 116
 117 haskell :-
 118
 119 -- everywhere: skip whitespace and comments
 120 $white_no_nl+                           ;
 121 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 122
 123 -- Everywhere: deal with nested comments.  We explicitly rule out
 124 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 125 -- (this can happen even though pragmas will normally take precedence due to
 126 -- longest-match, because pragmas aren't valid in every state, but comments
 127 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 128 -- set.
 129
 130 "{-" / { isNormalComment } { nested_comment lexToken }
 131
 132 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 133 -- more dashes followed by a symbol should be parsed as a varsym, so we
 134 -- have to exclude those.
 135
 136 -- Since Haddock comments aren't valid in every state, we need to rule them
 137 -- out here.
 138
 139 -- The following two rules match comments that begin with two dashes, but
 140 -- continue with a different character. The rules test that this character
 141 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 142 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 143 -- have a Haddock comment). The rules then munch the rest of the line.
 144
 145 "-- " ~$docsym .* ;
 146 "--" [^$symbol : \ ] .* ;
 147
 148 -- Next, match Haddock comments if no -haddock flag
 149
 150 "-- " $docsym .* / { ifExtension (not . haddockEnabled) } ;
 151
 152 -- Now, when we've matched comments that begin with 2 dashes and continue
 153 -- with a different character, we need to match comments that begin with three
 154 -- or more dashes (which clearly can't be Haddock comments). We only need to
 155 -- make sure that the first non-dash character isn't a symbol, and munch the
 156 -- rest of the line.
 157
 158 "---"\-* [^$symbol :] .* ;
 159
 160 -- Since the previous rules all match dashes followed by at least one
 161 -- character, we also need to match a whole line filled with just dashes.
 162
 163 "--"\-* / { atEOL } ;
 164
 165 -- We need this rule since none of the other single line comment rules
 166 -- actually match this case.
 167
 168 "-- " / { atEOL } ;
 169
 170 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 171 -- blank lines) until we find a non-whitespace character, then do layout
 172 -- processing.
 173 --
 174 -- One slight wibble here: what if the line begins with {-#? In
 175 -- theory, we have to lex the pragma to see if it's one we recognise,
 176 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 177 -- as a nested comment.  We don't bother with this: if the line begins
 178 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 179 <bol> {
 180   \n                                    ;
 181   ^\# (line)?                           { begin line_prag1 }
 182   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 183   ^\# \! .* \n                          ; -- #!, for scripts
 184   ()                                    { do_bol }
 185 }
 186
 187 -- after a layout keyword (let, where, do, of), we begin a new layout
 188 -- context if the curly brace is missing.
 189 -- Careful! This stuff is quite delicate.
 190 <layout, layout_do> {
 191   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 192         -- we might encounter {-# here, but {- has been handled already
 193   \n                                    ;
 194   ^\# (line)?                           { begin line_prag1 }
 195 }
 196
 197 -- do is treated in a subtly different way, see new_layout_context
 198 <layout>    ()                          { new_layout_context True }
 199 <layout_do> ()                          { new_layout_context False }
 200
 201 -- after a new layout context which was found to be to the left of the
 202 -- previous context, we have generated a '{' token, and we now need to
 203 -- generate a matching '}' token.
 204 <layout_left>  ()                       { do_layout_left }
 205
 206 <0,option_prags> \n                             { begin bol }
 207
 208 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 209
 210 -- single-line line pragmas, of the form
 211 --    # <line> "<file>" <extra-stuff> \n
 212 <line_prag1> $decdigit+                 { setLine line_prag1a }
 213 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 214 <line_prag1b> .*                        { pop }
 215
 216 -- Haskell-style line pragmas, of the form
 217 --    {-# LINE <line> "<file>" #-}
 218 <line_prag2> $decdigit+                 { setLine line_prag2a }
 219 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 220 <line_prag2b> "#-}"|"-}"                { pop }
 221    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 222    -- with older versions of GHC which generated these.
 223
 224 -- We only want RULES pragmas to be picked up when explicit forall
 225 -- syntax is enabled is on, because the contents of the pragma always
 226 -- uses it. If it's not on then we're sure to get a parse error.
 227 -- (ToDo: we should really emit a warning when ignoring pragmas)
 228 -- XXX Now that we can enable this without the -fglasgow-exts hammer,
 229 -- is it better just to let the parse error happen?
 230 <0>
 231   "{-#" $whitechar* (RULES|rules) / { ifExtension explicitForallEnabled } { token ITrules_prag }
 232
 233 <0,option_prags> {
 234   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 235   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 236                                         { token (ITinline_prag False) }
 237   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 238                                         { token ITspec_prag }
 239   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 240         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 241   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 242         $whitechar* (NO(T?)INLINE|no(t?)inline)
 243                                         { token (ITspec_inline_prag False) }
 244   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 245   "{-#" $whitechar* (DEPRECATED|deprecated)
 246                                         { token ITdeprecated_prag }
 247   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 248   "{-#" $whitechar* (GENERATED|generated)
 249                                         { token ITgenerated_prag }
 250   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 251   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 252
 253   "{-#" $whitechar* (DOCOPTIONS|docoptions)
 254   / { ifExtension haddockEnabled }     { lex_string_prag ITdocOptions }
 255
 256  "{-#"                                 { nested_comment lexToken }
 257
 258   -- ToDo: should only be valid inside a pragma:
 259   "#-}"                                 { token ITclose_prag}
 260 }
 261
 262 <option_prags> {
 263   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 264   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 265                                         { lex_string_prag IToptions_prag }
 266   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 267   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 268 }
 269
 270 <0,option_prags> {
 271         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 272   "{-#" $whitechar* $idchar+            { nested_comment lexToken }
 273 }
 274
 275 -- '0' state: ordinary lexemes
 276
 277 -- Haddock comments
 278
 279 <0> {
 280   "-- " $docsym    / { ifExtension haddockEnabled } { multiline_doc_comment }
 281   "{-" \ ? $docsym / { ifExtension haddockEnabled } { nested_doc_comment }
 282 }
 283
 284 -- "special" symbols
 285
 286 <0> {
 287   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 288   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 289 }
 290
 291 <0> {
 292   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 293   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 294   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 295   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 296   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 297   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 298   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 299   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 300 }
 301
 302 <0> {
 303   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 304                                         { special IToparenbar }
 305   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 306 }
 307
 308 <0> {
 309   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 310 }
 311
 312 <0> {
 313   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 314          { token IToubxparen }
 315   "#)" / { ifExtension unboxedTuplesEnabled }
 316          { token ITcubxparen }
 317 }
 318
 319 <0> {
 320   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 321   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 322 }
 323
 324 <0,option_prags> {
 325   \(                                    { special IToparen }
 326   \)                                    { special ITcparen }
 327   \[                                    { special ITobrack }
 328   \]                                    { special ITcbrack }
 329   \,                                    { special ITcomma }
 330   \;                                    { special ITsemi }
 331   \`                                    { special ITbackquote }
 332
 333   \{                                    { open_brace }
 334   \}                                    { close_brace }
 335 }
 336
 337 <0,option_prags> {
 338   @qual @varid                  { idtoken qvarid }
 339   @qual @conid                  { idtoken qconid }
 340   @varid                        { varid }
 341   @conid                        { idtoken conid }
 342 }
 343
 344 <0> {
 345   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 346   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 347   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 348   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 349 }
 350
 351 -- ToDo: M.(,,,)
 352
 353 <0> {
 354   @qual @varsym                 { idtoken qvarsym }
 355   @qual @consym                 { idtoken qconsym }
 356   @varsym                       { varsym }
 357   @consym                       { consym }
 358 }
 359
 360 -- For the normal boxed literals we need to be careful
 361 -- when trying to be close to Haskell98
 362 <0> {
 363   -- Normal integral literals (:: Num a => a, from Integer)
 364   @decimal                      { tok_num positive 0 0 decimal }
 365   0[oO] @octal                  { tok_num positive 2 2 octal }
 366   0[xX] @hexadecimal            { tok_num positive 2 2 hexadecimal }
 367
 368   -- Normal rational literals (:: Fractional a => a, from Rational)
 369   @floating_point               { strtoken tok_float }
 370 }
 371
 372 <0> {
 373   -- Unboxed ints (:: Int#)
 374   -- It's simpler (and faster?) to give separate cases to the negatives,
 375   -- especially considering octal/hexadecimal prefixes.
 376   @decimal \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 377   0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 378   0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 379   @negative @decimal \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 380   @negative 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 381   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 382
 383   -- Unboxed floats and doubles (:: Float#, :: Double#)
 384   -- prim_{float,double} work with signed literals
 385   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 386   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 387 }
 388
 389 -- Strings and chars are lexed by hand-written code.  The reason is
 390 -- that even if we recognise the string or char here in the regex
 391 -- lexer, we would still have to parse the string afterward in order
 392 -- to convert it to a String.
 393 <0> {
 394   \'                            { lex_char_tok }
 395   \"                            { lex_string_tok }
 396 }
 397
 398 {
 399 -- -----------------------------------------------------------------------------
 400 -- The token type
 401
 402 data Token
 403   = ITas                        -- Haskell keywords
 404   | ITcase
 405   | ITclass
 406   | ITdata
 407   | ITdefault
 408   | ITderiving
 409   | ITdo
 410   | ITelse
 411   | IThiding
 412   | ITif
 413   | ITimport
 414   | ITin
 415   | ITinfix
 416   | ITinfixl
 417   | ITinfixr
 418   | ITinstance
 419   | ITlet
 420   | ITmodule
 421   | ITnewtype
 422   | ITof
 423   | ITqualified
 424   | ITthen
 425   | ITtype
 426   | ITwhere
 427   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 428
 429   | ITforall                    -- GHC extension keywords
 430   | ITforeign
 431   | ITexport
 432   | ITlabel
 433   | ITdynamic
 434   | ITsafe
 435   | ITthreadsafe
 436   | ITunsafe
 437   | ITstdcallconv
 438   | ITccallconv
 439   | ITdotnet
 440   | ITmdo
 441   | ITfamily
 442
 443         -- Pragmas
 444   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 445   | ITspec_prag                 -- SPECIALISE
 446   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 447   | ITsource_prag
 448   | ITrules_prag
 449   | ITdeprecated_prag
 450   | ITline_prag
 451   | ITscc_prag
 452   | ITgenerated_prag
 453   | ITcore_prag                 -- hdaume: core annotations
 454   | ITunpack_prag
 455   | ITclose_prag
 456   | IToptions_prag String
 457   | ITinclude_prag String
 458   | ITlanguage_prag
 459
 460   | ITdotdot                    -- reserved symbols
 461   | ITcolon
 462   | ITdcolon
 463   | ITequal
 464   | ITlam
 465   | ITvbar
 466   | ITlarrow
 467   | ITrarrow
 468   | ITat
 469   | ITtilde
 470   | ITdarrow
 471   | ITminus
 472   | ITbang
 473   | ITstar
 474   | ITdot
 475
 476   | ITbiglam                    -- GHC-extension symbols
 477
 478   | ITocurly                    -- special symbols
 479   | ITccurly
 480   | ITocurlybar                 -- {|, for type applications
 481   | ITccurlybar                 -- |}, for type applications
 482   | ITvocurly
 483   | ITvccurly
 484   | ITobrack
 485   | ITopabrack                  -- [:, for parallel arrays with -fparr
 486   | ITcpabrack                  -- :], for parallel arrays with -fparr
 487   | ITcbrack
 488   | IToparen
 489   | ITcparen
 490   | IToubxparen
 491   | ITcubxparen
 492   | ITsemi
 493   | ITcomma
 494   | ITunderscore
 495   | ITbackquote
 496
 497   | ITvarid   FastString        -- identifiers
 498   | ITconid   FastString
 499   | ITvarsym  FastString
 500   | ITconsym  FastString
 501   | ITqvarid  (FastString,FastString)
 502   | ITqconid  (FastString,FastString)
 503   | ITqvarsym (FastString,FastString)
 504   | ITqconsym (FastString,FastString)
 505
 506   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 507
 508   | ITpragma StringBuffer
 509
 510   | ITchar       Char
 511   | ITstring     FastString
 512   | ITinteger    Integer
 513   | ITrational   Rational
 514
 515   | ITprimchar   Char
 516   | ITprimstring FastString
 517   | ITprimint    Integer
 518   | ITprimfloat  Rational
 519   | ITprimdouble Rational
 520
 521   -- MetaHaskell extension tokens
 522   | ITopenExpQuote              --  [| or [e|
 523   | ITopenPatQuote              --  [p|
 524   | ITopenDecQuote              --  [d|
 525   | ITopenTypQuote              --  [t|
 526   | ITcloseQuote                --  |]
 527   | ITidEscape   FastString     --  $x
 528   | ITparenEscape               --  $(
 529   | ITvarQuote                  --  '
 530   | ITtyQuote                   --  ''
 531
 532   -- Arrow notation extension
 533   | ITproc
 534   | ITrec
 535   | IToparenbar                 --  (|
 536   | ITcparenbar                 --  |)
 537   | ITlarrowtail                --  -<
 538   | ITrarrowtail                --  >-
 539   | ITLarrowtail                --  -<<
 540   | ITRarrowtail                --  >>-
 541
 542   | ITunknown String            -- Used when the lexer can't make sense of it
 543   | ITeof                       -- end of file token
 544
 545   -- Documentation annotations
 546   | ITdocCommentNext  String     -- something beginning '-- |'
 547   | ITdocCommentPrev  String     -- something beginning '-- ^'
 548   | ITdocCommentNamed String     -- something beginning '-- $'
 549   | ITdocSection      Int String -- a section heading
 550   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 551
 552 #ifdef DEBUG
 553   deriving Show -- debugging
 554 #endif
 555
 556 isSpecial :: Token -> Bool
 557 -- If we see M.x, where x is a keyword, but
 558 -- is special, we treat is as just plain M.x,
 559 -- not as a keyword.
 560 isSpecial ITas          = True
 561 isSpecial IThiding      = True
 562 isSpecial ITqualified   = True
 563 isSpecial ITforall      = True
 564 isSpecial ITexport      = True
 565 isSpecial ITlabel       = True
 566 isSpecial ITdynamic     = True
 567 isSpecial ITsafe        = True
 568 isSpecial ITthreadsafe  = True
 569 isSpecial ITunsafe      = True
 570 isSpecial ITccallconv   = True
 571 isSpecial ITstdcallconv = True
 572 isSpecial ITmdo         = True
 573 isSpecial ITfamily      = True
 574 isSpecial _             = False
 575
 576 -- the bitmap provided as the third component indicates whether the
 577 -- corresponding extension keyword is valid under the extension options
 578 -- provided to the compiler; if the extension corresponding to *any* of the
 579 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 580 -- facilitates using a keyword in two different extensions that can be
 581 -- activated independently)
 582 --
 583 reservedWordsFM = listToUFM $
 584         map (\(x, y, z) -> (mkFastString x, (y, z)))
 585        [( "_",          ITunderscore,   0 ),
 586         ( "as",         ITas,           0 ),
 587         ( "case",       ITcase,         0 ),
 588         ( "class",      ITclass,        0 ),
 589         ( "data",       ITdata,         0 ),
 590         ( "default",    ITdefault,      0 ),
 591         ( "deriving",   ITderiving,     0 ),
 592         ( "do",         ITdo,           0 ),
 593         ( "else",       ITelse,         0 ),
 594         ( "hiding",     IThiding,       0 ),
 595         ( "if",         ITif,           0 ),
 596         ( "import",     ITimport,       0 ),
 597         ( "in",         ITin,           0 ),
 598         ( "infix",      ITinfix,        0 ),
 599         ( "infixl",     ITinfixl,       0 ),
 600         ( "infixr",     ITinfixr,       0 ),
 601         ( "instance",   ITinstance,     0 ),
 602         ( "let",        ITlet,          0 ),
 603         ( "module",     ITmodule,       0 ),
 604         ( "newtype",    ITnewtype,      0 ),
 605         ( "of",         ITof,           0 ),
 606         ( "qualified",  ITqualified,    0 ),
 607         ( "then",       ITthen,         0 ),
 608         ( "type",       ITtype,         0 ),
 609         ( "where",      ITwhere,        0 ),
 610         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 611
 612         ( "forall",     ITforall,        bit explicitForallBit),
 613         ( "mdo",        ITmdo,           bit recursiveDoBit),
 614         ( "family",     ITfamily,        bit tyFamBit),
 615
 616         ( "foreign",    ITforeign,       bit ffiBit),
 617         ( "export",     ITexport,        bit ffiBit),
 618         ( "label",      ITlabel,         bit ffiBit),
 619         ( "dynamic",    ITdynamic,       bit ffiBit),
 620         ( "safe",       ITsafe,          bit ffiBit),
 621         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 622         ( "unsafe",     ITunsafe,        bit ffiBit),
 623         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 624         ( "ccall",      ITccallconv,     bit ffiBit),
 625         ( "dotnet",     ITdotnet,        bit ffiBit),
 626
 627         ( "rec",        ITrec,           bit arrowsBit),
 628         ( "proc",       ITproc,          bit arrowsBit)
 629      ]
 630
 631 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 632 reservedSymsFM = listToUFM $
 633     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 634       [ ("..",  ITdotdot,   always)
 635         -- (:) is a reserved op, meaning only list cons
 636        ,(":",   ITcolon,    always)
 637        ,("::",  ITdcolon,   always)
 638        ,("=",   ITequal,    always)
 639        ,("\\",  ITlam,      always)
 640        ,("|",   ITvbar,     always)
 641        ,("<-",  ITlarrow,   always)
 642        ,("->",  ITrarrow,   always)
 643        ,("@",   ITat,       always)
 644        ,("~",   ITtilde,    always)
 645        ,("=>",  ITdarrow,   always)
 646        ,("-",   ITminus,    always)
 647        ,("!",   ITbang,     always)
 648
 649         -- For data T (a::*) = MkT
 650        ,("*", ITstar, \i -> kindSigsEnabled i || tyFamEnabled i)
 651         -- For 'forall a . t'
 652        ,(".", ITdot, explicitForallEnabled)
 653
 654        ,("-<",  ITlarrowtail, arrowsEnabled)
 655        ,(">-",  ITrarrowtail, arrowsEnabled)
 656        ,("-<<", ITLarrowtail, arrowsEnabled)
 657        ,(">>-", ITRarrowtail, arrowsEnabled)
 658
 659 #if __GLASGOW_HASKELL__ >= 605
 660        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 661        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 662        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 663                                 explicitForallEnabled i)
 664        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 665        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 666        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 667         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 668         -- form part of a large operator.  This would let us have a better
 669         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 670 #endif
 671        ]
 672
 673 -- -----------------------------------------------------------------------------
 674 -- Lexer actions
 675
 676 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 677
 678 special :: Token -> Action
 679 special tok span _buf len = return (L span tok)
 680
 681 token, layout_token :: Token -> Action
 682 token t span buf len = return (L span t)
 683 layout_token t span buf len = pushLexState layout >> return (L span t)
 684
 685 idtoken :: (StringBuffer -> Int -> Token) -> Action
 686 idtoken f span buf len = return (L span $! (f buf len))
 687
 688 skip_one_varid :: (FastString -> Token) -> Action
 689 skip_one_varid f span buf len
 690   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 691
 692 strtoken :: (String -> Token) -> Action
 693 strtoken f span buf len =
 694   return (L span $! (f $! lexemeToString buf len))
 695
 696 init_strtoken :: Int -> (String -> Token) -> Action
 697 -- like strtoken, but drops the last N character(s)
 698 init_strtoken drop f span buf len =
 699   return (L span $! (f $! lexemeToString buf (len-drop)))
 700
 701 begin :: Int -> Action
 702 begin code _span _str _len = do pushLexState code; lexToken
 703
 704 pop :: Action
 705 pop _span _buf _len = do popLexState; lexToken
 706
 707 pop_and :: Action -> Action
 708 pop_and act span buf len = do popLexState; act span buf len
 709
 710 {-# INLINE nextCharIs #-}
 711 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 712
 713 notFollowedBy char _ _ _ (AI _ _ buf)
 714   = nextCharIs buf (/=char)
 715
 716 notFollowedBySymbol _ _ _ (AI _ _ buf)
 717   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 718
 719 -- We must reject doc comments as being ordinary comments everywhere.
 720 -- In some cases the doc comment will be selected as the lexeme due to
 721 -- maximal munch, but not always, because the nested comment rule is
 722 -- valid in all states, but the doc-comment rules are only valid in
 723 -- the non-layout states.
 724 isNormalComment bits _ _ (AI _ _ buf)
 725   | haddockEnabled bits = notFollowedByDocOrPragma
 726   | otherwise           = nextCharIs buf (/='#')
 727   where
 728     notFollowedByDocOrPragma
 729        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 730
 731 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 732
 733 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 734   = if haddockEnabled bits then False else (p buf)
 735
 736 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 737
 738 ifExtension pred bits _ _ _ = pred bits
 739
 740 multiline_doc_comment :: Action
 741 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 742   where
 743     worker commentAcc input docType oneLine = case alexGetChar input of
 744       Just ('\n', input')
 745         | oneLine -> docCommentEnd input commentAcc docType buf span
 746         | otherwise -> case checkIfCommentLine input' of
 747           Just input -> worker ('\n':commentAcc) input docType False
 748           Nothing -> docCommentEnd input commentAcc docType buf span
 749       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 750       Nothing -> docCommentEnd input commentAcc docType buf span
 751
 752     checkIfCommentLine input = check (dropNonNewlineSpace input)
 753       where
 754         check input = case alexGetChar input of
 755           Just ('-', input) -> case alexGetChar input of
 756             Just ('-', input) -> case alexGetChar input of
 757               Just (c, _) | c /= '-' -> Just input
 758               _ -> Nothing
 759             _ -> Nothing
 760           _ -> Nothing
 761
 762         dropNonNewlineSpace input = case alexGetChar input of
 763           Just (c, input')
 764             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 765             | otherwise -> input
 766           Nothing -> input
 767
 768 {-
 769   nested comments require traversing by hand, they can't be parsed
 770   using regular expressions.
 771 -}
 772 nested_comment :: P (Located Token) -> Action
 773 nested_comment cont span _str _len = do
 774   input <- getInput
 775   go (1::Int) input
 776   where
 777     go 0 input = do setInput input; cont
 778     go n input = case alexGetChar input of
 779       Nothing -> errBrace input span
 780       Just ('-',input) -> case alexGetChar input of
 781         Nothing  -> errBrace input span
 782         Just ('\125',input) -> go (n-1) input
 783         Just (c,_)          -> go n input
 784       Just ('\123',input) -> case alexGetChar input of
 785         Nothing  -> errBrace input span
 786         Just ('-',input) -> go (n+1) input
 787         Just (c,_)       -> go n input
 788       Just (c,input) -> go n input
 789
 790 nested_doc_comment :: Action
 791 nested_doc_comment span buf _len = withLexedDocType (go "")
 792   where
 793     go commentAcc input docType _ = case alexGetChar input of
 794       Nothing -> errBrace input span
 795       Just ('-',input) -> case alexGetChar input of
 796         Nothing -> errBrace input span
 797         Just ('\125',input@(AI end _ buf2)) ->
 798           docCommentEnd input commentAcc docType buf span
 799         Just (c,_) -> go ('-':commentAcc) input docType False
 800       Just ('\123', input) -> case alexGetChar input of
 801         Nothing  -> errBrace input span
 802         Just ('-',input) -> do
 803           setInput input
 804           let cont = do input <- getInput; go commentAcc input docType False
 805           nested_comment cont span buf _len
 806         Just (c,_) -> go ('\123':commentAcc) input docType False
 807       Just (c,input) -> go (c:commentAcc) input docType False
 808
 809 withLexedDocType lexDocComment = do
 810   input@(AI _ _ buf) <- getInput
 811   case prevChar buf ' ' of
 812     '|' -> lexDocComment input ITdocCommentNext False
 813     '^' -> lexDocComment input ITdocCommentPrev False
 814     '$' -> lexDocComment input ITdocCommentNamed False
 815     '*' -> lexDocSection 1 input
 816  where
 817     lexDocSection n input = case alexGetChar input of
 818       Just ('*', input) -> lexDocSection (n+1) input
 819       Just (c, _) -> lexDocComment input (ITdocSection n) True
 820       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 821
 822 -- docCommentEnd
 823 -------------------------------------------------------------------------------
 824 -- This function is quite tricky. We can't just return a new token, we also
 825 -- need to update the state of the parser. Why? Because the token is longer
 826 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 827 -- it writes the wrong token length to the parser state. This function is
 828 -- called afterwards, so it can just update the state.
 829
 830 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 831 -- which is something that the original lexer didn't account for.
 832 -- I have added last_line_len in the parser state which represents the length
 833 -- of the part of the token that is on the last line. It is now used for layout
 834 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 835 -- was before, the full length of the token, and it is now only used for error
 836 -- messages. /Waern
 837
 838 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 839                  SrcSpan -> P (Located Token)
 840 docCommentEnd input commentAcc docType buf span = do
 841   setInput input
 842   let (AI loc last_offs nextBuf) = input
 843       comment = reverse commentAcc
 844       span' = mkSrcSpan (srcSpanStart span) loc
 845       last_len = byteDiff buf nextBuf
 846
 847       last_line_len = if (last_offs - last_len < 0)
 848         then last_offs
 849         else last_len
 850
 851   span `seq` setLastToken span' last_len last_line_len
 852   return (L span' (docType comment))
 853
 854 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 855
 856 open_brace, close_brace :: Action
 857 open_brace span _str _len = do
 858   ctx <- getContext
 859   setContext (NoLayout:ctx)
 860   return (L span ITocurly)
 861 close_brace span _str _len = do
 862   popContext
 863   return (L span ITccurly)
 864
 865 qvarid buf len = ITqvarid $! splitQualName buf len
 866 qconid buf len = ITqconid $! splitQualName buf len
 867
 868 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 869 -- takes a StringBuffer and a length, and returns the module name
 870 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 871 -- because of hierarchical module names.
 872 splitQualName orig_buf len = split orig_buf orig_buf
 873   where
 874     split buf dot_buf
 875         | orig_buf `byteDiff` buf >= len  = done dot_buf
 876         | c == '.'                        = found_dot buf'
 877         | otherwise                       = split buf' dot_buf
 878       where
 879        (c,buf') = nextChar buf
 880
 881     -- careful, we might get names like M....
 882     -- so, if the character after the dot is not upper-case, this is
 883     -- the end of the qualifier part.
 884     found_dot buf -- buf points after the '.'
 885         | isUpper c    = split buf' buf
 886         | otherwise    = done buf
 887       where
 888        (c,buf') = nextChar buf
 889
 890     done dot_buf =
 891         (lexemeToFastString orig_buf (qual_size - 1),
 892          lexemeToFastString dot_buf (len - qual_size))
 893       where
 894         qual_size = orig_buf `byteDiff` dot_buf
 895
 896 varid span buf len =
 897   case lookupUFM reservedWordsFM fs of
 898         Just (keyword,0)    -> do
 899                 maybe_layout keyword
 900                 return (L span keyword)
 901         Just (keyword,exts) -> do
 902                 b <- extension (\i -> exts .&. i /= 0)
 903                 if b then do maybe_layout keyword
 904                              return (L span keyword)
 905                      else return (L span (ITvarid fs))
 906         _other -> return (L span (ITvarid fs))
 907   where
 908         fs = lexemeToFastString buf len
 909
 910 conid buf len = ITconid fs
 911   where fs = lexemeToFastString buf len
 912
 913 qvarsym buf len = ITqvarsym $! splitQualName buf len
 914 qconsym buf len = ITqconsym $! splitQualName buf len
 915
 916 varsym = sym ITvarsym
 917 consym = sym ITconsym
 918
 919 sym con span buf len =
 920   case lookupUFM reservedSymsFM fs of
 921         Just (keyword,exts) -> do
 922                 b <- extension exts
 923                 if b then return (L span keyword)
 924                      else return (L span $! con fs)
 925         _other -> return (L span $! con fs)
 926   where
 927         fs = lexemeToFastString buf len
 928
 929 -- Variations on the integral numeric literal.
 930 tok_integral :: (Integer -> Token)
 931      -> (Integer -> Integer)
 932  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
 933      -> Int -> Int
 934      -> (Integer, (Char->Int)) -> Action
 935 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
 936   return $ L span $ itint $! transint $ parseUnsignedInteger
 937      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
 938
 939 -- some conveniences for use with tok_integral
 940 tok_num = tok_integral ITinteger
 941 tok_primint = tok_integral ITprimint
 942 positive = id
 943 negative = negate
 944 decimal = (10,octDecDigit)
 945 octal = (8,octDecDigit)
 946 hexadecimal = (16,hexDigit)
 947
 948 -- readRational can understand negative rationals, exponents, everything.
 949 tok_float        str = ITrational   $! readRational str
 950 tok_primfloat    str = ITprimfloat  $! readRational str
 951 tok_primdouble   str = ITprimdouble $! readRational str
 952
 953 -- -----------------------------------------------------------------------------
 954 -- Layout processing
 955
 956 -- we're at the first token on a line, insert layout tokens if necessary
 957 do_bol :: Action
 958 do_bol span _str _len = do
 959         pos <- getOffside
 960         case pos of
 961             LT -> do
 962                 --trace "layout: inserting '}'" $ do
 963                 popContext
 964                 -- do NOT pop the lex state, we might have a ';' to insert
 965                 return (L span ITvccurly)
 966             EQ -> do
 967                 --trace "layout: inserting ';'" $ do
 968                 popLexState
 969                 return (L span ITsemi)
 970             GT -> do
 971                 popLexState
 972                 lexToken
 973
 974 -- certain keywords put us in the "layout" state, where we might
 975 -- add an opening curly brace.
 976 maybe_layout ITdo       = pushLexState layout_do
 977 maybe_layout ITmdo      = pushLexState layout_do
 978 maybe_layout ITof       = pushLexState layout
 979 maybe_layout ITlet      = pushLexState layout
 980 maybe_layout ITwhere    = pushLexState layout
 981 maybe_layout ITrec      = pushLexState layout
 982 maybe_layout _          = return ()
 983
 984 -- Pushing a new implicit layout context.  If the indentation of the
 985 -- next token is not greater than the previous layout context, then
 986 -- Haskell 98 says that the new layout context should be empty; that is
 987 -- the lexer must generate {}.
 988 --
 989 -- We are slightly more lenient than this: when the new context is started
 990 -- by a 'do', then we allow the new context to be at the same indentation as
 991 -- the previous context.  This is what the 'strict' argument is for.
 992 --
 993 new_layout_context strict span _buf _len = do
 994     popLexState
 995     (AI _ offset _) <- getInput
 996     ctx <- getContext
 997     case ctx of
 998         Layout prev_off : _  |
 999            (strict     && prev_off >= offset  ||
1000             not strict && prev_off > offset) -> do
1001                 -- token is indented to the left of the previous context.
1002                 -- we must generate a {} sequence now.
1003                 pushLexState layout_left
1004                 return (L span ITvocurly)
1005         other -> do
1006                 setContext (Layout offset : ctx)
1007                 return (L span ITvocurly)
1008
1009 do_layout_left span _buf _len = do
1010     popLexState
1011     pushLexState bol  -- we must be at the start of a line
1012     return (L span ITvccurly)
1013
1014 -- -----------------------------------------------------------------------------
1015 -- LINE pragmas
1016
1017 setLine :: Int -> Action
1018 setLine code span buf len = do
1019   let line = parseUnsignedInteger buf len 10 octDecDigit
1020   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1021         -- subtract one: the line number refers to the *following* line
1022   popLexState
1023   pushLexState code
1024   lexToken
1025
1026 setFile :: Int -> Action
1027 setFile code span buf len = do
1028   let file = lexemeToFastString (stepOn buf) (len-2)
1029   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1030   popLexState
1031   pushLexState code
1032   lexToken
1033
1034
1035 -- -----------------------------------------------------------------------------
1036 -- Options, includes and language pragmas.
1037
1038 lex_string_prag :: (String -> Token) -> Action
1039 lex_string_prag mkTok span buf len
1040     = do input <- getInput
1041          start <- getSrcLoc
1042          tok <- go [] input
1043          end <- getSrcLoc
1044          return (L (mkSrcSpan start end) tok)
1045     where go acc input
1046               = if isString input "#-}"
1047                    then do setInput input
1048                            return (mkTok (reverse acc))
1049                    else case alexGetChar input of
1050                           Just (c,i) -> go (c:acc) i
1051                           Nothing -> err input
1052           isString i [] = True
1053           isString i (x:xs)
1054               = case alexGetChar i of
1055                   Just (c,i') | c == x    -> isString i' xs
1056                   _other -> False
1057           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1058
1059
1060 -- -----------------------------------------------------------------------------
1061 -- Strings & Chars
1062
1063 -- This stuff is horrible.  I hates it.
1064
1065 lex_string_tok :: Action
1066 lex_string_tok span buf len = do
1067   tok <- lex_string ""
1068   end <- getSrcLoc
1069   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1070
1071 lex_string :: String -> P Token
1072 lex_string s = do
1073   i <- getInput
1074   case alexGetChar' i of
1075     Nothing -> lit_error
1076
1077     Just ('"',i)  -> do
1078         setInput i
1079         magicHash <- extension magicHashEnabled
1080         if magicHash
1081           then do
1082             i <- getInput
1083             case alexGetChar' i of
1084               Just ('#',i) -> do
1085                    setInput i
1086                    if any (> '\xFF') s
1087                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1088                     else let s' = mkZFastString (reverse s) in
1089                          return (ITprimstring s')
1090                         -- mkZFastString is a hack to avoid encoding the
1091                         -- string in UTF-8.  We just want the exact bytes.
1092               _other ->
1093                 return (ITstring (mkFastString (reverse s)))
1094           else
1095                 return (ITstring (mkFastString (reverse s)))
1096
1097     Just ('\\',i)
1098         | Just ('&',i) <- next -> do
1099                 setInput i; lex_string s
1100         | Just (c,i) <- next, is_space c -> do
1101                 setInput i; lex_stringgap s
1102         where next = alexGetChar' i
1103
1104     Just (c, i) -> do
1105         c' <- lex_char c i
1106         lex_string (c':s)
1107
1108 lex_stringgap s = do
1109   c <- getCharOrFail
1110   case c of
1111     '\\' -> lex_string s
1112     c | is_space c -> lex_stringgap s
1113     _other -> lit_error
1114
1115
1116 lex_char_tok :: Action
1117 -- Here we are basically parsing character literals, such as 'x' or '\n'
1118 -- but, when Template Haskell is on, we additionally spot
1119 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1120 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1121 -- So we have to do two characters of lookahead: when we see 'x we need to
1122 -- see if there's a trailing quote
1123 lex_char_tok span buf len = do  -- We've seen '
1124    i1 <- getInput       -- Look ahead to first character
1125    let loc = srcSpanStart span
1126    case alexGetChar' i1 of
1127         Nothing -> lit_error
1128
1129         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1130                   th_exts <- extension thEnabled
1131                   if th_exts then do
1132                         setInput i2
1133                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1134                    else lit_error
1135
1136         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
1137                   setInput i2
1138                   lit_ch <- lex_escape
1139                   mc <- getCharOrFail   -- Trailing quote
1140                   if mc == '\'' then finish_char_tok loc lit_ch
1141                                 else do setInput i2; lit_error
1142
1143         Just (c, i2@(AI end2 _ _))
1144                 | not (isAny c) -> lit_error
1145                 | otherwise ->
1146
1147                 -- We've seen 'x, where x is a valid character
1148                 --  (i.e. not newline etc) but not a quote or backslash
1149            case alexGetChar' i2 of      -- Look ahead one more character
1150                 Nothing -> lit_error
1151                 Just ('\'', i3) -> do   -- We've seen 'x'
1152                         setInput i3
1153                         finish_char_tok loc c
1154                 _other -> do            -- We've seen 'x not followed by quote
1155                                         -- If TH is on, just parse the quote only
1156                         th_exts <- extension thEnabled
1157                         let (AI end _ _) = i1
1158                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1159                                    else do setInput i2; lit_error
1160
1161 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1162 finish_char_tok loc ch  -- We've already seen the closing quote
1163                         -- Just need to check for trailing #
1164   = do  magicHash <- extension magicHashEnabled
1165         i@(AI end _ _) <- getInput
1166         if magicHash then do
1167                 case alexGetChar' i of
1168                         Just ('#',i@(AI end _ _)) -> do
1169                                 setInput i
1170                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1171                         _other ->
1172                                 return (L (mkSrcSpan loc end) (ITchar ch))
1173                 else do
1174                    return (L (mkSrcSpan loc end) (ITchar ch))
1175
1176 lex_char :: Char -> AlexInput -> P Char
1177 lex_char c inp = do
1178   case c of
1179       '\\' -> do setInput inp; lex_escape
1180       c | isAny c -> do setInput inp; return c
1181       _other -> lit_error
1182
1183 isAny c | c > '\xff' = isPrint c
1184         | otherwise  = is_any c
1185
1186 lex_escape :: P Char
1187 lex_escape = do
1188   c <- getCharOrFail
1189   case c of
1190         'a'   -> return '\a'
1191         'b'   -> return '\b'
1192         'f'   -> return '\f'
1193         'n'   -> return '\n'
1194         'r'   -> return '\r'
1195         't'   -> return '\t'
1196         'v'   -> return '\v'
1197         '\\'  -> return '\\'
1198         '"'   -> return '\"'
1199         '\''  -> return '\''
1200         '^'   -> do c <- getCharOrFail
1201                     if c >= '@' && c <= '_'
1202                         then return (chr (ord c - ord '@'))
1203                         else lit_error
1204
1205         'x'   -> readNum is_hexdigit 16 hexDigit
1206         'o'   -> readNum is_octdigit  8 octDecDigit
1207         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1208
1209         c1 ->  do
1210            i <- getInput
1211            case alexGetChar' i of
1212             Nothing -> lit_error
1213             Just (c2,i2) ->
1214               case alexGetChar' i2 of
1215                 Nothing -> do setInput i2; lit_error
1216                 Just (c3,i3) ->
1217                    let str = [c1,c2,c3] in
1218                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1219                                      Just rest <- [maybePrefixMatch p str] ] of
1220                           (escape_char,[]):_ -> do
1221                                 setInput i3
1222                                 return escape_char
1223                           (escape_char,_:_):_ -> do
1224                                 setInput i2
1225                                 return escape_char
1226                           [] -> lit_error
1227
1228 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1229 readNum is_digit base conv = do
1230   i <- getInput
1231   c <- getCharOrFail
1232   if is_digit c
1233         then readNum2 is_digit base conv (conv c)
1234         else do setInput i; lit_error
1235
1236 readNum2 is_digit base conv i = do
1237   input <- getInput
1238   read i input
1239   where read i input = do
1240           case alexGetChar' input of
1241             Just (c,input') | is_digit c -> do
1242                 read (i*base + conv c) input'
1243             _other -> do
1244                 if i >= 0 && i <= 0x10FFFF
1245                    then do setInput input; return (chr i)
1246                    else lit_error
1247
1248 silly_escape_chars = [
1249         ("NUL", '\NUL'),
1250         ("SOH", '\SOH'),
1251         ("STX", '\STX'),
1252         ("ETX", '\ETX'),
1253         ("EOT", '\EOT'),
1254         ("ENQ", '\ENQ'),
1255         ("ACK", '\ACK'),
1256         ("BEL", '\BEL'),
1257         ("BS", '\BS'),
1258         ("HT", '\HT'),
1259         ("LF", '\LF'),
1260         ("VT", '\VT'),
1261         ("FF", '\FF'),
1262         ("CR", '\CR'),
1263         ("SO", '\SO'),
1264         ("SI", '\SI'),
1265         ("DLE", '\DLE'),
1266         ("DC1", '\DC1'),
1267         ("DC2", '\DC2'),
1268         ("DC3", '\DC3'),
1269         ("DC4", '\DC4'),
1270         ("NAK", '\NAK'),
1271         ("SYN", '\SYN'),
1272         ("ETB", '\ETB'),
1273         ("CAN", '\CAN'),
1274         ("EM", '\EM'),
1275         ("SUB", '\SUB'),
1276         ("ESC", '\ESC'),
1277         ("FS", '\FS'),
1278         ("GS", '\GS'),
1279         ("RS", '\RS'),
1280         ("US", '\US'),
1281         ("SP", '\SP'),
1282         ("DEL", '\DEL')
1283         ]
1284
1285 -- before calling lit_error, ensure that the current input is pointing to
1286 -- the position of the error in the buffer.  This is so that we can report
1287 -- a correct location to the user, but also so we can detect UTF-8 decoding
1288 -- errors if they occur.
1289 lit_error = lexError "lexical error in string/character literal"
1290
1291 getCharOrFail :: P Char
1292 getCharOrFail =  do
1293   i <- getInput
1294   case alexGetChar' i of
1295         Nothing -> lexError "unexpected end-of-file in string/character literal"
1296         Just (c,i)  -> do setInput i; return c
1297
1298 -- -----------------------------------------------------------------------------
1299 -- Warnings
1300
1301 warn :: DynFlag -> SDoc -> Action
1302 warn option warning srcspan _buf _len = do
1303     addWarning option srcspan warning
1304     lexToken
1305
1306 -- -----------------------------------------------------------------------------
1307 -- The Parse Monad
1308
1309 data LayoutContext
1310   = NoLayout
1311   | Layout !Int
1312   deriving Show
1313
1314 data ParseResult a
1315   = POk PState a
1316   | PFailed
1317         SrcSpan         -- The start and end of the text span related to
1318                         -- the error.  Might be used in environments which can
1319                         -- show this span, e.g. by highlighting it.
1320         Message         -- The error message
1321
1322 data PState = PState {
1323         buffer     :: StringBuffer,
1324     dflags     :: DynFlags,
1325     messages   :: Messages,
1326         last_loc   :: SrcSpan,  -- pos of previous token
1327         last_offs  :: !Int,     -- offset of the previous token from the
1328                                 -- beginning of  the current line.
1329                                 -- \t is equal to 8 spaces.
1330         last_len   :: !Int,     -- len of previous token
1331   last_line_len :: !Int,
1332         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1333         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1334         context    :: [LayoutContext],
1335         lex_state  :: [Int]
1336      }
1337         -- last_loc and last_len are used when generating error messages,
1338         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1339         -- current token to happyError, we could at least get rid of last_len.
1340         -- Getting rid of last_loc would require finding another way to
1341         -- implement pushCurrentContext (which is only called from one place).
1342
1343 newtype P a = P { unP :: PState -> ParseResult a }
1344
1345 instance Monad P where
1346   return = returnP
1347   (>>=) = thenP
1348   fail = failP
1349
1350 returnP :: a -> P a
1351 returnP a = P $ \s -> POk s a
1352
1353 thenP :: P a -> (a -> P b) -> P b
1354 (P m) `thenP` k = P $ \ s ->
1355         case m s of
1356                 POk s1 a         -> (unP (k a)) s1
1357                 PFailed span err -> PFailed span err
1358
1359 failP :: String -> P a
1360 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1361
1362 failMsgP :: String -> P a
1363 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1364
1365 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1366 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1367
1368 failSpanMsgP :: SrcSpan -> String -> P a
1369 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1370
1371 extension :: (Int -> Bool) -> P Bool
1372 extension p = P $ \s -> POk s (p $! extsBitmap s)
1373
1374 getExts :: P Int
1375 getExts = P $ \s -> POk s (extsBitmap s)
1376
1377 setSrcLoc :: SrcLoc -> P ()
1378 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1379
1380 getSrcLoc :: P SrcLoc
1381 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1382
1383 setLastToken :: SrcSpan -> Int -> Int -> P ()
1384 setLastToken loc len line_len = P $ \s -> POk s {
1385   last_loc=loc,
1386   last_len=len,
1387   last_line_len=line_len
1388 } ()
1389
1390 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1391
1392 alexInputPrevChar :: AlexInput -> Char
1393 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1394
1395 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1396 alexGetChar (AI loc ofs s)
1397   | atEnd s   = Nothing
1398   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1399                 --trace (show (ord c)) $
1400                 Just (adj_c, (AI loc' ofs' s'))
1401   where (c,s') = nextChar s
1402         loc'   = advanceSrcLoc loc c
1403         ofs'   = advanceOffs c ofs
1404
1405         non_graphic     = '\x0'
1406         upper           = '\x1'
1407         lower           = '\x2'
1408         digit           = '\x3'
1409         symbol          = '\x4'
1410         space           = '\x5'
1411         other_graphic   = '\x6'
1412
1413         adj_c
1414           | c <= '\x06' = non_graphic
1415           | c <= '\xff' = c
1416           -- Alex doesn't handle Unicode, so when Unicode
1417           -- character is encoutered we output these values
1418           -- with the actual character value hidden in the state.
1419           | otherwise =
1420                 case generalCategory c of
1421                   UppercaseLetter       -> upper
1422                   LowercaseLetter       -> lower
1423                   TitlecaseLetter       -> upper
1424                   ModifierLetter        -> other_graphic
1425                   OtherLetter           -> other_graphic
1426                   NonSpacingMark        -> other_graphic
1427                   SpacingCombiningMark  -> other_graphic
1428                   EnclosingMark         -> other_graphic
1429                   DecimalNumber         -> digit
1430                   LetterNumber          -> other_graphic
1431                   OtherNumber           -> other_graphic
1432                   ConnectorPunctuation  -> other_graphic
1433                   DashPunctuation       -> other_graphic
1434                   OpenPunctuation       -> other_graphic
1435                   ClosePunctuation      -> other_graphic
1436                   InitialQuote          -> other_graphic
1437                   FinalQuote            -> other_graphic
1438                   OtherPunctuation      -> other_graphic
1439                   MathSymbol            -> symbol
1440                   CurrencySymbol        -> symbol
1441                   ModifierSymbol        -> symbol
1442                   OtherSymbol           -> symbol
1443                   Space                 -> space
1444                   _other                -> non_graphic
1445
1446 -- This version does not squash unicode characters, it is used when
1447 -- lexing strings.
1448 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1449 alexGetChar' (AI loc ofs s)
1450   | atEnd s   = Nothing
1451   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1452                 --trace (show (ord c)) $
1453                 Just (c, (AI loc' ofs' s'))
1454   where (c,s') = nextChar s
1455         loc'   = advanceSrcLoc loc c
1456         ofs'   = advanceOffs c ofs
1457
1458 advanceOffs :: Char -> Int -> Int
1459 advanceOffs '\n' offs = 0
1460 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1461 advanceOffs _    offs = offs + 1
1462
1463 getInput :: P AlexInput
1464 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1465
1466 setInput :: AlexInput -> P ()
1467 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1468
1469 pushLexState :: Int -> P ()
1470 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1471
1472 popLexState :: P Int
1473 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1474
1475 getLexState :: P Int
1476 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1477
1478 -- for reasons of efficiency, flags indicating language extensions (eg,
1479 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1480 -- integer
1481
1482 genericsBit, ffiBit, parrBit :: Int
1483 genericsBit = 0 -- {| and |}
1484 ffiBit     = 1
1485 parrBit    = 2
1486 arrowsBit  = 4
1487 thBit      = 5
1488 ipBit      = 6
1489 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1490 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1491                 -- (doesn't affect the lexer)
1492 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1493 haddockBit = 10 -- Lex and parse Haddock comments
1494 magicHashBit = 11 -- # in both functions and operators
1495 kindSigsBit = 12 -- Kind signatures on type variables
1496 recursiveDoBit = 13 -- mdo
1497 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1498 unboxedTuplesBit = 15 -- (# and #)
1499 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1500
1501 genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1502 always           _     = True
1503 genericsEnabled  flags = testBit flags genericsBit
1504 ffiEnabled       flags = testBit flags ffiBit
1505 parrEnabled      flags = testBit flags parrBit
1506 arrowsEnabled    flags = testBit flags arrowsBit
1507 thEnabled        flags = testBit flags thBit
1508 ipEnabled        flags = testBit flags ipBit
1509 explicitForallEnabled flags = testBit flags explicitForallBit
1510 bangPatEnabled   flags = testBit flags bangPatBit
1511 tyFamEnabled     flags = testBit flags tyFamBit
1512 haddockEnabled   flags = testBit flags haddockBit
1513 magicHashEnabled flags = testBit flags magicHashBit
1514 kindSigsEnabled  flags = testBit flags kindSigsBit
1515 recursiveDoEnabled flags = testBit flags recursiveDoBit
1516 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1517 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1518 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1519
1520 -- PState for parsing options pragmas
1521 --
1522 pragState :: StringBuffer -> SrcLoc -> PState
1523 pragState buf loc  =
1524   PState {
1525       buffer          = buf,
1526       messages      = emptyMessages,
1527       -- XXX defaultDynFlags is not right, but we don't have a real
1528       -- dflags handy
1529       dflags        = defaultDynFlags,
1530       last_loc      = mkSrcSpan loc loc,
1531       last_offs     = 0,
1532       last_len      = 0,
1533       last_line_len = 0,
1534       loc           = loc,
1535       extsBitmap    = 0,
1536       context       = [],
1537       lex_state     = [bol, option_prags, 0]
1538     }
1539
1540
1541 -- create a parse state
1542 --
1543 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1544 mkPState buf loc flags  =
1545   PState {
1546       buffer          = buf,
1547       dflags        = flags,
1548       messages      = emptyMessages,
1549       last_loc      = mkSrcSpan loc loc,
1550       last_offs     = 0,
1551       last_len      = 0,
1552       last_line_len = 0,
1553       loc           = loc,
1554       extsBitmap    = fromIntegral bitmap,
1555       context       = [],
1556       lex_state     = [bol, 0]
1557         -- we begin in the layout state if toplev_layout is set
1558     }
1559     where
1560       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1561                .|. ffiBit       `setBitIf` dopt Opt_ForeignFunctionInterface flags
1562                .|. parrBit      `setBitIf` dopt Opt_PArr         flags
1563                .|. arrowsBit    `setBitIf` dopt Opt_Arrows       flags
1564                .|. thBit        `setBitIf` dopt Opt_TemplateHaskell flags
1565                .|. ipBit        `setBitIf` dopt Opt_ImplicitParams flags
1566                .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
1567                .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
1568                .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
1569                .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
1570                .|. explicitForallBit `setBitIf` dopt Opt_RankNTypes flags
1571                .|. bangPatBit   `setBitIf` dopt Opt_BangPatterns flags
1572                .|. tyFamBit     `setBitIf` dopt Opt_TypeFamilies flags
1573                .|. haddockBit   `setBitIf` dopt Opt_Haddock      flags
1574                .|. magicHashBit `setBitIf` dopt Opt_MagicHash    flags
1575                .|. kindSigsBit  `setBitIf` dopt Opt_KindSignatures flags
1576                .|. recursiveDoBit `setBitIf` dopt Opt_RecursiveDo flags
1577                .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
1578                .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
1579                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1580       --
1581       setBitIf :: Int -> Bool -> Int
1582       b `setBitIf` cond | cond      = bit b
1583                         | otherwise = 0
1584
1585 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1586 addWarning option srcspan warning
1587  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1588        let warning' = mkWarnMsg srcspan alwaysQualify warning
1589            ws' = if dopt option d then ws `snocBag` warning' else ws
1590        in POk s{messages=(ws', es)} ()
1591
1592 getMessages :: PState -> Messages
1593 getMessages PState{messages=ms} = ms
1594
1595 getContext :: P [LayoutContext]
1596 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1597
1598 setContext :: [LayoutContext] -> P ()
1599 setContext ctx = P $ \s -> POk s{context=ctx} ()
1600
1601 popContext :: P ()
1602 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1603                            loc = loc, last_len = len, last_loc = last_loc }) ->
1604   case ctx of
1605         (_:tl) -> POk s{ context = tl } ()
1606         []     -> PFailed last_loc (srcParseErr buf len)
1607
1608 -- Push a new layout context at the indentation of the last token read.
1609 -- This is only used at the outer level of a module when the 'module'
1610 -- keyword is missing.
1611 pushCurrentContext :: P ()
1612 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1613     POk s{context = Layout (offs-len) : ctx} ()
1614 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1615
1616 getOffside :: P Ordering
1617 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1618                 let ord = case stk of
1619                         (Layout n:_) -> compare offs n
1620                         _            -> GT
1621                 in POk s ord
1622
1623 -- ---------------------------------------------------------------------------
1624 -- Construct a parse error
1625
1626 srcParseErr
1627   :: StringBuffer       -- current buffer (placed just after the last token)
1628   -> Int                -- length of the previous token
1629   -> Message
1630 srcParseErr buf len
1631   = hcat [ if null token
1632              then ptext SLIT("parse error (possibly incorrect indentation)")
1633              else hcat [ptext SLIT("parse error on input "),
1634                         char '`', text token, char '\'']
1635     ]
1636   where token = lexemeToString (offsetBytes (-len) buf) len
1637
1638 -- Report a parse failure, giving the span of the previous token as
1639 -- the location of the error.  This is the entry point for errors
1640 -- detected during parsing.
1641 srcParseFail :: P a
1642 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1643                             last_loc = last_loc } ->
1644     PFailed last_loc (srcParseErr buf len)
1645
1646 -- A lexical error is reported at a particular position in the source file,
1647 -- not over a token range.
1648 lexError :: String -> P a
1649 lexError str = do
1650   loc <- getSrcLoc
1651   i@(AI end _ buf) <- getInput
1652   reportLexError loc end buf str
1653
1654 -- -----------------------------------------------------------------------------
1655 -- This is the top-level function: called from the parser each time a
1656 -- new token is to be read from the input.
1657
1658 lexer :: (Located Token -> P a) -> P a
1659 lexer cont = do
1660   tok@(L span tok__) <- lexToken
1661 --  trace ("token: " ++ show tok__) $ do
1662   cont tok
1663
1664 lexToken :: P (Located Token)
1665 lexToken = do
1666   inp@(AI loc1 _ buf) <- getInput
1667   sc <- getLexState
1668   exts <- getExts
1669   case alexScanUser exts inp sc of
1670     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1671                   setLastToken span 0 0
1672                   return (L span ITeof)
1673     AlexError (AI loc2 _ buf) -> do
1674         reportLexError loc1 loc2 buf "lexical error"
1675     AlexSkip inp2 _ -> do
1676         setInput inp2
1677         lexToken
1678     AlexToken inp2@(AI end _ buf2) len t -> do
1679     setInput inp2
1680     let span = mkSrcSpan loc1 end
1681     let bytes = byteDiff buf buf2
1682     span `seq` setLastToken span bytes bytes
1683     t span buf bytes
1684
1685 reportLexError loc1 loc2 buf str
1686   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1687   | otherwise =
1688   let
1689         c = fst (nextChar buf)
1690   in
1691   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1692     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1693     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1694 }