compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 {-# OPTIONS -w #-}
  25 -- The above warning supression flag is a temporary kludge.
  26 -- While working on this module you are encouraged to remove it and fix
  27 -- any warnings in the module. See
  28 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  29 -- for details
  30
  31 module Lexer (
  32    Token(..), lexer, pragState, mkPState, PState(..),
  33    P(..), ParseResult(..), getSrcLoc,
  34    failLocMsgP, failSpanMsgP, srcParseFail,
  35    getMessages,
  36    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  37    getLexState, popLexState, pushLexState,
  38    extension, standaloneDerivingEnabled, bangPatEnabled,
  39    addWarning
  40   ) where
  41
  42 #include "HsVersions.h"
  43
  44 import Bag
  45 import ErrUtils
  46 import Outputable
  47 import StringBuffer
  48 import FastString
  49 import FastTypes
  50 import SrcLoc
  51 import UniqFM
  52 import DynFlags
  53 import Ctype
  54 import Util             ( maybePrefixMatch, readRational )
  55
  56 import Control.Monad
  57 import Data.Bits
  58 import Data.Char        ( chr, isSpace )
  59 import Data.Ratio
  60 import Debug.Trace
  61
  62 #if __GLASGOW_HASKELL__ >= 605
  63 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  64 #else
  65 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  66 #endif
  67 }
  68
  69 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  70 $whitechar   = [\ \n\r\f\v\xa0 $unispace]
  71 $white_no_nl = $whitechar # \n
  72 $tab         = \t
  73
  74 $ascdigit  = 0-9
  75 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  76 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  77 $digit     = [$ascdigit $unidigit]
  78
  79 $special   = [\(\)\,\;\[\]\`\{\}]
  80 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  81 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  82 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  83
  84 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  85 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  86 $large     = [$asclarge $unilarge]
  87
  88 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
  89 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  90 $small     = [$ascsmall $unismall \_]
  91
  92 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
  93 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  94
  95 $octit     = 0-7
  96 $hexit     = [$decdigit A-F a-f]
  97 $symchar   = [$symbol \:]
  98 $nl        = [\n\r]
  99 $idchar    = [$small $large $digit \']
 100
 101 $docsym    = [\| \^ \* \$]
 102
 103 @varid     = $small $idchar*
 104 @conid     = $large $idchar*
 105
 106 @varsym    = $symbol $symchar*
 107 @consym    = \: $symchar*
 108
 109 @decimal     = $decdigit+
 110 @octal       = $octit+
 111 @hexadecimal = $hexit+
 112 @exponent    = [eE] [\-\+]? @decimal
 113
 114 -- we support the hierarchical module name extension:
 115 @qual = (@conid \.)+
 116
 117 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 118
 119 -- normal signed numerical literals can only be explicitly negative,
 120 -- not explicitly positive (contrast @exponent)
 121 @negative = \-
 122 @signed = @negative ?
 123
 124 haskell :-
 125
 126 -- everywhere: skip whitespace and comments
 127 $white_no_nl+                           ;
 128 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 129
 130 -- Everywhere: deal with nested comments.  We explicitly rule out
 131 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 132 -- (this can happen even though pragmas will normally take precedence due to
 133 -- longest-match, because pragmas aren't valid in every state, but comments
 134 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 135 -- set.
 136
 137 "{-" / { isNormalComment } { nested_comment lexToken }
 138
 139 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 140 -- more dashes followed by a symbol should be parsed as a varsym, so we
 141 -- have to exclude those.
 142
 143 -- Since Haddock comments aren't valid in every state, we need to rule them
 144 -- out here.
 145
 146 -- The following two rules match comments that begin with two dashes, but
 147 -- continue with a different character. The rules test that this character
 148 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 149 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 150 -- have a Haddock comment). The rules then munch the rest of the line.
 151
 152 "-- " ~[$docsym \#] .* ;
 153 "--" [^$symbol : \ ] .* ;
 154
 155 -- Next, match Haddock comments if no -haddock flag
 156
 157 "-- " $docsym .* / { ifExtension (not . haddockEnabled) } ;
 158
 159 -- Now, when we've matched comments that begin with 2 dashes and continue
 160 -- with a different character, we need to match comments that begin with three
 161 -- or more dashes (which clearly can't be Haddock comments). We only need to
 162 -- make sure that the first non-dash character isn't a symbol, and munch the
 163 -- rest of the line.
 164
 165 "---"\-* [^$symbol :] .* ;
 166
 167 -- Since the previous rules all match dashes followed by at least one
 168 -- character, we also need to match a whole line filled with just dashes.
 169
 170 "--"\-* / { atEOL } ;
 171
 172 -- We need this rule since none of the other single line comment rules
 173 -- actually match this case.
 174
 175 "-- " / { atEOL } ;
 176
 177 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 178 -- blank lines) until we find a non-whitespace character, then do layout
 179 -- processing.
 180 --
 181 -- One slight wibble here: what if the line begins with {-#? In
 182 -- theory, we have to lex the pragma to see if it's one we recognise,
 183 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 184 -- as a nested comment.  We don't bother with this: if the line begins
 185 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 186 <bol> {
 187   \n                                    ;
 188   ^\# (line)?                           { begin line_prag1 }
 189   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 190   ^\# \! .* \n                          ; -- #!, for scripts
 191   ()                                    { do_bol }
 192 }
 193
 194 -- after a layout keyword (let, where, do, of), we begin a new layout
 195 -- context if the curly brace is missing.
 196 -- Careful! This stuff is quite delicate.
 197 <layout, layout_do> {
 198   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 199         -- we might encounter {-# here, but {- has been handled already
 200   \n                                    ;
 201   ^\# (line)?                           { begin line_prag1 }
 202 }
 203
 204 -- do is treated in a subtly different way, see new_layout_context
 205 <layout>    ()                          { new_layout_context True }
 206 <layout_do> ()                          { new_layout_context False }
 207
 208 -- after a new layout context which was found to be to the left of the
 209 -- previous context, we have generated a '{' token, and we now need to
 210 -- generate a matching '}' token.
 211 <layout_left>  ()                       { do_layout_left }
 212
 213 <0,option_prags> \n                             { begin bol }
 214
 215 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 216
 217 -- single-line line pragmas, of the form
 218 --    # <line> "<file>" <extra-stuff> \n
 219 <line_prag1> $decdigit+                 { setLine line_prag1a }
 220 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 221 <line_prag1b> .*                        { pop }
 222
 223 -- Haskell-style line pragmas, of the form
 224 --    {-# LINE <line> "<file>" #-}
 225 <line_prag2> $decdigit+                 { setLine line_prag2a }
 226 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 227 <line_prag2b> "#-}"|"-}"                { pop }
 228    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 229    -- with older versions of GHC which generated these.
 230
 231 -- We only want RULES pragmas to be picked up when explicit forall
 232 -- syntax is enabled is on, because the contents of the pragma always
 233 -- uses it. If it's not on then we're sure to get a parse error.
 234 -- (ToDo: we should really emit a warning when ignoring pragmas)
 235 -- XXX Now that we can enable this without the -fglasgow-exts hammer,
 236 -- is it better just to let the parse error happen?
 237 <0>
 238   "{-#" $whitechar* (RULES|rules) / { ifExtension explicitForallEnabled } { token ITrules_prag }
 239
 240 <0,option_prags> {
 241   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 242   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 243                                         { token (ITinline_prag False) }
 244   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 245                                         { token ITspec_prag }
 246   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 247         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 248   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 249         $whitechar* (NO(T?)INLINE|no(t?)inline)
 250                                         { token (ITspec_inline_prag False) }
 251   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 252   "{-#" $whitechar* (DEPRECATED|deprecated)
 253                                         { token ITdeprecated_prag }
 254   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 255   "{-#" $whitechar* (GENERATED|generated)
 256                                         { token ITgenerated_prag }
 257   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 258   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 259
 260  "{-#"                                 { nested_comment lexToken }
 261
 262   -- ToDo: should only be valid inside a pragma:
 263   "#-}"                                 { token ITclose_prag}
 264 }
 265
 266 <option_prags> {
 267   "{-#"  $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 268   "{-#"  $whitechar* (OPTIONS_GHC|options_ghc)
 269                                         { lex_string_prag IToptions_prag }
 270   "{-#"  $whitechar* (OPTIONS_HADDOCK|options_haddock)
 271                                          { lex_string_prag ITdocOptions }
 272   "-- #"                                 { multiline_doc_comment }
 273   "{-#"  $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 274   "{-#"  $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 275 }
 276
 277 <0> {
 278   "-- #" .* ;
 279 }
 280
 281 <0,option_prags> {
 282         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 283   "{-#" $whitechar* $idchar+            { nested_comment lexToken }
 284 }
 285
 286 -- '0' state: ordinary lexemes
 287
 288 -- Haddock comments
 289
 290 <0> {
 291   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 292   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 293 }
 294
 295 -- "special" symbols
 296
 297 <0> {
 298   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 299   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 300 }
 301
 302 <0> {
 303   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 304   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 305   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 306   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 307   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 308   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 309   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 310   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 311 }
 312
 313 <0> {
 314   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 315                                         { special IToparenbar }
 316   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 317 }
 318
 319 <0> {
 320   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 321 }
 322
 323 <0> {
 324   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 325          { token IToubxparen }
 326   "#)" / { ifExtension unboxedTuplesEnabled }
 327          { token ITcubxparen }
 328 }
 329
 330 <0> {
 331   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 332   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 333 }
 334
 335 <0,option_prags> {
 336   \(                                    { special IToparen }
 337   \)                                    { special ITcparen }
 338   \[                                    { special ITobrack }
 339   \]                                    { special ITcbrack }
 340   \,                                    { special ITcomma }
 341   \;                                    { special ITsemi }
 342   \`                                    { special ITbackquote }
 343
 344   \{                                    { open_brace }
 345   \}                                    { close_brace }
 346 }
 347
 348 <0,option_prags> {
 349   @qual @varid                  { idtoken qvarid }
 350   @qual @conid                  { idtoken qconid }
 351   @varid                        { varid }
 352   @conid                        { idtoken conid }
 353 }
 354
 355 <0> {
 356   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 357   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 358   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 359   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 360 }
 361
 362 -- ToDo: M.(,,,)
 363
 364 <0> {
 365   @qual @varsym                 { idtoken qvarsym }
 366   @qual @consym                 { idtoken qconsym }
 367   @varsym                       { varsym }
 368   @consym                       { consym }
 369 }
 370
 371 -- For the normal boxed literals we need to be careful
 372 -- when trying to be close to Haskell98
 373 <0> {
 374   -- Normal integral literals (:: Num a => a, from Integer)
 375   @decimal                      { tok_num positive 0 0 decimal }
 376   0[oO] @octal                  { tok_num positive 2 2 octal }
 377   0[xX] @hexadecimal            { tok_num positive 2 2 hexadecimal }
 378
 379   -- Normal rational literals (:: Fractional a => a, from Rational)
 380   @floating_point               { strtoken tok_float }
 381 }
 382
 383 <0> {
 384   -- Unboxed ints (:: Int#)
 385   -- It's simpler (and faster?) to give separate cases to the negatives,
 386   -- especially considering octal/hexadecimal prefixes.
 387   @decimal \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 388   0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 389   0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 390   @negative @decimal \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 391   @negative 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 392   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 393
 394   -- Unboxed floats and doubles (:: Float#, :: Double#)
 395   -- prim_{float,double} work with signed literals
 396   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 397   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 398 }
 399
 400 -- Strings and chars are lexed by hand-written code.  The reason is
 401 -- that even if we recognise the string or char here in the regex
 402 -- lexer, we would still have to parse the string afterward in order
 403 -- to convert it to a String.
 404 <0> {
 405   \'                            { lex_char_tok }
 406   \"                            { lex_string_tok }
 407 }
 408
 409 {
 410 -- -----------------------------------------------------------------------------
 411 -- The token type
 412
 413 data Token
 414   = ITas                        -- Haskell keywords
 415   | ITcase
 416   | ITclass
 417   | ITdata
 418   | ITdefault
 419   | ITderiving
 420   | ITdo
 421   | ITelse
 422   | IThiding
 423   | ITif
 424   | ITimport
 425   | ITin
 426   | ITinfix
 427   | ITinfixl
 428   | ITinfixr
 429   | ITinstance
 430   | ITlet
 431   | ITmodule
 432   | ITnewtype
 433   | ITof
 434   | ITqualified
 435   | ITthen
 436   | ITtype
 437   | ITwhere
 438   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 439
 440   | ITforall                    -- GHC extension keywords
 441   | ITforeign
 442   | ITexport
 443   | ITlabel
 444   | ITdynamic
 445   | ITsafe
 446   | ITthreadsafe
 447   | ITunsafe
 448   | ITstdcallconv
 449   | ITccallconv
 450   | ITdotnet
 451   | ITmdo
 452   | ITfamily
 453   | ITgroup
 454   | ITby
 455   | ITusing
 456
 457         -- Pragmas
 458   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 459   | ITspec_prag                 -- SPECIALISE
 460   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 461   | ITsource_prag
 462   | ITrules_prag
 463   | ITdeprecated_prag
 464   | ITline_prag
 465   | ITscc_prag
 466   | ITgenerated_prag
 467   | ITcore_prag                 -- hdaume: core annotations
 468   | ITunpack_prag
 469   | ITclose_prag
 470   | IToptions_prag String
 471   | ITinclude_prag String
 472   | ITlanguage_prag
 473
 474   | ITdotdot                    -- reserved symbols
 475   | ITcolon
 476   | ITdcolon
 477   | ITequal
 478   | ITlam
 479   | ITvbar
 480   | ITlarrow
 481   | ITrarrow
 482   | ITat
 483   | ITtilde
 484   | ITdarrow
 485   | ITminus
 486   | ITbang
 487   | ITstar
 488   | ITdot
 489
 490   | ITbiglam                    -- GHC-extension symbols
 491
 492   | ITocurly                    -- special symbols
 493   | ITccurly
 494   | ITocurlybar                 -- {|, for type applications
 495   | ITccurlybar                 -- |}, for type applications
 496   | ITvocurly
 497   | ITvccurly
 498   | ITobrack
 499   | ITopabrack                  -- [:, for parallel arrays with -fparr
 500   | ITcpabrack                  -- :], for parallel arrays with -fparr
 501   | ITcbrack
 502   | IToparen
 503   | ITcparen
 504   | IToubxparen
 505   | ITcubxparen
 506   | ITsemi
 507   | ITcomma
 508   | ITunderscore
 509   | ITbackquote
 510
 511   | ITvarid   FastString        -- identifiers
 512   | ITconid   FastString
 513   | ITvarsym  FastString
 514   | ITconsym  FastString
 515   | ITqvarid  (FastString,FastString)
 516   | ITqconid  (FastString,FastString)
 517   | ITqvarsym (FastString,FastString)
 518   | ITqconsym (FastString,FastString)
 519
 520   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 521
 522   | ITpragma StringBuffer
 523
 524   | ITchar       Char
 525   | ITstring     FastString
 526   | ITinteger    Integer
 527   | ITrational   Rational
 528
 529   | ITprimchar   Char
 530   | ITprimstring FastString
 531   | ITprimint    Integer
 532   | ITprimfloat  Rational
 533   | ITprimdouble Rational
 534
 535   -- MetaHaskell extension tokens
 536   | ITopenExpQuote              --  [| or [e|
 537   | ITopenPatQuote              --  [p|
 538   | ITopenDecQuote              --  [d|
 539   | ITopenTypQuote              --  [t|
 540   | ITcloseQuote                --  |]
 541   | ITidEscape   FastString     --  $x
 542   | ITparenEscape               --  $(
 543   | ITvarQuote                  --  '
 544   | ITtyQuote                   --  ''
 545
 546   -- Arrow notation extension
 547   | ITproc
 548   | ITrec
 549   | IToparenbar                 --  (|
 550   | ITcparenbar                 --  |)
 551   | ITlarrowtail                --  -<
 552   | ITrarrowtail                --  >-
 553   | ITLarrowtail                --  -<<
 554   | ITRarrowtail                --  >>-
 555
 556   | ITunknown String            -- Used when the lexer can't make sense of it
 557   | ITeof                       -- end of file token
 558
 559   -- Documentation annotations
 560   | ITdocCommentNext  String     -- something beginning '-- |'
 561   | ITdocCommentPrev  String     -- something beginning '-- ^'
 562   | ITdocCommentNamed String     -- something beginning '-- $'
 563   | ITdocSection      Int String -- a section heading
 564   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 565   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 566
 567 #ifdef DEBUG
 568   deriving Show -- debugging
 569 #endif
 570
 571 isSpecial :: Token -> Bool
 572 -- If we see M.x, where x is a keyword, but
 573 -- is special, we treat is as just plain M.x,
 574 -- not as a keyword.
 575 isSpecial ITas          = True
 576 isSpecial IThiding      = True
 577 isSpecial ITqualified   = True
 578 isSpecial ITforall      = True
 579 isSpecial ITexport      = True
 580 isSpecial ITlabel       = True
 581 isSpecial ITdynamic     = True
 582 isSpecial ITsafe        = True
 583 isSpecial ITthreadsafe  = True
 584 isSpecial ITunsafe      = True
 585 isSpecial ITccallconv   = True
 586 isSpecial ITstdcallconv = True
 587 isSpecial ITmdo         = True
 588 isSpecial ITfamily      = True
 589 isSpecial ITgroup   = True
 590 isSpecial ITby      = True
 591 isSpecial ITusing   = True
 592 isSpecial _             = False
 593
 594 -- the bitmap provided as the third component indicates whether the
 595 -- corresponding extension keyword is valid under the extension options
 596 -- provided to the compiler; if the extension corresponding to *any* of the
 597 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 598 -- facilitates using a keyword in two different extensions that can be
 599 -- activated independently)
 600 --
 601 reservedWordsFM = listToUFM $
 602         map (\(x, y, z) -> (mkFastString x, (y, z)))
 603        [( "_",          ITunderscore,   0 ),
 604         ( "as",         ITas,           0 ),
 605         ( "case",       ITcase,         0 ),
 606         ( "class",      ITclass,        0 ),
 607         ( "data",       ITdata,         0 ),
 608         ( "default",    ITdefault,      0 ),
 609         ( "deriving",   ITderiving,     0 ),
 610         ( "do",         ITdo,           0 ),
 611         ( "else",       ITelse,         0 ),
 612         ( "hiding",     IThiding,       0 ),
 613         ( "if",         ITif,           0 ),
 614         ( "import",     ITimport,       0 ),
 615         ( "in",         ITin,           0 ),
 616         ( "infix",      ITinfix,        0 ),
 617         ( "infixl",     ITinfixl,       0 ),
 618         ( "infixr",     ITinfixr,       0 ),
 619         ( "instance",   ITinstance,     0 ),
 620         ( "let",        ITlet,          0 ),
 621         ( "module",     ITmodule,       0 ),
 622         ( "newtype",    ITnewtype,      0 ),
 623         ( "of",         ITof,           0 ),
 624         ( "qualified",  ITqualified,    0 ),
 625         ( "then",       ITthen,         0 ),
 626         ( "type",       ITtype,         0 ),
 627         ( "where",      ITwhere,        0 ),
 628         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 629
 630     ( "forall", ITforall,        bit explicitForallBit),
 631         ( "mdo",        ITmdo,           bit recursiveDoBit),
 632         ( "family",     ITfamily,        bit tyFamBit),
 633     ( "group",  ITgroup,     bit transformComprehensionsBit),
 634     ( "by",     ITby,        bit transformComprehensionsBit),
 635     ( "using",  ITusing,     bit transformComprehensionsBit),
 636
 637         ( "foreign",    ITforeign,       bit ffiBit),
 638         ( "export",     ITexport,        bit ffiBit),
 639         ( "label",      ITlabel,         bit ffiBit),
 640         ( "dynamic",    ITdynamic,       bit ffiBit),
 641         ( "safe",       ITsafe,          bit ffiBit),
 642         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 643         ( "unsafe",     ITunsafe,        bit ffiBit),
 644         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 645         ( "ccall",      ITccallconv,     bit ffiBit),
 646         ( "dotnet",     ITdotnet,        bit ffiBit),
 647
 648         ( "rec",        ITrec,           bit arrowsBit),
 649         ( "proc",       ITproc,          bit arrowsBit)
 650      ]
 651
 652 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 653 reservedSymsFM = listToUFM $
 654     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 655       [ ("..",  ITdotdot,   always)
 656         -- (:) is a reserved op, meaning only list cons
 657        ,(":",   ITcolon,    always)
 658        ,("::",  ITdcolon,   always)
 659        ,("=",   ITequal,    always)
 660        ,("\\",  ITlam,      always)
 661        ,("|",   ITvbar,     always)
 662        ,("<-",  ITlarrow,   always)
 663        ,("->",  ITrarrow,   always)
 664        ,("@",   ITat,       always)
 665        ,("~",   ITtilde,    always)
 666        ,("=>",  ITdarrow,   always)
 667        ,("-",   ITminus,    always)
 668        ,("!",   ITbang,     always)
 669
 670         -- For data T (a::*) = MkT
 671        ,("*", ITstar, \i -> kindSigsEnabled i || tyFamEnabled i)
 672         -- For 'forall a . t'
 673        ,(".", ITdot, explicitForallEnabled)
 674
 675        ,("-<",  ITlarrowtail, arrowsEnabled)
 676        ,(">-",  ITrarrowtail, arrowsEnabled)
 677        ,("-<<", ITLarrowtail, arrowsEnabled)
 678        ,(">>-", ITRarrowtail, arrowsEnabled)
 679
 680 #if __GLASGOW_HASKELL__ >= 605
 681        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 682        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 683        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 684                                 explicitForallEnabled i)
 685        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 686        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 687        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 688         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 689         -- form part of a large operator.  This would let us have a better
 690         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 691 #endif
 692        ]
 693
 694 -- -----------------------------------------------------------------------------
 695 -- Lexer actions
 696
 697 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 698
 699 special :: Token -> Action
 700 special tok span _buf len = return (L span tok)
 701
 702 token, layout_token :: Token -> Action
 703 token t span buf len = return (L span t)
 704 layout_token t span buf len = pushLexState layout >> return (L span t)
 705
 706 idtoken :: (StringBuffer -> Int -> Token) -> Action
 707 idtoken f span buf len = return (L span $! (f buf len))
 708
 709 skip_one_varid :: (FastString -> Token) -> Action
 710 skip_one_varid f span buf len
 711   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 712
 713 strtoken :: (String -> Token) -> Action
 714 strtoken f span buf len =
 715   return (L span $! (f $! lexemeToString buf len))
 716
 717 init_strtoken :: Int -> (String -> Token) -> Action
 718 -- like strtoken, but drops the last N character(s)
 719 init_strtoken drop f span buf len =
 720   return (L span $! (f $! lexemeToString buf (len-drop)))
 721
 722 begin :: Int -> Action
 723 begin code _span _str _len = do pushLexState code; lexToken
 724
 725 pop :: Action
 726 pop _span _buf _len = do popLexState; lexToken
 727
 728 pop_and :: Action -> Action
 729 pop_and act span buf len = do popLexState; act span buf len
 730
 731 {-# INLINE nextCharIs #-}
 732 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 733
 734 notFollowedBy char _ _ _ (AI _ _ buf)
 735   = nextCharIs buf (/=char)
 736
 737 notFollowedBySymbol _ _ _ (AI _ _ buf)
 738   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 739
 740 -- We must reject doc comments as being ordinary comments everywhere.
 741 -- In some cases the doc comment will be selected as the lexeme due to
 742 -- maximal munch, but not always, because the nested comment rule is
 743 -- valid in all states, but the doc-comment rules are only valid in
 744 -- the non-layout states.
 745 isNormalComment bits _ _ (AI _ _ buf)
 746   | haddockEnabled bits = notFollowedByDocOrPragma
 747   | otherwise           = nextCharIs buf (/='#')
 748   where
 749     notFollowedByDocOrPragma
 750        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 751
 752 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 753
 754 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 755   = if haddockEnabled bits then False else (p buf)
 756
 757 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 758
 759 ifExtension pred bits _ _ _ = pred bits
 760
 761 multiline_doc_comment :: Action
 762 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 763   where
 764     worker commentAcc input docType oneLine = case alexGetChar input of
 765       Just ('\n', input')
 766         | oneLine -> docCommentEnd input commentAcc docType buf span
 767         | otherwise -> case checkIfCommentLine input' of
 768           Just input -> worker ('\n':commentAcc) input docType False
 769           Nothing -> docCommentEnd input commentAcc docType buf span
 770       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 771       Nothing -> docCommentEnd input commentAcc docType buf span
 772
 773     checkIfCommentLine input = check (dropNonNewlineSpace input)
 774       where
 775         check input = case alexGetChar input of
 776           Just ('-', input) -> case alexGetChar input of
 777             Just ('-', input) -> case alexGetChar input of
 778               Just (c, _) | c /= '-' -> Just input
 779               _ -> Nothing
 780             _ -> Nothing
 781           _ -> Nothing
 782
 783         dropNonNewlineSpace input = case alexGetChar input of
 784           Just (c, input')
 785             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 786             | otherwise -> input
 787           Nothing -> input
 788
 789 {-
 790   nested comments require traversing by hand, they can't be parsed
 791   using regular expressions.
 792 -}
 793 nested_comment :: P (Located Token) -> Action
 794 nested_comment cont span _str _len = do
 795   input <- getInput
 796   go (1::Int) input
 797   where
 798     go 0 input = do setInput input; cont
 799     go n input = case alexGetChar input of
 800       Nothing -> errBrace input span
 801       Just ('-',input) -> case alexGetChar input of
 802         Nothing  -> errBrace input span
 803         Just ('\125',input) -> go (n-1) input
 804         Just (c,_)          -> go n input
 805       Just ('\123',input) -> case alexGetChar input of
 806         Nothing  -> errBrace input span
 807         Just ('-',input) -> go (n+1) input
 808         Just (c,_)       -> go n input
 809       Just (c,input) -> go n input
 810
 811 nested_doc_comment :: Action
 812 nested_doc_comment span buf _len = withLexedDocType (go "")
 813   where
 814     go commentAcc input docType _ = case alexGetChar input of
 815       Nothing -> errBrace input span
 816       Just ('-',input) -> case alexGetChar input of
 817         Nothing -> errBrace input span
 818         Just ('\125',input@(AI end _ buf2)) ->
 819           docCommentEnd input commentAcc docType buf span
 820         Just (c,_) -> go ('-':commentAcc) input docType False
 821       Just ('\123', input) -> case alexGetChar input of
 822         Nothing  -> errBrace input span
 823         Just ('-',input) -> do
 824           setInput input
 825           let cont = do input <- getInput; go commentAcc input docType False
 826           nested_comment cont span buf _len
 827         Just (c,_) -> go ('\123':commentAcc) input docType False
 828       Just (c,input) -> go (c:commentAcc) input docType False
 829
 830 withLexedDocType lexDocComment = do
 831   input@(AI _ _ buf) <- getInput
 832   case prevChar buf ' ' of
 833     '|' -> lexDocComment input ITdocCommentNext False
 834     '^' -> lexDocComment input ITdocCommentPrev False
 835     '$' -> lexDocComment input ITdocCommentNamed False
 836     '*' -> lexDocSection 1 input
 837     '#' -> lexDocComment input ITdocOptionsOld False
 838  where
 839     lexDocSection n input = case alexGetChar input of
 840       Just ('*', input) -> lexDocSection (n+1) input
 841       Just (c, _) -> lexDocComment input (ITdocSection n) True
 842       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 843
 844 -- docCommentEnd
 845 -------------------------------------------------------------------------------
 846 -- This function is quite tricky. We can't just return a new token, we also
 847 -- need to update the state of the parser. Why? Because the token is longer
 848 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 849 -- it writes the wrong token length to the parser state. This function is
 850 -- called afterwards, so it can just update the state.
 851
 852 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 853 -- which is something that the original lexer didn't account for.
 854 -- I have added last_line_len in the parser state which represents the length
 855 -- of the part of the token that is on the last line. It is now used for layout
 856 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 857 -- was before, the full length of the token, and it is now only used for error
 858 -- messages. /Waern
 859
 860 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 861                  SrcSpan -> P (Located Token)
 862 docCommentEnd input commentAcc docType buf span = do
 863   setInput input
 864   let (AI loc last_offs nextBuf) = input
 865       comment = reverse commentAcc
 866       span' = mkSrcSpan (srcSpanStart span) loc
 867       last_len = byteDiff buf nextBuf
 868
 869       last_line_len = if (last_offs - last_len < 0)
 870         then last_offs
 871         else last_len
 872
 873   span `seq` setLastToken span' last_len last_line_len
 874   return (L span' (docType comment))
 875
 876 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 877
 878 open_brace, close_brace :: Action
 879 open_brace span _str _len = do
 880   ctx <- getContext
 881   setContext (NoLayout:ctx)
 882   return (L span ITocurly)
 883 close_brace span _str _len = do
 884   popContext
 885   return (L span ITccurly)
 886
 887 qvarid buf len = ITqvarid $! splitQualName buf len
 888 qconid buf len = ITqconid $! splitQualName buf len
 889
 890 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 891 -- takes a StringBuffer and a length, and returns the module name
 892 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 893 -- because of hierarchical module names.
 894 splitQualName orig_buf len = split orig_buf orig_buf
 895   where
 896     split buf dot_buf
 897         | orig_buf `byteDiff` buf >= len  = done dot_buf
 898         | c == '.'                        = found_dot buf'
 899         | otherwise                       = split buf' dot_buf
 900       where
 901        (c,buf') = nextChar buf
 902
 903     -- careful, we might get names like M....
 904     -- so, if the character after the dot is not upper-case, this is
 905     -- the end of the qualifier part.
 906     found_dot buf -- buf points after the '.'
 907         | isUpper c    = split buf' buf
 908         | otherwise    = done buf
 909       where
 910        (c,buf') = nextChar buf
 911
 912     done dot_buf =
 913         (lexemeToFastString orig_buf (qual_size - 1),
 914          lexemeToFastString dot_buf (len - qual_size))
 915       where
 916         qual_size = orig_buf `byteDiff` dot_buf
 917
 918 varid span buf len =
 919   case lookupUFM reservedWordsFM fs of
 920         Just (keyword,0)    -> do
 921                 maybe_layout keyword
 922                 return (L span keyword)
 923         Just (keyword,exts) -> do
 924                 b <- extension (\i -> exts .&. i /= 0)
 925                 if b then do maybe_layout keyword
 926                              return (L span keyword)
 927                      else return (L span (ITvarid fs))
 928         _other -> return (L span (ITvarid fs))
 929   where
 930         fs = lexemeToFastString buf len
 931
 932 conid buf len = ITconid fs
 933   where fs = lexemeToFastString buf len
 934
 935 qvarsym buf len = ITqvarsym $! splitQualName buf len
 936 qconsym buf len = ITqconsym $! splitQualName buf len
 937
 938 varsym = sym ITvarsym
 939 consym = sym ITconsym
 940
 941 sym con span buf len =
 942   case lookupUFM reservedSymsFM fs of
 943         Just (keyword,exts) -> do
 944                 b <- extension exts
 945                 if b then return (L span keyword)
 946                      else return (L span $! con fs)
 947         _other -> return (L span $! con fs)
 948   where
 949         fs = lexemeToFastString buf len
 950
 951 -- Variations on the integral numeric literal.
 952 tok_integral :: (Integer -> Token)
 953      -> (Integer -> Integer)
 954  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
 955      -> Int -> Int
 956      -> (Integer, (Char->Int)) -> Action
 957 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
 958   return $ L span $ itint $! transint $ parseUnsignedInteger
 959      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
 960
 961 -- some conveniences for use with tok_integral
 962 tok_num = tok_integral ITinteger
 963 tok_primint = tok_integral ITprimint
 964 positive = id
 965 negative = negate
 966 decimal = (10,octDecDigit)
 967 octal = (8,octDecDigit)
 968 hexadecimal = (16,hexDigit)
 969
 970 -- readRational can understand negative rationals, exponents, everything.
 971 tok_float        str = ITrational   $! readRational str
 972 tok_primfloat    str = ITprimfloat  $! readRational str
 973 tok_primdouble   str = ITprimdouble $! readRational str
 974
 975 -- -----------------------------------------------------------------------------
 976 -- Layout processing
 977
 978 -- we're at the first token on a line, insert layout tokens if necessary
 979 do_bol :: Action
 980 do_bol span _str _len = do
 981         pos <- getOffside
 982         case pos of
 983             LT -> do
 984                 --trace "layout: inserting '}'" $ do
 985                 popContext
 986                 -- do NOT pop the lex state, we might have a ';' to insert
 987                 return (L span ITvccurly)
 988             EQ -> do
 989                 --trace "layout: inserting ';'" $ do
 990                 popLexState
 991                 return (L span ITsemi)
 992             GT -> do
 993                 popLexState
 994                 lexToken
 995
 996 -- certain keywords put us in the "layout" state, where we might
 997 -- add an opening curly brace.
 998 maybe_layout ITdo       = pushLexState layout_do
 999 maybe_layout ITmdo      = pushLexState layout_do
1000 maybe_layout ITof       = pushLexState layout
1001 maybe_layout ITlet      = pushLexState layout
1002 maybe_layout ITwhere    = pushLexState layout
1003 maybe_layout ITrec      = pushLexState layout
1004 maybe_layout _          = return ()
1005
1006 -- Pushing a new implicit layout context.  If the indentation of the
1007 -- next token is not greater than the previous layout context, then
1008 -- Haskell 98 says that the new layout context should be empty; that is
1009 -- the lexer must generate {}.
1010 --
1011 -- We are slightly more lenient than this: when the new context is started
1012 -- by a 'do', then we allow the new context to be at the same indentation as
1013 -- the previous context.  This is what the 'strict' argument is for.
1014 --
1015 new_layout_context strict span _buf _len = do
1016     popLexState
1017     (AI _ offset _) <- getInput
1018     ctx <- getContext
1019     case ctx of
1020         Layout prev_off : _  |
1021            (strict     && prev_off >= offset  ||
1022             not strict && prev_off > offset) -> do
1023                 -- token is indented to the left of the previous context.
1024                 -- we must generate a {} sequence now.
1025                 pushLexState layout_left
1026                 return (L span ITvocurly)
1027         other -> do
1028                 setContext (Layout offset : ctx)
1029                 return (L span ITvocurly)
1030
1031 do_layout_left span _buf _len = do
1032     popLexState
1033     pushLexState bol  -- we must be at the start of a line
1034     return (L span ITvccurly)
1035
1036 -- -----------------------------------------------------------------------------
1037 -- LINE pragmas
1038
1039 setLine :: Int -> Action
1040 setLine code span buf len = do
1041   let line = parseUnsignedInteger buf len 10 octDecDigit
1042   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1043         -- subtract one: the line number refers to the *following* line
1044   popLexState
1045   pushLexState code
1046   lexToken
1047
1048 setFile :: Int -> Action
1049 setFile code span buf len = do
1050   let file = lexemeToFastString (stepOn buf) (len-2)
1051   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1052   popLexState
1053   pushLexState code
1054   lexToken
1055
1056
1057 -- -----------------------------------------------------------------------------
1058 -- Options, includes and language pragmas.
1059
1060 lex_string_prag :: (String -> Token) -> Action
1061 lex_string_prag mkTok span buf len
1062     = do input <- getInput
1063          start <- getSrcLoc
1064          tok <- go [] input
1065          end <- getSrcLoc
1066          return (L (mkSrcSpan start end) tok)
1067     where go acc input
1068               = if isString input "#-}"
1069                    then do setInput input
1070                            return (mkTok (reverse acc))
1071                    else case alexGetChar input of
1072                           Just (c,i) -> go (c:acc) i
1073                           Nothing -> err input
1074           isString i [] = True
1075           isString i (x:xs)
1076               = case alexGetChar i of
1077                   Just (c,i') | c == x    -> isString i' xs
1078                   _other -> False
1079           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1080
1081
1082 -- -----------------------------------------------------------------------------
1083 -- Strings & Chars
1084
1085 -- This stuff is horrible.  I hates it.
1086
1087 lex_string_tok :: Action
1088 lex_string_tok span buf len = do
1089   tok <- lex_string ""
1090   end <- getSrcLoc
1091   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1092
1093 lex_string :: String -> P Token
1094 lex_string s = do
1095   i <- getInput
1096   case alexGetChar' i of
1097     Nothing -> lit_error
1098
1099     Just ('"',i)  -> do
1100         setInput i
1101         magicHash <- extension magicHashEnabled
1102         if magicHash
1103           then do
1104             i <- getInput
1105             case alexGetChar' i of
1106               Just ('#',i) -> do
1107                    setInput i
1108                    if any (> '\xFF') s
1109                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1110                     else let s' = mkZFastString (reverse s) in
1111                          return (ITprimstring s')
1112                         -- mkZFastString is a hack to avoid encoding the
1113                         -- string in UTF-8.  We just want the exact bytes.
1114               _other ->
1115                 return (ITstring (mkFastString (reverse s)))
1116           else
1117                 return (ITstring (mkFastString (reverse s)))
1118
1119     Just ('\\',i)
1120         | Just ('&',i) <- next -> do
1121                 setInput i; lex_string s
1122         | Just (c,i) <- next, is_space c -> do
1123                 setInput i; lex_stringgap s
1124         where next = alexGetChar' i
1125
1126     Just (c, i) -> do
1127         c' <- lex_char c i
1128         lex_string (c':s)
1129
1130 lex_stringgap s = do
1131   c <- getCharOrFail
1132   case c of
1133     '\\' -> lex_string s
1134     c | is_space c -> lex_stringgap s
1135     _other -> lit_error
1136
1137
1138 lex_char_tok :: Action
1139 -- Here we are basically parsing character literals, such as 'x' or '\n'
1140 -- but, when Template Haskell is on, we additionally spot
1141 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1142 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1143 -- So we have to do two characters of lookahead: when we see 'x we need to
1144 -- see if there's a trailing quote
1145 lex_char_tok span buf len = do  -- We've seen '
1146    i1 <- getInput       -- Look ahead to first character
1147    let loc = srcSpanStart span
1148    case alexGetChar' i1 of
1149         Nothing -> lit_error
1150
1151         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1152                   th_exts <- extension thEnabled
1153                   if th_exts then do
1154                         setInput i2
1155                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1156                    else lit_error
1157
1158         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
1159                   setInput i2
1160                   lit_ch <- lex_escape
1161                   mc <- getCharOrFail   -- Trailing quote
1162                   if mc == '\'' then finish_char_tok loc lit_ch
1163                                 else do setInput i2; lit_error
1164
1165         Just (c, i2@(AI end2 _ _))
1166                 | not (isAny c) -> lit_error
1167                 | otherwise ->
1168
1169                 -- We've seen 'x, where x is a valid character
1170                 --  (i.e. not newline etc) but not a quote or backslash
1171            case alexGetChar' i2 of      -- Look ahead one more character
1172                 Nothing -> lit_error
1173                 Just ('\'', i3) -> do   -- We've seen 'x'
1174                         setInput i3
1175                         finish_char_tok loc c
1176                 _other -> do            -- We've seen 'x not followed by quote
1177                                         -- If TH is on, just parse the quote only
1178                         th_exts <- extension thEnabled
1179                         let (AI end _ _) = i1
1180                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1181                                    else do setInput i2; lit_error
1182
1183 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1184 finish_char_tok loc ch  -- We've already seen the closing quote
1185                         -- Just need to check for trailing #
1186   = do  magicHash <- extension magicHashEnabled
1187         i@(AI end _ _) <- getInput
1188         if magicHash then do
1189                 case alexGetChar' i of
1190                         Just ('#',i@(AI end _ _)) -> do
1191                                 setInput i
1192                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1193                         _other ->
1194                                 return (L (mkSrcSpan loc end) (ITchar ch))
1195                 else do
1196                    return (L (mkSrcSpan loc end) (ITchar ch))
1197
1198 lex_char :: Char -> AlexInput -> P Char
1199 lex_char c inp = do
1200   case c of
1201       '\\' -> do setInput inp; lex_escape
1202       c | isAny c -> do setInput inp; return c
1203       _other -> lit_error
1204
1205 isAny c | c > '\xff' = isPrint c
1206         | otherwise  = is_any c
1207
1208 lex_escape :: P Char
1209 lex_escape = do
1210   c <- getCharOrFail
1211   case c of
1212         'a'   -> return '\a'
1213         'b'   -> return '\b'
1214         'f'   -> return '\f'
1215         'n'   -> return '\n'
1216         'r'   -> return '\r'
1217         't'   -> return '\t'
1218         'v'   -> return '\v'
1219         '\\'  -> return '\\'
1220         '"'   -> return '\"'
1221         '\''  -> return '\''
1222         '^'   -> do c <- getCharOrFail
1223                     if c >= '@' && c <= '_'
1224                         then return (chr (ord c - ord '@'))
1225                         else lit_error
1226
1227         'x'   -> readNum is_hexdigit 16 hexDigit
1228         'o'   -> readNum is_octdigit  8 octDecDigit
1229         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1230
1231         c1 ->  do
1232            i <- getInput
1233            case alexGetChar' i of
1234             Nothing -> lit_error
1235             Just (c2,i2) ->
1236               case alexGetChar' i2 of
1237                 Nothing -> do setInput i2; lit_error
1238                 Just (c3,i3) ->
1239                    let str = [c1,c2,c3] in
1240                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1241                                      Just rest <- [maybePrefixMatch p str] ] of
1242                           (escape_char,[]):_ -> do
1243                                 setInput i3
1244                                 return escape_char
1245                           (escape_char,_:_):_ -> do
1246                                 setInput i2
1247                                 return escape_char
1248                           [] -> lit_error
1249
1250 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1251 readNum is_digit base conv = do
1252   i <- getInput
1253   c <- getCharOrFail
1254   if is_digit c
1255         then readNum2 is_digit base conv (conv c)
1256         else do setInput i; lit_error
1257
1258 readNum2 is_digit base conv i = do
1259   input <- getInput
1260   read i input
1261   where read i input = do
1262           case alexGetChar' input of
1263             Just (c,input') | is_digit c -> do
1264                 read (i*base + conv c) input'
1265             _other -> do
1266                 if i >= 0 && i <= 0x10FFFF
1267                    then do setInput input; return (chr i)
1268                    else lit_error
1269
1270 silly_escape_chars = [
1271         ("NUL", '\NUL'),
1272         ("SOH", '\SOH'),
1273         ("STX", '\STX'),
1274         ("ETX", '\ETX'),
1275         ("EOT", '\EOT'),
1276         ("ENQ", '\ENQ'),
1277         ("ACK", '\ACK'),
1278         ("BEL", '\BEL'),
1279         ("BS", '\BS'),
1280         ("HT", '\HT'),
1281         ("LF", '\LF'),
1282         ("VT", '\VT'),
1283         ("FF", '\FF'),
1284         ("CR", '\CR'),
1285         ("SO", '\SO'),
1286         ("SI", '\SI'),
1287         ("DLE", '\DLE'),
1288         ("DC1", '\DC1'),
1289         ("DC2", '\DC2'),
1290         ("DC3", '\DC3'),
1291         ("DC4", '\DC4'),
1292         ("NAK", '\NAK'),
1293         ("SYN", '\SYN'),
1294         ("ETB", '\ETB'),
1295         ("CAN", '\CAN'),
1296         ("EM", '\EM'),
1297         ("SUB", '\SUB'),
1298         ("ESC", '\ESC'),
1299         ("FS", '\FS'),
1300         ("GS", '\GS'),
1301         ("RS", '\RS'),
1302         ("US", '\US'),
1303         ("SP", '\SP'),
1304         ("DEL", '\DEL')
1305         ]
1306
1307 -- before calling lit_error, ensure that the current input is pointing to
1308 -- the position of the error in the buffer.  This is so that we can report
1309 -- a correct location to the user, but also so we can detect UTF-8 decoding
1310 -- errors if they occur.
1311 lit_error = lexError "lexical error in string/character literal"
1312
1313 getCharOrFail :: P Char
1314 getCharOrFail =  do
1315   i <- getInput
1316   case alexGetChar' i of
1317         Nothing -> lexError "unexpected end-of-file in string/character literal"
1318         Just (c,i)  -> do setInput i; return c
1319
1320 -- -----------------------------------------------------------------------------
1321 -- Warnings
1322
1323 warn :: DynFlag -> SDoc -> Action
1324 warn option warning srcspan _buf _len = do
1325     addWarning option srcspan warning
1326     lexToken
1327
1328 -- -----------------------------------------------------------------------------
1329 -- The Parse Monad
1330
1331 data LayoutContext
1332   = NoLayout
1333   | Layout !Int
1334   deriving Show
1335
1336 data ParseResult a
1337   = POk PState a
1338   | PFailed
1339         SrcSpan         -- The start and end of the text span related to
1340                         -- the error.  Might be used in environments which can
1341                         -- show this span, e.g. by highlighting it.
1342         Message         -- The error message
1343
1344 data PState = PState {
1345         buffer     :: StringBuffer,
1346     dflags     :: DynFlags,
1347     messages   :: Messages,
1348         last_loc   :: SrcSpan,  -- pos of previous token
1349         last_offs  :: !Int,     -- offset of the previous token from the
1350                                 -- beginning of  the current line.
1351                                 -- \t is equal to 8 spaces.
1352         last_len   :: !Int,     -- len of previous token
1353   last_line_len :: !Int,
1354         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1355         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1356         context    :: [LayoutContext],
1357         lex_state  :: [Int]
1358      }
1359         -- last_loc and last_len are used when generating error messages,
1360         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1361         -- current token to happyError, we could at least get rid of last_len.
1362         -- Getting rid of last_loc would require finding another way to
1363         -- implement pushCurrentContext (which is only called from one place).
1364
1365 newtype P a = P { unP :: PState -> ParseResult a }
1366
1367 instance Monad P where
1368   return = returnP
1369   (>>=) = thenP
1370   fail = failP
1371
1372 returnP :: a -> P a
1373 returnP a = P $ \s -> POk s a
1374
1375 thenP :: P a -> (a -> P b) -> P b
1376 (P m) `thenP` k = P $ \ s ->
1377         case m s of
1378                 POk s1 a         -> (unP (k a)) s1
1379                 PFailed span err -> PFailed span err
1380
1381 failP :: String -> P a
1382 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1383
1384 failMsgP :: String -> P a
1385 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1386
1387 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1388 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1389
1390 failSpanMsgP :: SrcSpan -> String -> P a
1391 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1392
1393 extension :: (Int -> Bool) -> P Bool
1394 extension p = P $ \s -> POk s (p $! extsBitmap s)
1395
1396 getExts :: P Int
1397 getExts = P $ \s -> POk s (extsBitmap s)
1398
1399 setSrcLoc :: SrcLoc -> P ()
1400 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1401
1402 getSrcLoc :: P SrcLoc
1403 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1404
1405 setLastToken :: SrcSpan -> Int -> Int -> P ()
1406 setLastToken loc len line_len = P $ \s -> POk s {
1407   last_loc=loc,
1408   last_len=len,
1409   last_line_len=line_len
1410 } ()
1411
1412 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1413
1414 alexInputPrevChar :: AlexInput -> Char
1415 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1416
1417 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1418 alexGetChar (AI loc ofs s)
1419   | atEnd s   = Nothing
1420   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1421                 --trace (show (ord c)) $
1422                 Just (adj_c, (AI loc' ofs' s'))
1423   where (c,s') = nextChar s
1424         loc'   = advanceSrcLoc loc c
1425         ofs'   = advanceOffs c ofs
1426
1427         non_graphic     = '\x0'
1428         upper           = '\x1'
1429         lower           = '\x2'
1430         digit           = '\x3'
1431         symbol          = '\x4'
1432         space           = '\x5'
1433         other_graphic   = '\x6'
1434
1435         adj_c
1436           | c <= '\x06' = non_graphic
1437           | c <= '\xff' = c
1438           -- Alex doesn't handle Unicode, so when Unicode
1439           -- character is encoutered we output these values
1440           -- with the actual character value hidden in the state.
1441           | otherwise =
1442                 case generalCategory c of
1443                   UppercaseLetter       -> upper
1444                   LowercaseLetter       -> lower
1445                   TitlecaseLetter       -> upper
1446                   ModifierLetter        -> other_graphic
1447                   OtherLetter           -> other_graphic
1448                   NonSpacingMark        -> other_graphic
1449                   SpacingCombiningMark  -> other_graphic
1450                   EnclosingMark         -> other_graphic
1451                   DecimalNumber         -> digit
1452                   LetterNumber          -> other_graphic
1453                   OtherNumber           -> other_graphic
1454                   ConnectorPunctuation  -> other_graphic
1455                   DashPunctuation       -> other_graphic
1456                   OpenPunctuation       -> other_graphic
1457                   ClosePunctuation      -> other_graphic
1458                   InitialQuote          -> other_graphic
1459                   FinalQuote            -> other_graphic
1460                   OtherPunctuation      -> other_graphic
1461                   MathSymbol            -> symbol
1462                   CurrencySymbol        -> symbol
1463                   ModifierSymbol        -> symbol
1464                   OtherSymbol           -> symbol
1465                   Space                 -> space
1466                   _other                -> non_graphic
1467
1468 -- This version does not squash unicode characters, it is used when
1469 -- lexing strings.
1470 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1471 alexGetChar' (AI loc ofs s)
1472   | atEnd s   = Nothing
1473   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1474                 --trace (show (ord c)) $
1475                 Just (c, (AI loc' ofs' s'))
1476   where (c,s') = nextChar s
1477         loc'   = advanceSrcLoc loc c
1478         ofs'   = advanceOffs c ofs
1479
1480 advanceOffs :: Char -> Int -> Int
1481 advanceOffs '\n' offs = 0
1482 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1483 advanceOffs _    offs = offs + 1
1484
1485 getInput :: P AlexInput
1486 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1487
1488 setInput :: AlexInput -> P ()
1489 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1490
1491 pushLexState :: Int -> P ()
1492 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1493
1494 popLexState :: P Int
1495 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1496
1497 getLexState :: P Int
1498 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1499
1500 -- for reasons of efficiency, flags indicating language extensions (eg,
1501 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1502 -- integer
1503
1504 genericsBit, ffiBit, parrBit :: Int
1505 genericsBit = 0 -- {| and |}
1506 ffiBit     = 1
1507 parrBit    = 2
1508 arrowsBit  = 4
1509 thBit      = 5
1510 ipBit      = 6
1511 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1512 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1513                 -- (doesn't affect the lexer)
1514 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1515 haddockBit = 10 -- Lex and parse Haddock comments
1516 magicHashBit = 11 -- # in both functions and operators
1517 kindSigsBit = 12 -- Kind signatures on type variables
1518 recursiveDoBit = 13 -- mdo
1519 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1520 unboxedTuplesBit = 15 -- (# and #)
1521 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1522 transformComprehensionsBit = 17
1523
1524 genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1525 always           _     = True
1526 genericsEnabled  flags = testBit flags genericsBit
1527 ffiEnabled       flags = testBit flags ffiBit
1528 parrEnabled      flags = testBit flags parrBit
1529 arrowsEnabled    flags = testBit flags arrowsBit
1530 thEnabled        flags = testBit flags thBit
1531 ipEnabled        flags = testBit flags ipBit
1532 explicitForallEnabled flags = testBit flags explicitForallBit
1533 bangPatEnabled   flags = testBit flags bangPatBit
1534 tyFamEnabled     flags = testBit flags tyFamBit
1535 haddockEnabled   flags = testBit flags haddockBit
1536 magicHashEnabled flags = testBit flags magicHashBit
1537 kindSigsEnabled  flags = testBit flags kindSigsBit
1538 recursiveDoEnabled flags = testBit flags recursiveDoBit
1539 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1540 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1541 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1542 transformComprehensionsEnabled flags = testBit flags transformComprehensionsBit
1543
1544 -- PState for parsing options pragmas
1545 --
1546 pragState :: StringBuffer -> SrcLoc -> PState
1547 pragState buf loc  =
1548   PState {
1549       buffer          = buf,
1550       messages      = emptyMessages,
1551       -- XXX defaultDynFlags is not right, but we don't have a real
1552       -- dflags handy
1553       dflags        = defaultDynFlags,
1554       last_loc      = mkSrcSpan loc loc,
1555       last_offs     = 0,
1556       last_len      = 0,
1557       last_line_len = 0,
1558       loc           = loc,
1559       extsBitmap    = 0,
1560       context       = [],
1561       lex_state     = [bol, option_prags, 0]
1562     }
1563
1564
1565 -- create a parse state
1566 --
1567 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1568 mkPState buf loc flags  =
1569   PState {
1570       buffer          = buf,
1571       dflags        = flags,
1572       messages      = emptyMessages,
1573       last_loc      = mkSrcSpan loc loc,
1574       last_offs     = 0,
1575       last_len      = 0,
1576       last_line_len = 0,
1577       loc           = loc,
1578       extsBitmap    = fromIntegral bitmap,
1579       context       = [],
1580       lex_state     = [bol, 0]
1581         -- we begin in the layout state if toplev_layout is set
1582     }
1583     where
1584       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1585                .|. ffiBit       `setBitIf` dopt Opt_ForeignFunctionInterface flags
1586                .|. parrBit      `setBitIf` dopt Opt_PArr         flags
1587                .|. arrowsBit    `setBitIf` dopt Opt_Arrows       flags
1588                .|. thBit        `setBitIf` dopt Opt_TemplateHaskell flags
1589                .|. ipBit        `setBitIf` dopt Opt_ImplicitParams flags
1590                .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
1591                .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
1592                .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
1593                .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
1594                .|. explicitForallBit `setBitIf` dopt Opt_RankNTypes flags
1595                .|. bangPatBit   `setBitIf` dopt Opt_BangPatterns flags
1596                .|. tyFamBit     `setBitIf` dopt Opt_TypeFamilies flags
1597                .|. haddockBit   `setBitIf` dopt Opt_Haddock      flags
1598                .|. magicHashBit `setBitIf` dopt Opt_MagicHash    flags
1599                .|. kindSigsBit  `setBitIf` dopt Opt_KindSignatures flags
1600                .|. recursiveDoBit `setBitIf` dopt Opt_RecursiveDo flags
1601                .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
1602                .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
1603                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1604            .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1605       --
1606       setBitIf :: Int -> Bool -> Int
1607       b `setBitIf` cond | cond      = bit b
1608                         | otherwise = 0
1609
1610 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1611 addWarning option srcspan warning
1612  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1613        let warning' = mkWarnMsg srcspan alwaysQualify warning
1614            ws' = if dopt option d then ws `snocBag` warning' else ws
1615        in POk s{messages=(ws', es)} ()
1616
1617 getMessages :: PState -> Messages
1618 getMessages PState{messages=ms} = ms
1619
1620 getContext :: P [LayoutContext]
1621 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1622
1623 setContext :: [LayoutContext] -> P ()
1624 setContext ctx = P $ \s -> POk s{context=ctx} ()
1625
1626 popContext :: P ()
1627 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1628                            loc = loc, last_len = len, last_loc = last_loc }) ->
1629   case ctx of
1630         (_:tl) -> POk s{ context = tl } ()
1631         []     -> PFailed last_loc (srcParseErr buf len)
1632
1633 -- Push a new layout context at the indentation of the last token read.
1634 -- This is only used at the outer level of a module when the 'module'
1635 -- keyword is missing.
1636 pushCurrentContext :: P ()
1637 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1638     POk s{context = Layout (offs-len) : ctx} ()
1639 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1640
1641 getOffside :: P Ordering
1642 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1643                 let ord = case stk of
1644                         (Layout n:_) -> compare offs n
1645                         _            -> GT
1646                 in POk s ord
1647
1648 -- ---------------------------------------------------------------------------
1649 -- Construct a parse error
1650
1651 srcParseErr
1652   :: StringBuffer       -- current buffer (placed just after the last token)
1653   -> Int                -- length of the previous token
1654   -> Message
1655 srcParseErr buf len
1656   = hcat [ if null token
1657              then ptext SLIT("parse error (possibly incorrect indentation)")
1658              else hcat [ptext SLIT("parse error on input "),
1659                         char '`', text token, char '\'']
1660     ]
1661   where token = lexemeToString (offsetBytes (-len) buf) len
1662
1663 -- Report a parse failure, giving the span of the previous token as
1664 -- the location of the error.  This is the entry point for errors
1665 -- detected during parsing.
1666 srcParseFail :: P a
1667 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1668                             last_loc = last_loc } ->
1669     PFailed last_loc (srcParseErr buf len)
1670
1671 -- A lexical error is reported at a particular position in the source file,
1672 -- not over a token range.
1673 lexError :: String -> P a
1674 lexError str = do
1675   loc <- getSrcLoc
1676   i@(AI end _ buf) <- getInput
1677   reportLexError loc end buf str
1678
1679 -- -----------------------------------------------------------------------------
1680 -- This is the top-level function: called from the parser each time a
1681 -- new token is to be read from the input.
1682
1683 lexer :: (Located Token -> P a) -> P a
1684 lexer cont = do
1685   tok@(L span tok__) <- lexToken
1686 --  trace ("token: " ++ show tok__) $ do
1687   cont tok
1688
1689 lexToken :: P (Located Token)
1690 lexToken = do
1691   inp@(AI loc1 _ buf) <- getInput
1692   sc <- getLexState
1693   exts <- getExts
1694   case alexScanUser exts inp sc of
1695     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1696                   setLastToken span 0 0
1697                   return (L span ITeof)
1698     AlexError (AI loc2 _ buf) -> do
1699         reportLexError loc1 loc2 buf "lexical error"
1700     AlexSkip inp2 _ -> do
1701         setInput inp2
1702         lexToken
1703     AlexToken inp2@(AI end _ buf2) len t -> do
1704     setInput inp2
1705     let span = mkSrcSpan loc1 end
1706     let bytes = byteDiff buf buf2
1707     span `seq` setLastToken span bytes bytes
1708     t span buf bytes
1709
1710 reportLexError loc1 loc2 buf str
1711   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1712   | otherwise =
1713   let
1714         c = fst (nextChar buf)
1715   in
1716   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1717     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1718     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1719 }