compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 -- XXX The above flags turn off warnings in the generated code:
  35 {-# OPTIONS_GHC -fno-warn-unused-matches #-}
  36 {-# OPTIONS_GHC -fno-warn-unused-binds #-}
  37 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
  38 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
  39 -- But alex still generates some code that causes the "lazy unlifted bindings"
  40 -- warning, and old compilers don't know about it so we can't easily turn
  41 -- it off, so for now we use the sledge hammer:
  42 {-# OPTIONS_GHC -w #-}
  43
  44 {-# OPTIONS_GHC -funbox-strict-fields #-}
  45
  46 module Lexer (
  47    Token(..), lexer, pragState, mkPState, PState(..),
  48    P(..), ParseResult(..), getSrcLoc,
  49    getPState, getDynFlags, withThisPackage,
  50    failLocMsgP, failSpanMsgP, srcParseFail,
  51    getMessages,
  52    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  53    getLexState, popLexState, pushLexState,
  54    extension, standaloneDerivingEnabled, bangPatEnabled,
  55    addWarning,
  56    lexTokenStream
  57   ) where
  58
  59 import Bag
  60 import ErrUtils
  61 import Outputable
  62 import StringBuffer
  63 import FastString
  64 import SrcLoc
  65 import UniqFM
  66 import DynFlags
  67 import Module
  68 import Ctype
  69 import Util             ( readRational )
  70
  71 import Control.Monad
  72 import Data.Bits
  73 import Data.Char
  74 import Data.List
  75 import Data.Maybe
  76 import Data.Map (Map)
  77 import qualified Data.Map as Map
  78 import Data.Ratio
  79 }
  80
  81 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  82 $whitechar   = [\ \n\r\f\v $unispace]
  83 $white_no_nl = $whitechar # \n
  84 $tab         = \t
  85
  86 $ascdigit  = 0-9
  87 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  88 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  89 $digit     = [$ascdigit $unidigit]
  90
  91 $special   = [\(\)\,\;\[\]\`\{\}]
  92 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  93 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  94 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  95
  96 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  97 $asclarge  = [A-Z]
  98 $large     = [$asclarge $unilarge]
  99
 100 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 101 $ascsmall  = [a-z]
 102 $small     = [$ascsmall $unismall \_]
 103
 104 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 105 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 106
 107 $octit     = 0-7
 108 $hexit     = [$decdigit A-F a-f]
 109 $symchar   = [$symbol \:]
 110 $nl        = [\n\r]
 111 $idchar    = [$small $large $digit \']
 112
 113 $pragmachar = [$small $large $digit]
 114
 115 $docsym    = [\| \^ \* \$]
 116
 117 @varid     = $small $idchar*
 118 @conid     = $large $idchar*
 119
 120 @varsym    = $symbol $symchar*
 121 @consym    = \: $symchar*
 122
 123 @decimal     = $decdigit+
 124 @octal       = $octit+
 125 @hexadecimal = $hexit+
 126 @exponent    = [eE] [\-\+]? @decimal
 127
 128 -- we support the hierarchical module name extension:
 129 @qual = (@conid \.)+
 130
 131 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 132
 133 -- normal signed numerical literals can only be explicitly negative,
 134 -- not explicitly positive (contrast @exponent)
 135 @negative = \-
 136 @signed = @negative ?
 137
 138 haskell :-
 139
 140 -- everywhere: skip whitespace and comments
 141 $white_no_nl+                           ;
 142 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 143
 144 -- Everywhere: deal with nested comments.  We explicitly rule out
 145 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 146 -- (this can happen even though pragmas will normally take precedence due to
 147 -- longest-match, because pragmas aren't valid in every state, but comments
 148 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 149 -- set.
 150
 151 "{-" / { isNormalComment } { nested_comment lexToken }
 152
 153 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 154 -- more dashes followed by a symbol should be parsed as a varsym, so we
 155 -- have to exclude those.
 156
 157 -- Since Haddock comments aren't valid in every state, we need to rule them
 158 -- out here.
 159
 160 -- The following two rules match comments that begin with two dashes, but
 161 -- continue with a different character. The rules test that this character
 162 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 163 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 164 -- have a Haddock comment). The rules then munch the rest of the line.
 165
 166 "-- " ~[$docsym \#] .* { lineCommentToken }
 167 "--" [^$symbol : \ ] .* { lineCommentToken }
 168
 169 -- Next, match Haddock comments if no -haddock flag
 170
 171 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 172
 173 -- Now, when we've matched comments that begin with 2 dashes and continue
 174 -- with a different character, we need to match comments that begin with three
 175 -- or more dashes (which clearly can't be Haddock comments). We only need to
 176 -- make sure that the first non-dash character isn't a symbol, and munch the
 177 -- rest of the line.
 178
 179 "---"\-* [^$symbol :] .* { lineCommentToken }
 180
 181 -- Since the previous rules all match dashes followed by at least one
 182 -- character, we also need to match a whole line filled with just dashes.
 183
 184 "--"\-* / { atEOL } { lineCommentToken }
 185
 186 -- We need this rule since none of the other single line comment rules
 187 -- actually match this case.
 188
 189 "-- " / { atEOL } { lineCommentToken }
 190
 191 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 192 -- blank lines) until we find a non-whitespace character, then do layout
 193 -- processing.
 194 --
 195 -- One slight wibble here: what if the line begins with {-#? In
 196 -- theory, we have to lex the pragma to see if it's one we recognise,
 197 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 198 -- as a nested comment.  We don't bother with this: if the line begins
 199 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 200 <bol> {
 201   \n                                    ;
 202   ^\# (line)?                           { begin line_prag1 }
 203   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 204   ^\# \! .* \n                          ; -- #!, for scripts
 205   ()                                    { do_bol }
 206 }
 207
 208 -- after a layout keyword (let, where, do, of), we begin a new layout
 209 -- context if the curly brace is missing.
 210 -- Careful! This stuff is quite delicate.
 211 <layout, layout_do> {
 212   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 213         -- we might encounter {-# here, but {- has been handled already
 214   \n                                    ;
 215   ^\# (line)?                           { begin line_prag1 }
 216 }
 217
 218 -- do is treated in a subtly different way, see new_layout_context
 219 <layout>    ()                          { new_layout_context True }
 220 <layout_do> ()                          { new_layout_context False }
 221
 222 -- after a new layout context which was found to be to the left of the
 223 -- previous context, we have generated a '{' token, and we now need to
 224 -- generate a matching '}' token.
 225 <layout_left>  ()                       { do_layout_left }
 226
 227 <0,option_prags> \n                             { begin bol }
 228
 229 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 230                                 { dispatch_pragmas linePrags }
 231
 232 -- single-line line pragmas, of the form
 233 --    # <line> "<file>" <extra-stuff> \n
 234 <line_prag1> $decdigit+                 { setLine line_prag1a }
 235 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 236 <line_prag1b> .*                        { pop }
 237
 238 -- Haskell-style line pragmas, of the form
 239 --    {-# LINE <line> "<file>" #-}
 240 <line_prag2> $decdigit+                 { setLine line_prag2a }
 241 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 242 <line_prag2b> "#-}"|"-}"                { pop }
 243    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 244    -- with older versions of GHC which generated these.
 245
 246 <0,option_prags> {
 247   "{-#" $whitechar* $pragmachar+
 248         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 249                                  { dispatch_pragmas twoWordPrags }
 250
 251   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 252                                  { dispatch_pragmas oneWordPrags }
 253
 254   -- We ignore all these pragmas, but don't generate a warning for them
 255   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 256                                  { dispatch_pragmas ignoredPrags }
 257
 258   -- ToDo: should only be valid inside a pragma:
 259   "#-}"                                 { endPrag }
 260 }
 261
 262 <option_prags> {
 263   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 264                                    { dispatch_pragmas fileHeaderPrags }
 265
 266   "-- #"                                 { multiline_doc_comment }
 267 }
 268
 269 <0> {
 270   -- In the "0" mode we ignore these pragmas
 271   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 272                      { nested_comment lexToken }
 273 }
 274
 275 <0> {
 276   "-- #" .* { lineCommentToken }
 277 }
 278
 279 <0,option_prags> {
 280   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 281                     (nested_comment lexToken) }
 282 }
 283
 284 -- '0' state: ordinary lexemes
 285
 286 -- Haddock comments
 287
 288 <0> {
 289   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 290   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 291 }
 292
 293 -- "special" symbols
 294
 295 <0> {
 296   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 297   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 298 }
 299
 300 <0> {
 301   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 302   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 303   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 304   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 305   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 306   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 307   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 308   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 309
 310   "[$" @varid "|"  / { ifExtension qqEnabled }
 311                      { lex_quasiquote_tok }
 312 }
 313
 314 <0> {
 315   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 316                                         { special IToparenbar }
 317   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 318 }
 319
 320 <0> {
 321   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 322 }
 323
 324 <0> {
 325   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 326          { token IToubxparen }
 327   "#)" / { ifExtension unboxedTuplesEnabled }
 328          { token ITcubxparen }
 329 }
 330
 331 <0> {
 332   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 333   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 334 }
 335
 336 <0,option_prags> {
 337   \(                                    { special IToparen }
 338   \)                                    { special ITcparen }
 339   \[                                    { special ITobrack }
 340   \]                                    { special ITcbrack }
 341   \,                                    { special ITcomma }
 342   \;                                    { special ITsemi }
 343   \`                                    { special ITbackquote }
 344
 345   \{                                    { open_brace }
 346   \}                                    { close_brace }
 347 }
 348
 349 <0,option_prags> {
 350   @qual @varid                  { idtoken qvarid }
 351   @qual @conid                  { idtoken qconid }
 352   @varid                        { varid }
 353   @conid                        { idtoken conid }
 354 }
 355
 356 <0> {
 357   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 358   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 359   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 360   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 361 }
 362
 363 -- ToDo: - move `var` and (sym) into lexical syntax?
 364 --       - remove backquote from $special?
 365 <0> {
 366   @qual @varsym       / { ifExtension oldQualOps } { idtoken qvarsym }
 367   @qual @consym       / { ifExtension oldQualOps } { idtoken qconsym }
 368   @qual \( @varsym \) / { ifExtension newQualOps } { idtoken prefixqvarsym }
 369   @qual \( @consym \) / { ifExtension newQualOps } { idtoken prefixqconsym }
 370   @varsym                                          { varsym }
 371   @consym                                          { consym }
 372 }
 373
 374 -- For the normal boxed literals we need to be careful
 375 -- when trying to be close to Haskell98
 376 <0> {
 377   -- Normal integral literals (:: Num a => a, from Integer)
 378   @decimal           { tok_num positive 0 0 decimal }
 379   0[oO] @octal       { tok_num positive 2 2 octal }
 380   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 381
 382   -- Normal rational literals (:: Fractional a => a, from Rational)
 383   @floating_point    { strtoken tok_float }
 384 }
 385
 386 <0> {
 387   -- Unboxed ints (:: Int#) and words (:: Word#)
 388   -- It's simpler (and faster?) to give separate cases to the negatives,
 389   -- especially considering octal/hexadecimal prefixes.
 390   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 391   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 392   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 393   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 394   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 395   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 396
 397   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 398   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 399   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 400
 401   -- Unboxed floats and doubles (:: Float#, :: Double#)
 402   -- prim_{float,double} work with signed literals
 403   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 404   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 405 }
 406
 407 -- Strings and chars are lexed by hand-written code.  The reason is
 408 -- that even if we recognise the string or char here in the regex
 409 -- lexer, we would still have to parse the string afterward in order
 410 -- to convert it to a String.
 411 <0> {
 412   \'                            { lex_char_tok }
 413   \"                            { lex_string_tok }
 414 }
 415
 416 {
 417 -- -----------------------------------------------------------------------------
 418 -- The token type
 419
 420 data Token
 421   = ITas                        -- Haskell keywords
 422   | ITcase
 423   | ITclass
 424   | ITdata
 425   | ITdefault
 426   | ITderiving
 427   | ITdo
 428   | ITelse
 429   | IThiding
 430   | ITif
 431   | ITimport
 432   | ITin
 433   | ITinfix
 434   | ITinfixl
 435   | ITinfixr
 436   | ITinstance
 437   | ITlet
 438   | ITmodule
 439   | ITnewtype
 440   | ITof
 441   | ITqualified
 442   | ITthen
 443   | ITtype
 444   | ITwhere
 445   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 446
 447   | ITforall                    -- GHC extension keywords
 448   | ITforeign
 449   | ITexport
 450   | ITlabel
 451   | ITdynamic
 452   | ITsafe
 453   | ITthreadsafe
 454   | ITunsafe
 455   | ITstdcallconv
 456   | ITccallconv
 457   | ITprimcallconv
 458   | ITmdo
 459   | ITfamily
 460   | ITgroup
 461   | ITby
 462   | ITusing
 463
 464         -- Pragmas
 465   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 466   | ITinline_conlike_prag Bool  -- same
 467   | ITspec_prag                 -- SPECIALISE
 468   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 469   | ITsource_prag
 470   | ITrules_prag
 471   | ITwarning_prag
 472   | ITdeprecated_prag
 473   | ITline_prag
 474   | ITscc_prag
 475   | ITgenerated_prag
 476   | ITcore_prag                 -- hdaume: core annotations
 477   | ITunpack_prag
 478   | ITann_prag
 479   | ITclose_prag
 480   | IToptions_prag String
 481   | ITinclude_prag String
 482   | ITlanguage_prag
 483
 484   | ITdotdot                    -- reserved symbols
 485   | ITcolon
 486   | ITdcolon
 487   | ITequal
 488   | ITlam
 489   | ITvbar
 490   | ITlarrow
 491   | ITrarrow
 492   | ITat
 493   | ITtilde
 494   | ITdarrow
 495   | ITminus
 496   | ITbang
 497   | ITstar
 498   | ITdot
 499
 500   | ITbiglam                    -- GHC-extension symbols
 501
 502   | ITocurly                    -- special symbols
 503   | ITccurly
 504   | ITocurlybar                 -- {|, for type applications
 505   | ITccurlybar                 -- |}, for type applications
 506   | ITvocurly
 507   | ITvccurly
 508   | ITobrack
 509   | ITopabrack                  -- [:, for parallel arrays with -XParr
 510   | ITcpabrack                  -- :], for parallel arrays with -XParr
 511   | ITcbrack
 512   | IToparen
 513   | ITcparen
 514   | IToubxparen
 515   | ITcubxparen
 516   | ITsemi
 517   | ITcomma
 518   | ITunderscore
 519   | ITbackquote
 520
 521   | ITvarid   FastString        -- identifiers
 522   | ITconid   FastString
 523   | ITvarsym  FastString
 524   | ITconsym  FastString
 525   | ITqvarid  (FastString,FastString)
 526   | ITqconid  (FastString,FastString)
 527   | ITqvarsym (FastString,FastString)
 528   | ITqconsym (FastString,FastString)
 529   | ITprefixqvarsym (FastString,FastString)
 530   | ITprefixqconsym (FastString,FastString)
 531
 532   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 533
 534   | ITchar       Char
 535   | ITstring     FastString
 536   | ITinteger    Integer
 537   | ITrational   Rational
 538
 539   | ITprimchar   Char
 540   | ITprimstring FastString
 541   | ITprimint    Integer
 542   | ITprimword   Integer
 543   | ITprimfloat  Rational
 544   | ITprimdouble Rational
 545
 546   -- Template Haskell extension tokens
 547   | ITopenExpQuote              --  [| or [e|
 548   | ITopenPatQuote              --  [p|
 549   | ITopenDecQuote              --  [d|
 550   | ITopenTypQuote              --  [t|
 551   | ITcloseQuote                --  |]
 552   | ITidEscape   FastString     --  $x
 553   | ITparenEscape               --  $(
 554   | ITvarQuote                  --  '
 555   | ITtyQuote                   --  ''
 556   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 557
 558   -- Arrow notation extension
 559   | ITproc
 560   | ITrec
 561   | IToparenbar                 --  (|
 562   | ITcparenbar                 --  |)
 563   | ITlarrowtail                --  -<
 564   | ITrarrowtail                --  >-
 565   | ITLarrowtail                --  -<<
 566   | ITRarrowtail                --  >>-
 567
 568   | ITunknown String            -- Used when the lexer can't make sense of it
 569   | ITeof                       -- end of file token
 570
 571   -- Documentation annotations
 572   | ITdocCommentNext  String     -- something beginning '-- |'
 573   | ITdocCommentPrev  String     -- something beginning '-- ^'
 574   | ITdocCommentNamed String     -- something beginning '-- $'
 575   | ITdocSection      Int String -- a section heading
 576   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 577   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 578   | ITlineComment     String     -- comment starting by "--"
 579   | ITblockComment    String     -- comment in {- -}
 580
 581 #ifdef DEBUG
 582   deriving Show -- debugging
 583 #endif
 584
 585 {-
 586 isSpecial :: Token -> Bool
 587 -- If we see M.x, where x is a keyword, but
 588 -- is special, we treat is as just plain M.x,
 589 -- not as a keyword.
 590 isSpecial ITas          = True
 591 isSpecial IThiding      = True
 592 isSpecial ITqualified   = True
 593 isSpecial ITforall      = True
 594 isSpecial ITexport      = True
 595 isSpecial ITlabel       = True
 596 isSpecial ITdynamic     = True
 597 isSpecial ITsafe        = True
 598 isSpecial ITthreadsafe  = True
 599 isSpecial ITunsafe      = True
 600 isSpecial ITccallconv   = True
 601 isSpecial ITstdcallconv = True
 602 isSpecial ITprimcallconv = True
 603 isSpecial ITmdo         = True
 604 isSpecial ITfamily      = True
 605 isSpecial ITgroup   = True
 606 isSpecial ITby      = True
 607 isSpecial ITusing   = True
 608 isSpecial _             = False
 609 -}
 610
 611 -- the bitmap provided as the third component indicates whether the
 612 -- corresponding extension keyword is valid under the extension options
 613 -- provided to the compiler; if the extension corresponding to *any* of the
 614 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 615 -- facilitates using a keyword in two different extensions that can be
 616 -- activated independently)
 617 --
 618 reservedWordsFM :: UniqFM (Token, Int)
 619 reservedWordsFM = listToUFM $
 620         map (\(x, y, z) -> (mkFastString x, (y, z)))
 621        [( "_",          ITunderscore,   0 ),
 622         ( "as",         ITas,           0 ),
 623         ( "case",       ITcase,         0 ),
 624         ( "class",      ITclass,        0 ),
 625         ( "data",       ITdata,         0 ),
 626         ( "default",    ITdefault,      0 ),
 627         ( "deriving",   ITderiving,     0 ),
 628         ( "do",         ITdo,           0 ),
 629         ( "else",       ITelse,         0 ),
 630         ( "hiding",     IThiding,       0 ),
 631         ( "if",         ITif,           0 ),
 632         ( "import",     ITimport,       0 ),
 633         ( "in",         ITin,           0 ),
 634         ( "infix",      ITinfix,        0 ),
 635         ( "infixl",     ITinfixl,       0 ),
 636         ( "infixr",     ITinfixr,       0 ),
 637         ( "instance",   ITinstance,     0 ),
 638         ( "let",        ITlet,          0 ),
 639         ( "module",     ITmodule,       0 ),
 640         ( "newtype",    ITnewtype,      0 ),
 641         ( "of",         ITof,           0 ),
 642         ( "qualified",  ITqualified,    0 ),
 643         ( "then",       ITthen,         0 ),
 644         ( "type",       ITtype,         0 ),
 645         ( "where",      ITwhere,        0 ),
 646         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 647
 648     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 649         ( "mdo",        ITmdo,           bit recursiveDoBit),
 650         ( "family",     ITfamily,        bit tyFamBit),
 651     ( "group",  ITgroup,     bit transformComprehensionsBit),
 652     ( "by",     ITby,        bit transformComprehensionsBit),
 653     ( "using",  ITusing,     bit transformComprehensionsBit),
 654
 655         ( "foreign",    ITforeign,       bit ffiBit),
 656         ( "export",     ITexport,        bit ffiBit),
 657         ( "label",      ITlabel,         bit ffiBit),
 658         ( "dynamic",    ITdynamic,       bit ffiBit),
 659         ( "safe",       ITsafe,          bit ffiBit),
 660         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 661         ( "unsafe",     ITunsafe,        bit ffiBit),
 662         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 663         ( "ccall",      ITccallconv,     bit ffiBit),
 664         ( "prim",       ITprimcallconv,  bit ffiBit),
 665
 666         ( "rec",        ITrec,           bit recBit),
 667         ( "proc",       ITproc,          bit arrowsBit)
 668      ]
 669
 670 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 671 reservedSymsFM = listToUFM $
 672     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 673       [ ("..",  ITdotdot,   always)
 674         -- (:) is a reserved op, meaning only list cons
 675        ,(":",   ITcolon,    always)
 676        ,("::",  ITdcolon,   always)
 677        ,("=",   ITequal,    always)
 678        ,("\\",  ITlam,      always)
 679        ,("|",   ITvbar,     always)
 680        ,("<-",  ITlarrow,   always)
 681        ,("->",  ITrarrow,   always)
 682        ,("@",   ITat,       always)
 683        ,("~",   ITtilde,    always)
 684        ,("=>",  ITdarrow,   always)
 685        ,("-",   ITminus,    always)
 686        ,("!",   ITbang,     always)
 687
 688         -- For data T (a::*) = MkT
 689        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 690         -- For 'forall a . t'
 691        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 692
 693        ,("-<",  ITlarrowtail, arrowsEnabled)
 694        ,(">-",  ITrarrowtail, arrowsEnabled)
 695        ,("-<<", ITLarrowtail, arrowsEnabled)
 696        ,(">>-", ITRarrowtail, arrowsEnabled)
 697
 698        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 699        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 700        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 701                                 explicitForallEnabled i)
 702        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 703        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 704        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 705
 706        ,("⤙",   ITlarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 707        ,("⤚",   ITrarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 708        ,("⤛",   ITLarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 709        ,("⤜",   ITRarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 710
 711        ,("★", ITstar, unicodeSyntaxEnabled)
 712
 713         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 714         -- form part of a large operator.  This would let us have a better
 715         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 716        ]
 717
 718 -- -----------------------------------------------------------------------------
 719 -- Lexer actions
 720
 721 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 722
 723 special :: Token -> Action
 724 special tok span _buf _len = return (L span tok)
 725
 726 token, layout_token :: Token -> Action
 727 token t span _buf _len = return (L span t)
 728 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 729
 730 idtoken :: (StringBuffer -> Int -> Token) -> Action
 731 idtoken f span buf len = return (L span $! (f buf len))
 732
 733 skip_one_varid :: (FastString -> Token) -> Action
 734 skip_one_varid f span buf len
 735   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 736
 737 strtoken :: (String -> Token) -> Action
 738 strtoken f span buf len =
 739   return (L span $! (f $! lexemeToString buf len))
 740
 741 init_strtoken :: Int -> (String -> Token) -> Action
 742 -- like strtoken, but drops the last N character(s)
 743 init_strtoken drop f span buf len =
 744   return (L span $! (f $! lexemeToString buf (len-drop)))
 745
 746 begin :: Int -> Action
 747 begin code _span _str _len = do pushLexState code; lexToken
 748
 749 pop :: Action
 750 pop _span _buf _len = do _ <- popLexState
 751                          lexToken
 752
 753 pop_and :: Action -> Action
 754 pop_and act span buf len = do _ <- popLexState
 755                               act span buf len
 756
 757 {-# INLINE nextCharIs #-}
 758 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 759 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 760
 761 notFollowedBy :: Char -> AlexAccPred Int
 762 notFollowedBy char _ _ _ (AI _ _ buf)
 763   = nextCharIs buf (/=char)
 764
 765 notFollowedBySymbol :: AlexAccPred Int
 766 notFollowedBySymbol _ _ _ (AI _ _ buf)
 767   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 768
 769 -- We must reject doc comments as being ordinary comments everywhere.
 770 -- In some cases the doc comment will be selected as the lexeme due to
 771 -- maximal munch, but not always, because the nested comment rule is
 772 -- valid in all states, but the doc-comment rules are only valid in
 773 -- the non-layout states.
 774 isNormalComment :: AlexAccPred Int
 775 isNormalComment bits _ _ (AI _ _ buf)
 776   | haddockEnabled bits = notFollowedByDocOrPragma
 777   | otherwise           = nextCharIs buf (/='#')
 778   where
 779     notFollowedByDocOrPragma
 780        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 781
 782 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 783 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 784
 785 {-
 786 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 787   = if haddockEnabled bits then False else (p buf)
 788 -}
 789
 790 atEOL :: AlexAccPred Int
 791 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 792
 793 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 794 ifExtension pred bits _ _ _ = pred bits
 795
 796 multiline_doc_comment :: Action
 797 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 798   where
 799     worker commentAcc input docType oneLine = case alexGetChar input of
 800       Just ('\n', input')
 801         | oneLine -> docCommentEnd input commentAcc docType buf span
 802         | otherwise -> case checkIfCommentLine input' of
 803           Just input -> worker ('\n':commentAcc) input docType False
 804           Nothing -> docCommentEnd input commentAcc docType buf span
 805       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 806       Nothing -> docCommentEnd input commentAcc docType buf span
 807
 808     checkIfCommentLine input = check (dropNonNewlineSpace input)
 809       where
 810         check input = case alexGetChar input of
 811           Just ('-', input) -> case alexGetChar input of
 812             Just ('-', input) -> case alexGetChar input of
 813               Just (c, _) | c /= '-' -> Just input
 814               _ -> Nothing
 815             _ -> Nothing
 816           _ -> Nothing
 817
 818         dropNonNewlineSpace input = case alexGetChar input of
 819           Just (c, input')
 820             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 821             | otherwise -> input
 822           Nothing -> input
 823
 824 lineCommentToken :: Action
 825 lineCommentToken span buf len = do
 826   b <- extension rawTokenStreamEnabled
 827   if b then strtoken ITlineComment span buf len else lexToken
 828
 829 {-
 830   nested comments require traversing by hand, they can't be parsed
 831   using regular expressions.
 832 -}
 833 nested_comment :: P (Located Token) -> Action
 834 nested_comment cont span _str _len = do
 835   input <- getInput
 836   go "" (1::Int) input
 837   where
 838     go commentAcc 0 input = do setInput input
 839                                b <- extension rawTokenStreamEnabled
 840                                if b
 841                                  then docCommentEnd input commentAcc ITblockComment _str span
 842                                  else cont
 843     go commentAcc n input = case alexGetChar input of
 844       Nothing -> errBrace input span
 845       Just ('-',input) -> case alexGetChar input of
 846         Nothing  -> errBrace input span
 847         Just ('\125',input) -> go commentAcc (n-1) input
 848         Just (_,_)          -> go ('-':commentAcc) n input
 849       Just ('\123',input) -> case alexGetChar input of
 850         Nothing  -> errBrace input span
 851         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 852         Just (_,_)       -> go ('\123':commentAcc) n input
 853       Just (c,input) -> go (c:commentAcc) n input
 854
 855 nested_doc_comment :: Action
 856 nested_doc_comment span buf _len = withLexedDocType (go "")
 857   where
 858     go commentAcc input docType _ = case alexGetChar input of
 859       Nothing -> errBrace input span
 860       Just ('-',input) -> case alexGetChar input of
 861         Nothing -> errBrace input span
 862         Just ('\125',input) ->
 863           docCommentEnd input commentAcc docType buf span
 864         Just (_,_) -> go ('-':commentAcc) input docType False
 865       Just ('\123', input) -> case alexGetChar input of
 866         Nothing  -> errBrace input span
 867         Just ('-',input) -> do
 868           setInput input
 869           let cont = do input <- getInput; go commentAcc input docType False
 870           nested_comment cont span buf _len
 871         Just (_,_) -> go ('\123':commentAcc) input docType False
 872       Just (c,input) -> go (c:commentAcc) input docType False
 873
 874 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 875                  -> P (Located Token)
 876 withLexedDocType lexDocComment = do
 877   input@(AI _ _ buf) <- getInput
 878   case prevChar buf ' ' of
 879     '|' -> lexDocComment input ITdocCommentNext False
 880     '^' -> lexDocComment input ITdocCommentPrev False
 881     '$' -> lexDocComment input ITdocCommentNamed False
 882     '*' -> lexDocSection 1 input
 883     '#' -> lexDocComment input ITdocOptionsOld False
 884     _ -> panic "withLexedDocType: Bad doc type"
 885  where
 886     lexDocSection n input = case alexGetChar input of
 887       Just ('*', input) -> lexDocSection (n+1) input
 888       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 889       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 890
 891 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 892 -- off again at the end of the pragma.
 893 rulePrag :: Action
 894 rulePrag span _buf _len = do
 895   setExts (.|. bit inRulePragBit)
 896   return (L span ITrules_prag)
 897
 898 endPrag :: Action
 899 endPrag span _buf _len = do
 900   setExts (.&. complement (bit inRulePragBit))
 901   return (L span ITclose_prag)
 902
 903 -- docCommentEnd
 904 -------------------------------------------------------------------------------
 905 -- This function is quite tricky. We can't just return a new token, we also
 906 -- need to update the state of the parser. Why? Because the token is longer
 907 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 908 -- it writes the wrong token length to the parser state. This function is
 909 -- called afterwards, so it can just update the state.
 910
 911 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 912 -- which is something that the original lexer didn't account for.
 913 -- I have added last_line_len in the parser state which represents the length
 914 -- of the part of the token that is on the last line. It is now used for layout
 915 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 916 -- was before, the full length of the token, and it is now only used for error
 917 -- messages. /Waern
 918
 919 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 920                  SrcSpan -> P (Located Token)
 921 docCommentEnd input commentAcc docType buf span = do
 922   setInput input
 923   let (AI loc last_offs nextBuf) = input
 924       comment = reverse commentAcc
 925       span' = mkSrcSpan (srcSpanStart span) loc
 926       last_len = byteDiff buf nextBuf
 927
 928       last_line_len = if (last_offs - last_len < 0)
 929         then last_offs
 930         else last_len
 931
 932   span `seq` setLastToken span' last_len last_line_len
 933   return (L span' (docType comment))
 934
 935 errBrace :: AlexInput -> SrcSpan -> P a
 936 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 937
 938 open_brace, close_brace :: Action
 939 open_brace span _str _len = do
 940   ctx <- getContext
 941   setContext (NoLayout:ctx)
 942   return (L span ITocurly)
 943 close_brace span _str _len = do
 944   popContext
 945   return (L span ITccurly)
 946
 947 qvarid, qconid :: StringBuffer -> Int -> Token
 948 qvarid buf len = ITqvarid $! splitQualName buf len False
 949 qconid buf len = ITqconid $! splitQualName buf len False
 950
 951 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 952 -- takes a StringBuffer and a length, and returns the module name
 953 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 954 -- because of hierarchical module names.
 955 splitQualName orig_buf len parens = split orig_buf orig_buf
 956   where
 957     split buf dot_buf
 958         | orig_buf `byteDiff` buf >= len  = done dot_buf
 959         | c == '.'                        = found_dot buf'
 960         | otherwise                       = split buf' dot_buf
 961       where
 962        (c,buf') = nextChar buf
 963
 964     -- careful, we might get names like M....
 965     -- so, if the character after the dot is not upper-case, this is
 966     -- the end of the qualifier part.
 967     found_dot buf -- buf points after the '.'
 968         | isUpper c    = split buf' buf
 969         | otherwise    = done buf
 970       where
 971        (c,buf') = nextChar buf
 972
 973     done dot_buf =
 974         (lexemeToFastString orig_buf (qual_size - 1),
 975          if parens -- Prelude.(+)
 976             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 977             else lexemeToFastString dot_buf (len - qual_size))
 978       where
 979         qual_size = orig_buf `byteDiff` dot_buf
 980
 981 varid :: Action
 982 varid span buf len =
 983   fs `seq`
 984   case lookupUFM reservedWordsFM fs of
 985         Just (keyword,0)    -> do
 986                 maybe_layout keyword
 987                 return (L span keyword)
 988         Just (keyword,exts) -> do
 989                 b <- extension (\i -> exts .&. i /= 0)
 990                 if b then do maybe_layout keyword
 991                              return (L span keyword)
 992                      else return (L span (ITvarid fs))
 993         _other -> return (L span (ITvarid fs))
 994   where
 995         fs = lexemeToFastString buf len
 996
 997 conid :: StringBuffer -> Int -> Token
 998 conid buf len = ITconid fs
 999   where fs = lexemeToFastString buf len
1000
1001 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
1002 qvarsym buf len = ITqvarsym $! splitQualName buf len False
1003 qconsym buf len = ITqconsym $! splitQualName buf len False
1004 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
1005 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
1006
1007 varsym, consym :: Action
1008 varsym = sym ITvarsym
1009 consym = sym ITconsym
1010
1011 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
1012     -> P (Located Token)
1013 sym con span buf len =
1014   case lookupUFM reservedSymsFM fs of
1015         Just (keyword,exts) -> do
1016                 b <- extension exts
1017                 if b then return (L span keyword)
1018                      else return (L span $! con fs)
1019         _other -> return (L span $! con fs)
1020   where
1021         fs = lexemeToFastString buf len
1022
1023 -- Variations on the integral numeric literal.
1024 tok_integral :: (Integer -> Token)
1025      -> (Integer -> Integer)
1026  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1027      -> Int -> Int
1028      -> (Integer, (Char->Int)) -> Action
1029 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1030   return $ L span $ itint $! transint $ parseUnsignedInteger
1031      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1032
1033 -- some conveniences for use with tok_integral
1034 tok_num :: (Integer -> Integer)
1035         -> Int -> Int
1036         -> (Integer, (Char->Int)) -> Action
1037 tok_num = tok_integral ITinteger
1038 tok_primint :: (Integer -> Integer)
1039             -> Int -> Int
1040             -> (Integer, (Char->Int)) -> Action
1041 tok_primint = tok_integral ITprimint
1042 tok_primword :: Int -> Int
1043              -> (Integer, (Char->Int)) -> Action
1044 tok_primword = tok_integral ITprimword positive
1045 positive, negative :: (Integer -> Integer)
1046 positive = id
1047 negative = negate
1048 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1049 decimal = (10,octDecDigit)
1050 octal = (8,octDecDigit)
1051 hexadecimal = (16,hexDigit)
1052
1053 -- readRational can understand negative rationals, exponents, everything.
1054 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1055 tok_float        str = ITrational   $! readRational str
1056 tok_primfloat    str = ITprimfloat  $! readRational str
1057 tok_primdouble   str = ITprimdouble $! readRational str
1058
1059 -- -----------------------------------------------------------------------------
1060 -- Layout processing
1061
1062 -- we're at the first token on a line, insert layout tokens if necessary
1063 do_bol :: Action
1064 do_bol span _str _len = do
1065         pos <- getOffside
1066         case pos of
1067             LT -> do
1068                 --trace "layout: inserting '}'" $ do
1069                 popContext
1070                 -- do NOT pop the lex state, we might have a ';' to insert
1071                 return (L span ITvccurly)
1072             EQ -> do
1073                 --trace "layout: inserting ';'" $ do
1074                 _ <- popLexState
1075                 return (L span ITsemi)
1076             GT -> do
1077                 _ <- popLexState
1078                 lexToken
1079
1080 -- certain keywords put us in the "layout" state, where we might
1081 -- add an opening curly brace.
1082 maybe_layout :: Token -> P ()
1083 maybe_layout t = do -- If the alternative layout rule is enabled then
1084                     -- we never create an implicit layout context here.
1085                     -- Layout is handled XXX instead.
1086                     -- The code for closing implicit contexts, or
1087                     -- inserting implicit semi-colons, is therefore
1088                     -- irrelevant as it only applies in an implicit
1089                     -- context.
1090                     alr <- extension alternativeLayoutRule
1091                     unless alr $ f t
1092     where f ITdo    = pushLexState layout_do
1093           f ITmdo   = pushLexState layout_do
1094           f ITof    = pushLexState layout
1095           f ITlet   = pushLexState layout
1096           f ITwhere = pushLexState layout
1097           f ITrec   = pushLexState layout
1098           f _       = return ()
1099
1100 -- Pushing a new implicit layout context.  If the indentation of the
1101 -- next token is not greater than the previous layout context, then
1102 -- Haskell 98 says that the new layout context should be empty; that is
1103 -- the lexer must generate {}.
1104 --
1105 -- We are slightly more lenient than this: when the new context is started
1106 -- by a 'do', then we allow the new context to be at the same indentation as
1107 -- the previous context.  This is what the 'strict' argument is for.
1108 --
1109 new_layout_context :: Bool -> Action
1110 new_layout_context strict span _buf _len = do
1111     _ <- popLexState
1112     (AI _ offset _) <- getInput
1113     ctx <- getContext
1114     case ctx of
1115         Layout prev_off : _  |
1116            (strict     && prev_off >= offset  ||
1117             not strict && prev_off > offset) -> do
1118                 -- token is indented to the left of the previous context.
1119                 -- we must generate a {} sequence now.
1120                 pushLexState layout_left
1121                 return (L span ITvocurly)
1122         _ -> do
1123                 setContext (Layout offset : ctx)
1124                 return (L span ITvocurly)
1125
1126 do_layout_left :: Action
1127 do_layout_left span _buf _len = do
1128     _ <- popLexState
1129     pushLexState bol  -- we must be at the start of a line
1130     return (L span ITvccurly)
1131
1132 -- -----------------------------------------------------------------------------
1133 -- LINE pragmas
1134
1135 setLine :: Int -> Action
1136 setLine code span buf len = do
1137   let line = parseUnsignedInteger buf len 10 octDecDigit
1138   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1)
1139         -- subtract one: the line number refers to the *following* line
1140   _ <- popLexState
1141   pushLexState code
1142   lexToken
1143
1144 setFile :: Int -> Action
1145 setFile code span buf len = do
1146   let file = lexemeToFastString (stepOn buf) (len-2)
1147   setAlrLastLoc noSrcSpan
1148   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1149   _ <- popLexState
1150   pushLexState code
1151   lexToken
1152
1153
1154 -- -----------------------------------------------------------------------------
1155 -- Options, includes and language pragmas.
1156
1157 lex_string_prag :: (String -> Token) -> Action
1158 lex_string_prag mkTok span _buf _len
1159     = do input <- getInput
1160          start <- getSrcLoc
1161          tok <- go [] input
1162          end <- getSrcLoc
1163          return (L (mkSrcSpan start end) tok)
1164     where go acc input
1165               = if isString input "#-}"
1166                    then do setInput input
1167                            return (mkTok (reverse acc))
1168                    else case alexGetChar input of
1169                           Just (c,i) -> go (c:acc) i
1170                           Nothing -> err input
1171           isString _ [] = True
1172           isString i (x:xs)
1173               = case alexGetChar i of
1174                   Just (c,i') | c == x    -> isString i' xs
1175                   _other -> False
1176           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1177
1178
1179 -- -----------------------------------------------------------------------------
1180 -- Strings & Chars
1181
1182 -- This stuff is horrible.  I hates it.
1183
1184 lex_string_tok :: Action
1185 lex_string_tok span _buf _len = do
1186   tok <- lex_string ""
1187   end <- getSrcLoc
1188   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1189
1190 lex_string :: String -> P Token
1191 lex_string s = do
1192   i <- getInput
1193   case alexGetChar' i of
1194     Nothing -> lit_error
1195
1196     Just ('"',i)  -> do
1197         setInput i
1198         magicHash <- extension magicHashEnabled
1199         if magicHash
1200           then do
1201             i <- getInput
1202             case alexGetChar' i of
1203               Just ('#',i) -> do
1204                    setInput i
1205                    if any (> '\xFF') s
1206                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1207                     else let s' = mkZFastString (reverse s) in
1208                          return (ITprimstring s')
1209                         -- mkZFastString is a hack to avoid encoding the
1210                         -- string in UTF-8.  We just want the exact bytes.
1211               _other ->
1212                 return (ITstring (mkFastString (reverse s)))
1213           else
1214                 return (ITstring (mkFastString (reverse s)))
1215
1216     Just ('\\',i)
1217         | Just ('&',i) <- next -> do
1218                 setInput i; lex_string s
1219         | Just (c,i) <- next, is_space c -> do
1220                 setInput i; lex_stringgap s
1221         where next = alexGetChar' i
1222
1223     Just (c, i) -> do
1224         c' <- lex_char c i
1225         lex_string (c':s)
1226
1227 lex_stringgap :: String -> P Token
1228 lex_stringgap s = do
1229   c <- getCharOrFail
1230   case c of
1231     '\\' -> lex_string s
1232     c | is_space c -> lex_stringgap s
1233     _other -> lit_error
1234
1235
1236 lex_char_tok :: Action
1237 -- Here we are basically parsing character literals, such as 'x' or '\n'
1238 -- but, when Template Haskell is on, we additionally spot
1239 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1240 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1241 -- So we have to do two characters of lookahead: when we see 'x we need to
1242 -- see if there's a trailing quote
1243 lex_char_tok span _buf _len = do        -- We've seen '
1244    i1 <- getInput       -- Look ahead to first character
1245    let loc = srcSpanStart span
1246    case alexGetChar' i1 of
1247         Nothing -> lit_error
1248
1249         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1250                   th_exts <- extension thEnabled
1251                   if th_exts then do
1252                         setInput i2
1253                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1254                    else lit_error
1255
1256         Just ('\\', i2@(AI _end2 _ _)) -> do    -- We've seen 'backslash
1257                   setInput i2
1258                   lit_ch <- lex_escape
1259                   mc <- getCharOrFail   -- Trailing quote
1260                   if mc == '\'' then finish_char_tok loc lit_ch
1261                                 else do setInput i2; lit_error
1262
1263         Just (c, i2@(AI _end2 _ _))
1264                 | not (isAny c) -> lit_error
1265                 | otherwise ->
1266
1267                 -- We've seen 'x, where x is a valid character
1268                 --  (i.e. not newline etc) but not a quote or backslash
1269            case alexGetChar' i2 of      -- Look ahead one more character
1270                 Just ('\'', i3) -> do   -- We've seen 'x'
1271                         setInput i3
1272                         finish_char_tok loc c
1273                 _other -> do            -- We've seen 'x not followed by quote
1274                                         -- (including the possibility of EOF)
1275                                         -- If TH is on, just parse the quote only
1276                         th_exts <- extension thEnabled
1277                         let (AI end _ _) = i1
1278                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1279                                    else do setInput i2; lit_error
1280
1281 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1282 finish_char_tok loc ch  -- We've already seen the closing quote
1283                         -- Just need to check for trailing #
1284   = do  magicHash <- extension magicHashEnabled
1285         i@(AI end _ _) <- getInput
1286         if magicHash then do
1287                 case alexGetChar' i of
1288                         Just ('#',i@(AI end _ _)) -> do
1289                                 setInput i
1290                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1291                         _other ->
1292                                 return (L (mkSrcSpan loc end) (ITchar ch))
1293             else do
1294                    return (L (mkSrcSpan loc end) (ITchar ch))
1295
1296 lex_char :: Char -> AlexInput -> P Char
1297 lex_char c inp = do
1298   case c of
1299       '\\' -> do setInput inp; lex_escape
1300       c | isAny c -> do setInput inp; return c
1301       _other -> lit_error
1302
1303 isAny :: Char -> Bool
1304 isAny c | c > '\x7f' = isPrint c
1305         | otherwise  = is_any c
1306
1307 lex_escape :: P Char
1308 lex_escape = do
1309   c <- getCharOrFail
1310   case c of
1311         'a'   -> return '\a'
1312         'b'   -> return '\b'
1313         'f'   -> return '\f'
1314         'n'   -> return '\n'
1315         'r'   -> return '\r'
1316         't'   -> return '\t'
1317         'v'   -> return '\v'
1318         '\\'  -> return '\\'
1319         '"'   -> return '\"'
1320         '\''  -> return '\''
1321         '^'   -> do c <- getCharOrFail
1322                     if c >= '@' && c <= '_'
1323                         then return (chr (ord c - ord '@'))
1324                         else lit_error
1325
1326         'x'   -> readNum is_hexdigit 16 hexDigit
1327         'o'   -> readNum is_octdigit  8 octDecDigit
1328         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1329
1330         c1 ->  do
1331            i <- getInput
1332            case alexGetChar' i of
1333             Nothing -> lit_error
1334             Just (c2,i2) ->
1335               case alexGetChar' i2 of
1336                 Nothing -> do setInput i2; lit_error
1337                 Just (c3,i3) ->
1338                    let str = [c1,c2,c3] in
1339                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1340                                      Just rest <- [stripPrefix p str] ] of
1341                           (escape_char,[]):_ -> do
1342                                 setInput i3
1343                                 return escape_char
1344                           (escape_char,_:_):_ -> do
1345                                 setInput i2
1346                                 return escape_char
1347                           [] -> lit_error
1348
1349 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1350 readNum is_digit base conv = do
1351   i <- getInput
1352   c <- getCharOrFail
1353   if is_digit c
1354         then readNum2 is_digit base conv (conv c)
1355         else do setInput i; lit_error
1356
1357 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1358 readNum2 is_digit base conv i = do
1359   input <- getInput
1360   read i input
1361   where read i input = do
1362           case alexGetChar' input of
1363             Just (c,input') | is_digit c -> do
1364                 read (i*base + conv c) input'
1365             _other -> do
1366                 if i >= 0 && i <= 0x10FFFF
1367                    then do setInput input; return (chr i)
1368                    else lit_error
1369
1370 silly_escape_chars :: [(String, Char)]
1371 silly_escape_chars = [
1372         ("NUL", '\NUL'),
1373         ("SOH", '\SOH'),
1374         ("STX", '\STX'),
1375         ("ETX", '\ETX'),
1376         ("EOT", '\EOT'),
1377         ("ENQ", '\ENQ'),
1378         ("ACK", '\ACK'),
1379         ("BEL", '\BEL'),
1380         ("BS", '\BS'),
1381         ("HT", '\HT'),
1382         ("LF", '\LF'),
1383         ("VT", '\VT'),
1384         ("FF", '\FF'),
1385         ("CR", '\CR'),
1386         ("SO", '\SO'),
1387         ("SI", '\SI'),
1388         ("DLE", '\DLE'),
1389         ("DC1", '\DC1'),
1390         ("DC2", '\DC2'),
1391         ("DC3", '\DC3'),
1392         ("DC4", '\DC4'),
1393         ("NAK", '\NAK'),
1394         ("SYN", '\SYN'),
1395         ("ETB", '\ETB'),
1396         ("CAN", '\CAN'),
1397         ("EM", '\EM'),
1398         ("SUB", '\SUB'),
1399         ("ESC", '\ESC'),
1400         ("FS", '\FS'),
1401         ("GS", '\GS'),
1402         ("RS", '\RS'),
1403         ("US", '\US'),
1404         ("SP", '\SP'),
1405         ("DEL", '\DEL')
1406         ]
1407
1408 -- before calling lit_error, ensure that the current input is pointing to
1409 -- the position of the error in the buffer.  This is so that we can report
1410 -- a correct location to the user, but also so we can detect UTF-8 decoding
1411 -- errors if they occur.
1412 lit_error :: P a
1413 lit_error = lexError "lexical error in string/character literal"
1414
1415 getCharOrFail :: P Char
1416 getCharOrFail =  do
1417   i <- getInput
1418   case alexGetChar' i of
1419         Nothing -> lexError "unexpected end-of-file in string/character literal"
1420         Just (c,i)  -> do setInput i; return c
1421
1422 -- -----------------------------------------------------------------------------
1423 -- QuasiQuote
1424
1425 lex_quasiquote_tok :: Action
1426 lex_quasiquote_tok span buf len = do
1427   let quoter = reverse $ takeWhile (/= '$')
1428                $ reverse $ lexemeToString buf (len - 1)
1429   quoteStart <- getSrcLoc
1430   quote <- lex_quasiquote ""
1431   end <- getSrcLoc
1432   return (L (mkSrcSpan (srcSpanStart span) end)
1433            (ITquasiQuote (mkFastString quoter,
1434                           mkFastString (reverse quote),
1435                           mkSrcSpan quoteStart end)))
1436
1437 lex_quasiquote :: String -> P String
1438 lex_quasiquote s = do
1439   i <- getInput
1440   case alexGetChar' i of
1441     Nothing -> lit_error
1442
1443     Just ('\\',i)
1444         | Just ('|',i) <- next -> do
1445                 setInput i; lex_quasiquote ('|' : s)
1446         | Just (']',i) <- next -> do
1447                 setInput i; lex_quasiquote (']' : s)
1448         where next = alexGetChar' i
1449
1450     Just ('|',i)
1451         | Just (']',i) <- next -> do
1452                 setInput i; return s
1453         where next = alexGetChar' i
1454
1455     Just (c, i) -> do
1456          setInput i; lex_quasiquote (c : s)
1457
1458 -- -----------------------------------------------------------------------------
1459 -- Warnings
1460
1461 warn :: DynFlag -> SDoc -> Action
1462 warn option warning srcspan _buf _len = do
1463     addWarning option srcspan warning
1464     lexToken
1465
1466 warnThen :: DynFlag -> SDoc -> Action -> Action
1467 warnThen option warning action srcspan buf len = do
1468     addWarning option srcspan warning
1469     action srcspan buf len
1470
1471 -- -----------------------------------------------------------------------------
1472 -- The Parse Monad
1473
1474 data LayoutContext
1475   = NoLayout
1476   | Layout !Int
1477   deriving Show
1478
1479 data ParseResult a
1480   = POk PState a
1481   | PFailed
1482         SrcSpan         -- The start and end of the text span related to
1483                         -- the error.  Might be used in environments which can
1484                         -- show this span, e.g. by highlighting it.
1485         Message         -- The error message
1486
1487 data PState = PState {
1488         buffer     :: StringBuffer,
1489         dflags     :: DynFlags,
1490         messages   :: Messages,
1491         last_loc   :: SrcSpan,  -- pos of previous token
1492         last_offs  :: !Int,     -- offset of the previous token from the
1493                                 -- beginning of  the current line.
1494                                 -- \t is equal to 8 spaces.
1495         last_len   :: !Int,     -- len of previous token
1496         last_line_len :: !Int,
1497         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1498         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1499         context    :: [LayoutContext],
1500         lex_state  :: [Int],
1501         -- Used in the alternative layout rule:
1502         -- These tokens are the next ones to be sent out. They are
1503         -- just blindly emitted, without the rule looking at them again:
1504         alr_pending_implicit_tokens :: [Located Token],
1505         -- This is the next token to be considered or, if it is Nothing,
1506         -- we need to get the next token from the input stream:
1507         alr_next_token :: Maybe (Located Token),
1508         -- This is what we consider to be the locatino of the last token
1509         -- emitted:
1510         alr_last_loc :: SrcSpan,
1511         -- The stack of layout contexts:
1512         alr_context :: [ALRContext],
1513         -- Are we expecting a '{'? If it's Just, then the ALRLayout tells
1514         -- us what sort of layout the '{' will open:
1515         alr_expecting_ocurly :: Maybe ALRLayout
1516      }
1517         -- last_loc and last_len are used when generating error messages,
1518         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1519         -- current token to happyError, we could at least get rid of last_len.
1520         -- Getting rid of last_loc would require finding another way to
1521         -- implement pushCurrentContext (which is only called from one place).
1522
1523 data ALRContext = ALRNoLayout Bool{- does it contain commas? -}
1524                 | ALRLayout ALRLayout Int
1525 data ALRLayout = ALRLayoutLet
1526                | ALRLayoutWhere
1527                | ALRLayoutOf
1528                | ALRLayoutDo
1529
1530 newtype P a = P { unP :: PState -> ParseResult a }
1531
1532 instance Monad P where
1533   return = returnP
1534   (>>=) = thenP
1535   fail = failP
1536
1537 returnP :: a -> P a
1538 returnP a = a `seq` (P $ \s -> POk s a)
1539
1540 thenP :: P a -> (a -> P b) -> P b
1541 (P m) `thenP` k = P $ \ s ->
1542         case m s of
1543                 POk s1 a         -> (unP (k a)) s1
1544                 PFailed span err -> PFailed span err
1545
1546 failP :: String -> P a
1547 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1548
1549 failMsgP :: String -> P a
1550 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1551
1552 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1553 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1554
1555 failSpanMsgP :: SrcSpan -> SDoc -> P a
1556 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1557
1558 getPState :: P PState
1559 getPState = P $ \s -> POk s s
1560
1561 getDynFlags :: P DynFlags
1562 getDynFlags = P $ \s -> POk s (dflags s)
1563
1564 withThisPackage :: (PackageId -> a) -> P a
1565 withThisPackage f
1566  = do   pkg     <- liftM thisPackage getDynFlags
1567         return  $ f pkg
1568
1569 extension :: (Int -> Bool) -> P Bool
1570 extension p = P $ \s -> POk s (p $! extsBitmap s)
1571
1572 getExts :: P Int
1573 getExts = P $ \s -> POk s (extsBitmap s)
1574
1575 setExts :: (Int -> Int) -> P ()
1576 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1577
1578 setSrcLoc :: SrcLoc -> P ()
1579 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1580
1581 getSrcLoc :: P SrcLoc
1582 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1583
1584 setLastToken :: SrcSpan -> Int -> Int -> P ()
1585 setLastToken loc len line_len = P $ \s -> POk s {
1586   last_loc=loc,
1587   last_len=len,
1588   last_line_len=line_len
1589 } ()
1590
1591 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1592
1593 alexInputPrevChar :: AlexInput -> Char
1594 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1595
1596 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1597 alexGetChar (AI loc ofs s)
1598   | atEnd s   = Nothing
1599   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1600                 --trace (show (ord c)) $
1601                 Just (adj_c, (AI loc' ofs' s'))
1602   where (c,s') = nextChar s
1603         loc'   = advanceSrcLoc loc c
1604         ofs'   = advanceOffs c ofs
1605
1606         non_graphic     = '\x0'
1607         upper           = '\x1'
1608         lower           = '\x2'
1609         digit           = '\x3'
1610         symbol          = '\x4'
1611         space           = '\x5'
1612         other_graphic   = '\x6'
1613
1614         adj_c
1615           | c <= '\x06' = non_graphic
1616           | c <= '\x7f' = c
1617           -- Alex doesn't handle Unicode, so when Unicode
1618           -- character is encountered we output these values
1619           -- with the actual character value hidden in the state.
1620           | otherwise =
1621                 case generalCategory c of
1622                   UppercaseLetter       -> upper
1623                   LowercaseLetter       -> lower
1624                   TitlecaseLetter       -> upper
1625                   ModifierLetter        -> other_graphic
1626                   OtherLetter           -> lower -- see #1103
1627                   NonSpacingMark        -> other_graphic
1628                   SpacingCombiningMark  -> other_graphic
1629                   EnclosingMark         -> other_graphic
1630                   DecimalNumber         -> digit
1631                   LetterNumber          -> other_graphic
1632                   OtherNumber           -> other_graphic
1633                   ConnectorPunctuation  -> symbol
1634                   DashPunctuation       -> symbol
1635                   OpenPunctuation       -> other_graphic
1636                   ClosePunctuation      -> other_graphic
1637                   InitialQuote          -> other_graphic
1638                   FinalQuote            -> other_graphic
1639                   OtherPunctuation      -> symbol
1640                   MathSymbol            -> symbol
1641                   CurrencySymbol        -> symbol
1642                   ModifierSymbol        -> symbol
1643                   OtherSymbol           -> symbol
1644                   Space                 -> space
1645                   _other                -> non_graphic
1646
1647 -- This version does not squash unicode characters, it is used when
1648 -- lexing strings.
1649 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1650 alexGetChar' (AI loc ofs s)
1651   | atEnd s   = Nothing
1652   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1653                 --trace (show (ord c)) $
1654                 Just (c, (AI loc' ofs' s'))
1655   where (c,s') = nextChar s
1656         loc'   = advanceSrcLoc loc c
1657         ofs'   = advanceOffs c ofs
1658
1659 advanceOffs :: Char -> Int -> Int
1660 advanceOffs '\n' _    = 0
1661 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1662 advanceOffs _    offs = offs + 1
1663
1664 getInput :: P AlexInput
1665 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1666
1667 setInput :: AlexInput -> P ()
1668 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1669
1670 pushLexState :: Int -> P ()
1671 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1672
1673 popLexState :: P Int
1674 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1675
1676 getLexState :: P Int
1677 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1678
1679 popNextToken :: P (Maybe (Located Token))
1680 popNextToken
1681     = P $ \s@PState{ alr_next_token = m } ->
1682               POk (s {alr_next_token = Nothing}) m
1683
1684 setAlrLastLoc :: SrcSpan -> P ()
1685 setAlrLastLoc l = P $ \s -> POk (s {alr_last_loc = l}) ()
1686
1687 getAlrLastLoc :: P SrcSpan
1688 getAlrLastLoc = P $ \s@(PState {alr_last_loc = l}) -> POk s l
1689
1690 getALRContext :: P [ALRContext]
1691 getALRContext = P $ \s@(PState {alr_context = cs}) -> POk s cs
1692
1693 setALRContext :: [ALRContext] -> P ()
1694 setALRContext cs = P $ \s -> POk (s {alr_context = cs}) ()
1695
1696 setNextToken :: Located Token -> P ()
1697 setNextToken t = P $ \s -> POk (s {alr_next_token = Just t}) ()
1698
1699 popPendingImplicitToken :: P (Maybe (Located Token))
1700 popPendingImplicitToken
1701     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1702               case ts of
1703               [] -> POk s Nothing
1704               (t : ts') -> POk (s {alr_pending_implicit_tokens = ts'}) (Just t)
1705
1706 setPendingImplicitTokens :: [Located Token] -> P ()
1707 setPendingImplicitTokens ts = P $ \s -> POk (s {alr_pending_implicit_tokens = ts}) ()
1708
1709 getAlrExpectingOCurly :: P (Maybe ALRLayout)
1710 getAlrExpectingOCurly = P $ \s@(PState {alr_expecting_ocurly = b}) -> POk s b
1711
1712 setAlrExpectingOCurly :: Maybe ALRLayout -> P ()
1713 setAlrExpectingOCurly b = P $ \s -> POk (s {alr_expecting_ocurly = b}) ()
1714
1715 -- for reasons of efficiency, flags indicating language extensions (eg,
1716 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1717 -- integer
1718
1719 genericsBit :: Int
1720 genericsBit = 0 -- {| and |}
1721 ffiBit :: Int
1722 ffiBit     = 1
1723 parrBit :: Int
1724 parrBit    = 2
1725 arrowsBit :: Int
1726 arrowsBit  = 4
1727 thBit :: Int
1728 thBit      = 5
1729 ipBit :: Int
1730 ipBit      = 6
1731 explicitForallBit :: Int
1732 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1733 bangPatBit :: Int
1734 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1735                 -- (doesn't affect the lexer)
1736 tyFamBit :: Int
1737 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1738 haddockBit :: Int
1739 haddockBit = 10 -- Lex and parse Haddock comments
1740 magicHashBit :: Int
1741 magicHashBit = 11 -- "#" in both functions and operators
1742 kindSigsBit :: Int
1743 kindSigsBit = 12 -- Kind signatures on type variables
1744 recursiveDoBit :: Int
1745 recursiveDoBit = 13 -- mdo
1746 unicodeSyntaxBit :: Int
1747 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1748 unboxedTuplesBit :: Int
1749 unboxedTuplesBit = 15 -- (# and #)
1750 standaloneDerivingBit :: Int
1751 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1752 transformComprehensionsBit :: Int
1753 transformComprehensionsBit = 17
1754 qqBit :: Int
1755 qqBit      = 18 -- enable quasiquoting
1756 inRulePragBit :: Int
1757 inRulePragBit = 19
1758 rawTokenStreamBit :: Int
1759 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1760 newQualOpsBit :: Int
1761 newQualOpsBit = 21 -- Haskell' qualified operator syntax, e.g. Prelude.(+)
1762 recBit :: Int
1763 recBit = 22 -- rec
1764 alternativeLayoutRuleBit :: Int
1765 alternativeLayoutRuleBit = 23
1766
1767 always :: Int -> Bool
1768 always           _     = True
1769 genericsEnabled :: Int -> Bool
1770 genericsEnabled  flags = testBit flags genericsBit
1771 parrEnabled :: Int -> Bool
1772 parrEnabled      flags = testBit flags parrBit
1773 arrowsEnabled :: Int -> Bool
1774 arrowsEnabled    flags = testBit flags arrowsBit
1775 thEnabled :: Int -> Bool
1776 thEnabled        flags = testBit flags thBit
1777 ipEnabled :: Int -> Bool
1778 ipEnabled        flags = testBit flags ipBit
1779 explicitForallEnabled :: Int -> Bool
1780 explicitForallEnabled flags = testBit flags explicitForallBit
1781 bangPatEnabled :: Int -> Bool
1782 bangPatEnabled   flags = testBit flags bangPatBit
1783 -- tyFamEnabled :: Int -> Bool
1784 -- tyFamEnabled     flags = testBit flags tyFamBit
1785 haddockEnabled :: Int -> Bool
1786 haddockEnabled   flags = testBit flags haddockBit
1787 magicHashEnabled :: Int -> Bool
1788 magicHashEnabled flags = testBit flags magicHashBit
1789 -- kindSigsEnabled :: Int -> Bool
1790 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1791 unicodeSyntaxEnabled :: Int -> Bool
1792 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1793 unboxedTuplesEnabled :: Int -> Bool
1794 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1795 standaloneDerivingEnabled :: Int -> Bool
1796 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1797 qqEnabled :: Int -> Bool
1798 qqEnabled        flags = testBit flags qqBit
1799 -- inRulePrag :: Int -> Bool
1800 -- inRulePrag       flags = testBit flags inRulePragBit
1801 rawTokenStreamEnabled :: Int -> Bool
1802 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1803 newQualOps :: Int -> Bool
1804 newQualOps       flags = testBit flags newQualOpsBit
1805 oldQualOps :: Int -> Bool
1806 oldQualOps flags = not (newQualOps flags)
1807 alternativeLayoutRule :: Int -> Bool
1808 alternativeLayoutRule flags = testBit flags alternativeLayoutRuleBit
1809
1810 -- PState for parsing options pragmas
1811 --
1812 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1813 pragState dynflags buf loc =
1814   PState {
1815       buffer        = buf,
1816       messages      = emptyMessages,
1817       dflags        = dynflags,
1818       last_loc      = mkSrcSpan loc loc,
1819       last_offs     = 0,
1820       last_len      = 0,
1821       last_line_len = 0,
1822       loc           = loc,
1823       extsBitmap    = 0,
1824       context       = [],
1825       lex_state     = [bol, option_prags, 0],
1826       alr_pending_implicit_tokens = [],
1827       alr_next_token = Nothing,
1828       alr_last_loc = noSrcSpan,
1829       alr_context = [],
1830       alr_expecting_ocurly = Nothing
1831     }
1832
1833
1834 -- create a parse state
1835 --
1836 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1837 mkPState buf loc flags  =
1838   PState {
1839       buffer          = buf,
1840       dflags        = flags,
1841       messages      = emptyMessages,
1842       last_loc      = mkSrcSpan loc loc,
1843       last_offs     = 0,
1844       last_len      = 0,
1845       last_line_len = 0,
1846       loc           = loc,
1847       extsBitmap    = fromIntegral bitmap,
1848       context       = [],
1849       lex_state     = [bol, 0],
1850         -- we begin in the layout state if toplev_layout is set
1851       alr_pending_implicit_tokens = [],
1852       alr_next_token = Nothing,
1853       alr_last_loc = noSrcSpan,
1854       alr_context = [],
1855       alr_expecting_ocurly = Nothing
1856     }
1857     where
1858       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1859                .|. ffiBit            `setBitIf` dopt Opt_ForeignFunctionInterface flags
1860                .|. parrBit           `setBitIf` dopt Opt_PArr         flags
1861                .|. arrowsBit         `setBitIf` dopt Opt_Arrows       flags
1862                .|. thBit             `setBitIf` dopt Opt_TemplateHaskell flags
1863                .|. qqBit             `setBitIf` dopt Opt_QuasiQuotes flags
1864                .|. ipBit             `setBitIf` dopt Opt_ImplicitParams flags
1865                .|. explicitForallBit `setBitIf` dopt Opt_ExplicitForAll flags
1866                .|. bangPatBit        `setBitIf` dopt Opt_BangPatterns flags
1867                .|. tyFamBit          `setBitIf` dopt Opt_TypeFamilies flags
1868                .|. haddockBit        `setBitIf` dopt Opt_Haddock      flags
1869                .|. magicHashBit      `setBitIf` dopt Opt_MagicHash    flags
1870                .|. kindSigsBit       `setBitIf` dopt Opt_KindSignatures flags
1871                .|. recursiveDoBit    `setBitIf` dopt Opt_RecursiveDo flags
1872                .|. recBit            `setBitIf` dopt Opt_DoRec  flags
1873                .|. recBit            `setBitIf` dopt Opt_Arrows flags
1874                .|. unicodeSyntaxBit  `setBitIf` dopt Opt_UnicodeSyntax flags
1875                .|. unboxedTuplesBit  `setBitIf` dopt Opt_UnboxedTuples flags
1876                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1877                .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1878                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1879                .|. newQualOpsBit `setBitIf` dopt Opt_NewQualifiedOperators flags
1880                .|. alternativeLayoutRuleBit `setBitIf` dopt Opt_AlternativeLayoutRule flags
1881       --
1882       setBitIf :: Int -> Bool -> Int
1883       b `setBitIf` cond | cond      = bit b
1884                         | otherwise = 0
1885
1886 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1887 addWarning option srcspan warning
1888  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1889        let warning' = mkWarnMsg srcspan alwaysQualify warning
1890            ws' = if dopt option d then ws `snocBag` warning' else ws
1891        in POk s{messages=(ws', es)} ()
1892
1893 getMessages :: PState -> Messages
1894 getMessages PState{messages=ms} = ms
1895
1896 getContext :: P [LayoutContext]
1897 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1898
1899 setContext :: [LayoutContext] -> P ()
1900 setContext ctx = P $ \s -> POk s{context=ctx} ()
1901
1902 popContext :: P ()
1903 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1904                               last_len = len, last_loc = last_loc }) ->
1905   case ctx of
1906         (_:tl) -> POk s{ context = tl } ()
1907         []     -> PFailed last_loc (srcParseErr buf len)
1908
1909 -- Push a new layout context at the indentation of the last token read.
1910 -- This is only used at the outer level of a module when the 'module'
1911 -- keyword is missing.
1912 pushCurrentContext :: P ()
1913 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1914     POk s{context = Layout (offs-len) : ctx} ()
1915 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1916
1917 getOffside :: P Ordering
1918 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1919                 let ord = case stk of
1920                         (Layout n:_) -> compare offs n
1921                         _            -> GT
1922                 in POk s ord
1923
1924 -- ---------------------------------------------------------------------------
1925 -- Construct a parse error
1926
1927 srcParseErr
1928   :: StringBuffer       -- current buffer (placed just after the last token)
1929   -> Int                -- length of the previous token
1930   -> Message
1931 srcParseErr buf len
1932   = hcat [ if null token
1933              then ptext (sLit "parse error (possibly incorrect indentation)")
1934              else hcat [ptext (sLit "parse error on input "),
1935                         char '`', text token, char '\'']
1936     ]
1937   where token = lexemeToString (offsetBytes (-len) buf) len
1938
1939 -- Report a parse failure, giving the span of the previous token as
1940 -- the location of the error.  This is the entry point for errors
1941 -- detected during parsing.
1942 srcParseFail :: P a
1943 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1944                             last_loc = last_loc } ->
1945     PFailed last_loc (srcParseErr buf len)
1946
1947 -- A lexical error is reported at a particular position in the source file,
1948 -- not over a token range.
1949 lexError :: String -> P a
1950 lexError str = do
1951   loc <- getSrcLoc
1952   (AI end _ buf) <- getInput
1953   reportLexError loc end buf str
1954
1955 -- -----------------------------------------------------------------------------
1956 -- This is the top-level function: called from the parser each time a
1957 -- new token is to be read from the input.
1958
1959 lexer :: (Located Token -> P a) -> P a
1960 lexer cont = do
1961   alr <- extension alternativeLayoutRule
1962   let lexTokenFun = if alr then lexTokenAlr else lexToken
1963   tok@(L _span _tok__) <- lexTokenFun
1964 --  trace ("token: " ++ show tok__) $ do
1965   cont tok
1966
1967 lexTokenAlr :: P (Located Token)
1968 lexTokenAlr = do mPending <- popPendingImplicitToken
1969                  t <- case mPending of
1970                       Nothing ->
1971                           do mNext <- popNextToken
1972                              t <- case mNext of
1973                                   Nothing -> lexToken
1974                                   Just next -> return next
1975                              alternativeLayoutRuleToken t
1976                       Just t ->
1977                           return t
1978                  setAlrLastLoc (getLoc t)
1979                  case unLoc t of
1980                      ITwhere -> setAlrExpectingOCurly (Just ALRLayoutWhere)
1981                      ITlet   -> setAlrExpectingOCurly (Just ALRLayoutLet)
1982                      ITof    -> setAlrExpectingOCurly (Just ALRLayoutOf)
1983                      ITdo    -> setAlrExpectingOCurly (Just ALRLayoutDo)
1984                      _       -> return ()
1985                  return t
1986
1987 alternativeLayoutRuleToken :: Located Token -> P (Located Token)
1988 alternativeLayoutRuleToken t
1989     = do context <- getALRContext
1990          lastLoc <- getAlrLastLoc
1991          mExpectingOCurly <- getAlrExpectingOCurly
1992          let thisLoc = getLoc t
1993              thisCol = srcSpanStartCol thisLoc
1994              newLine = (lastLoc == noSrcSpan)
1995                     || (srcSpanStartLine thisLoc > srcSpanEndLine lastLoc)
1996          case (unLoc t, context, mExpectingOCurly) of
1997              -- This case handles a GHC extension to the original H98
1998              -- layout rule...
1999              (ITocurly, _, Just _) ->
2000                  do setAlrExpectingOCurly Nothing
2001                     setALRContext (ALRNoLayout (containsCommas ITocurly) : context)
2002                     return t
2003              -- ...and makes this case unnecessary
2004              {-
2005              -- I think our implicit open-curly handling is slightly
2006              -- different to John's, in how it interacts with newlines
2007              -- and "in"
2008              (ITocurly, _, Just _) ->
2009                  do setAlrExpectingOCurly Nothing
2010                     setNextToken t
2011                     lexTokenAlr
2012              -}
2013              (_, ALRLayout _ col : ls, Just expectingOCurly)
2014               | (thisCol > col) ||
2015                 (thisCol == col &&
2016                  isNonDecreasingIntentation expectingOCurly) ->
2017                  do setAlrExpectingOCurly Nothing
2018                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2019                     setNextToken t
2020                     return (L thisLoc ITocurly)
2021               | otherwise ->
2022                  do setAlrExpectingOCurly Nothing
2023                     setPendingImplicitTokens [L lastLoc ITccurly]
2024                     setNextToken t
2025                     return (L lastLoc ITocurly)
2026              (_, _, Just expectingOCurly) ->
2027                  do setAlrExpectingOCurly Nothing
2028                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2029                     setNextToken t
2030                     return (L thisLoc ITocurly)
2031              -- We do the [] cases earlier than in the spec, as we
2032              -- have an actual EOF token
2033              (ITeof, ALRLayout _ _ : ls, _) ->
2034                  do setALRContext ls
2035                     setNextToken t
2036                     return (L thisLoc ITccurly)
2037              (ITeof, _, _) ->
2038                  return t
2039              -- the other ITeof case omitted; general case below covers it
2040              (ITin, ALRLayout ALRLayoutLet _ : ls, _)
2041               | newLine ->
2042                  do setPendingImplicitTokens [t]
2043                     setALRContext ls
2044                     return (L thisLoc ITccurly)
2045              (_, ALRLayout _ col : ls, _)
2046               | newLine && thisCol == col ->
2047                  do setNextToken t
2048                     return (L thisLoc ITsemi)
2049               | newLine && thisCol < col ->
2050                  do setALRContext ls
2051                     setNextToken t
2052                     -- Note that we use lastLoc, as we may need to close
2053                     -- more layouts, or give a semicolon
2054                     return (L lastLoc ITccurly)
2055              (u, _, _)
2056               | isALRopen u ->
2057                  do setALRContext (ALRNoLayout (containsCommas u) : context)
2058                     return t
2059              (u, _, _)
2060               | isALRclose u ->
2061                  case context of
2062                  ALRLayout _ _ : ls ->
2063                      do setALRContext ls
2064                         setNextToken t
2065                         return (L thisLoc ITccurly)
2066                  ALRNoLayout _ : ls ->
2067                      do setALRContext ls
2068                         return t
2069                  [] ->
2070                      -- XXX This is an error in John's code, but
2071                      -- it looks reachable to me at first glance
2072                      return t
2073              (ITin, ALRLayout ALRLayoutLet _ : ls, _) ->
2074                  do setALRContext ls
2075                     setPendingImplicitTokens [t]
2076                     return (L thisLoc ITccurly)
2077              (ITin, ALRLayout _ _ : ls, _) ->
2078                  do setALRContext ls
2079                     setNextToken t
2080                     return (L thisLoc ITccurly)
2081              -- the other ITin case omitted; general case below covers it
2082              (ITcomma, ALRLayout _ _ : ls, _)
2083               | topNoLayoutContainsCommas ls ->
2084                  do setALRContext ls
2085                     setNextToken t
2086                     return (L thisLoc ITccurly)
2087              (ITwhere, ALRLayout ALRLayoutDo _ : ls, _) ->
2088                  do setALRContext ls
2089                     setPendingImplicitTokens [t]
2090                     return (L thisLoc ITccurly)
2091              -- the other ITwhere case omitted; general case below covers it
2092              (_, _, _) -> return t
2093
2094 isALRopen :: Token -> Bool
2095 isALRopen ITcase   = True
2096 isALRopen ITif     = True
2097 isALRopen IToparen = True
2098 isALRopen ITobrack = True
2099 isALRopen ITocurly = True
2100 -- GHC Extensions:
2101 isALRopen IToubxparen = True
2102 isALRopen _        = False
2103
2104 isALRclose :: Token -> Bool
2105 isALRclose ITof     = True
2106 isALRclose ITthen   = True
2107 isALRclose ITcparen = True
2108 isALRclose ITcbrack = True
2109 isALRclose ITccurly = True
2110 -- GHC Extensions:
2111 isALRclose ITcubxparen = True
2112 isALRclose _        = False
2113
2114 isNonDecreasingIntentation :: ALRLayout -> Bool
2115 isNonDecreasingIntentation ALRLayoutDo = True
2116 isNonDecreasingIntentation _           = False
2117
2118 containsCommas :: Token -> Bool
2119 containsCommas IToparen = True
2120 containsCommas ITobrack = True
2121 -- John doesn't have {} as containing commas, but records contain them,
2122 -- which caused a problem parsing Cabal's Distribution.Simple.InstallDirs
2123 -- (defaultInstallDirs).
2124 containsCommas ITocurly = True
2125 -- GHC Extensions:
2126 containsCommas IToubxparen = True
2127 containsCommas _        = False
2128
2129 topNoLayoutContainsCommas :: [ALRContext] -> Bool
2130 topNoLayoutContainsCommas [] = False
2131 topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
2132 topNoLayoutContainsCommas (ALRNoLayout b : _) = b
2133
2134 lexToken :: P (Located Token)
2135 lexToken = do
2136   inp@(AI loc1 _ buf) <- getInput
2137   sc <- getLexState
2138   exts <- getExts
2139   case alexScanUser exts inp sc of
2140     AlexEOF -> do
2141         let span = mkSrcSpan loc1 loc1
2142         setLastToken span 0 0
2143         return (L span ITeof)
2144     AlexError (AI loc2 _ buf) ->
2145         reportLexError loc1 loc2 buf "lexical error"
2146     AlexSkip inp2 _ -> do
2147         setInput inp2
2148         lexToken
2149     AlexToken inp2@(AI end _ buf2) _ t -> do
2150         setInput inp2
2151         let span = mkSrcSpan loc1 end
2152         let bytes = byteDiff buf buf2
2153         span `seq` setLastToken span bytes bytes
2154         t span buf bytes
2155
2156 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
2157 reportLexError loc1 loc2 buf str
2158   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
2159   | otherwise =
2160   let
2161         c = fst (nextChar buf)
2162   in
2163   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
2164     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
2165     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
2166
2167 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
2168 lexTokenStream buf loc dflags = unP go initState
2169     where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
2170           go = do
2171             ltok <- lexer return
2172             case ltok of
2173               L _ ITeof -> return []
2174               _ -> liftM (ltok:) go
2175
2176 linePrags = Map.singleton "line" (begin line_prag2)
2177
2178 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
2179                                  ("options_ghc", lex_string_prag IToptions_prag),
2180                                  ("options_haddock", lex_string_prag ITdocOptions),
2181                                  ("language", token ITlanguage_prag),
2182                                  ("include", lex_string_prag ITinclude_prag)])
2183
2184 ignoredPrags = Map.fromList (map ignored pragmas)
2185                where ignored opt = (opt, nested_comment lexToken)
2186                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
2187                      options_pragmas = map ("options_" ++) impls
2188                      -- CFILES is a hugs-only thing.
2189                      pragmas = options_pragmas ++ ["cfiles", "contract"]
2190
2191 oneWordPrags = Map.fromList([("rules", rulePrag),
2192                            ("inline", token (ITinline_prag True)),
2193                            ("notinline", token (ITinline_prag False)),
2194                            ("specialize", token ITspec_prag),
2195                            ("source", token ITsource_prag),
2196                            ("warning", token ITwarning_prag),
2197                            ("deprecated", token ITdeprecated_prag),
2198                            ("scc", token ITscc_prag),
2199                            ("generated", token ITgenerated_prag),
2200                            ("core", token ITcore_prag),
2201                            ("unpack", token ITunpack_prag),
2202                            ("ann", token ITann_prag)])
2203
2204 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_conlike_prag True)),
2205                              ("notinline conlike", token (ITinline_conlike_prag False)),
2206                              ("specialize inline", token (ITspec_inline_prag True)),
2207                              ("specialize notinline", token (ITspec_inline_prag False))])
2208
2209
2210 dispatch_pragmas :: Map String Action -> Action
2211 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
2212                                        Just found -> found span buf len
2213                                        Nothing -> lexError "unknown pragma"
2214
2215 known_pragma :: Map String Action -> AlexAccPred Int
2216 known_pragma prags _ _ len (AI _ _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
2217                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
2218
2219 clean_pragma :: String -> String
2220 clean_pragma prag = canon_ws (map toLower (unprefix prag))
2221                     where unprefix prag' = case stripPrefix "{-#" prag' of
2222                                              Just rest -> rest
2223                                              Nothing -> prag'
2224                           canonical prag' = case prag' of
2225                                               "noinline" -> "notinline"
2226                                               "specialise" -> "specialize"
2227                                               "constructorlike" -> "conlike"
2228                                               _ -> prag'
2229                           canon_ws s = unwords (map canonical (words s))
2230 }