compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 -- XXX The above flags turn off warnings in the generated code:
  35 {-# OPTIONS_GHC -fno-warn-unused-matches #-}
  36 {-# OPTIONS_GHC -fno-warn-unused-binds #-}
  37 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
  38 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
  39 -- But alex still generates some code that causes the "lazy unlifted bindings"
  40 -- warning, and old compilers don't know about it so we can't easily turn
  41 -- it off, so for now we use the sledge hammer:
  42 {-# OPTIONS_GHC -w #-}
  43
  44 {-# OPTIONS_GHC -funbox-strict-fields #-}
  45
  46 module Lexer (
  47    Token(..), lexer, pragState, mkPState, PState(..),
  48    P(..), ParseResult(..), getSrcLoc,
  49    getPState, getDynFlags, withThisPackage,
  50    failLocMsgP, failSpanMsgP, srcParseFail,
  51    getMessages,
  52    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  53    getLexState, popLexState, pushLexState,
  54    extension, standaloneDerivingEnabled, bangPatEnabled,
  55    addWarning,
  56    lexTokenStream
  57   ) where
  58
  59 import Bag
  60 import ErrUtils
  61 import Outputable
  62 import StringBuffer
  63 import FastString
  64 import SrcLoc
  65 import UniqFM
  66 import DynFlags
  67 import Module
  68 import Ctype
  69 import Util             ( readRational )
  70
  71 import Control.Monad
  72 import Data.Bits
  73 import Data.Char
  74 import Data.List
  75 import Data.Maybe
  76 import Data.Map (Map)
  77 import qualified Data.Map as Map
  78 import Data.Ratio
  79 }
  80
  81 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  82 $whitechar   = [\ \n\r\f\v $unispace]
  83 $white_no_nl = $whitechar # \n
  84 $tab         = \t
  85
  86 $ascdigit  = 0-9
  87 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  88 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  89 $digit     = [$ascdigit $unidigit]
  90
  91 $special   = [\(\)\,\;\[\]\`\{\}]
  92 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  93 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  94 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  95
  96 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  97 $asclarge  = [A-Z]
  98 $large     = [$asclarge $unilarge]
  99
 100 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 101 $ascsmall  = [a-z]
 102 $small     = [$ascsmall $unismall \_]
 103
 104 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 105 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 106
 107 $octit     = 0-7
 108 $hexit     = [$decdigit A-F a-f]
 109 $symchar   = [$symbol \:]
 110 $nl        = [\n\r]
 111 $idchar    = [$small $large $digit \']
 112
 113 $pragmachar = [$small $large $digit]
 114
 115 $docsym    = [\| \^ \* \$]
 116
 117 @varid     = $small $idchar*
 118 @conid     = $large $idchar*
 119
 120 @varsym    = $symbol $symchar*
 121 @consym    = \: $symchar*
 122
 123 @decimal     = $decdigit+
 124 @octal       = $octit+
 125 @hexadecimal = $hexit+
 126 @exponent    = [eE] [\-\+]? @decimal
 127
 128 -- we support the hierarchical module name extension:
 129 @qual = (@conid \.)+
 130
 131 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 132
 133 -- normal signed numerical literals can only be explicitly negative,
 134 -- not explicitly positive (contrast @exponent)
 135 @negative = \-
 136 @signed = @negative ?
 137
 138 haskell :-
 139
 140 -- everywhere: skip whitespace and comments
 141 $white_no_nl+                           ;
 142 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 143
 144 -- Everywhere: deal with nested comments.  We explicitly rule out
 145 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 146 -- (this can happen even though pragmas will normally take precedence due to
 147 -- longest-match, because pragmas aren't valid in every state, but comments
 148 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 149 -- set.
 150
 151 "{-" / { isNormalComment } { nested_comment lexToken }
 152
 153 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 154 -- more dashes followed by a symbol should be parsed as a varsym, so we
 155 -- have to exclude those.
 156
 157 -- Since Haddock comments aren't valid in every state, we need to rule them
 158 -- out here.
 159
 160 -- The following two rules match comments that begin with two dashes, but
 161 -- continue with a different character. The rules test that this character
 162 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 163 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 164 -- have a Haddock comment). The rules then munch the rest of the line.
 165
 166 "-- " ~[$docsym \#] .* { lineCommentToken }
 167 "--" [^$symbol : \ ] .* { lineCommentToken }
 168
 169 -- Next, match Haddock comments if no -haddock flag
 170
 171 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 172
 173 -- Now, when we've matched comments that begin with 2 dashes and continue
 174 -- with a different character, we need to match comments that begin with three
 175 -- or more dashes (which clearly can't be Haddock comments). We only need to
 176 -- make sure that the first non-dash character isn't a symbol, and munch the
 177 -- rest of the line.
 178
 179 "---"\-* [^$symbol :] .* { lineCommentToken }
 180
 181 -- Since the previous rules all match dashes followed by at least one
 182 -- character, we also need to match a whole line filled with just dashes.
 183
 184 "--"\-* / { atEOL } { lineCommentToken }
 185
 186 -- We need this rule since none of the other single line comment rules
 187 -- actually match this case.
 188
 189 "-- " / { atEOL } { lineCommentToken }
 190
 191 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 192 -- blank lines) until we find a non-whitespace character, then do layout
 193 -- processing.
 194 --
 195 -- One slight wibble here: what if the line begins with {-#? In
 196 -- theory, we have to lex the pragma to see if it's one we recognise,
 197 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 198 -- as a nested comment.  We don't bother with this: if the line begins
 199 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 200 <bol> {
 201   \n                                    ;
 202   ^\# (line)?                           { begin line_prag1 }
 203   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 204   ^\# \! .* \n                          ; -- #!, for scripts
 205   ()                                    { do_bol }
 206 }
 207
 208 -- after a layout keyword (let, where, do, of), we begin a new layout
 209 -- context if the curly brace is missing.
 210 -- Careful! This stuff is quite delicate.
 211 <layout, layout_do> {
 212   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 213         -- we might encounter {-# here, but {- has been handled already
 214   \n                                    ;
 215   ^\# (line)?                           { begin line_prag1 }
 216 }
 217
 218 -- do is treated in a subtly different way, see new_layout_context
 219 <layout>    ()                          { new_layout_context True }
 220 <layout_do> ()                          { new_layout_context False }
 221
 222 -- after a new layout context which was found to be to the left of the
 223 -- previous context, we have generated a '{' token, and we now need to
 224 -- generate a matching '}' token.
 225 <layout_left>  ()                       { do_layout_left }
 226
 227 <0,option_prags> \n                             { begin bol }
 228
 229 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 230                                 { dispatch_pragmas linePrags }
 231
 232 -- single-line line pragmas, of the form
 233 --    # <line> "<file>" <extra-stuff> \n
 234 <line_prag1> $decdigit+                 { setLine line_prag1a }
 235 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 236 <line_prag1b> .*                        { pop }
 237
 238 -- Haskell-style line pragmas, of the form
 239 --    {-# LINE <line> "<file>" #-}
 240 <line_prag2> $decdigit+                 { setLine line_prag2a }
 241 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 242 <line_prag2b> "#-}"|"-}"                { pop }
 243    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 244    -- with older versions of GHC which generated these.
 245
 246 <0,option_prags> {
 247   "{-#" $whitechar* $pragmachar+
 248         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 249                                  { dispatch_pragmas twoWordPrags }
 250
 251   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 252                                  { dispatch_pragmas oneWordPrags }
 253
 254   -- We ignore all these pragmas, but don't generate a warning for them
 255   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 256                                  { dispatch_pragmas ignoredPrags }
 257
 258   -- ToDo: should only be valid inside a pragma:
 259   "#-}"                                 { endPrag }
 260 }
 261
 262 <option_prags> {
 263   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 264                                    { dispatch_pragmas fileHeaderPrags }
 265
 266   "-- #"                                 { multiline_doc_comment }
 267 }
 268
 269 <0> {
 270   -- In the "0" mode we ignore these pragmas
 271   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 272                      { nested_comment lexToken }
 273 }
 274
 275 <0> {
 276   "-- #" .* { lineCommentToken }
 277 }
 278
 279 <0,option_prags> {
 280   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 281                     (nested_comment lexToken) }
 282 }
 283
 284 -- '0' state: ordinary lexemes
 285
 286 -- Haddock comments
 287
 288 <0> {
 289   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 290   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 291 }
 292
 293 -- "special" symbols
 294
 295 <0> {
 296   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 297   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 298 }
 299
 300 <0> {
 301   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 302   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 303   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 304   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 305   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 306   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 307   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 308   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 309
 310   "[" @varid "|"  / { ifExtension qqEnabled }
 311                      { lex_quasiquote_tok }
 312 }
 313
 314 <0> {
 315   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 316                                         { special IToparenbar }
 317   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 318 }
 319
 320 <0> {
 321   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 322 }
 323
 324 <0> {
 325   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 326          { token IToubxparen }
 327   "#)" / { ifExtension unboxedTuplesEnabled }
 328          { token ITcubxparen }
 329 }
 330
 331 <0> {
 332   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 333   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 334 }
 335
 336 <0,option_prags> {
 337   \(                                    { special IToparen }
 338   \)                                    { special ITcparen }
 339   \[                                    { special ITobrack }
 340   \]                                    { special ITcbrack }
 341   \,                                    { special ITcomma }
 342   \;                                    { special ITsemi }
 343   \`                                    { special ITbackquote }
 344
 345   \{                                    { open_brace }
 346   \}                                    { close_brace }
 347 }
 348
 349 <0,option_prags> {
 350   @qual @varid                  { idtoken qvarid }
 351   @qual @conid                  { idtoken qconid }
 352   @varid                        { varid }
 353   @conid                        { idtoken conid }
 354 }
 355
 356 <0> {
 357   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 358   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 359   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 360   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 361 }
 362
 363 -- ToDo: - move `var` and (sym) into lexical syntax?
 364 --       - remove backquote from $special?
 365 <0> {
 366   @qual @varsym       / { ifExtension oldQualOps } { idtoken qvarsym }
 367   @qual @consym       / { ifExtension oldQualOps } { idtoken qconsym }
 368   @qual \( @varsym \) / { ifExtension newQualOps } { idtoken prefixqvarsym }
 369   @qual \( @consym \) / { ifExtension newQualOps } { idtoken prefixqconsym }
 370   @varsym                                          { varsym }
 371   @consym                                          { consym }
 372 }
 373
 374 -- For the normal boxed literals we need to be careful
 375 -- when trying to be close to Haskell98
 376 <0> {
 377   -- Normal integral literals (:: Num a => a, from Integer)
 378   @decimal           { tok_num positive 0 0 decimal }
 379   0[oO] @octal       { tok_num positive 2 2 octal }
 380   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 381
 382   -- Normal rational literals (:: Fractional a => a, from Rational)
 383   @floating_point    { strtoken tok_float }
 384 }
 385
 386 <0> {
 387   -- Unboxed ints (:: Int#) and words (:: Word#)
 388   -- It's simpler (and faster?) to give separate cases to the negatives,
 389   -- especially considering octal/hexadecimal prefixes.
 390   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 391   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 392   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 393   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 394   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 395   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 396
 397   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 398   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 399   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 400
 401   -- Unboxed floats and doubles (:: Float#, :: Double#)
 402   -- prim_{float,double} work with signed literals
 403   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 404   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 405 }
 406
 407 -- Strings and chars are lexed by hand-written code.  The reason is
 408 -- that even if we recognise the string or char here in the regex
 409 -- lexer, we would still have to parse the string afterward in order
 410 -- to convert it to a String.
 411 <0> {
 412   \'                            { lex_char_tok }
 413   \"                            { lex_string_tok }
 414 }
 415
 416 {
 417 -- -----------------------------------------------------------------------------
 418 -- The token type
 419
 420 data Token
 421   = ITas                        -- Haskell keywords
 422   | ITcase
 423   | ITclass
 424   | ITdata
 425   | ITdefault
 426   | ITderiving
 427   | ITdo
 428   | ITelse
 429   | IThiding
 430   | ITif
 431   | ITimport
 432   | ITin
 433   | ITinfix
 434   | ITinfixl
 435   | ITinfixr
 436   | ITinstance
 437   | ITlet
 438   | ITmodule
 439   | ITnewtype
 440   | ITof
 441   | ITqualified
 442   | ITthen
 443   | ITtype
 444   | ITwhere
 445   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 446
 447   | ITforall                    -- GHC extension keywords
 448   | ITforeign
 449   | ITexport
 450   | ITlabel
 451   | ITdynamic
 452   | ITsafe
 453   | ITthreadsafe
 454   | ITunsafe
 455   | ITstdcallconv
 456   | ITccallconv
 457   | ITprimcallconv
 458   | ITmdo
 459   | ITfamily
 460   | ITgroup
 461   | ITby
 462   | ITusing
 463
 464         -- Pragmas
 465   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 466   | ITinline_conlike_prag Bool  -- same
 467   | ITspec_prag                 -- SPECIALISE
 468   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 469   | ITsource_prag
 470   | ITrules_prag
 471   | ITwarning_prag
 472   | ITdeprecated_prag
 473   | ITline_prag
 474   | ITscc_prag
 475   | ITgenerated_prag
 476   | ITcore_prag                 -- hdaume: core annotations
 477   | ITunpack_prag
 478   | ITann_prag
 479   | ITclose_prag
 480   | IToptions_prag String
 481   | ITinclude_prag String
 482   | ITlanguage_prag
 483
 484   | ITdotdot                    -- reserved symbols
 485   | ITcolon
 486   | ITdcolon
 487   | ITequal
 488   | ITlam
 489   | ITvbar
 490   | ITlarrow
 491   | ITrarrow
 492   | ITat
 493   | ITtilde
 494   | ITdarrow
 495   | ITminus
 496   | ITbang
 497   | ITstar
 498   | ITdot
 499
 500   | ITbiglam                    -- GHC-extension symbols
 501
 502   | ITocurly                    -- special symbols
 503   | ITccurly
 504   | ITocurlybar                 -- {|, for type applications
 505   | ITccurlybar                 -- |}, for type applications
 506   | ITvocurly
 507   | ITvccurly
 508   | ITobrack
 509   | ITopabrack                  -- [:, for parallel arrays with -XParr
 510   | ITcpabrack                  -- :], for parallel arrays with -XParr
 511   | ITcbrack
 512   | IToparen
 513   | ITcparen
 514   | IToubxparen
 515   | ITcubxparen
 516   | ITsemi
 517   | ITcomma
 518   | ITunderscore
 519   | ITbackquote
 520
 521   | ITvarid   FastString        -- identifiers
 522   | ITconid   FastString
 523   | ITvarsym  FastString
 524   | ITconsym  FastString
 525   | ITqvarid  (FastString,FastString)
 526   | ITqconid  (FastString,FastString)
 527   | ITqvarsym (FastString,FastString)
 528   | ITqconsym (FastString,FastString)
 529   | ITprefixqvarsym (FastString,FastString)
 530   | ITprefixqconsym (FastString,FastString)
 531
 532   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 533
 534   | ITchar       Char
 535   | ITstring     FastString
 536   | ITinteger    Integer
 537   | ITrational   Rational
 538
 539   | ITprimchar   Char
 540   | ITprimstring FastString
 541   | ITprimint    Integer
 542   | ITprimword   Integer
 543   | ITprimfloat  Rational
 544   | ITprimdouble Rational
 545
 546   -- Template Haskell extension tokens
 547   | ITopenExpQuote              --  [| or [e|
 548   | ITopenPatQuote              --  [p|
 549   | ITopenDecQuote              --  [d|
 550   | ITopenTypQuote              --  [t|
 551   | ITcloseQuote                --  |]
 552   | ITidEscape   FastString     --  $x
 553   | ITparenEscape               --  $(
 554   | ITvarQuote                  --  '
 555   | ITtyQuote                   --  ''
 556   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 557
 558   -- Arrow notation extension
 559   | ITproc
 560   | ITrec
 561   | IToparenbar                 --  (|
 562   | ITcparenbar                 --  |)
 563   | ITlarrowtail                --  -<
 564   | ITrarrowtail                --  >-
 565   | ITLarrowtail                --  -<<
 566   | ITRarrowtail                --  >>-
 567
 568   | ITunknown String            -- Used when the lexer can't make sense of it
 569   | ITeof                       -- end of file token
 570
 571   -- Documentation annotations
 572   | ITdocCommentNext  String     -- something beginning '-- |'
 573   | ITdocCommentPrev  String     -- something beginning '-- ^'
 574   | ITdocCommentNamed String     -- something beginning '-- $'
 575   | ITdocSection      Int String -- a section heading
 576   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 577   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 578   | ITlineComment     String     -- comment starting by "--"
 579   | ITblockComment    String     -- comment in {- -}
 580
 581 #ifdef DEBUG
 582   deriving Show -- debugging
 583 #endif
 584
 585 {-
 586 isSpecial :: Token -> Bool
 587 -- If we see M.x, where x is a keyword, but
 588 -- is special, we treat is as just plain M.x,
 589 -- not as a keyword.
 590 isSpecial ITas          = True
 591 isSpecial IThiding      = True
 592 isSpecial ITqualified   = True
 593 isSpecial ITforall      = True
 594 isSpecial ITexport      = True
 595 isSpecial ITlabel       = True
 596 isSpecial ITdynamic     = True
 597 isSpecial ITsafe        = True
 598 isSpecial ITthreadsafe  = True
 599 isSpecial ITunsafe      = True
 600 isSpecial ITccallconv   = True
 601 isSpecial ITstdcallconv = True
 602 isSpecial ITprimcallconv = True
 603 isSpecial ITmdo         = True
 604 isSpecial ITfamily      = True
 605 isSpecial ITgroup   = True
 606 isSpecial ITby      = True
 607 isSpecial ITusing   = True
 608 isSpecial _             = False
 609 -}
 610
 611 -- the bitmap provided as the third component indicates whether the
 612 -- corresponding extension keyword is valid under the extension options
 613 -- provided to the compiler; if the extension corresponding to *any* of the
 614 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 615 -- facilitates using a keyword in two different extensions that can be
 616 -- activated independently)
 617 --
 618 reservedWordsFM :: UniqFM (Token, Int)
 619 reservedWordsFM = listToUFM $
 620         map (\(x, y, z) -> (mkFastString x, (y, z)))
 621        [( "_",          ITunderscore,   0 ),
 622         ( "as",         ITas,           0 ),
 623         ( "case",       ITcase,         0 ),
 624         ( "class",      ITclass,        0 ),
 625         ( "data",       ITdata,         0 ),
 626         ( "default",    ITdefault,      0 ),
 627         ( "deriving",   ITderiving,     0 ),
 628         ( "do",         ITdo,           0 ),
 629         ( "else",       ITelse,         0 ),
 630         ( "hiding",     IThiding,       0 ),
 631         ( "if",         ITif,           0 ),
 632         ( "import",     ITimport,       0 ),
 633         ( "in",         ITin,           0 ),
 634         ( "infix",      ITinfix,        0 ),
 635         ( "infixl",     ITinfixl,       0 ),
 636         ( "infixr",     ITinfixr,       0 ),
 637         ( "instance",   ITinstance,     0 ),
 638         ( "let",        ITlet,          0 ),
 639         ( "module",     ITmodule,       0 ),
 640         ( "newtype",    ITnewtype,      0 ),
 641         ( "of",         ITof,           0 ),
 642         ( "qualified",  ITqualified,    0 ),
 643         ( "then",       ITthen,         0 ),
 644         ( "type",       ITtype,         0 ),
 645         ( "where",      ITwhere,        0 ),
 646         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 647
 648     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 649         ( "mdo",        ITmdo,           bit recursiveDoBit),
 650         ( "family",     ITfamily,        bit tyFamBit),
 651     ( "group",  ITgroup,     bit transformComprehensionsBit),
 652     ( "by",     ITby,        bit transformComprehensionsBit),
 653     ( "using",  ITusing,     bit transformComprehensionsBit),
 654
 655         ( "foreign",    ITforeign,       bit ffiBit),
 656         ( "export",     ITexport,        bit ffiBit),
 657         ( "label",      ITlabel,         bit ffiBit),
 658         ( "dynamic",    ITdynamic,       bit ffiBit),
 659         ( "safe",       ITsafe,          bit ffiBit),
 660         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 661         ( "unsafe",     ITunsafe,        bit ffiBit),
 662         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 663         ( "ccall",      ITccallconv,     bit ffiBit),
 664         ( "prim",       ITprimcallconv,  bit ffiBit),
 665
 666         ( "rec",        ITrec,           bit recBit),
 667         ( "proc",       ITproc,          bit arrowsBit)
 668      ]
 669
 670 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 671 reservedSymsFM = listToUFM $
 672     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 673       [ ("..",  ITdotdot,   always)
 674         -- (:) is a reserved op, meaning only list cons
 675        ,(":",   ITcolon,    always)
 676        ,("::",  ITdcolon,   always)
 677        ,("=",   ITequal,    always)
 678        ,("\\",  ITlam,      always)
 679        ,("|",   ITvbar,     always)
 680        ,("<-",  ITlarrow,   always)
 681        ,("->",  ITrarrow,   always)
 682        ,("@",   ITat,       always)
 683        ,("~",   ITtilde,    always)
 684        ,("=>",  ITdarrow,   always)
 685        ,("-",   ITminus,    always)
 686        ,("!",   ITbang,     always)
 687
 688         -- For data T (a::*) = MkT
 689        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 690         -- For 'forall a . t'
 691        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 692
 693        ,("-<",  ITlarrowtail, arrowsEnabled)
 694        ,(">-",  ITrarrowtail, arrowsEnabled)
 695        ,("-<<", ITLarrowtail, arrowsEnabled)
 696        ,(">>-", ITRarrowtail, arrowsEnabled)
 697
 698        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 699        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 700        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 701                                 explicitForallEnabled i)
 702        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 703        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 704        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 705
 706        ,("⤙",   ITlarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 707        ,("⤚",   ITrarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 708        ,("⤛",   ITLarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 709        ,("⤜",   ITRarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 710
 711        ,("★", ITstar, unicodeSyntaxEnabled)
 712
 713         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 714         -- form part of a large operator.  This would let us have a better
 715         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 716        ]
 717
 718 -- -----------------------------------------------------------------------------
 719 -- Lexer actions
 720
 721 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 722
 723 special :: Token -> Action
 724 special tok span _buf _len = return (L span tok)
 725
 726 token, layout_token :: Token -> Action
 727 token t span _buf _len = return (L span t)
 728 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 729
 730 idtoken :: (StringBuffer -> Int -> Token) -> Action
 731 idtoken f span buf len = return (L span $! (f buf len))
 732
 733 skip_one_varid :: (FastString -> Token) -> Action
 734 skip_one_varid f span buf len
 735   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 736
 737 strtoken :: (String -> Token) -> Action
 738 strtoken f span buf len =
 739   return (L span $! (f $! lexemeToString buf len))
 740
 741 init_strtoken :: Int -> (String -> Token) -> Action
 742 -- like strtoken, but drops the last N character(s)
 743 init_strtoken drop f span buf len =
 744   return (L span $! (f $! lexemeToString buf (len-drop)))
 745
 746 begin :: Int -> Action
 747 begin code _span _str _len = do pushLexState code; lexToken
 748
 749 pop :: Action
 750 pop _span _buf _len = do _ <- popLexState
 751                          lexToken
 752
 753 pop_and :: Action -> Action
 754 pop_and act span buf len = do _ <- popLexState
 755                               act span buf len
 756
 757 {-# INLINE nextCharIs #-}
 758 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 759 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 760
 761 notFollowedBy :: Char -> AlexAccPred Int
 762 notFollowedBy char _ _ _ (AI _ buf)
 763   = nextCharIs buf (/=char)
 764
 765 notFollowedBySymbol :: AlexAccPred Int
 766 notFollowedBySymbol _ _ _ (AI _ buf)
 767   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 768
 769 -- We must reject doc comments as being ordinary comments everywhere.
 770 -- In some cases the doc comment will be selected as the lexeme due to
 771 -- maximal munch, but not always, because the nested comment rule is
 772 -- valid in all states, but the doc-comment rules are only valid in
 773 -- the non-layout states.
 774 isNormalComment :: AlexAccPred Int
 775 isNormalComment bits _ _ (AI _ buf)
 776   | haddockEnabled bits = notFollowedByDocOrPragma
 777   | otherwise           = nextCharIs buf (/='#')
 778   where
 779     notFollowedByDocOrPragma
 780        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 781
 782 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 783 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 784
 785 {-
 786 haddockDisabledAnd p bits _ _ (AI _ buf)
 787   = if haddockEnabled bits then False else (p buf)
 788 -}
 789
 790 atEOL :: AlexAccPred Int
 791 atEOL _ _ _ (AI _ buf) = atEnd buf || currentChar buf == '\n'
 792
 793 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 794 ifExtension pred bits _ _ _ = pred bits
 795
 796 multiline_doc_comment :: Action
 797 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 798   where
 799     worker commentAcc input docType oneLine = case alexGetChar input of
 800       Just ('\n', input')
 801         | oneLine -> docCommentEnd input commentAcc docType buf span
 802         | otherwise -> case checkIfCommentLine input' of
 803           Just input -> worker ('\n':commentAcc) input docType False
 804           Nothing -> docCommentEnd input commentAcc docType buf span
 805       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 806       Nothing -> docCommentEnd input commentAcc docType buf span
 807
 808     checkIfCommentLine input = check (dropNonNewlineSpace input)
 809       where
 810         check input = case alexGetChar input of
 811           Just ('-', input) -> case alexGetChar input of
 812             Just ('-', input) -> case alexGetChar input of
 813               Just (c, _) | c /= '-' -> Just input
 814               _ -> Nothing
 815             _ -> Nothing
 816           _ -> Nothing
 817
 818         dropNonNewlineSpace input = case alexGetChar input of
 819           Just (c, input')
 820             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 821             | otherwise -> input
 822           Nothing -> input
 823
 824 lineCommentToken :: Action
 825 lineCommentToken span buf len = do
 826   b <- extension rawTokenStreamEnabled
 827   if b then strtoken ITlineComment span buf len else lexToken
 828
 829 {-
 830   nested comments require traversing by hand, they can't be parsed
 831   using regular expressions.
 832 -}
 833 nested_comment :: P (Located Token) -> Action
 834 nested_comment cont span _str _len = do
 835   input <- getInput
 836   go "" (1::Int) input
 837   where
 838     go commentAcc 0 input = do setInput input
 839                                b <- extension rawTokenStreamEnabled
 840                                if b
 841                                  then docCommentEnd input commentAcc ITblockComment _str span
 842                                  else cont
 843     go commentAcc n input = case alexGetChar input of
 844       Nothing -> errBrace input span
 845       Just ('-',input) -> case alexGetChar input of
 846         Nothing  -> errBrace input span
 847         Just ('\125',input) -> go commentAcc (n-1) input
 848         Just (_,_)          -> go ('-':commentAcc) n input
 849       Just ('\123',input) -> case alexGetChar input of
 850         Nothing  -> errBrace input span
 851         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 852         Just (_,_)       -> go ('\123':commentAcc) n input
 853       Just (c,input) -> go (c:commentAcc) n input
 854
 855 nested_doc_comment :: Action
 856 nested_doc_comment span buf _len = withLexedDocType (go "")
 857   where
 858     go commentAcc input docType _ = case alexGetChar input of
 859       Nothing -> errBrace input span
 860       Just ('-',input) -> case alexGetChar input of
 861         Nothing -> errBrace input span
 862         Just ('\125',input) ->
 863           docCommentEnd input commentAcc docType buf span
 864         Just (_,_) -> go ('-':commentAcc) input docType False
 865       Just ('\123', input) -> case alexGetChar input of
 866         Nothing  -> errBrace input span
 867         Just ('-',input) -> do
 868           setInput input
 869           let cont = do input <- getInput; go commentAcc input docType False
 870           nested_comment cont span buf _len
 871         Just (_,_) -> go ('\123':commentAcc) input docType False
 872       Just (c,input) -> go (c:commentAcc) input docType False
 873
 874 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 875                  -> P (Located Token)
 876 withLexedDocType lexDocComment = do
 877   input@(AI _ buf) <- getInput
 878   case prevChar buf ' ' of
 879     '|' -> lexDocComment input ITdocCommentNext False
 880     '^' -> lexDocComment input ITdocCommentPrev False
 881     '$' -> lexDocComment input ITdocCommentNamed False
 882     '*' -> lexDocSection 1 input
 883     '#' -> lexDocComment input ITdocOptionsOld False
 884     _ -> panic "withLexedDocType: Bad doc type"
 885  where
 886     lexDocSection n input = case alexGetChar input of
 887       Just ('*', input) -> lexDocSection (n+1) input
 888       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 889       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 890
 891 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 892 -- off again at the end of the pragma.
 893 rulePrag :: Action
 894 rulePrag span _buf _len = do
 895   setExts (.|. bit inRulePragBit)
 896   return (L span ITrules_prag)
 897
 898 endPrag :: Action
 899 endPrag span _buf _len = do
 900   setExts (.&. complement (bit inRulePragBit))
 901   return (L span ITclose_prag)
 902
 903 -- docCommentEnd
 904 -------------------------------------------------------------------------------
 905 -- This function is quite tricky. We can't just return a new token, we also
 906 -- need to update the state of the parser. Why? Because the token is longer
 907 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 908 -- it writes the wrong token length to the parser state. This function is
 909 -- called afterwards, so it can just update the state.
 910
 911 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 912                  SrcSpan -> P (Located Token)
 913 docCommentEnd input commentAcc docType buf span = do
 914   setInput input
 915   let (AI loc nextBuf) = input
 916       comment = reverse commentAcc
 917       span' = mkSrcSpan (srcSpanStart span) loc
 918       last_len = byteDiff buf nextBuf
 919
 920   span `seq` setLastToken span' last_len
 921   return (L span' (docType comment))
 922
 923 errBrace :: AlexInput -> SrcSpan -> P a
 924 errBrace (AI end _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 925
 926 open_brace, close_brace :: Action
 927 open_brace span _str _len = do
 928   ctx <- getContext
 929   setContext (NoLayout:ctx)
 930   return (L span ITocurly)
 931 close_brace span _str _len = do
 932   popContext
 933   return (L span ITccurly)
 934
 935 qvarid, qconid :: StringBuffer -> Int -> Token
 936 qvarid buf len = ITqvarid $! splitQualName buf len False
 937 qconid buf len = ITqconid $! splitQualName buf len False
 938
 939 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 940 -- takes a StringBuffer and a length, and returns the module name
 941 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 942 -- because of hierarchical module names.
 943 splitQualName orig_buf len parens = split orig_buf orig_buf
 944   where
 945     split buf dot_buf
 946         | orig_buf `byteDiff` buf >= len  = done dot_buf
 947         | c == '.'                        = found_dot buf'
 948         | otherwise                       = split buf' dot_buf
 949       where
 950        (c,buf') = nextChar buf
 951
 952     -- careful, we might get names like M....
 953     -- so, if the character after the dot is not upper-case, this is
 954     -- the end of the qualifier part.
 955     found_dot buf -- buf points after the '.'
 956         | isUpper c    = split buf' buf
 957         | otherwise    = done buf
 958       where
 959        (c,buf') = nextChar buf
 960
 961     done dot_buf =
 962         (lexemeToFastString orig_buf (qual_size - 1),
 963          if parens -- Prelude.(+)
 964             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 965             else lexemeToFastString dot_buf (len - qual_size))
 966       where
 967         qual_size = orig_buf `byteDiff` dot_buf
 968
 969 varid :: Action
 970 varid span buf len =
 971   fs `seq`
 972   case lookupUFM reservedWordsFM fs of
 973         Just (keyword,0)    -> do
 974                 maybe_layout keyword
 975                 return (L span keyword)
 976         Just (keyword,exts) -> do
 977                 b <- extension (\i -> exts .&. i /= 0)
 978                 if b then do maybe_layout keyword
 979                              return (L span keyword)
 980                      else return (L span (ITvarid fs))
 981         _other -> return (L span (ITvarid fs))
 982   where
 983         fs = lexemeToFastString buf len
 984
 985 conid :: StringBuffer -> Int -> Token
 986 conid buf len = ITconid fs
 987   where fs = lexemeToFastString buf len
 988
 989 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
 990 qvarsym buf len = ITqvarsym $! splitQualName buf len False
 991 qconsym buf len = ITqconsym $! splitQualName buf len False
 992 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
 993 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
 994
 995 varsym, consym :: Action
 996 varsym = sym ITvarsym
 997 consym = sym ITconsym
 998
 999 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
1000     -> P (Located Token)
1001 sym con span buf len =
1002   case lookupUFM reservedSymsFM fs of
1003         Just (keyword,exts) -> do
1004                 b <- extension exts
1005                 if b then return (L span keyword)
1006                      else return (L span $! con fs)
1007         _other -> return (L span $! con fs)
1008   where
1009         fs = lexemeToFastString buf len
1010
1011 -- Variations on the integral numeric literal.
1012 tok_integral :: (Integer -> Token)
1013      -> (Integer -> Integer)
1014  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1015      -> Int -> Int
1016      -> (Integer, (Char->Int)) -> Action
1017 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1018   return $ L span $ itint $! transint $ parseUnsignedInteger
1019      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1020
1021 -- some conveniences for use with tok_integral
1022 tok_num :: (Integer -> Integer)
1023         -> Int -> Int
1024         -> (Integer, (Char->Int)) -> Action
1025 tok_num = tok_integral ITinteger
1026 tok_primint :: (Integer -> Integer)
1027             -> Int -> Int
1028             -> (Integer, (Char->Int)) -> Action
1029 tok_primint = tok_integral ITprimint
1030 tok_primword :: Int -> Int
1031              -> (Integer, (Char->Int)) -> Action
1032 tok_primword = tok_integral ITprimword positive
1033 positive, negative :: (Integer -> Integer)
1034 positive = id
1035 negative = negate
1036 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1037 decimal = (10,octDecDigit)
1038 octal = (8,octDecDigit)
1039 hexadecimal = (16,hexDigit)
1040
1041 -- readRational can understand negative rationals, exponents, everything.
1042 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1043 tok_float        str = ITrational   $! readRational str
1044 tok_primfloat    str = ITprimfloat  $! readRational str
1045 tok_primdouble   str = ITprimdouble $! readRational str
1046
1047 -- -----------------------------------------------------------------------------
1048 -- Layout processing
1049
1050 -- we're at the first token on a line, insert layout tokens if necessary
1051 do_bol :: Action
1052 do_bol span _str _len = do
1053         pos <- getOffside
1054         case pos of
1055             LT -> do
1056                 --trace "layout: inserting '}'" $ do
1057                 popContext
1058                 -- do NOT pop the lex state, we might have a ';' to insert
1059                 return (L span ITvccurly)
1060             EQ -> do
1061                 --trace "layout: inserting ';'" $ do
1062                 _ <- popLexState
1063                 return (L span ITsemi)
1064             GT -> do
1065                 _ <- popLexState
1066                 lexToken
1067
1068 -- certain keywords put us in the "layout" state, where we might
1069 -- add an opening curly brace.
1070 maybe_layout :: Token -> P ()
1071 maybe_layout t = do -- If the alternative layout rule is enabled then
1072                     -- we never create an implicit layout context here.
1073                     -- Layout is handled XXX instead.
1074                     -- The code for closing implicit contexts, or
1075                     -- inserting implicit semi-colons, is therefore
1076                     -- irrelevant as it only applies in an implicit
1077                     -- context.
1078                     alr <- extension alternativeLayoutRule
1079                     unless alr $ f t
1080     where f ITdo    = pushLexState layout_do
1081           f ITmdo   = pushLexState layout_do
1082           f ITof    = pushLexState layout
1083           f ITlet   = pushLexState layout
1084           f ITwhere = pushLexState layout
1085           f ITrec   = pushLexState layout
1086           f _       = return ()
1087
1088 -- Pushing a new implicit layout context.  If the indentation of the
1089 -- next token is not greater than the previous layout context, then
1090 -- Haskell 98 says that the new layout context should be empty; that is
1091 -- the lexer must generate {}.
1092 --
1093 -- We are slightly more lenient than this: when the new context is started
1094 -- by a 'do', then we allow the new context to be at the same indentation as
1095 -- the previous context.  This is what the 'strict' argument is for.
1096 --
1097 new_layout_context :: Bool -> Action
1098 new_layout_context strict span _buf _len = do
1099     _ <- popLexState
1100     (AI l _) <- getInput
1101     let offset = srcLocCol l
1102     ctx <- getContext
1103     case ctx of
1104         Layout prev_off : _  |
1105            (strict     && prev_off >= offset  ||
1106             not strict && prev_off > offset) -> do
1107                 -- token is indented to the left of the previous context.
1108                 -- we must generate a {} sequence now.
1109                 pushLexState layout_left
1110                 return (L span ITvocurly)
1111         _ -> do
1112                 setContext (Layout offset : ctx)
1113                 return (L span ITvocurly)
1114
1115 do_layout_left :: Action
1116 do_layout_left span _buf _len = do
1117     _ <- popLexState
1118     pushLexState bol  -- we must be at the start of a line
1119     return (L span ITvccurly)
1120
1121 -- -----------------------------------------------------------------------------
1122 -- LINE pragmas
1123
1124 setLine :: Int -> Action
1125 setLine code span buf len = do
1126   let line = parseUnsignedInteger buf len 10 octDecDigit
1127   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1)
1128         -- subtract one: the line number refers to the *following* line
1129   _ <- popLexState
1130   pushLexState code
1131   lexToken
1132
1133 setFile :: Int -> Action
1134 setFile code span buf len = do
1135   let file = lexemeToFastString (stepOn buf) (len-2)
1136   setAlrLastLoc noSrcSpan
1137   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1138   _ <- popLexState
1139   pushLexState code
1140   lexToken
1141
1142
1143 -- -----------------------------------------------------------------------------
1144 -- Options, includes and language pragmas.
1145
1146 lex_string_prag :: (String -> Token) -> Action
1147 lex_string_prag mkTok span _buf _len
1148     = do input <- getInput
1149          start <- getSrcLoc
1150          tok <- go [] input
1151          end <- getSrcLoc
1152          return (L (mkSrcSpan start end) tok)
1153     where go acc input
1154               = if isString input "#-}"
1155                    then do setInput input
1156                            return (mkTok (reverse acc))
1157                    else case alexGetChar input of
1158                           Just (c,i) -> go (c:acc) i
1159                           Nothing -> err input
1160           isString _ [] = True
1161           isString i (x:xs)
1162               = case alexGetChar i of
1163                   Just (c,i') | c == x    -> isString i' xs
1164                   _other -> False
1165           err (AI end _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1166
1167
1168 -- -----------------------------------------------------------------------------
1169 -- Strings & Chars
1170
1171 -- This stuff is horrible.  I hates it.
1172
1173 lex_string_tok :: Action
1174 lex_string_tok span _buf _len = do
1175   tok <- lex_string ""
1176   end <- getSrcLoc
1177   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1178
1179 lex_string :: String -> P Token
1180 lex_string s = do
1181   i <- getInput
1182   case alexGetChar' i of
1183     Nothing -> lit_error i
1184
1185     Just ('"',i)  -> do
1186         setInput i
1187         magicHash <- extension magicHashEnabled
1188         if magicHash
1189           then do
1190             i <- getInput
1191             case alexGetChar' i of
1192               Just ('#',i) -> do
1193                    setInput i
1194                    if any (> '\xFF') s
1195                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1196                     else let s' = mkZFastString (reverse s) in
1197                          return (ITprimstring s')
1198                         -- mkZFastString is a hack to avoid encoding the
1199                         -- string in UTF-8.  We just want the exact bytes.
1200               _other ->
1201                 return (ITstring (mkFastString (reverse s)))
1202           else
1203                 return (ITstring (mkFastString (reverse s)))
1204
1205     Just ('\\',i)
1206         | Just ('&',i) <- next -> do
1207                 setInput i; lex_string s
1208         | Just (c,i) <- next, c <= '\x7f' && is_space c -> do
1209                            -- is_space only works for <= '\x7f' (#3751)
1210                 setInput i; lex_stringgap s
1211         where next = alexGetChar' i
1212
1213     Just (c, i1) -> do
1214         case c of
1215           '\\' -> do setInput i1; c' <- lex_escape; lex_string (c':s)
1216           c | isAny c -> do setInput i1; lex_string (c:s)
1217           _other -> lit_error i
1218
1219 lex_stringgap :: String -> P Token
1220 lex_stringgap s = do
1221   i <- getInput
1222   c <- getCharOrFail i
1223   case c of
1224     '\\' -> lex_string s
1225     c | is_space c -> lex_stringgap s
1226     _other -> lit_error i
1227
1228
1229 lex_char_tok :: Action
1230 -- Here we are basically parsing character literals, such as 'x' or '\n'
1231 -- but, when Template Haskell is on, we additionally spot
1232 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1233 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1234 -- So we have to do two characters of lookahead: when we see 'x we need to
1235 -- see if there's a trailing quote
1236 lex_char_tok span _buf _len = do        -- We've seen '
1237    i1 <- getInput       -- Look ahead to first character
1238    let loc = srcSpanStart span
1239    case alexGetChar' i1 of
1240         Nothing -> lit_error  i1
1241
1242         Just ('\'', i2@(AI end2 _)) -> do       -- We've seen ''
1243                   th_exts <- extension thEnabled
1244                   if th_exts then do
1245                         setInput i2
1246                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1247                    else lit_error i1
1248
1249         Just ('\\', i2@(AI _end2 _)) -> do      -- We've seen 'backslash
1250                   setInput i2
1251                   lit_ch <- lex_escape
1252                   i3 <- getInput
1253                   mc <- getCharOrFail i3 -- Trailing quote
1254                   if mc == '\'' then finish_char_tok loc lit_ch
1255                                 else lit_error i3
1256
1257         Just (c, i2@(AI _end2 _))
1258                 | not (isAny c) -> lit_error i1
1259                 | otherwise ->
1260
1261                 -- We've seen 'x, where x is a valid character
1262                 --  (i.e. not newline etc) but not a quote or backslash
1263            case alexGetChar' i2 of      -- Look ahead one more character
1264                 Just ('\'', i3) -> do   -- We've seen 'x'
1265                         setInput i3
1266                         finish_char_tok loc c
1267                 _other -> do            -- We've seen 'x not followed by quote
1268                                         -- (including the possibility of EOF)
1269                                         -- If TH is on, just parse the quote only
1270                         th_exts <- extension thEnabled
1271                         let (AI end _) = i1
1272                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1273                                    else lit_error i2
1274
1275 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1276 finish_char_tok loc ch  -- We've already seen the closing quote
1277                         -- Just need to check for trailing #
1278   = do  magicHash <- extension magicHashEnabled
1279         i@(AI end _) <- getInput
1280         if magicHash then do
1281                 case alexGetChar' i of
1282                         Just ('#',i@(AI end _)) -> do
1283                                 setInput i
1284                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1285                         _other ->
1286                                 return (L (mkSrcSpan loc end) (ITchar ch))
1287             else do
1288                    return (L (mkSrcSpan loc end) (ITchar ch))
1289
1290 isAny :: Char -> Bool
1291 isAny c | c > '\x7f' = isPrint c
1292         | otherwise  = is_any c
1293
1294 lex_escape :: P Char
1295 lex_escape = do
1296   i0 <- getInput
1297   c <- getCharOrFail i0
1298   case c of
1299         'a'   -> return '\a'
1300         'b'   -> return '\b'
1301         'f'   -> return '\f'
1302         'n'   -> return '\n'
1303         'r'   -> return '\r'
1304         't'   -> return '\t'
1305         'v'   -> return '\v'
1306         '\\'  -> return '\\'
1307         '"'   -> return '\"'
1308         '\''  -> return '\''
1309         '^'   -> do i1 <- getInput
1310                     c <- getCharOrFail i1
1311                     if c >= '@' && c <= '_'
1312                         then return (chr (ord c - ord '@'))
1313                         else lit_error i1
1314
1315         'x'   -> readNum is_hexdigit 16 hexDigit
1316         'o'   -> readNum is_octdigit  8 octDecDigit
1317         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1318
1319         c1 ->  do
1320            i <- getInput
1321            case alexGetChar' i of
1322             Nothing -> lit_error i0
1323             Just (c2,i2) ->
1324               case alexGetChar' i2 of
1325                 Nothing -> do lit_error i0
1326                 Just (c3,i3) ->
1327                    let str = [c1,c2,c3] in
1328                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1329                                      Just rest <- [stripPrefix p str] ] of
1330                           (escape_char,[]):_ -> do
1331                                 setInput i3
1332                                 return escape_char
1333                           (escape_char,_:_):_ -> do
1334                                 setInput i2
1335                                 return escape_char
1336                           [] -> lit_error i0
1337
1338 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1339 readNum is_digit base conv = do
1340   i <- getInput
1341   c <- getCharOrFail i
1342   if is_digit c
1343         then readNum2 is_digit base conv (conv c)
1344         else lit_error i
1345
1346 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1347 readNum2 is_digit base conv i = do
1348   input <- getInput
1349   read i input
1350   where read i input = do
1351           case alexGetChar' input of
1352             Just (c,input') | is_digit c -> do
1353                 read (i*base + conv c) input'
1354             _other -> do
1355                 if i >= 0 && i <= 0x10FFFF
1356                    then do setInput input; return (chr i)
1357                    else lit_error input
1358
1359 silly_escape_chars :: [(String, Char)]
1360 silly_escape_chars = [
1361         ("NUL", '\NUL'),
1362         ("SOH", '\SOH'),
1363         ("STX", '\STX'),
1364         ("ETX", '\ETX'),
1365         ("EOT", '\EOT'),
1366         ("ENQ", '\ENQ'),
1367         ("ACK", '\ACK'),
1368         ("BEL", '\BEL'),
1369         ("BS", '\BS'),
1370         ("HT", '\HT'),
1371         ("LF", '\LF'),
1372         ("VT", '\VT'),
1373         ("FF", '\FF'),
1374         ("CR", '\CR'),
1375         ("SO", '\SO'),
1376         ("SI", '\SI'),
1377         ("DLE", '\DLE'),
1378         ("DC1", '\DC1'),
1379         ("DC2", '\DC2'),
1380         ("DC3", '\DC3'),
1381         ("DC4", '\DC4'),
1382         ("NAK", '\NAK'),
1383         ("SYN", '\SYN'),
1384         ("ETB", '\ETB'),
1385         ("CAN", '\CAN'),
1386         ("EM", '\EM'),
1387         ("SUB", '\SUB'),
1388         ("ESC", '\ESC'),
1389         ("FS", '\FS'),
1390         ("GS", '\GS'),
1391         ("RS", '\RS'),
1392         ("US", '\US'),
1393         ("SP", '\SP'),
1394         ("DEL", '\DEL')
1395         ]
1396
1397 -- before calling lit_error, ensure that the current input is pointing to
1398 -- the position of the error in the buffer.  This is so that we can report
1399 -- a correct location to the user, but also so we can detect UTF-8 decoding
1400 -- errors if they occur.
1401 lit_error :: AlexInput -> P a
1402 lit_error i = do setInput i; lexError "lexical error in string/character literal"
1403
1404 getCharOrFail :: AlexInput -> P Char
1405 getCharOrFail i =  do
1406   case alexGetChar' i of
1407         Nothing -> lexError "unexpected end-of-file in string/character literal"
1408         Just (c,i)  -> do setInput i; return c
1409
1410 -- -----------------------------------------------------------------------------
1411 -- QuasiQuote
1412
1413 lex_quasiquote_tok :: Action
1414 lex_quasiquote_tok span buf len = do
1415   let quoter = tail (lexemeToString buf (len - 1))
1416                 -- 'tail' drops the initial '[',
1417                 -- while the -1 drops the trailing '|'
1418   quoteStart <- getSrcLoc
1419   quote <- lex_quasiquote ""
1420   end <- getSrcLoc
1421   return (L (mkSrcSpan (srcSpanStart span) end)
1422            (ITquasiQuote (mkFastString quoter,
1423                           mkFastString (reverse quote),
1424                           mkSrcSpan quoteStart end)))
1425
1426 lex_quasiquote :: String -> P String
1427 lex_quasiquote s = do
1428   i <- getInput
1429   case alexGetChar' i of
1430     Nothing -> lit_error i
1431
1432     Just ('\\',i)
1433         | Just ('|',i) <- next -> do
1434                 setInput i; lex_quasiquote ('|' : s)
1435         | Just (']',i) <- next -> do
1436                 setInput i; lex_quasiquote (']' : s)
1437         where next = alexGetChar' i
1438
1439     Just ('|',i)
1440         | Just (']',i) <- next -> do
1441                 setInput i; return s
1442         where next = alexGetChar' i
1443
1444     Just (c, i) -> do
1445          setInput i; lex_quasiquote (c : s)
1446
1447 -- -----------------------------------------------------------------------------
1448 -- Warnings
1449
1450 warn :: DynFlag -> SDoc -> Action
1451 warn option warning srcspan _buf _len = do
1452     addWarning option srcspan warning
1453     lexToken
1454
1455 warnThen :: DynFlag -> SDoc -> Action -> Action
1456 warnThen option warning action srcspan buf len = do
1457     addWarning option srcspan warning
1458     action srcspan buf len
1459
1460 -- -----------------------------------------------------------------------------
1461 -- The Parse Monad
1462
1463 data LayoutContext
1464   = NoLayout
1465   | Layout !Int
1466   deriving Show
1467
1468 data ParseResult a
1469   = POk PState a
1470   | PFailed
1471         SrcSpan         -- The start and end of the text span related to
1472                         -- the error.  Might be used in environments which can
1473                         -- show this span, e.g. by highlighting it.
1474         Message         -- The error message
1475
1476 data PState = PState {
1477         buffer     :: StringBuffer,
1478         dflags     :: DynFlags,
1479         messages   :: Messages,
1480         last_loc   :: SrcSpan,  -- pos of previous token
1481         last_len   :: !Int,     -- len of previous token
1482         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1483         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1484         context    :: [LayoutContext],
1485         lex_state  :: [Int],
1486         -- Used in the alternative layout rule:
1487         -- These tokens are the next ones to be sent out. They are
1488         -- just blindly emitted, without the rule looking at them again:
1489         alr_pending_implicit_tokens :: [Located Token],
1490         -- This is the next token to be considered or, if it is Nothing,
1491         -- we need to get the next token from the input stream:
1492         alr_next_token :: Maybe (Located Token),
1493         -- This is what we consider to be the locatino of the last token
1494         -- emitted:
1495         alr_last_loc :: SrcSpan,
1496         -- The stack of layout contexts:
1497         alr_context :: [ALRContext],
1498         -- Are we expecting a '{'? If it's Just, then the ALRLayout tells
1499         -- us what sort of layout the '{' will open:
1500         alr_expecting_ocurly :: Maybe ALRLayout,
1501         -- Have we just had the '}' for a let block? If so, than an 'in'
1502         -- token doesn't need to close anything:
1503         alr_justClosedExplicitLetBlock :: Bool
1504      }
1505         -- last_loc and last_len are used when generating error messages,
1506         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1507         -- current token to happyError, we could at least get rid of last_len.
1508         -- Getting rid of last_loc would require finding another way to
1509         -- implement pushCurrentContext (which is only called from one place).
1510
1511 data ALRContext = ALRNoLayout Bool{- does it contain commas? -}
1512                               Bool{- is it a 'let' block? -}
1513                 | ALRLayout ALRLayout Int
1514 data ALRLayout = ALRLayoutLet
1515                | ALRLayoutWhere
1516                | ALRLayoutOf
1517                | ALRLayoutDo
1518
1519 newtype P a = P { unP :: PState -> ParseResult a }
1520
1521 instance Monad P where
1522   return = returnP
1523   (>>=) = thenP
1524   fail = failP
1525
1526 returnP :: a -> P a
1527 returnP a = a `seq` (P $ \s -> POk s a)
1528
1529 thenP :: P a -> (a -> P b) -> P b
1530 (P m) `thenP` k = P $ \ s ->
1531         case m s of
1532                 POk s1 a         -> (unP (k a)) s1
1533                 PFailed span err -> PFailed span err
1534
1535 failP :: String -> P a
1536 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1537
1538 failMsgP :: String -> P a
1539 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1540
1541 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1542 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1543
1544 failSpanMsgP :: SrcSpan -> SDoc -> P a
1545 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1546
1547 getPState :: P PState
1548 getPState = P $ \s -> POk s s
1549
1550 getDynFlags :: P DynFlags
1551 getDynFlags = P $ \s -> POk s (dflags s)
1552
1553 withThisPackage :: (PackageId -> a) -> P a
1554 withThisPackage f
1555  = do   pkg     <- liftM thisPackage getDynFlags
1556         return  $ f pkg
1557
1558 extension :: (Int -> Bool) -> P Bool
1559 extension p = P $ \s -> POk s (p $! extsBitmap s)
1560
1561 getExts :: P Int
1562 getExts = P $ \s -> POk s (extsBitmap s)
1563
1564 setExts :: (Int -> Int) -> P ()
1565 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1566
1567 setSrcLoc :: SrcLoc -> P ()
1568 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1569
1570 getSrcLoc :: P SrcLoc
1571 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1572
1573 setLastToken :: SrcSpan -> Int -> P ()
1574 setLastToken loc len = P $ \s -> POk s {
1575   last_loc=loc,
1576   last_len=len
1577   } ()
1578
1579 data AlexInput = AI SrcLoc StringBuffer
1580
1581 alexInputPrevChar :: AlexInput -> Char
1582 alexInputPrevChar (AI _ buf) = prevChar buf '\n'
1583
1584 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1585 alexGetChar (AI loc s)
1586   | atEnd s   = Nothing
1587   | otherwise = adj_c `seq` loc' `seq` s' `seq`
1588                 --trace (show (ord c)) $
1589                 Just (adj_c, (AI loc' s'))
1590   where (c,s') = nextChar s
1591         loc'   = advanceSrcLoc loc c
1592
1593         non_graphic     = '\x0'
1594         upper           = '\x1'
1595         lower           = '\x2'
1596         digit           = '\x3'
1597         symbol          = '\x4'
1598         space           = '\x5'
1599         other_graphic   = '\x6'
1600
1601         adj_c
1602           | c <= '\x06' = non_graphic
1603           | c <= '\x7f' = c
1604           -- Alex doesn't handle Unicode, so when Unicode
1605           -- character is encountered we output these values
1606           -- with the actual character value hidden in the state.
1607           | otherwise =
1608                 case generalCategory c of
1609                   UppercaseLetter       -> upper
1610                   LowercaseLetter       -> lower
1611                   TitlecaseLetter       -> upper
1612                   ModifierLetter        -> other_graphic
1613                   OtherLetter           -> lower -- see #1103
1614                   NonSpacingMark        -> other_graphic
1615                   SpacingCombiningMark  -> other_graphic
1616                   EnclosingMark         -> other_graphic
1617                   DecimalNumber         -> digit
1618                   LetterNumber          -> other_graphic
1619                   OtherNumber           -> other_graphic
1620                   ConnectorPunctuation  -> symbol
1621                   DashPunctuation       -> symbol
1622                   OpenPunctuation       -> other_graphic
1623                   ClosePunctuation      -> other_graphic
1624                   InitialQuote          -> other_graphic
1625                   FinalQuote            -> other_graphic
1626                   OtherPunctuation      -> symbol
1627                   MathSymbol            -> symbol
1628                   CurrencySymbol        -> symbol
1629                   ModifierSymbol        -> symbol
1630                   OtherSymbol           -> symbol
1631                   Space                 -> space
1632                   _other                -> non_graphic
1633
1634 -- This version does not squash unicode characters, it is used when
1635 -- lexing strings.
1636 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1637 alexGetChar' (AI loc s)
1638   | atEnd s   = Nothing
1639   | otherwise = c `seq` loc' `seq` s' `seq`
1640                 --trace (show (ord c)) $
1641                 Just (c, (AI loc' s'))
1642   where (c,s') = nextChar s
1643         loc'   = advanceSrcLoc loc c
1644
1645 getInput :: P AlexInput
1646 getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (AI l b)
1647
1648 setInput :: AlexInput -> P ()
1649 setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } ()
1650
1651 pushLexState :: Int -> P ()
1652 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1653
1654 popLexState :: P Int
1655 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1656
1657 getLexState :: P Int
1658 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1659
1660 popNextToken :: P (Maybe (Located Token))
1661 popNextToken
1662     = P $ \s@PState{ alr_next_token = m } ->
1663               POk (s {alr_next_token = Nothing}) m
1664
1665 setAlrLastLoc :: SrcSpan -> P ()
1666 setAlrLastLoc l = P $ \s -> POk (s {alr_last_loc = l}) ()
1667
1668 getAlrLastLoc :: P SrcSpan
1669 getAlrLastLoc = P $ \s@(PState {alr_last_loc = l}) -> POk s l
1670
1671 getALRContext :: P [ALRContext]
1672 getALRContext = P $ \s@(PState {alr_context = cs}) -> POk s cs
1673
1674 setALRContext :: [ALRContext] -> P ()
1675 setALRContext cs = P $ \s -> POk (s {alr_context = cs}) ()
1676
1677 getJustClosedExplicitLetBlock :: P Bool
1678 getJustClosedExplicitLetBlock
1679  = P $ \s@(PState {alr_justClosedExplicitLetBlock = b}) -> POk s b
1680
1681 setJustClosedExplicitLetBlock :: Bool -> P ()
1682 setJustClosedExplicitLetBlock b
1683  = P $ \s -> POk (s {alr_justClosedExplicitLetBlock = b}) ()
1684
1685 setNextToken :: Located Token -> P ()
1686 setNextToken t = P $ \s -> POk (s {alr_next_token = Just t}) ()
1687
1688 popPendingImplicitToken :: P (Maybe (Located Token))
1689 popPendingImplicitToken
1690     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1691               case ts of
1692               [] -> POk s Nothing
1693               (t : ts') -> POk (s {alr_pending_implicit_tokens = ts'}) (Just t)
1694
1695 setPendingImplicitTokens :: [Located Token] -> P ()
1696 setPendingImplicitTokens ts = P $ \s -> POk (s {alr_pending_implicit_tokens = ts}) ()
1697
1698 getAlrExpectingOCurly :: P (Maybe ALRLayout)
1699 getAlrExpectingOCurly = P $ \s@(PState {alr_expecting_ocurly = b}) -> POk s b
1700
1701 setAlrExpectingOCurly :: Maybe ALRLayout -> P ()
1702 setAlrExpectingOCurly b = P $ \s -> POk (s {alr_expecting_ocurly = b}) ()
1703
1704 -- for reasons of efficiency, flags indicating language extensions (eg,
1705 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1706 -- integer
1707
1708 genericsBit :: Int
1709 genericsBit = 0 -- {| and |}
1710 ffiBit :: Int
1711 ffiBit     = 1
1712 parrBit :: Int
1713 parrBit    = 2
1714 arrowsBit :: Int
1715 arrowsBit  = 4
1716 thBit :: Int
1717 thBit      = 5
1718 ipBit :: Int
1719 ipBit      = 6
1720 explicitForallBit :: Int
1721 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1722 bangPatBit :: Int
1723 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1724                 -- (doesn't affect the lexer)
1725 tyFamBit :: Int
1726 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1727 haddockBit :: Int
1728 haddockBit = 10 -- Lex and parse Haddock comments
1729 magicHashBit :: Int
1730 magicHashBit = 11 -- "#" in both functions and operators
1731 kindSigsBit :: Int
1732 kindSigsBit = 12 -- Kind signatures on type variables
1733 recursiveDoBit :: Int
1734 recursiveDoBit = 13 -- mdo
1735 unicodeSyntaxBit :: Int
1736 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1737 unboxedTuplesBit :: Int
1738 unboxedTuplesBit = 15 -- (# and #)
1739 standaloneDerivingBit :: Int
1740 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1741 transformComprehensionsBit :: Int
1742 transformComprehensionsBit = 17
1743 qqBit :: Int
1744 qqBit      = 18 -- enable quasiquoting
1745 inRulePragBit :: Int
1746 inRulePragBit = 19
1747 rawTokenStreamBit :: Int
1748 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1749 newQualOpsBit :: Int
1750 newQualOpsBit = 21 -- Haskell' qualified operator syntax, e.g. Prelude.(+)
1751 recBit :: Int
1752 recBit = 22 -- rec
1753 alternativeLayoutRuleBit :: Int
1754 alternativeLayoutRuleBit = 23
1755
1756 always :: Int -> Bool
1757 always           _     = True
1758 genericsEnabled :: Int -> Bool
1759 genericsEnabled  flags = testBit flags genericsBit
1760 parrEnabled :: Int -> Bool
1761 parrEnabled      flags = testBit flags parrBit
1762 arrowsEnabled :: Int -> Bool
1763 arrowsEnabled    flags = testBit flags arrowsBit
1764 thEnabled :: Int -> Bool
1765 thEnabled        flags = testBit flags thBit
1766 ipEnabled :: Int -> Bool
1767 ipEnabled        flags = testBit flags ipBit
1768 explicitForallEnabled :: Int -> Bool
1769 explicitForallEnabled flags = testBit flags explicitForallBit
1770 bangPatEnabled :: Int -> Bool
1771 bangPatEnabled   flags = testBit flags bangPatBit
1772 -- tyFamEnabled :: Int -> Bool
1773 -- tyFamEnabled     flags = testBit flags tyFamBit
1774 haddockEnabled :: Int -> Bool
1775 haddockEnabled   flags = testBit flags haddockBit
1776 magicHashEnabled :: Int -> Bool
1777 magicHashEnabled flags = testBit flags magicHashBit
1778 -- kindSigsEnabled :: Int -> Bool
1779 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1780 unicodeSyntaxEnabled :: Int -> Bool
1781 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1782 unboxedTuplesEnabled :: Int -> Bool
1783 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1784 standaloneDerivingEnabled :: Int -> Bool
1785 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1786 qqEnabled :: Int -> Bool
1787 qqEnabled        flags = testBit flags qqBit
1788 -- inRulePrag :: Int -> Bool
1789 -- inRulePrag       flags = testBit flags inRulePragBit
1790 rawTokenStreamEnabled :: Int -> Bool
1791 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1792 newQualOps :: Int -> Bool
1793 newQualOps       flags = testBit flags newQualOpsBit
1794 oldQualOps :: Int -> Bool
1795 oldQualOps flags = not (newQualOps flags)
1796 alternativeLayoutRule :: Int -> Bool
1797 alternativeLayoutRule flags = testBit flags alternativeLayoutRuleBit
1798
1799 -- PState for parsing options pragmas
1800 --
1801 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1802 pragState dynflags buf loc =
1803   PState {
1804       buffer        = buf,
1805       messages      = emptyMessages,
1806       dflags        = dynflags,
1807       last_loc      = mkSrcSpan loc loc,
1808       last_len      = 0,
1809       loc           = loc,
1810       extsBitmap    = 0,
1811       context       = [],
1812       lex_state     = [bol, option_prags, 0],
1813       alr_pending_implicit_tokens = [],
1814       alr_next_token = Nothing,
1815       alr_last_loc = noSrcSpan,
1816       alr_context = [],
1817       alr_expecting_ocurly = Nothing,
1818       alr_justClosedExplicitLetBlock = False
1819     }
1820
1821
1822 -- create a parse state
1823 --
1824 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1825 mkPState buf loc flags  =
1826   PState {
1827       buffer          = buf,
1828       dflags        = flags,
1829       messages      = emptyMessages,
1830       last_loc      = mkSrcSpan loc loc,
1831       last_len      = 0,
1832       loc           = loc,
1833       extsBitmap    = fromIntegral bitmap,
1834       context       = [],
1835       lex_state     = [bol, 0],
1836         -- we begin in the layout state if toplev_layout is set
1837       alr_pending_implicit_tokens = [],
1838       alr_next_token = Nothing,
1839       alr_last_loc = noSrcSpan,
1840       alr_context = [],
1841       alr_expecting_ocurly = Nothing,
1842       alr_justClosedExplicitLetBlock = False
1843     }
1844     where
1845       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1846                .|. ffiBit            `setBitIf` dopt Opt_ForeignFunctionInterface flags
1847                .|. parrBit           `setBitIf` dopt Opt_PArr         flags
1848                .|. arrowsBit         `setBitIf` dopt Opt_Arrows       flags
1849                .|. thBit             `setBitIf` dopt Opt_TemplateHaskell flags
1850                .|. qqBit             `setBitIf` dopt Opt_QuasiQuotes flags
1851                .|. ipBit             `setBitIf` dopt Opt_ImplicitParams flags
1852                .|. explicitForallBit `setBitIf` dopt Opt_ExplicitForAll flags
1853                .|. bangPatBit        `setBitIf` dopt Opt_BangPatterns flags
1854                .|. tyFamBit          `setBitIf` dopt Opt_TypeFamilies flags
1855                .|. haddockBit        `setBitIf` dopt Opt_Haddock      flags
1856                .|. magicHashBit      `setBitIf` dopt Opt_MagicHash    flags
1857                .|. kindSigsBit       `setBitIf` dopt Opt_KindSignatures flags
1858                .|. recursiveDoBit    `setBitIf` dopt Opt_RecursiveDo flags
1859                .|. recBit            `setBitIf` dopt Opt_DoRec  flags
1860                .|. recBit            `setBitIf` dopt Opt_Arrows flags
1861                .|. unicodeSyntaxBit  `setBitIf` dopt Opt_UnicodeSyntax flags
1862                .|. unboxedTuplesBit  `setBitIf` dopt Opt_UnboxedTuples flags
1863                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1864                .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1865                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1866                .|. newQualOpsBit `setBitIf` dopt Opt_NewQualifiedOperators flags
1867                .|. alternativeLayoutRuleBit `setBitIf` dopt Opt_AlternativeLayoutRule flags
1868       --
1869       setBitIf :: Int -> Bool -> Int
1870       b `setBitIf` cond | cond      = bit b
1871                         | otherwise = 0
1872
1873 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1874 addWarning option srcspan warning
1875  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1876        let warning' = mkWarnMsg srcspan alwaysQualify warning
1877            ws' = if dopt option d then ws `snocBag` warning' else ws
1878        in POk s{messages=(ws', es)} ()
1879
1880 getMessages :: PState -> Messages
1881 getMessages PState{messages=ms} = ms
1882
1883 getContext :: P [LayoutContext]
1884 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1885
1886 setContext :: [LayoutContext] -> P ()
1887 setContext ctx = P $ \s -> POk s{context=ctx} ()
1888
1889 popContext :: P ()
1890 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1891                               last_len = len, last_loc = last_loc }) ->
1892   case ctx of
1893         (_:tl) -> POk s{ context = tl } ()
1894         []     -> PFailed last_loc (srcParseErr buf len)
1895
1896 -- Push a new layout context at the indentation of the last token read.
1897 -- This is only used at the outer level of a module when the 'module'
1898 -- keyword is missing.
1899 pushCurrentContext :: P ()
1900 pushCurrentContext = P $ \ s@PState{ last_loc=loc, context=ctx } ->
1901     POk s{context = Layout (srcSpanStartCol loc) : ctx} ()
1902
1903 getOffside :: P Ordering
1904 getOffside = P $ \s@PState{last_loc=loc, context=stk} ->
1905                 let offs = srcSpanStartCol loc in
1906                 let ord = case stk of
1907                         (Layout n:_) -> --trace ("layout: " ++ show n ++ ", offs: " ++ show offs) $
1908                                         compare offs n
1909                         _            -> GT
1910                 in POk s ord
1911
1912 -- ---------------------------------------------------------------------------
1913 -- Construct a parse error
1914
1915 srcParseErr
1916   :: StringBuffer       -- current buffer (placed just after the last token)
1917   -> Int                -- length of the previous token
1918   -> Message
1919 srcParseErr buf len
1920   = hcat [ if null token
1921              then ptext (sLit "parse error (possibly incorrect indentation)")
1922              else hcat [ptext (sLit "parse error on input "),
1923                         char '`', text token, char '\'']
1924     ]
1925   where token = lexemeToString (offsetBytes (-len) buf) len
1926
1927 -- Report a parse failure, giving the span of the previous token as
1928 -- the location of the error.  This is the entry point for errors
1929 -- detected during parsing.
1930 srcParseFail :: P a
1931 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1932                             last_loc = last_loc } ->
1933     PFailed last_loc (srcParseErr buf len)
1934
1935 -- A lexical error is reported at a particular position in the source file,
1936 -- not over a token range.
1937 lexError :: String -> P a
1938 lexError str = do
1939   loc <- getSrcLoc
1940   (AI end buf) <- getInput
1941   reportLexError loc end buf str
1942
1943 -- -----------------------------------------------------------------------------
1944 -- This is the top-level function: called from the parser each time a
1945 -- new token is to be read from the input.
1946
1947 lexer :: (Located Token -> P a) -> P a
1948 lexer cont = do
1949   alr <- extension alternativeLayoutRule
1950   let lexTokenFun = if alr then lexTokenAlr else lexToken
1951   tok@(L _span _tok__) <- lexTokenFun
1952   --trace ("token: " ++ show _tok__) $ do
1953   cont tok
1954
1955 lexTokenAlr :: P (Located Token)
1956 lexTokenAlr = do mPending <- popPendingImplicitToken
1957                  t <- case mPending of
1958                       Nothing ->
1959                           do mNext <- popNextToken
1960                              t <- case mNext of
1961                                   Nothing -> lexToken
1962                                   Just next -> return next
1963                              alternativeLayoutRuleToken t
1964                       Just t ->
1965                           return t
1966                  setAlrLastLoc (getLoc t)
1967                  case unLoc t of
1968                      ITwhere -> setAlrExpectingOCurly (Just ALRLayoutWhere)
1969                      ITlet   -> setAlrExpectingOCurly (Just ALRLayoutLet)
1970                      ITof    -> setAlrExpectingOCurly (Just ALRLayoutOf)
1971                      ITdo    -> setAlrExpectingOCurly (Just ALRLayoutDo)
1972                      ITmdo   -> setAlrExpectingOCurly (Just ALRLayoutDo)
1973                      ITrec   -> setAlrExpectingOCurly (Just ALRLayoutDo)
1974                      _       -> return ()
1975                  return t
1976
1977 alternativeLayoutRuleToken :: Located Token -> P (Located Token)
1978 alternativeLayoutRuleToken t
1979     = do context <- getALRContext
1980          lastLoc <- getAlrLastLoc
1981          mExpectingOCurly <- getAlrExpectingOCurly
1982          justClosedExplicitLetBlock <- getJustClosedExplicitLetBlock
1983          setJustClosedExplicitLetBlock False
1984          dflags <- getDynFlags
1985          let transitional = dopt Opt_AlternativeLayoutRuleTransitional dflags
1986              thisLoc = getLoc t
1987              thisCol = srcSpanStartCol thisLoc
1988              newLine = (lastLoc == noSrcSpan)
1989                     || (srcSpanStartLine thisLoc > srcSpanEndLine lastLoc)
1990          case (unLoc t, context, mExpectingOCurly) of
1991              -- This case handles a GHC extension to the original H98
1992              -- layout rule...
1993              (ITocurly, _, Just alrLayout) ->
1994                  do setAlrExpectingOCurly Nothing
1995                     let isLet = case alrLayout of
1996                                 ALRLayoutLet -> True
1997                                 _ -> False
1998                     setALRContext (ALRNoLayout (containsCommas ITocurly) isLet : context)
1999                     return t
2000              -- ...and makes this case unnecessary
2001              {-
2002              -- I think our implicit open-curly handling is slightly
2003              -- different to John's, in how it interacts with newlines
2004              -- and "in"
2005              (ITocurly, _, Just _) ->
2006                  do setAlrExpectingOCurly Nothing
2007                     setNextToken t
2008                     lexTokenAlr
2009              -}
2010              (_, ALRLayout _ col : ls, Just expectingOCurly)
2011               | (thisCol > col) ||
2012                 (thisCol == col &&
2013                  isNonDecreasingIntentation expectingOCurly) ->
2014                  do setAlrExpectingOCurly Nothing
2015                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2016                     setNextToken t
2017                     return (L thisLoc ITocurly)
2018               | otherwise ->
2019                  do setAlrExpectingOCurly Nothing
2020                     setPendingImplicitTokens [L lastLoc ITccurly]
2021                     setNextToken t
2022                     return (L lastLoc ITocurly)
2023              (_, _, Just expectingOCurly) ->
2024                  do setAlrExpectingOCurly Nothing
2025                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2026                     setNextToken t
2027                     return (L thisLoc ITocurly)
2028              -- We do the [] cases earlier than in the spec, as we
2029              -- have an actual EOF token
2030              (ITeof, ALRLayout _ _ : ls, _) ->
2031                  do setALRContext ls
2032                     setNextToken t
2033                     return (L thisLoc ITccurly)
2034              (ITeof, _, _) ->
2035                  return t
2036              -- the other ITeof case omitted; general case below covers it
2037              (ITin, _, _)
2038               | justClosedExplicitLetBlock ->
2039                  return t
2040              (ITin, ALRLayout ALRLayoutLet _ : ls, _)
2041               | newLine ->
2042                  do setPendingImplicitTokens [t]
2043                     setALRContext ls
2044                     return (L thisLoc ITccurly)
2045              -- This next case is to handle a transitional issue:
2046              (ITwhere, ALRLayout _ col : ls, _)
2047               | newLine && thisCol == col && transitional ->
2048                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2049                                thisLoc
2050                                (transitionalAlternativeLayoutWarning
2051                                     "`where' clause at the same depth as implicit layout block")
2052                     setALRContext ls
2053                     setNextToken t
2054                     -- Note that we use lastLoc, as we may need to close
2055                     -- more layouts, or give a semicolon
2056                     return (L lastLoc ITccurly)
2057              (_, ALRLayout _ col : ls, _)
2058               | newLine && thisCol == col ->
2059                  do setNextToken t
2060                     return (L thisLoc ITsemi)
2061               | newLine && thisCol < col ->
2062                  do setALRContext ls
2063                     setNextToken t
2064                     -- Note that we use lastLoc, as we may need to close
2065                     -- more layouts, or give a semicolon
2066                     return (L lastLoc ITccurly)
2067              (u, _, _)
2068               | isALRopen u ->
2069                  do setALRContext (ALRNoLayout (containsCommas u) False : context)
2070                     return t
2071              (u, _, _)
2072               | isALRclose u ->
2073                  case context of
2074                  ALRLayout _ _ : ls ->
2075                      do setALRContext ls
2076                         setNextToken t
2077                         return (L thisLoc ITccurly)
2078                  ALRNoLayout _ isLet : ls ->
2079                      do setALRContext ls
2080                         when isLet $ setJustClosedExplicitLetBlock True
2081                         return t
2082                  [] ->
2083                      -- XXX This is an error in John's code, but
2084                      -- it looks reachable to me at first glance
2085                      return t
2086              (ITin, ALRLayout ALRLayoutLet _ : ls, _) ->
2087                  do setALRContext ls
2088                     setPendingImplicitTokens [t]
2089                     return (L thisLoc ITccurly)
2090              (ITin, ALRLayout _ _ : ls, _) ->
2091                  do setALRContext ls
2092                     setNextToken t
2093                     return (L thisLoc ITccurly)
2094              -- the other ITin case omitted; general case below covers it
2095              (ITcomma, ALRLayout _ _ : ls, _)
2096               | topNoLayoutContainsCommas ls ->
2097                  do setALRContext ls
2098                     setNextToken t
2099                     return (L thisLoc ITccurly)
2100              (ITwhere, ALRLayout ALRLayoutDo _ : ls, _) ->
2101                  do setALRContext ls
2102                     setPendingImplicitTokens [t]
2103                     return (L thisLoc ITccurly)
2104              -- the other ITwhere case omitted; general case below covers it
2105              (_, _, _) -> return t
2106
2107 transitionalAlternativeLayoutWarning :: String -> SDoc
2108 transitionalAlternativeLayoutWarning msg
2109     = text "transitional layout will not be accepted in the future:"
2110    $$ text msg
2111
2112 isALRopen :: Token -> Bool
2113 isALRopen ITcase        = True
2114 isALRopen ITif          = True
2115 isALRopen IToparen      = True
2116 isALRopen ITobrack      = True
2117 isALRopen ITocurly      = True
2118 -- GHC Extensions:
2119 isALRopen IToubxparen   = True
2120 isALRopen ITparenEscape = True
2121 isALRopen _             = False
2122
2123 isALRclose :: Token -> Bool
2124 isALRclose ITof     = True
2125 isALRclose ITthen   = True
2126 isALRclose ITcparen = True
2127 isALRclose ITcbrack = True
2128 isALRclose ITccurly = True
2129 -- GHC Extensions:
2130 isALRclose ITcubxparen = True
2131 isALRclose _        = False
2132
2133 isNonDecreasingIntentation :: ALRLayout -> Bool
2134 isNonDecreasingIntentation ALRLayoutDo = True
2135 isNonDecreasingIntentation _           = False
2136
2137 containsCommas :: Token -> Bool
2138 containsCommas IToparen = True
2139 containsCommas ITobrack = True
2140 -- John doesn't have {} as containing commas, but records contain them,
2141 -- which caused a problem parsing Cabal's Distribution.Simple.InstallDirs
2142 -- (defaultInstallDirs).
2143 containsCommas ITocurly = True
2144 -- GHC Extensions:
2145 containsCommas IToubxparen = True
2146 containsCommas _        = False
2147
2148 topNoLayoutContainsCommas :: [ALRContext] -> Bool
2149 topNoLayoutContainsCommas [] = False
2150 topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
2151 topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
2152
2153 lexToken :: P (Located Token)
2154 lexToken = do
2155   inp@(AI loc1 buf) <- getInput
2156   sc <- getLexState
2157   exts <- getExts
2158   case alexScanUser exts inp sc of
2159     AlexEOF -> do
2160         let span = mkSrcSpan loc1 loc1
2161         setLastToken span 0
2162         return (L span ITeof)
2163     AlexError (AI loc2 buf) ->
2164         reportLexError loc1 loc2 buf "lexical error"
2165     AlexSkip inp2 _ -> do
2166         setInput inp2
2167         lexToken
2168     AlexToken inp2@(AI end buf2) _ t -> do
2169         setInput inp2
2170         let span = mkSrcSpan loc1 end
2171         let bytes = byteDiff buf buf2
2172         span `seq` setLastToken span bytes
2173         t span buf bytes
2174
2175 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
2176 reportLexError loc1 loc2 buf str
2177   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
2178   | otherwise =
2179   let
2180         c = fst (nextChar buf)
2181   in
2182   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
2183     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
2184     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
2185
2186 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
2187 lexTokenStream buf loc dflags = unP go initState
2188     where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
2189           go = do
2190             ltok <- lexer return
2191             case ltok of
2192               L _ ITeof -> return []
2193               _ -> liftM (ltok:) go
2194
2195 linePrags = Map.singleton "line" (begin line_prag2)
2196
2197 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
2198                                  ("options_ghc", lex_string_prag IToptions_prag),
2199                                  ("options_haddock", lex_string_prag ITdocOptions),
2200                                  ("language", token ITlanguage_prag),
2201                                  ("include", lex_string_prag ITinclude_prag)])
2202
2203 ignoredPrags = Map.fromList (map ignored pragmas)
2204                where ignored opt = (opt, nested_comment lexToken)
2205                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
2206                      options_pragmas = map ("options_" ++) impls
2207                      -- CFILES is a hugs-only thing.
2208                      pragmas = options_pragmas ++ ["cfiles", "contract"]
2209
2210 oneWordPrags = Map.fromList([("rules", rulePrag),
2211                            ("inline", token (ITinline_prag True)),
2212                            ("notinline", token (ITinline_prag False)),
2213                            ("specialize", token ITspec_prag),
2214                            ("source", token ITsource_prag),
2215                            ("warning", token ITwarning_prag),
2216                            ("deprecated", token ITdeprecated_prag),
2217                            ("scc", token ITscc_prag),
2218                            ("generated", token ITgenerated_prag),
2219                            ("core", token ITcore_prag),
2220                            ("unpack", token ITunpack_prag),
2221                            ("ann", token ITann_prag)])
2222
2223 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_conlike_prag True)),
2224                              ("notinline conlike", token (ITinline_conlike_prag False)),
2225                              ("specialize inline", token (ITspec_inline_prag True)),
2226                              ("specialize notinline", token (ITspec_inline_prag False))])
2227
2228
2229 dispatch_pragmas :: Map String Action -> Action
2230 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
2231                                        Just found -> found span buf len
2232                                        Nothing -> lexError "unknown pragma"
2233
2234 known_pragma :: Map String Action -> AlexAccPred Int
2235 known_pragma prags _ _ len (AI _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
2236                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
2237
2238 clean_pragma :: String -> String
2239 clean_pragma prag = canon_ws (map toLower (unprefix prag))
2240                     where unprefix prag' = case stripPrefix "{-#" prag' of
2241                                              Just rest -> rest
2242                                              Nothing -> prag'
2243                           canonical prag' = case prag' of
2244                                               "noinline" -> "notinline"
2245                                               "specialise" -> "specialize"
2246                                               "constructorlike" -> "conlike"
2247                                               _ -> prag'
2248                           canon_ws s = unwords (map canonical (words s))
2249 }