compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 -- XXX The above flags turn off warnings in the generated code:
  35 {-# OPTIONS_GHC -fno-warn-unused-matches #-}
  36 {-# OPTIONS_GHC -fno-warn-unused-binds #-}
  37 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
  38 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
  39 -- But alex still generates some code that causes the "lazy unlifted bindings"
  40 -- warning, and old compilers don't know about it so we can't easily turn
  41 -- it off, so for now we use the sledge hammer:
  42 {-# OPTIONS_GHC -w #-}
  43
  44 {-# OPTIONS_GHC -funbox-strict-fields #-}
  45
  46 module Lexer (
  47    Token(..), lexer, pragState, mkPState, PState(..),
  48    P(..), ParseResult(..), getSrcLoc,
  49    getPState, getDynFlags, withThisPackage,
  50    failLocMsgP, failSpanMsgP, srcParseFail,
  51    getMessages,
  52    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  53    getLexState, popLexState, pushLexState,
  54    extension, standaloneDerivingEnabled, bangPatEnabled,
  55    addWarning,
  56    lexTokenStream
  57   ) where
  58
  59 import Bag
  60 import ErrUtils
  61 import Outputable
  62 import StringBuffer
  63 import FastString
  64 import SrcLoc
  65 import UniqFM
  66 import DynFlags
  67 import Module
  68 import Ctype
  69 import Util             ( readRational )
  70
  71 import Control.Monad
  72 import Data.Bits
  73 import Data.Char
  74 import Data.List
  75 import Data.Maybe
  76 import Data.Map (Map)
  77 import qualified Data.Map as Map
  78 import Data.Ratio
  79 }
  80
  81 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  82 $whitechar   = [\ \n\r\f\v $unispace]
  83 $white_no_nl = $whitechar # \n
  84 $tab         = \t
  85
  86 $ascdigit  = 0-9
  87 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  88 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  89 $digit     = [$ascdigit $unidigit]
  90
  91 $special   = [\(\)\,\;\[\]\`\{\}]
  92 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  93 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  94 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  95
  96 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  97 $asclarge  = [A-Z]
  98 $large     = [$asclarge $unilarge]
  99
 100 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 101 $ascsmall  = [a-z]
 102 $small     = [$ascsmall $unismall \_]
 103
 104 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 105 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 106
 107 $octit     = 0-7
 108 $hexit     = [$decdigit A-F a-f]
 109 $symchar   = [$symbol \:]
 110 $nl        = [\n\r]
 111 $idchar    = [$small $large $digit \']
 112
 113 $pragmachar = [$small $large $digit]
 114
 115 $docsym    = [\| \^ \* \$]
 116
 117 @varid     = $small $idchar*
 118 @conid     = $large $idchar*
 119
 120 @varsym    = $symbol $symchar*
 121 @consym    = \: $symchar*
 122
 123 @decimal     = $decdigit+
 124 @octal       = $octit+
 125 @hexadecimal = $hexit+
 126 @exponent    = [eE] [\-\+]? @decimal
 127
 128 -- we support the hierarchical module name extension:
 129 @qual = (@conid \.)+
 130
 131 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 132
 133 -- normal signed numerical literals can only be explicitly negative,
 134 -- not explicitly positive (contrast @exponent)
 135 @negative = \-
 136 @signed = @negative ?
 137
 138 haskell :-
 139
 140 -- everywhere: skip whitespace and comments
 141 $white_no_nl+                           ;
 142 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 143
 144 -- Everywhere: deal with nested comments.  We explicitly rule out
 145 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 146 -- (this can happen even though pragmas will normally take precedence due to
 147 -- longest-match, because pragmas aren't valid in every state, but comments
 148 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 149 -- set.
 150
 151 "{-" / { isNormalComment } { nested_comment lexToken }
 152
 153 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 154 -- more dashes followed by a symbol should be parsed as a varsym, so we
 155 -- have to exclude those.
 156
 157 -- Since Haddock comments aren't valid in every state, we need to rule them
 158 -- out here.
 159
 160 -- The following two rules match comments that begin with two dashes, but
 161 -- continue with a different character. The rules test that this character
 162 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 163 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 164 -- have a Haddock comment). The rules then munch the rest of the line.
 165
 166 "-- " ~[$docsym \#] .* { lineCommentToken }
 167 "--" [^$symbol : \ ] .* { lineCommentToken }
 168
 169 -- Next, match Haddock comments if no -haddock flag
 170
 171 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 172
 173 -- Now, when we've matched comments that begin with 2 dashes and continue
 174 -- with a different character, we need to match comments that begin with three
 175 -- or more dashes (which clearly can't be Haddock comments). We only need to
 176 -- make sure that the first non-dash character isn't a symbol, and munch the
 177 -- rest of the line.
 178
 179 "---"\-* [^$symbol :] .* { lineCommentToken }
 180
 181 -- Since the previous rules all match dashes followed by at least one
 182 -- character, we also need to match a whole line filled with just dashes.
 183
 184 "--"\-* / { atEOL } { lineCommentToken }
 185
 186 -- We need this rule since none of the other single line comment rules
 187 -- actually match this case.
 188
 189 "-- " / { atEOL } { lineCommentToken }
 190
 191 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 192 -- blank lines) until we find a non-whitespace character, then do layout
 193 -- processing.
 194 --
 195 -- One slight wibble here: what if the line begins with {-#? In
 196 -- theory, we have to lex the pragma to see if it's one we recognise,
 197 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 198 -- as a nested comment.  We don't bother with this: if the line begins
 199 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 200 <bol> {
 201   \n                                    ;
 202   ^\# (line)?                           { begin line_prag1 }
 203   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 204   ^\# \! .* \n                          ; -- #!, for scripts
 205   ()                                    { do_bol }
 206 }
 207
 208 -- after a layout keyword (let, where, do, of), we begin a new layout
 209 -- context if the curly brace is missing.
 210 -- Careful! This stuff is quite delicate.
 211 <layout, layout_do> {
 212   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 213         -- we might encounter {-# here, but {- has been handled already
 214   \n                                    ;
 215   ^\# (line)?                           { begin line_prag1 }
 216 }
 217
 218 -- do is treated in a subtly different way, see new_layout_context
 219 <layout>    ()                          { new_layout_context True }
 220 <layout_do> ()                          { new_layout_context False }
 221
 222 -- after a new layout context which was found to be to the left of the
 223 -- previous context, we have generated a '{' token, and we now need to
 224 -- generate a matching '}' token.
 225 <layout_left>  ()                       { do_layout_left }
 226
 227 <0,option_prags> \n                             { begin bol }
 228
 229 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 230                                 { dispatch_pragmas linePrags }
 231
 232 -- single-line line pragmas, of the form
 233 --    # <line> "<file>" <extra-stuff> \n
 234 <line_prag1> $decdigit+                 { setLine line_prag1a }
 235 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 236 <line_prag1b> .*                        { pop }
 237
 238 -- Haskell-style line pragmas, of the form
 239 --    {-# LINE <line> "<file>" #-}
 240 <line_prag2> $decdigit+                 { setLine line_prag2a }
 241 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 242 <line_prag2b> "#-}"|"-}"                { pop }
 243    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 244    -- with older versions of GHC which generated these.
 245
 246 <0,option_prags> {
 247   "{-#" $whitechar* $pragmachar+
 248         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 249                                  { dispatch_pragmas twoWordPrags }
 250
 251   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 252                                  { dispatch_pragmas oneWordPrags }
 253
 254   -- We ignore all these pragmas, but don't generate a warning for them
 255   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 256                                  { dispatch_pragmas ignoredPrags }
 257
 258   -- ToDo: should only be valid inside a pragma:
 259   "#-}"                                 { endPrag }
 260 }
 261
 262 <option_prags> {
 263   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 264                                    { dispatch_pragmas fileHeaderPrags }
 265
 266   "-- #"                                 { multiline_doc_comment }
 267 }
 268
 269 <0> {
 270   -- In the "0" mode we ignore these pragmas
 271   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 272                      { nested_comment lexToken }
 273 }
 274
 275 <0> {
 276   "-- #" .* { lineCommentToken }
 277 }
 278
 279 <0,option_prags> {
 280   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 281                     (nested_comment lexToken) }
 282 }
 283
 284 -- '0' state: ordinary lexemes
 285
 286 -- Haddock comments
 287
 288 <0> {
 289   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 290   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 291 }
 292
 293 -- "special" symbols
 294
 295 <0> {
 296   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 297   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 298 }
 299
 300 <0> {
 301   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 302   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 303   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 304   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 305   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 306   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 307   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 308   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 309
 310   "[$" @varid "|"  / { ifExtension qqEnabled }
 311                      { lex_quasiquote_tok }
 312 }
 313
 314 <0> {
 315   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 316                                         { special IToparenbar }
 317   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 318 }
 319
 320 <0> {
 321   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 322 }
 323
 324 <0> {
 325   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 326          { token IToubxparen }
 327   "#)" / { ifExtension unboxedTuplesEnabled }
 328          { token ITcubxparen }
 329 }
 330
 331 <0> {
 332   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 333   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 334 }
 335
 336 <0,option_prags> {
 337   \(                                    { special IToparen }
 338   \)                                    { special ITcparen }
 339   \[                                    { special ITobrack }
 340   \]                                    { special ITcbrack }
 341   \,                                    { special ITcomma }
 342   \;                                    { special ITsemi }
 343   \`                                    { special ITbackquote }
 344
 345   \{                                    { open_brace }
 346   \}                                    { close_brace }
 347 }
 348
 349 <0,option_prags> {
 350   @qual @varid                  { idtoken qvarid }
 351   @qual @conid                  { idtoken qconid }
 352   @varid                        { varid }
 353   @conid                        { idtoken conid }
 354 }
 355
 356 <0> {
 357   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 358   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 359   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 360   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 361 }
 362
 363 -- ToDo: - move `var` and (sym) into lexical syntax?
 364 --       - remove backquote from $special?
 365 <0> {
 366   @qual @varsym       / { ifExtension oldQualOps } { idtoken qvarsym }
 367   @qual @consym       / { ifExtension oldQualOps } { idtoken qconsym }
 368   @qual \( @varsym \) / { ifExtension newQualOps } { idtoken prefixqvarsym }
 369   @qual \( @consym \) / { ifExtension newQualOps } { idtoken prefixqconsym }
 370   @varsym                                          { varsym }
 371   @consym                                          { consym }
 372 }
 373
 374 -- For the normal boxed literals we need to be careful
 375 -- when trying to be close to Haskell98
 376 <0> {
 377   -- Normal integral literals (:: Num a => a, from Integer)
 378   @decimal           { tok_num positive 0 0 decimal }
 379   0[oO] @octal       { tok_num positive 2 2 octal }
 380   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 381
 382   -- Normal rational literals (:: Fractional a => a, from Rational)
 383   @floating_point    { strtoken tok_float }
 384 }
 385
 386 <0> {
 387   -- Unboxed ints (:: Int#) and words (:: Word#)
 388   -- It's simpler (and faster?) to give separate cases to the negatives,
 389   -- especially considering octal/hexadecimal prefixes.
 390   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 391   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 392   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 393   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 394   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 395   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 396
 397   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 398   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 399   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 400
 401   -- Unboxed floats and doubles (:: Float#, :: Double#)
 402   -- prim_{float,double} work with signed literals
 403   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 404   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 405 }
 406
 407 -- Strings and chars are lexed by hand-written code.  The reason is
 408 -- that even if we recognise the string or char here in the regex
 409 -- lexer, we would still have to parse the string afterward in order
 410 -- to convert it to a String.
 411 <0> {
 412   \'                            { lex_char_tok }
 413   \"                            { lex_string_tok }
 414 }
 415
 416 {
 417 -- -----------------------------------------------------------------------------
 418 -- The token type
 419
 420 data Token
 421   = ITas                        -- Haskell keywords
 422   | ITcase
 423   | ITclass
 424   | ITdata
 425   | ITdefault
 426   | ITderiving
 427   | ITdo
 428   | ITelse
 429   | IThiding
 430   | ITif
 431   | ITimport
 432   | ITin
 433   | ITinfix
 434   | ITinfixl
 435   | ITinfixr
 436   | ITinstance
 437   | ITlet
 438   | ITmodule
 439   | ITnewtype
 440   | ITof
 441   | ITqualified
 442   | ITthen
 443   | ITtype
 444   | ITwhere
 445   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 446
 447   | ITforall                    -- GHC extension keywords
 448   | ITforeign
 449   | ITexport
 450   | ITlabel
 451   | ITdynamic
 452   | ITsafe
 453   | ITthreadsafe
 454   | ITunsafe
 455   | ITstdcallconv
 456   | ITccallconv
 457   | ITprimcallconv
 458   | ITmdo
 459   | ITfamily
 460   | ITgroup
 461   | ITby
 462   | ITusing
 463
 464         -- Pragmas
 465   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 466   | ITinline_conlike_prag Bool  -- same
 467   | ITspec_prag                 -- SPECIALISE
 468   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 469   | ITsource_prag
 470   | ITrules_prag
 471   | ITwarning_prag
 472   | ITdeprecated_prag
 473   | ITline_prag
 474   | ITscc_prag
 475   | ITgenerated_prag
 476   | ITcore_prag                 -- hdaume: core annotations
 477   | ITunpack_prag
 478   | ITann_prag
 479   | ITclose_prag
 480   | IToptions_prag String
 481   | ITinclude_prag String
 482   | ITlanguage_prag
 483
 484   | ITdotdot                    -- reserved symbols
 485   | ITcolon
 486   | ITdcolon
 487   | ITequal
 488   | ITlam
 489   | ITvbar
 490   | ITlarrow
 491   | ITrarrow
 492   | ITat
 493   | ITtilde
 494   | ITdarrow
 495   | ITminus
 496   | ITbang
 497   | ITstar
 498   | ITdot
 499
 500   | ITbiglam                    -- GHC-extension symbols
 501
 502   | ITocurly                    -- special symbols
 503   | ITccurly
 504   | ITocurlybar                 -- {|, for type applications
 505   | ITccurlybar                 -- |}, for type applications
 506   | ITvocurly
 507   | ITvccurly
 508   | ITobrack
 509   | ITopabrack                  -- [:, for parallel arrays with -XParr
 510   | ITcpabrack                  -- :], for parallel arrays with -XParr
 511   | ITcbrack
 512   | IToparen
 513   | ITcparen
 514   | IToubxparen
 515   | ITcubxparen
 516   | ITsemi
 517   | ITcomma
 518   | ITunderscore
 519   | ITbackquote
 520
 521   | ITvarid   FastString        -- identifiers
 522   | ITconid   FastString
 523   | ITvarsym  FastString
 524   | ITconsym  FastString
 525   | ITqvarid  (FastString,FastString)
 526   | ITqconid  (FastString,FastString)
 527   | ITqvarsym (FastString,FastString)
 528   | ITqconsym (FastString,FastString)
 529   | ITprefixqvarsym (FastString,FastString)
 530   | ITprefixqconsym (FastString,FastString)
 531
 532   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 533
 534   | ITchar       Char
 535   | ITstring     FastString
 536   | ITinteger    Integer
 537   | ITrational   Rational
 538
 539   | ITprimchar   Char
 540   | ITprimstring FastString
 541   | ITprimint    Integer
 542   | ITprimword   Integer
 543   | ITprimfloat  Rational
 544   | ITprimdouble Rational
 545
 546   -- Template Haskell extension tokens
 547   | ITopenExpQuote              --  [| or [e|
 548   | ITopenPatQuote              --  [p|
 549   | ITopenDecQuote              --  [d|
 550   | ITopenTypQuote              --  [t|
 551   | ITcloseQuote                --  |]
 552   | ITidEscape   FastString     --  $x
 553   | ITparenEscape               --  $(
 554   | ITvarQuote                  --  '
 555   | ITtyQuote                   --  ''
 556   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 557
 558   -- Arrow notation extension
 559   | ITproc
 560   | ITrec
 561   | IToparenbar                 --  (|
 562   | ITcparenbar                 --  |)
 563   | ITlarrowtail                --  -<
 564   | ITrarrowtail                --  >-
 565   | ITLarrowtail                --  -<<
 566   | ITRarrowtail                --  >>-
 567
 568   | ITunknown String            -- Used when the lexer can't make sense of it
 569   | ITeof                       -- end of file token
 570
 571   -- Documentation annotations
 572   | ITdocCommentNext  String     -- something beginning '-- |'
 573   | ITdocCommentPrev  String     -- something beginning '-- ^'
 574   | ITdocCommentNamed String     -- something beginning '-- $'
 575   | ITdocSection      Int String -- a section heading
 576   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 577   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 578   | ITlineComment     String     -- comment starting by "--"
 579   | ITblockComment    String     -- comment in {- -}
 580
 581 #ifdef DEBUG
 582   deriving Show -- debugging
 583 #endif
 584
 585 {-
 586 isSpecial :: Token -> Bool
 587 -- If we see M.x, where x is a keyword, but
 588 -- is special, we treat is as just plain M.x,
 589 -- not as a keyword.
 590 isSpecial ITas          = True
 591 isSpecial IThiding      = True
 592 isSpecial ITqualified   = True
 593 isSpecial ITforall      = True
 594 isSpecial ITexport      = True
 595 isSpecial ITlabel       = True
 596 isSpecial ITdynamic     = True
 597 isSpecial ITsafe        = True
 598 isSpecial ITthreadsafe  = True
 599 isSpecial ITunsafe      = True
 600 isSpecial ITccallconv   = True
 601 isSpecial ITstdcallconv = True
 602 isSpecial ITprimcallconv = True
 603 isSpecial ITmdo         = True
 604 isSpecial ITfamily      = True
 605 isSpecial ITgroup   = True
 606 isSpecial ITby      = True
 607 isSpecial ITusing   = True
 608 isSpecial _             = False
 609 -}
 610
 611 -- the bitmap provided as the third component indicates whether the
 612 -- corresponding extension keyword is valid under the extension options
 613 -- provided to the compiler; if the extension corresponding to *any* of the
 614 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 615 -- facilitates using a keyword in two different extensions that can be
 616 -- activated independently)
 617 --
 618 reservedWordsFM :: UniqFM (Token, Int)
 619 reservedWordsFM = listToUFM $
 620         map (\(x, y, z) -> (mkFastString x, (y, z)))
 621        [( "_",          ITunderscore,   0 ),
 622         ( "as",         ITas,           0 ),
 623         ( "case",       ITcase,         0 ),
 624         ( "class",      ITclass,        0 ),
 625         ( "data",       ITdata,         0 ),
 626         ( "default",    ITdefault,      0 ),
 627         ( "deriving",   ITderiving,     0 ),
 628         ( "do",         ITdo,           0 ),
 629         ( "else",       ITelse,         0 ),
 630         ( "hiding",     IThiding,       0 ),
 631         ( "if",         ITif,           0 ),
 632         ( "import",     ITimport,       0 ),
 633         ( "in",         ITin,           0 ),
 634         ( "infix",      ITinfix,        0 ),
 635         ( "infixl",     ITinfixl,       0 ),
 636         ( "infixr",     ITinfixr,       0 ),
 637         ( "instance",   ITinstance,     0 ),
 638         ( "let",        ITlet,          0 ),
 639         ( "module",     ITmodule,       0 ),
 640         ( "newtype",    ITnewtype,      0 ),
 641         ( "of",         ITof,           0 ),
 642         ( "qualified",  ITqualified,    0 ),
 643         ( "then",       ITthen,         0 ),
 644         ( "type",       ITtype,         0 ),
 645         ( "where",      ITwhere,        0 ),
 646         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 647
 648     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 649         ( "mdo",        ITmdo,           bit recursiveDoBit),
 650         ( "family",     ITfamily,        bit tyFamBit),
 651     ( "group",  ITgroup,     bit transformComprehensionsBit),
 652     ( "by",     ITby,        bit transformComprehensionsBit),
 653     ( "using",  ITusing,     bit transformComprehensionsBit),
 654
 655         ( "foreign",    ITforeign,       bit ffiBit),
 656         ( "export",     ITexport,        bit ffiBit),
 657         ( "label",      ITlabel,         bit ffiBit),
 658         ( "dynamic",    ITdynamic,       bit ffiBit),
 659         ( "safe",       ITsafe,          bit ffiBit),
 660         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 661         ( "unsafe",     ITunsafe,        bit ffiBit),
 662         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 663         ( "ccall",      ITccallconv,     bit ffiBit),
 664         ( "prim",       ITprimcallconv,  bit ffiBit),
 665
 666         ( "rec",        ITrec,           bit recBit),
 667         ( "proc",       ITproc,          bit arrowsBit)
 668      ]
 669
 670 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 671 reservedSymsFM = listToUFM $
 672     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 673       [ ("..",  ITdotdot,   always)
 674         -- (:) is a reserved op, meaning only list cons
 675        ,(":",   ITcolon,    always)
 676        ,("::",  ITdcolon,   always)
 677        ,("=",   ITequal,    always)
 678        ,("\\",  ITlam,      always)
 679        ,("|",   ITvbar,     always)
 680        ,("<-",  ITlarrow,   always)
 681        ,("->",  ITrarrow,   always)
 682        ,("@",   ITat,       always)
 683        ,("~",   ITtilde,    always)
 684        ,("=>",  ITdarrow,   always)
 685        ,("-",   ITminus,    always)
 686        ,("!",   ITbang,     always)
 687
 688         -- For data T (a::*) = MkT
 689        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 690         -- For 'forall a . t'
 691        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 692
 693        ,("-<",  ITlarrowtail, arrowsEnabled)
 694        ,(">-",  ITrarrowtail, arrowsEnabled)
 695        ,("-<<", ITLarrowtail, arrowsEnabled)
 696        ,(">>-", ITRarrowtail, arrowsEnabled)
 697
 698        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 699        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 700        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 701                                 explicitForallEnabled i)
 702        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 703        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 704        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 705         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 706         -- form part of a large operator.  This would let us have a better
 707         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 708        ]
 709
 710 -- -----------------------------------------------------------------------------
 711 -- Lexer actions
 712
 713 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 714
 715 special :: Token -> Action
 716 special tok span _buf _len = return (L span tok)
 717
 718 token, layout_token :: Token -> Action
 719 token t span _buf _len = return (L span t)
 720 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 721
 722 idtoken :: (StringBuffer -> Int -> Token) -> Action
 723 idtoken f span buf len = return (L span $! (f buf len))
 724
 725 skip_one_varid :: (FastString -> Token) -> Action
 726 skip_one_varid f span buf len
 727   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 728
 729 strtoken :: (String -> Token) -> Action
 730 strtoken f span buf len =
 731   return (L span $! (f $! lexemeToString buf len))
 732
 733 init_strtoken :: Int -> (String -> Token) -> Action
 734 -- like strtoken, but drops the last N character(s)
 735 init_strtoken drop f span buf len =
 736   return (L span $! (f $! lexemeToString buf (len-drop)))
 737
 738 begin :: Int -> Action
 739 begin code _span _str _len = do pushLexState code; lexToken
 740
 741 pop :: Action
 742 pop _span _buf _len = do _ <- popLexState
 743                          lexToken
 744
 745 pop_and :: Action -> Action
 746 pop_and act span buf len = do _ <- popLexState
 747                               act span buf len
 748
 749 {-# INLINE nextCharIs #-}
 750 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 751 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 752
 753 notFollowedBy :: Char -> AlexAccPred Int
 754 notFollowedBy char _ _ _ (AI _ _ buf)
 755   = nextCharIs buf (/=char)
 756
 757 notFollowedBySymbol :: AlexAccPred Int
 758 notFollowedBySymbol _ _ _ (AI _ _ buf)
 759   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 760
 761 -- We must reject doc comments as being ordinary comments everywhere.
 762 -- In some cases the doc comment will be selected as the lexeme due to
 763 -- maximal munch, but not always, because the nested comment rule is
 764 -- valid in all states, but the doc-comment rules are only valid in
 765 -- the non-layout states.
 766 isNormalComment :: AlexAccPred Int
 767 isNormalComment bits _ _ (AI _ _ buf)
 768   | haddockEnabled bits = notFollowedByDocOrPragma
 769   | otherwise           = nextCharIs buf (/='#')
 770   where
 771     notFollowedByDocOrPragma
 772        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 773
 774 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 775 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 776
 777 {-
 778 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 779   = if haddockEnabled bits then False else (p buf)
 780 -}
 781
 782 atEOL :: AlexAccPred Int
 783 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 784
 785 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 786 ifExtension pred bits _ _ _ = pred bits
 787
 788 multiline_doc_comment :: Action
 789 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 790   where
 791     worker commentAcc input docType oneLine = case alexGetChar input of
 792       Just ('\n', input')
 793         | oneLine -> docCommentEnd input commentAcc docType buf span
 794         | otherwise -> case checkIfCommentLine input' of
 795           Just input -> worker ('\n':commentAcc) input docType False
 796           Nothing -> docCommentEnd input commentAcc docType buf span
 797       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 798       Nothing -> docCommentEnd input commentAcc docType buf span
 799
 800     checkIfCommentLine input = check (dropNonNewlineSpace input)
 801       where
 802         check input = case alexGetChar input of
 803           Just ('-', input) -> case alexGetChar input of
 804             Just ('-', input) -> case alexGetChar input of
 805               Just (c, _) | c /= '-' -> Just input
 806               _ -> Nothing
 807             _ -> Nothing
 808           _ -> Nothing
 809
 810         dropNonNewlineSpace input = case alexGetChar input of
 811           Just (c, input')
 812             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 813             | otherwise -> input
 814           Nothing -> input
 815
 816 lineCommentToken :: Action
 817 lineCommentToken span buf len = do
 818   b <- extension rawTokenStreamEnabled
 819   if b then strtoken ITlineComment span buf len else lexToken
 820
 821 {-
 822   nested comments require traversing by hand, they can't be parsed
 823   using regular expressions.
 824 -}
 825 nested_comment :: P (Located Token) -> Action
 826 nested_comment cont span _str _len = do
 827   input <- getInput
 828   go "" (1::Int) input
 829   where
 830     go commentAcc 0 input = do setInput input
 831                                b <- extension rawTokenStreamEnabled
 832                                if b
 833                                  then docCommentEnd input commentAcc ITblockComment _str span
 834                                  else cont
 835     go commentAcc n input = case alexGetChar input of
 836       Nothing -> errBrace input span
 837       Just ('-',input) -> case alexGetChar input of
 838         Nothing  -> errBrace input span
 839         Just ('\125',input) -> go commentAcc (n-1) input
 840         Just (_,_)          -> go ('-':commentAcc) n input
 841       Just ('\123',input) -> case alexGetChar input of
 842         Nothing  -> errBrace input span
 843         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 844         Just (_,_)       -> go ('\123':commentAcc) n input
 845       Just (c,input) -> go (c:commentAcc) n input
 846
 847 nested_doc_comment :: Action
 848 nested_doc_comment span buf _len = withLexedDocType (go "")
 849   where
 850     go commentAcc input docType _ = case alexGetChar input of
 851       Nothing -> errBrace input span
 852       Just ('-',input) -> case alexGetChar input of
 853         Nothing -> errBrace input span
 854         Just ('\125',input) ->
 855           docCommentEnd input commentAcc docType buf span
 856         Just (_,_) -> go ('-':commentAcc) input docType False
 857       Just ('\123', input) -> case alexGetChar input of
 858         Nothing  -> errBrace input span
 859         Just ('-',input) -> do
 860           setInput input
 861           let cont = do input <- getInput; go commentAcc input docType False
 862           nested_comment cont span buf _len
 863         Just (_,_) -> go ('\123':commentAcc) input docType False
 864       Just (c,input) -> go (c:commentAcc) input docType False
 865
 866 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 867                  -> P (Located Token)
 868 withLexedDocType lexDocComment = do
 869   input@(AI _ _ buf) <- getInput
 870   case prevChar buf ' ' of
 871     '|' -> lexDocComment input ITdocCommentNext False
 872     '^' -> lexDocComment input ITdocCommentPrev False
 873     '$' -> lexDocComment input ITdocCommentNamed False
 874     '*' -> lexDocSection 1 input
 875     '#' -> lexDocComment input ITdocOptionsOld False
 876     _ -> panic "withLexedDocType: Bad doc type"
 877  where
 878     lexDocSection n input = case alexGetChar input of
 879       Just ('*', input) -> lexDocSection (n+1) input
 880       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 881       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 882
 883 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 884 -- off again at the end of the pragma.
 885 rulePrag :: Action
 886 rulePrag span _buf _len = do
 887   setExts (.|. bit inRulePragBit)
 888   return (L span ITrules_prag)
 889
 890 endPrag :: Action
 891 endPrag span _buf _len = do
 892   setExts (.&. complement (bit inRulePragBit))
 893   return (L span ITclose_prag)
 894
 895 -- docCommentEnd
 896 -------------------------------------------------------------------------------
 897 -- This function is quite tricky. We can't just return a new token, we also
 898 -- need to update the state of the parser. Why? Because the token is longer
 899 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 900 -- it writes the wrong token length to the parser state. This function is
 901 -- called afterwards, so it can just update the state.
 902
 903 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 904 -- which is something that the original lexer didn't account for.
 905 -- I have added last_line_len in the parser state which represents the length
 906 -- of the part of the token that is on the last line. It is now used for layout
 907 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 908 -- was before, the full length of the token, and it is now only used for error
 909 -- messages. /Waern
 910
 911 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 912                  SrcSpan -> P (Located Token)
 913 docCommentEnd input commentAcc docType buf span = do
 914   setInput input
 915   let (AI loc last_offs nextBuf) = input
 916       comment = reverse commentAcc
 917       span' = mkSrcSpan (srcSpanStart span) loc
 918       last_len = byteDiff buf nextBuf
 919
 920       last_line_len = if (last_offs - last_len < 0)
 921         then last_offs
 922         else last_len
 923
 924   span `seq` setLastToken span' last_len last_line_len
 925   return (L span' (docType comment))
 926
 927 errBrace :: AlexInput -> SrcSpan -> P a
 928 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 929
 930 open_brace, close_brace :: Action
 931 open_brace span _str _len = do
 932   ctx <- getContext
 933   setContext (NoLayout:ctx)
 934   return (L span ITocurly)
 935 close_brace span _str _len = do
 936   popContext
 937   return (L span ITccurly)
 938
 939 qvarid, qconid :: StringBuffer -> Int -> Token
 940 qvarid buf len = ITqvarid $! splitQualName buf len False
 941 qconid buf len = ITqconid $! splitQualName buf len False
 942
 943 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 944 -- takes a StringBuffer and a length, and returns the module name
 945 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 946 -- because of hierarchical module names.
 947 splitQualName orig_buf len parens = split orig_buf orig_buf
 948   where
 949     split buf dot_buf
 950         | orig_buf `byteDiff` buf >= len  = done dot_buf
 951         | c == '.'                        = found_dot buf'
 952         | otherwise                       = split buf' dot_buf
 953       where
 954        (c,buf') = nextChar buf
 955
 956     -- careful, we might get names like M....
 957     -- so, if the character after the dot is not upper-case, this is
 958     -- the end of the qualifier part.
 959     found_dot buf -- buf points after the '.'
 960         | isUpper c    = split buf' buf
 961         | otherwise    = done buf
 962       where
 963        (c,buf') = nextChar buf
 964
 965     done dot_buf =
 966         (lexemeToFastString orig_buf (qual_size - 1),
 967          if parens -- Prelude.(+)
 968             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 969             else lexemeToFastString dot_buf (len - qual_size))
 970       where
 971         qual_size = orig_buf `byteDiff` dot_buf
 972
 973 varid :: Action
 974 varid span buf len =
 975   fs `seq`
 976   case lookupUFM reservedWordsFM fs of
 977         Just (keyword,0)    -> do
 978                 maybe_layout keyword
 979                 return (L span keyword)
 980         Just (keyword,exts) -> do
 981                 b <- extension (\i -> exts .&. i /= 0)
 982                 if b then do maybe_layout keyword
 983                              return (L span keyword)
 984                      else return (L span (ITvarid fs))
 985         _other -> return (L span (ITvarid fs))
 986   where
 987         fs = lexemeToFastString buf len
 988
 989 conid :: StringBuffer -> Int -> Token
 990 conid buf len = ITconid fs
 991   where fs = lexemeToFastString buf len
 992
 993 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
 994 qvarsym buf len = ITqvarsym $! splitQualName buf len False
 995 qconsym buf len = ITqconsym $! splitQualName buf len False
 996 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
 997 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
 998
 999 varsym, consym :: Action
1000 varsym = sym ITvarsym
1001 consym = sym ITconsym
1002
1003 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
1004     -> P (Located Token)
1005 sym con span buf len =
1006   case lookupUFM reservedSymsFM fs of
1007         Just (keyword,exts) -> do
1008                 b <- extension exts
1009                 if b then return (L span keyword)
1010                      else return (L span $! con fs)
1011         _other -> return (L span $! con fs)
1012   where
1013         fs = lexemeToFastString buf len
1014
1015 -- Variations on the integral numeric literal.
1016 tok_integral :: (Integer -> Token)
1017      -> (Integer -> Integer)
1018  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1019      -> Int -> Int
1020      -> (Integer, (Char->Int)) -> Action
1021 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1022   return $ L span $ itint $! transint $ parseUnsignedInteger
1023      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1024
1025 -- some conveniences for use with tok_integral
1026 tok_num :: (Integer -> Integer)
1027         -> Int -> Int
1028         -> (Integer, (Char->Int)) -> Action
1029 tok_num = tok_integral ITinteger
1030 tok_primint :: (Integer -> Integer)
1031             -> Int -> Int
1032             -> (Integer, (Char->Int)) -> Action
1033 tok_primint = tok_integral ITprimint
1034 tok_primword :: Int -> Int
1035              -> (Integer, (Char->Int)) -> Action
1036 tok_primword = tok_integral ITprimword positive
1037 positive, negative :: (Integer -> Integer)
1038 positive = id
1039 negative = negate
1040 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1041 decimal = (10,octDecDigit)
1042 octal = (8,octDecDigit)
1043 hexadecimal = (16,hexDigit)
1044
1045 -- readRational can understand negative rationals, exponents, everything.
1046 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1047 tok_float        str = ITrational   $! readRational str
1048 tok_primfloat    str = ITprimfloat  $! readRational str
1049 tok_primdouble   str = ITprimdouble $! readRational str
1050
1051 -- -----------------------------------------------------------------------------
1052 -- Layout processing
1053
1054 -- we're at the first token on a line, insert layout tokens if necessary
1055 do_bol :: Action
1056 do_bol span _str _len = do
1057         pos <- getOffside
1058         case pos of
1059             LT -> do
1060                 --trace "layout: inserting '}'" $ do
1061                 popContext
1062                 -- do NOT pop the lex state, we might have a ';' to insert
1063                 return (L span ITvccurly)
1064             EQ -> do
1065                 --trace "layout: inserting ';'" $ do
1066                 _ <- popLexState
1067                 return (L span ITsemi)
1068             GT -> do
1069                 _ <- popLexState
1070                 lexToken
1071
1072 -- certain keywords put us in the "layout" state, where we might
1073 -- add an opening curly brace.
1074 maybe_layout :: Token -> P ()
1075 maybe_layout t = do -- If the alternative layout rule is enabled then
1076                     -- we never create an implicit layout context here.
1077                     -- Layout is handled XXX instead.
1078                     -- The code for closing implicit contexts, or
1079                     -- inserting implicit semi-colons, is therefore
1080                     -- irrelevant as it only applies in an implicit
1081                     -- context.
1082                     alr <- extension alternativeLayoutRule
1083                     unless alr $ f t
1084     where f ITdo    = pushLexState layout_do
1085           f ITmdo   = pushLexState layout_do
1086           f ITof    = pushLexState layout
1087           f ITlet   = pushLexState layout
1088           f ITwhere = pushLexState layout
1089           f ITrec   = pushLexState layout
1090           f _       = return ()
1091
1092 -- Pushing a new implicit layout context.  If the indentation of the
1093 -- next token is not greater than the previous layout context, then
1094 -- Haskell 98 says that the new layout context should be empty; that is
1095 -- the lexer must generate {}.
1096 --
1097 -- We are slightly more lenient than this: when the new context is started
1098 -- by a 'do', then we allow the new context to be at the same indentation as
1099 -- the previous context.  This is what the 'strict' argument is for.
1100 --
1101 new_layout_context :: Bool -> Action
1102 new_layout_context strict span _buf _len = do
1103     _ <- popLexState
1104     (AI _ offset _) <- getInput
1105     ctx <- getContext
1106     case ctx of
1107         Layout prev_off : _  |
1108            (strict     && prev_off >= offset  ||
1109             not strict && prev_off > offset) -> do
1110                 -- token is indented to the left of the previous context.
1111                 -- we must generate a {} sequence now.
1112                 pushLexState layout_left
1113                 return (L span ITvocurly)
1114         _ -> do
1115                 setContext (Layout offset : ctx)
1116                 return (L span ITvocurly)
1117
1118 do_layout_left :: Action
1119 do_layout_left span _buf _len = do
1120     _ <- popLexState
1121     pushLexState bol  -- we must be at the start of a line
1122     return (L span ITvccurly)
1123
1124 -- -----------------------------------------------------------------------------
1125 -- LINE pragmas
1126
1127 setLine :: Int -> Action
1128 setLine code span buf len = do
1129   let line = parseUnsignedInteger buf len 10 octDecDigit
1130   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1131         -- subtract one: the line number refers to the *following* line
1132   _ <- popLexState
1133   pushLexState code
1134   lexToken
1135
1136 setFile :: Int -> Action
1137 setFile code span buf len = do
1138   let file = lexemeToFastString (stepOn buf) (len-2)
1139   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1140   _ <- popLexState
1141   pushLexState code
1142   lexToken
1143
1144
1145 -- -----------------------------------------------------------------------------
1146 -- Options, includes and language pragmas.
1147
1148 lex_string_prag :: (String -> Token) -> Action
1149 lex_string_prag mkTok span _buf _len
1150     = do input <- getInput
1151          start <- getSrcLoc
1152          tok <- go [] input
1153          end <- getSrcLoc
1154          return (L (mkSrcSpan start end) tok)
1155     where go acc input
1156               = if isString input "#-}"
1157                    then do setInput input
1158                            return (mkTok (reverse acc))
1159                    else case alexGetChar input of
1160                           Just (c,i) -> go (c:acc) i
1161                           Nothing -> err input
1162           isString _ [] = True
1163           isString i (x:xs)
1164               = case alexGetChar i of
1165                   Just (c,i') | c == x    -> isString i' xs
1166                   _other -> False
1167           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1168
1169
1170 -- -----------------------------------------------------------------------------
1171 -- Strings & Chars
1172
1173 -- This stuff is horrible.  I hates it.
1174
1175 lex_string_tok :: Action
1176 lex_string_tok span _buf _len = do
1177   tok <- lex_string ""
1178   end <- getSrcLoc
1179   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1180
1181 lex_string :: String -> P Token
1182 lex_string s = do
1183   i <- getInput
1184   case alexGetChar' i of
1185     Nothing -> lit_error
1186
1187     Just ('"',i)  -> do
1188         setInput i
1189         magicHash <- extension magicHashEnabled
1190         if magicHash
1191           then do
1192             i <- getInput
1193             case alexGetChar' i of
1194               Just ('#',i) -> do
1195                    setInput i
1196                    if any (> '\xFF') s
1197                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1198                     else let s' = mkZFastString (reverse s) in
1199                          return (ITprimstring s')
1200                         -- mkZFastString is a hack to avoid encoding the
1201                         -- string in UTF-8.  We just want the exact bytes.
1202               _other ->
1203                 return (ITstring (mkFastString (reverse s)))
1204           else
1205                 return (ITstring (mkFastString (reverse s)))
1206
1207     Just ('\\',i)
1208         | Just ('&',i) <- next -> do
1209                 setInput i; lex_string s
1210         | Just (c,i) <- next, is_space c -> do
1211                 setInput i; lex_stringgap s
1212         where next = alexGetChar' i
1213
1214     Just (c, i) -> do
1215         c' <- lex_char c i
1216         lex_string (c':s)
1217
1218 lex_stringgap :: String -> P Token
1219 lex_stringgap s = do
1220   c <- getCharOrFail
1221   case c of
1222     '\\' -> lex_string s
1223     c | is_space c -> lex_stringgap s
1224     _other -> lit_error
1225
1226
1227 lex_char_tok :: Action
1228 -- Here we are basically parsing character literals, such as 'x' or '\n'
1229 -- but, when Template Haskell is on, we additionally spot
1230 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1231 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1232 -- So we have to do two characters of lookahead: when we see 'x we need to
1233 -- see if there's a trailing quote
1234 lex_char_tok span _buf _len = do        -- We've seen '
1235    i1 <- getInput       -- Look ahead to first character
1236    let loc = srcSpanStart span
1237    case alexGetChar' i1 of
1238         Nothing -> lit_error
1239
1240         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1241                   th_exts <- extension thEnabled
1242                   if th_exts then do
1243                         setInput i2
1244                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1245                    else lit_error
1246
1247         Just ('\\', i2@(AI _end2 _ _)) -> do    -- We've seen 'backslash
1248                   setInput i2
1249                   lit_ch <- lex_escape
1250                   mc <- getCharOrFail   -- Trailing quote
1251                   if mc == '\'' then finish_char_tok loc lit_ch
1252                                 else do setInput i2; lit_error
1253
1254         Just (c, i2@(AI _end2 _ _))
1255                 | not (isAny c) -> lit_error
1256                 | otherwise ->
1257
1258                 -- We've seen 'x, where x is a valid character
1259                 --  (i.e. not newline etc) but not a quote or backslash
1260            case alexGetChar' i2 of      -- Look ahead one more character
1261                 Just ('\'', i3) -> do   -- We've seen 'x'
1262                         setInput i3
1263                         finish_char_tok loc c
1264                 _other -> do            -- We've seen 'x not followed by quote
1265                                         -- (including the possibility of EOF)
1266                                         -- If TH is on, just parse the quote only
1267                         th_exts <- extension thEnabled
1268                         let (AI end _ _) = i1
1269                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1270                                    else do setInput i2; lit_error
1271
1272 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1273 finish_char_tok loc ch  -- We've already seen the closing quote
1274                         -- Just need to check for trailing #
1275   = do  magicHash <- extension magicHashEnabled
1276         i@(AI end _ _) <- getInput
1277         if magicHash then do
1278                 case alexGetChar' i of
1279                         Just ('#',i@(AI end _ _)) -> do
1280                                 setInput i
1281                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1282                         _other ->
1283                                 return (L (mkSrcSpan loc end) (ITchar ch))
1284                 else do
1285                    return (L (mkSrcSpan loc end) (ITchar ch))
1286
1287 lex_char :: Char -> AlexInput -> P Char
1288 lex_char c inp = do
1289   case c of
1290       '\\' -> do setInput inp; lex_escape
1291       c | isAny c -> do setInput inp; return c
1292       _other -> lit_error
1293
1294 isAny :: Char -> Bool
1295 isAny c | c > '\x7f' = isPrint c
1296         | otherwise  = is_any c
1297
1298 lex_escape :: P Char
1299 lex_escape = do
1300   c <- getCharOrFail
1301   case c of
1302         'a'   -> return '\a'
1303         'b'   -> return '\b'
1304         'f'   -> return '\f'
1305         'n'   -> return '\n'
1306         'r'   -> return '\r'
1307         't'   -> return '\t'
1308         'v'   -> return '\v'
1309         '\\'  -> return '\\'
1310         '"'   -> return '\"'
1311         '\''  -> return '\''
1312         '^'   -> do c <- getCharOrFail
1313                     if c >= '@' && c <= '_'
1314                         then return (chr (ord c - ord '@'))
1315                         else lit_error
1316
1317         'x'   -> readNum is_hexdigit 16 hexDigit
1318         'o'   -> readNum is_octdigit  8 octDecDigit
1319         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1320
1321         c1 ->  do
1322            i <- getInput
1323            case alexGetChar' i of
1324             Nothing -> lit_error
1325             Just (c2,i2) ->
1326               case alexGetChar' i2 of
1327                 Nothing -> do setInput i2; lit_error
1328                 Just (c3,i3) ->
1329                    let str = [c1,c2,c3] in
1330                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1331                                      Just rest <- [stripPrefix p str] ] of
1332                           (escape_char,[]):_ -> do
1333                                 setInput i3
1334                                 return escape_char
1335                           (escape_char,_:_):_ -> do
1336                                 setInput i2
1337                                 return escape_char
1338                           [] -> lit_error
1339
1340 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1341 readNum is_digit base conv = do
1342   i <- getInput
1343   c <- getCharOrFail
1344   if is_digit c
1345         then readNum2 is_digit base conv (conv c)
1346         else do setInput i; lit_error
1347
1348 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1349 readNum2 is_digit base conv i = do
1350   input <- getInput
1351   read i input
1352   where read i input = do
1353           case alexGetChar' input of
1354             Just (c,input') | is_digit c -> do
1355                 read (i*base + conv c) input'
1356             _other -> do
1357                 if i >= 0 && i <= 0x10FFFF
1358                    then do setInput input; return (chr i)
1359                    else lit_error
1360
1361 silly_escape_chars :: [(String, Char)]
1362 silly_escape_chars = [
1363         ("NUL", '\NUL'),
1364         ("SOH", '\SOH'),
1365         ("STX", '\STX'),
1366         ("ETX", '\ETX'),
1367         ("EOT", '\EOT'),
1368         ("ENQ", '\ENQ'),
1369         ("ACK", '\ACK'),
1370         ("BEL", '\BEL'),
1371         ("BS", '\BS'),
1372         ("HT", '\HT'),
1373         ("LF", '\LF'),
1374         ("VT", '\VT'),
1375         ("FF", '\FF'),
1376         ("CR", '\CR'),
1377         ("SO", '\SO'),
1378         ("SI", '\SI'),
1379         ("DLE", '\DLE'),
1380         ("DC1", '\DC1'),
1381         ("DC2", '\DC2'),
1382         ("DC3", '\DC3'),
1383         ("DC4", '\DC4'),
1384         ("NAK", '\NAK'),
1385         ("SYN", '\SYN'),
1386         ("ETB", '\ETB'),
1387         ("CAN", '\CAN'),
1388         ("EM", '\EM'),
1389         ("SUB", '\SUB'),
1390         ("ESC", '\ESC'),
1391         ("FS", '\FS'),
1392         ("GS", '\GS'),
1393         ("RS", '\RS'),
1394         ("US", '\US'),
1395         ("SP", '\SP'),
1396         ("DEL", '\DEL')
1397         ]
1398
1399 -- before calling lit_error, ensure that the current input is pointing to
1400 -- the position of the error in the buffer.  This is so that we can report
1401 -- a correct location to the user, but also so we can detect UTF-8 decoding
1402 -- errors if they occur.
1403 lit_error :: P a
1404 lit_error = lexError "lexical error in string/character literal"
1405
1406 getCharOrFail :: P Char
1407 getCharOrFail =  do
1408   i <- getInput
1409   case alexGetChar' i of
1410         Nothing -> lexError "unexpected end-of-file in string/character literal"
1411         Just (c,i)  -> do setInput i; return c
1412
1413 -- -----------------------------------------------------------------------------
1414 -- QuasiQuote
1415
1416 lex_quasiquote_tok :: Action
1417 lex_quasiquote_tok span buf len = do
1418   let quoter = reverse $ takeWhile (/= '$')
1419                $ reverse $ lexemeToString buf (len - 1)
1420   quoteStart <- getSrcLoc
1421   quote <- lex_quasiquote ""
1422   end <- getSrcLoc
1423   return (L (mkSrcSpan (srcSpanStart span) end)
1424            (ITquasiQuote (mkFastString quoter,
1425                           mkFastString (reverse quote),
1426                           mkSrcSpan quoteStart end)))
1427
1428 lex_quasiquote :: String -> P String
1429 lex_quasiquote s = do
1430   i <- getInput
1431   case alexGetChar' i of
1432     Nothing -> lit_error
1433
1434     Just ('\\',i)
1435         | Just ('|',i) <- next -> do
1436                 setInput i; lex_quasiquote ('|' : s)
1437         | Just (']',i) <- next -> do
1438                 setInput i; lex_quasiquote (']' : s)
1439         where next = alexGetChar' i
1440
1441     Just ('|',i)
1442         | Just (']',i) <- next -> do
1443                 setInput i; return s
1444         where next = alexGetChar' i
1445
1446     Just (c, i) -> do
1447          setInput i; lex_quasiquote (c : s)
1448
1449 -- -----------------------------------------------------------------------------
1450 -- Warnings
1451
1452 warn :: DynFlag -> SDoc -> Action
1453 warn option warning srcspan _buf _len = do
1454     addWarning option srcspan warning
1455     lexToken
1456
1457 warnThen :: DynFlag -> SDoc -> Action -> Action
1458 warnThen option warning action srcspan buf len = do
1459     addWarning option srcspan warning
1460     action srcspan buf len
1461
1462 -- -----------------------------------------------------------------------------
1463 -- The Parse Monad
1464
1465 data LayoutContext
1466   = NoLayout
1467   | Layout !Int
1468   deriving Show
1469
1470 data ParseResult a
1471   = POk PState a
1472   | PFailed
1473         SrcSpan         -- The start and end of the text span related to
1474                         -- the error.  Might be used in environments which can
1475                         -- show this span, e.g. by highlighting it.
1476         Message         -- The error message
1477
1478 data PState = PState {
1479         buffer     :: StringBuffer,
1480         dflags     :: DynFlags,
1481         messages   :: Messages,
1482         last_loc   :: SrcSpan,  -- pos of previous token
1483         last_offs  :: !Int,     -- offset of the previous token from the
1484                                 -- beginning of  the current line.
1485                                 -- \t is equal to 8 spaces.
1486         last_len   :: !Int,     -- len of previous token
1487         last_line_len :: !Int,
1488         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1489         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1490         context    :: [LayoutContext],
1491         lex_state  :: [Int],
1492         -- Used in the alternative layout rule:
1493         alr_pending_implicit_tokens :: [Located Token],
1494         alr_next_token :: Maybe (Located Token),
1495         alr_last_loc :: SrcSpan,
1496         alr_context :: [ALRContext],
1497         alr_expecting_ocurly :: Maybe ALRLayout
1498      }
1499         -- last_loc and last_len are used when generating error messages,
1500         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1501         -- current token to happyError, we could at least get rid of last_len.
1502         -- Getting rid of last_loc would require finding another way to
1503         -- implement pushCurrentContext (which is only called from one place).
1504
1505 data ALRContext = ALRNoLayout Bool{- does it contain commas? -}
1506                 | ALRLayout ALRLayout Int
1507 data ALRLayout = ALRLayoutLet
1508                | ALRLayoutWhere
1509                | ALRLayoutOf
1510                | ALRLayoutDo
1511
1512 newtype P a = P { unP :: PState -> ParseResult a }
1513
1514 instance Monad P where
1515   return = returnP
1516   (>>=) = thenP
1517   fail = failP
1518
1519 returnP :: a -> P a
1520 returnP a = a `seq` (P $ \s -> POk s a)
1521
1522 thenP :: P a -> (a -> P b) -> P b
1523 (P m) `thenP` k = P $ \ s ->
1524         case m s of
1525                 POk s1 a         -> (unP (k a)) s1
1526                 PFailed span err -> PFailed span err
1527
1528 failP :: String -> P a
1529 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1530
1531 failMsgP :: String -> P a
1532 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1533
1534 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1535 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1536
1537 failSpanMsgP :: SrcSpan -> SDoc -> P a
1538 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1539
1540 getPState :: P PState
1541 getPState = P $ \s -> POk s s
1542
1543 getDynFlags :: P DynFlags
1544 getDynFlags = P $ \s -> POk s (dflags s)
1545
1546 withThisPackage :: (PackageId -> a) -> P a
1547 withThisPackage f
1548  = do   pkg     <- liftM thisPackage getDynFlags
1549         return  $ f pkg
1550
1551 extension :: (Int -> Bool) -> P Bool
1552 extension p = P $ \s -> POk s (p $! extsBitmap s)
1553
1554 getExts :: P Int
1555 getExts = P $ \s -> POk s (extsBitmap s)
1556
1557 setExts :: (Int -> Int) -> P ()
1558 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1559
1560 setSrcLoc :: SrcLoc -> P ()
1561 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1562
1563 getSrcLoc :: P SrcLoc
1564 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1565
1566 setLastToken :: SrcSpan -> Int -> Int -> P ()
1567 setLastToken loc len line_len = P $ \s -> POk s {
1568   last_loc=loc,
1569   last_len=len,
1570   last_line_len=line_len
1571 } ()
1572
1573 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1574
1575 alexInputPrevChar :: AlexInput -> Char
1576 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1577
1578 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1579 alexGetChar (AI loc ofs s)
1580   | atEnd s   = Nothing
1581   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1582                 --trace (show (ord c)) $
1583                 Just (adj_c, (AI loc' ofs' s'))
1584   where (c,s') = nextChar s
1585         loc'   = advanceSrcLoc loc c
1586         ofs'   = advanceOffs c ofs
1587
1588         non_graphic     = '\x0'
1589         upper           = '\x1'
1590         lower           = '\x2'
1591         digit           = '\x3'
1592         symbol          = '\x4'
1593         space           = '\x5'
1594         other_graphic   = '\x6'
1595
1596         adj_c
1597           | c <= '\x06' = non_graphic
1598           | c <= '\x7f' = c
1599           -- Alex doesn't handle Unicode, so when Unicode
1600           -- character is encountered we output these values
1601           -- with the actual character value hidden in the state.
1602           | otherwise =
1603                 case generalCategory c of
1604                   UppercaseLetter       -> upper
1605                   LowercaseLetter       -> lower
1606                   TitlecaseLetter       -> upper
1607                   ModifierLetter        -> other_graphic
1608                   OtherLetter           -> lower -- see #1103
1609                   NonSpacingMark        -> other_graphic
1610                   SpacingCombiningMark  -> other_graphic
1611                   EnclosingMark         -> other_graphic
1612                   DecimalNumber         -> digit
1613                   LetterNumber          -> other_graphic
1614                   OtherNumber           -> other_graphic
1615                   ConnectorPunctuation  -> symbol
1616                   DashPunctuation       -> symbol
1617                   OpenPunctuation       -> other_graphic
1618                   ClosePunctuation      -> other_graphic
1619                   InitialQuote          -> other_graphic
1620                   FinalQuote            -> other_graphic
1621                   OtherPunctuation      -> symbol
1622                   MathSymbol            -> symbol
1623                   CurrencySymbol        -> symbol
1624                   ModifierSymbol        -> symbol
1625                   OtherSymbol           -> symbol
1626                   Space                 -> space
1627                   _other                -> non_graphic
1628
1629 -- This version does not squash unicode characters, it is used when
1630 -- lexing strings.
1631 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1632 alexGetChar' (AI loc ofs s)
1633   | atEnd s   = Nothing
1634   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1635                 --trace (show (ord c)) $
1636                 Just (c, (AI loc' ofs' s'))
1637   where (c,s') = nextChar s
1638         loc'   = advanceSrcLoc loc c
1639         ofs'   = advanceOffs c ofs
1640
1641 advanceOffs :: Char -> Int -> Int
1642 advanceOffs '\n' _    = 0
1643 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1644 advanceOffs _    offs = offs + 1
1645
1646 getInput :: P AlexInput
1647 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1648
1649 setInput :: AlexInput -> P ()
1650 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1651
1652 pushLexState :: Int -> P ()
1653 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1654
1655 popLexState :: P Int
1656 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1657
1658 getLexState :: P Int
1659 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1660
1661 popNextToken :: P (Maybe (Located Token))
1662 popNextToken
1663     = P $ \s@PState{ alr_next_token = m } ->
1664               POk (s {alr_next_token = Nothing}) m
1665
1666 setAlrLastLoc :: SrcSpan -> P ()
1667 setAlrLastLoc l = P $ \s -> POk (s {alr_last_loc = l}) ()
1668
1669 getAlrLastLoc :: P SrcSpan
1670 getAlrLastLoc = P $ \s@(PState {alr_last_loc = l}) -> POk s l
1671
1672 getALRContext :: P [ALRContext]
1673 getALRContext = P $ \s@(PState {alr_context = cs}) -> POk s cs
1674
1675 setALRContext :: [ALRContext] -> P ()
1676 setALRContext cs = P $ \s -> POk (s {alr_context = cs}) ()
1677
1678 setNextToken :: Located Token -> P ()
1679 setNextToken t = P $ \s -> POk (s {alr_next_token = Just t}) ()
1680
1681 popPendingImplicitToken :: P (Maybe (Located Token))
1682 popPendingImplicitToken
1683     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1684               case ts of
1685               [] -> POk s Nothing
1686               (t : ts') -> POk (s {alr_pending_implicit_tokens = ts'}) (Just t)
1687
1688 setPendingImplicitTokens :: [Located Token] -> P ()
1689 setPendingImplicitTokens ts = P $ \s -> POk (s {alr_pending_implicit_tokens = ts}) ()
1690
1691 getAlrExpectingOCurly :: P (Maybe ALRLayout)
1692 getAlrExpectingOCurly = P $ \s@(PState {alr_expecting_ocurly = b}) -> POk s b
1693
1694 setAlrExpectingOCurly :: Maybe ALRLayout -> P ()
1695 setAlrExpectingOCurly b = P $ \s -> POk (s {alr_expecting_ocurly = b}) ()
1696
1697 -- for reasons of efficiency, flags indicating language extensions (eg,
1698 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1699 -- integer
1700
1701 genericsBit :: Int
1702 genericsBit = 0 -- {| and |}
1703 ffiBit :: Int
1704 ffiBit     = 1
1705 parrBit :: Int
1706 parrBit    = 2
1707 arrowsBit :: Int
1708 arrowsBit  = 4
1709 thBit :: Int
1710 thBit      = 5
1711 ipBit :: Int
1712 ipBit      = 6
1713 explicitForallBit :: Int
1714 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1715 bangPatBit :: Int
1716 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1717                 -- (doesn't affect the lexer)
1718 tyFamBit :: Int
1719 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1720 haddockBit :: Int
1721 haddockBit = 10 -- Lex and parse Haddock comments
1722 magicHashBit :: Int
1723 magicHashBit = 11 -- "#" in both functions and operators
1724 kindSigsBit :: Int
1725 kindSigsBit = 12 -- Kind signatures on type variables
1726 recursiveDoBit :: Int
1727 recursiveDoBit = 13 -- mdo
1728 unicodeSyntaxBit :: Int
1729 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1730 unboxedTuplesBit :: Int
1731 unboxedTuplesBit = 15 -- (# and #)
1732 standaloneDerivingBit :: Int
1733 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1734 transformComprehensionsBit :: Int
1735 transformComprehensionsBit = 17
1736 qqBit :: Int
1737 qqBit      = 18 -- enable quasiquoting
1738 inRulePragBit :: Int
1739 inRulePragBit = 19
1740 rawTokenStreamBit :: Int
1741 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1742 newQualOpsBit :: Int
1743 newQualOpsBit = 21 -- Haskell' qualified operator syntax, e.g. Prelude.(+)
1744 recBit :: Int
1745 recBit = 22 -- rec
1746 alternativeLayoutRuleBit :: Int
1747 alternativeLayoutRuleBit = 23
1748
1749 always :: Int -> Bool
1750 always           _     = True
1751 genericsEnabled :: Int -> Bool
1752 genericsEnabled  flags = testBit flags genericsBit
1753 parrEnabled :: Int -> Bool
1754 parrEnabled      flags = testBit flags parrBit
1755 arrowsEnabled :: Int -> Bool
1756 arrowsEnabled    flags = testBit flags arrowsBit
1757 thEnabled :: Int -> Bool
1758 thEnabled        flags = testBit flags thBit
1759 ipEnabled :: Int -> Bool
1760 ipEnabled        flags = testBit flags ipBit
1761 explicitForallEnabled :: Int -> Bool
1762 explicitForallEnabled flags = testBit flags explicitForallBit
1763 bangPatEnabled :: Int -> Bool
1764 bangPatEnabled   flags = testBit flags bangPatBit
1765 -- tyFamEnabled :: Int -> Bool
1766 -- tyFamEnabled     flags = testBit flags tyFamBit
1767 haddockEnabled :: Int -> Bool
1768 haddockEnabled   flags = testBit flags haddockBit
1769 magicHashEnabled :: Int -> Bool
1770 magicHashEnabled flags = testBit flags magicHashBit
1771 -- kindSigsEnabled :: Int -> Bool
1772 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1773 unicodeSyntaxEnabled :: Int -> Bool
1774 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1775 unboxedTuplesEnabled :: Int -> Bool
1776 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1777 standaloneDerivingEnabled :: Int -> Bool
1778 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1779 qqEnabled :: Int -> Bool
1780 qqEnabled        flags = testBit flags qqBit
1781 -- inRulePrag :: Int -> Bool
1782 -- inRulePrag       flags = testBit flags inRulePragBit
1783 rawTokenStreamEnabled :: Int -> Bool
1784 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1785 newQualOps :: Int -> Bool
1786 newQualOps       flags = testBit flags newQualOpsBit
1787 oldQualOps :: Int -> Bool
1788 oldQualOps flags = not (newQualOps flags)
1789 alternativeLayoutRule :: Int -> Bool
1790 alternativeLayoutRule flags = testBit flags alternativeLayoutRuleBit
1791
1792 -- PState for parsing options pragmas
1793 --
1794 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1795 pragState dynflags buf loc =
1796   PState {
1797       buffer        = buf,
1798       messages      = emptyMessages,
1799       dflags        = dynflags,
1800       last_loc      = mkSrcSpan loc loc,
1801       last_offs     = 0,
1802       last_len      = 0,
1803       last_line_len = 0,
1804       loc           = loc,
1805       extsBitmap    = 0,
1806       context       = [],
1807       lex_state     = [bol, option_prags, 0],
1808       alr_pending_implicit_tokens = [],
1809       alr_next_token = Nothing,
1810       alr_last_loc = noSrcSpan,
1811       alr_context = [],
1812       alr_expecting_ocurly = Nothing
1813     }
1814
1815
1816 -- create a parse state
1817 --
1818 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1819 mkPState buf loc flags  =
1820   PState {
1821       buffer          = buf,
1822       dflags        = flags,
1823       messages      = emptyMessages,
1824       last_loc      = mkSrcSpan loc loc,
1825       last_offs     = 0,
1826       last_len      = 0,
1827       last_line_len = 0,
1828       loc           = loc,
1829       extsBitmap    = fromIntegral bitmap,
1830       context       = [],
1831       lex_state     = [bol, 0],
1832         -- we begin in the layout state if toplev_layout is set
1833       alr_pending_implicit_tokens = [],
1834       alr_next_token = Nothing,
1835       alr_last_loc = noSrcSpan,
1836       alr_context = [],
1837       alr_expecting_ocurly = Nothing
1838     }
1839     where
1840       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1841                .|. ffiBit            `setBitIf` dopt Opt_ForeignFunctionInterface flags
1842                .|. parrBit           `setBitIf` dopt Opt_PArr         flags
1843                .|. arrowsBit         `setBitIf` dopt Opt_Arrows       flags
1844                .|. thBit             `setBitIf` dopt Opt_TemplateHaskell flags
1845                .|. qqBit             `setBitIf` dopt Opt_QuasiQuotes flags
1846                .|. ipBit             `setBitIf` dopt Opt_ImplicitParams flags
1847                .|. explicitForallBit `setBitIf` dopt Opt_ExplicitForAll flags
1848                .|. bangPatBit        `setBitIf` dopt Opt_BangPatterns flags
1849                .|. tyFamBit          `setBitIf` dopt Opt_TypeFamilies flags
1850                .|. haddockBit        `setBitIf` dopt Opt_Haddock      flags
1851                .|. magicHashBit      `setBitIf` dopt Opt_MagicHash    flags
1852                .|. kindSigsBit       `setBitIf` dopt Opt_KindSignatures flags
1853                .|. recursiveDoBit    `setBitIf` dopt Opt_RecursiveDo flags
1854                .|. recBit            `setBitIf` dopt Opt_DoRec  flags
1855                .|. recBit            `setBitIf` dopt Opt_Arrows flags
1856                .|. unicodeSyntaxBit  `setBitIf` dopt Opt_UnicodeSyntax flags
1857                .|. unboxedTuplesBit  `setBitIf` dopt Opt_UnboxedTuples flags
1858                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1859                .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1860                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1861                .|. newQualOpsBit `setBitIf` dopt Opt_NewQualifiedOperators flags
1862                .|. alternativeLayoutRuleBit `setBitIf` dopt Opt_AlternativeLayoutRule flags
1863       --
1864       setBitIf :: Int -> Bool -> Int
1865       b `setBitIf` cond | cond      = bit b
1866                         | otherwise = 0
1867
1868 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1869 addWarning option srcspan warning
1870  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1871        let warning' = mkWarnMsg srcspan alwaysQualify warning
1872            ws' = if dopt option d then ws `snocBag` warning' else ws
1873        in POk s{messages=(ws', es)} ()
1874
1875 getMessages :: PState -> Messages
1876 getMessages PState{messages=ms} = ms
1877
1878 getContext :: P [LayoutContext]
1879 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1880
1881 setContext :: [LayoutContext] -> P ()
1882 setContext ctx = P $ \s -> POk s{context=ctx} ()
1883
1884 popContext :: P ()
1885 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1886                               last_len = len, last_loc = last_loc }) ->
1887   case ctx of
1888         (_:tl) -> POk s{ context = tl } ()
1889         []     -> PFailed last_loc (srcParseErr buf len)
1890
1891 -- Push a new layout context at the indentation of the last token read.
1892 -- This is only used at the outer level of a module when the 'module'
1893 -- keyword is missing.
1894 pushCurrentContext :: P ()
1895 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1896     POk s{context = Layout (offs-len) : ctx} ()
1897 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1898
1899 getOffside :: P Ordering
1900 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1901                 let ord = case stk of
1902                         (Layout n:_) -> compare offs n
1903                         _            -> GT
1904                 in POk s ord
1905
1906 -- ---------------------------------------------------------------------------
1907 -- Construct a parse error
1908
1909 srcParseErr
1910   :: StringBuffer       -- current buffer (placed just after the last token)
1911   -> Int                -- length of the previous token
1912   -> Message
1913 srcParseErr buf len
1914   = hcat [ if null token
1915              then ptext (sLit "parse error (possibly incorrect indentation)")
1916              else hcat [ptext (sLit "parse error on input "),
1917                         char '`', text token, char '\'']
1918     ]
1919   where token = lexemeToString (offsetBytes (-len) buf) len
1920
1921 -- Report a parse failure, giving the span of the previous token as
1922 -- the location of the error.  This is the entry point for errors
1923 -- detected during parsing.
1924 srcParseFail :: P a
1925 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1926                             last_loc = last_loc } ->
1927     PFailed last_loc (srcParseErr buf len)
1928
1929 -- A lexical error is reported at a particular position in the source file,
1930 -- not over a token range.
1931 lexError :: String -> P a
1932 lexError str = do
1933   loc <- getSrcLoc
1934   (AI end _ buf) <- getInput
1935   reportLexError loc end buf str
1936
1937 -- -----------------------------------------------------------------------------
1938 -- This is the top-level function: called from the parser each time a
1939 -- new token is to be read from the input.
1940
1941 lexer :: (Located Token -> P a) -> P a
1942 lexer cont = do
1943   alr <- extension alternativeLayoutRule
1944   let lexTokenFun = if alr then lexTokenAlr else lexToken
1945   tok@(L _span _tok__) <- lexTokenFun
1946 --  trace ("token: " ++ show tok__) $ do
1947   cont tok
1948
1949 lexTokenAlr :: P (Located Token)
1950 lexTokenAlr = do mPending <- popPendingImplicitToken
1951                  t <- case mPending of
1952                       Nothing ->
1953                           do mNext <- popNextToken
1954                              t <- case mNext of
1955                                   Nothing -> lexToken
1956                                   Just next -> return next
1957                              alternativeLayoutRuleToken t
1958                       Just t ->
1959                           return t
1960                  setAlrLastLoc (getLoc t)
1961                  case unLoc t of
1962                      ITwhere -> setAlrExpectingOCurly (Just ALRLayoutWhere)
1963                      ITlet   -> setAlrExpectingOCurly (Just ALRLayoutLet)
1964                      ITof    -> setAlrExpectingOCurly (Just ALRLayoutOf)
1965                      ITdo    -> setAlrExpectingOCurly (Just ALRLayoutDo)
1966                      _       -> return ()
1967                  return t
1968
1969 alternativeLayoutRuleToken :: Located Token -> P (Located Token)
1970 alternativeLayoutRuleToken t
1971     = do context <- getALRContext
1972          lastLoc <- getAlrLastLoc
1973          mExpectingOCurly <- getAlrExpectingOCurly
1974          let thisLoc = getLoc t
1975              thisCol = srcSpanStartCol thisLoc
1976              newLine = srcSpanStartLine thisLoc > srcSpanEndLine lastLoc
1977          case (unLoc t, context, mExpectingOCurly) of
1978              -- I think our implicit open-curly handling is slightly
1979              -- different to John's, in how it interacts with newlines
1980              -- and "in"
1981              (ITocurly, _, Just _) ->
1982                  do setAlrExpectingOCurly Nothing
1983                     setNextToken t
1984                     lexTokenAlr
1985              (_, ALRLayout _ col : ls, Just expectingOCurly)
1986               | thisCol > col ->
1987                  do setAlrExpectingOCurly Nothing
1988                     setALRContext (ALRLayout expectingOCurly thisCol : context)
1989                     setNextToken t
1990                     return (L thisLoc ITocurly)
1991               | otherwise ->
1992                  do setAlrExpectingOCurly Nothing
1993                     setPendingImplicitTokens [L thisLoc ITccurly]
1994                     setNextToken t
1995                     return (L thisLoc ITocurly)
1996              (_, _, Just expectingOCurly) ->
1997                  do setAlrExpectingOCurly Nothing
1998                     setALRContext (ALRLayout expectingOCurly thisCol : context)
1999                     setNextToken t
2000                     return (L thisLoc ITocurly)
2001              (ITin, ALRLayout ALRLayoutLet _ : ls, _)
2002               | newLine ->
2003                  do setPendingImplicitTokens [t]
2004                     setALRContext ls
2005                     return (L thisLoc ITccurly)
2006              (_, ls@(ALRLayout _ col : _), _)
2007               | newLine && thisCol <= col ->
2008                  do let f ls'@(ALRLayout _ col' : ls'')
2009                          | thisCol < col' = case f ls'' of
2010                                             (ts, ls''') ->
2011                                                 (L thisLoc ITccurly : ts, ls''')
2012                          | thisCol == col' = ([L thisLoc ITsemi], ls')
2013                          | otherwise       = ([], ls')
2014                         f ls' = ([], ls')
2015                     case f ls of
2016                         (t' : ts, ls') ->
2017                             do setPendingImplicitTokens ts
2018                                setNextToken t
2019                                return t'
2020                         _ -> panic "Layout rule: [] when considering newline"
2021              (u, _, _)
2022               | isALRopen u ->
2023                  do setALRContext (ALRNoLayout (containsCommas u) : context)
2024                     return t
2025              (u, _, _)
2026               | isALRclose u ->
2027                  case context of
2028                  ALRLayout _ _ : ls ->
2029                      do setALRContext ls
2030                         setNextToken t
2031                         return (L thisLoc ITccurly)
2032                  ALRNoLayout _ : ls ->
2033                      do setALRContext ls
2034                         return t
2035                  [] ->
2036                      -- XXX This is an error in John's code, but
2037                      -- it looks reachable to me at first glance
2038                      return t
2039              (ITin, ALRLayout ALRLayoutLet _ : ls, _) ->
2040                  do setALRContext ls
2041                     setPendingImplicitTokens [t]
2042                     return (L thisLoc ITccurly)
2043              (ITin, _ : ls, _) ->
2044                  do setALRContext ls
2045                     setNextToken t
2046                     return (L thisLoc ITccurly)
2047              -- the other ITin case omitted; general case below covers it
2048              (ITcomma, ALRLayout _ _ : ls, _)
2049               | topNoLayoutContainsCommas ls ->
2050                  do setALRContext ls
2051                     setNextToken t
2052                     return (L thisLoc ITccurly)
2053              (ITwhere, ALRLayout ALRLayoutDo _ : ls, _) ->
2054                  do setALRContext ls
2055                     setPendingImplicitTokens [t]
2056                     return (L thisLoc ITccurly)
2057              -- the other ITwhere case omitted; general case below covers it
2058              -- The first [] case comes before the general case, as we
2059              -- have an actual EOF token
2060              (ITeof, ALRLayout _ _ : ls, _) ->
2061                  do setALRContext ls
2062                     setNextToken t
2063                     return (L thisLoc ITccurly)
2064              -- the other ITeof case omitted; general case below covers it
2065              (_, _, _) -> return t
2066
2067 isALRopen :: Token -> Bool
2068 isALRopen ITcase   = True
2069 isALRopen ITif     = True
2070 isALRopen IToparen = True
2071 isALRopen ITobrack = True
2072 isALRopen ITocurly = True
2073 -- GHC Extensions:
2074 isALRopen IToubxparen = True
2075 isALRopen _        = False
2076
2077 isALRclose :: Token -> Bool
2078 isALRclose ITof     = True
2079 isALRclose ITthen   = True
2080 isALRclose ITcparen = True
2081 isALRclose ITcbrack = True
2082 isALRclose ITccurly = True
2083 -- GHC Extensions:
2084 isALRclose ITcubxparen = True
2085 isALRclose _        = False
2086
2087 containsCommas :: Token -> Bool
2088 containsCommas IToparen = True
2089 containsCommas ITobrack = True
2090 -- GHC Extensions:
2091 containsCommas IToubxparen = True
2092 containsCommas _        = False
2093
2094 topNoLayoutContainsCommas :: [ALRContext] -> Bool
2095 topNoLayoutContainsCommas [] = False
2096 topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
2097 topNoLayoutContainsCommas (ALRNoLayout b : _) = b
2098
2099 lexToken :: P (Located Token)
2100 lexToken = do
2101   inp@(AI loc1 _ buf) <- getInput
2102   sc <- getLexState
2103   exts <- getExts
2104   case alexScanUser exts inp sc of
2105     AlexEOF -> do
2106         let span = mkSrcSpan loc1 loc1
2107         setLastToken span 0 0
2108         return (L span ITeof)
2109     AlexError (AI loc2 _ buf) ->
2110         reportLexError loc1 loc2 buf "lexical error"
2111     AlexSkip inp2 _ -> do
2112         setInput inp2
2113         lexToken
2114     AlexToken inp2@(AI end _ buf2) _ t -> do
2115         setInput inp2
2116         let span = mkSrcSpan loc1 end
2117         let bytes = byteDiff buf buf2
2118         span `seq` setLastToken span bytes bytes
2119         t span buf bytes
2120
2121 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
2122 reportLexError loc1 loc2 buf str
2123   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
2124   | otherwise =
2125   let
2126         c = fst (nextChar buf)
2127   in
2128   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
2129     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
2130     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
2131
2132 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
2133 lexTokenStream buf loc dflags = unP go initState
2134     where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
2135           go = do
2136             ltok <- lexer return
2137             case ltok of
2138               L _ ITeof -> return []
2139               _ -> liftM (ltok:) go
2140
2141 linePrags = Map.singleton "line" (begin line_prag2)
2142
2143 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
2144                                  ("options_ghc", lex_string_prag IToptions_prag),
2145                                  ("options_haddock", lex_string_prag ITdocOptions),
2146                                  ("language", token ITlanguage_prag),
2147                                  ("include", lex_string_prag ITinclude_prag)])
2148
2149 ignoredPrags = Map.fromList (map ignored pragmas)
2150                where ignored opt = (opt, nested_comment lexToken)
2151                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
2152                      options_pragmas = map ("options_" ++) impls
2153                      -- CFILES is a hugs-only thing.
2154                      pragmas = options_pragmas ++ ["cfiles", "contract"]
2155
2156 oneWordPrags = Map.fromList([("rules", rulePrag),
2157                            ("inline", token (ITinline_prag True)),
2158                            ("notinline", token (ITinline_prag False)),
2159                            ("specialize", token ITspec_prag),
2160                            ("source", token ITsource_prag),
2161                            ("warning", token ITwarning_prag),
2162                            ("deprecated", token ITdeprecated_prag),
2163                            ("scc", token ITscc_prag),
2164                            ("generated", token ITgenerated_prag),
2165                            ("core", token ITcore_prag),
2166                            ("unpack", token ITunpack_prag),
2167                            ("ann", token ITann_prag)])
2168
2169 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_conlike_prag True)),
2170                              ("notinline conlike", token (ITinline_conlike_prag False)),
2171                              ("specialize inline", token (ITspec_inline_prag True)),
2172                              ("specialize notinline", token (ITspec_inline_prag False))])
2173
2174
2175 dispatch_pragmas :: Map String Action -> Action
2176 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
2177                                        Just found -> found span buf len
2178                                        Nothing -> lexError "unknown pragma"
2179
2180 known_pragma :: Map String Action -> AlexAccPred Int
2181 known_pragma prags _ _ len (AI _ _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
2182                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
2183
2184 clean_pragma :: String -> String
2185 clean_pragma prag = canon_ws (map toLower (unprefix prag))
2186                     where unprefix prag' = case stripPrefix "{-#" prag' of
2187                                              Just rest -> rest
2188                                              Nothing -> prag'
2189                           canonical prag' = case prag' of
2190                                               "noinline" -> "notinline"
2191                                               "specialise" -> "specialize"
2192                                               "constructorlike" -> "conlike"
2193                                               _ -> prag'
2194                           canon_ws s = unwords (map canonical (words s))
2195 }