compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 {-# OPTIONS -Wwarn -w #-}
  35 -- The above warning supression flag is a temporary kludge.
  36 -- While working on this module you are encouraged to remove it and fix
  37 -- any warnings in the module. See
  38 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  39 -- for details
  40 --
  41 -- Note that Alex itself generates code with with some unused bindings and
  42 -- without type signatures, so removing the flag might not be possible.
  43
  44 {-# OPTIONS_GHC -funbox-strict-fields #-}
  45
  46 module Lexer (
  47    Token(..), lexer, pragState, mkPState, PState(..),
  48    P(..), ParseResult(..), getSrcLoc,
  49    failLocMsgP, failSpanMsgP, srcParseFail,
  50    getMessages,
  51    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  52    getLexState, popLexState, pushLexState,
  53    extension, standaloneDerivingEnabled, bangPatEnabled,
  54    addWarning,
  55    lexTokenStream
  56   ) where
  57
  58 import Bag
  59 import ErrUtils
  60 import Maybe
  61 import Outputable
  62 import StringBuffer
  63 import FastString
  64 import SrcLoc
  65 import UniqFM
  66 import DynFlags
  67 import Ctype
  68 import Util             ( readRational )
  69
  70 import Control.Monad
  71 import Data.Bits
  72 import Data.Char
  73 import Data.List
  74 import Data.Map (Map)
  75 import qualified Data.Map as Map
  76 import Data.Ratio
  77 }
  78
  79 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  80 $whitechar   = [\ \n\r\f\v $unispace]
  81 $white_no_nl = $whitechar # \n
  82 $tab         = \t
  83
  84 $ascdigit  = 0-9
  85 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  86 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  87 $digit     = [$ascdigit $unidigit]
  88
  89 $special   = [\(\)\,\;\[\]\`\{\}]
  90 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  91 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  92 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  93
  94 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  95 $asclarge  = [A-Z]
  96 $large     = [$asclarge $unilarge]
  97
  98 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
  99 $ascsmall  = [a-z]
 100 $small     = [$ascsmall $unismall \_]
 101
 102 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 103 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 104
 105 $octit     = 0-7
 106 $hexit     = [$decdigit A-F a-f]
 107 $symchar   = [$symbol \:]
 108 $nl        = [\n\r]
 109 $idchar    = [$small $large $digit \']
 110
 111 $pragmachar = [$small $large $digit]
 112
 113 $docsym    = [\| \^ \* \$]
 114
 115 @varid     = $small $idchar*
 116 @conid     = $large $idchar*
 117
 118 @varsym    = $symbol $symchar*
 119 @consym    = \: $symchar*
 120
 121 @decimal     = $decdigit+
 122 @octal       = $octit+
 123 @hexadecimal = $hexit+
 124 @exponent    = [eE] [\-\+]? @decimal
 125
 126 -- we support the hierarchical module name extension:
 127 @qual = (@conid \.)+
 128
 129 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 130
 131 -- normal signed numerical literals can only be explicitly negative,
 132 -- not explicitly positive (contrast @exponent)
 133 @negative = \-
 134 @signed = @negative ?
 135
 136 haskell :-
 137
 138 -- everywhere: skip whitespace and comments
 139 $white_no_nl+                           ;
 140 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 141
 142 -- Everywhere: deal with nested comments.  We explicitly rule out
 143 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 144 -- (this can happen even though pragmas will normally take precedence due to
 145 -- longest-match, because pragmas aren't valid in every state, but comments
 146 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 147 -- set.
 148
 149 "{-" / { isNormalComment } { nested_comment lexToken }
 150
 151 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 152 -- more dashes followed by a symbol should be parsed as a varsym, so we
 153 -- have to exclude those.
 154
 155 -- Since Haddock comments aren't valid in every state, we need to rule them
 156 -- out here.
 157
 158 -- The following two rules match comments that begin with two dashes, but
 159 -- continue with a different character. The rules test that this character
 160 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 161 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 162 -- have a Haddock comment). The rules then munch the rest of the line.
 163
 164 "-- " ~[$docsym \#] .* { lineCommentToken }
 165 "--" [^$symbol : \ ] .* { lineCommentToken }
 166
 167 -- Next, match Haddock comments if no -haddock flag
 168
 169 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 170
 171 -- Now, when we've matched comments that begin with 2 dashes and continue
 172 -- with a different character, we need to match comments that begin with three
 173 -- or more dashes (which clearly can't be Haddock comments). We only need to
 174 -- make sure that the first non-dash character isn't a symbol, and munch the
 175 -- rest of the line.
 176
 177 "---"\-* [^$symbol :] .* { lineCommentToken }
 178
 179 -- Since the previous rules all match dashes followed by at least one
 180 -- character, we also need to match a whole line filled with just dashes.
 181
 182 "--"\-* / { atEOL } { lineCommentToken }
 183
 184 -- We need this rule since none of the other single line comment rules
 185 -- actually match this case.
 186
 187 "-- " / { atEOL } { lineCommentToken }
 188
 189 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 190 -- blank lines) until we find a non-whitespace character, then do layout
 191 -- processing.
 192 --
 193 -- One slight wibble here: what if the line begins with {-#? In
 194 -- theory, we have to lex the pragma to see if it's one we recognise,
 195 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 196 -- as a nested comment.  We don't bother with this: if the line begins
 197 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 198 <bol> {
 199   \n                                    ;
 200   ^\# (line)?                           { begin line_prag1 }
 201   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 202   ^\# \! .* \n                          ; -- #!, for scripts
 203   ()                                    { do_bol }
 204 }
 205
 206 -- after a layout keyword (let, where, do, of), we begin a new layout
 207 -- context if the curly brace is missing.
 208 -- Careful! This stuff is quite delicate.
 209 <layout, layout_do> {
 210   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 211         -- we might encounter {-# here, but {- has been handled already
 212   \n                                    ;
 213   ^\# (line)?                           { begin line_prag1 }
 214 }
 215
 216 -- do is treated in a subtly different way, see new_layout_context
 217 <layout>    ()                          { new_layout_context True }
 218 <layout_do> ()                          { new_layout_context False }
 219
 220 -- after a new layout context which was found to be to the left of the
 221 -- previous context, we have generated a '{' token, and we now need to
 222 -- generate a matching '}' token.
 223 <layout_left>  ()                       { do_layout_left }
 224
 225 <0,option_prags> \n                             { begin bol }
 226
 227 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 228                                 { dispatch_pragmas linePrags }
 229
 230 -- single-line line pragmas, of the form
 231 --    # <line> "<file>" <extra-stuff> \n
 232 <line_prag1> $decdigit+                 { setLine line_prag1a }
 233 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 234 <line_prag1b> .*                        { pop }
 235
 236 -- Haskell-style line pragmas, of the form
 237 --    {-# LINE <line> "<file>" #-}
 238 <line_prag2> $decdigit+                 { setLine line_prag2a }
 239 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 240 <line_prag2b> "#-}"|"-}"                { pop }
 241    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 242    -- with older versions of GHC which generated these.
 243
 244 <0,option_prags> {
 245   "{-#" $whitechar* $pragmachar+
 246         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 247                                  { dispatch_pragmas twoWordPrags }
 248
 249   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 250                                  { dispatch_pragmas oneWordPrags }
 251
 252   -- We ignore all these pragmas, but don't generate a warning for them
 253   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 254                                  { dispatch_pragmas ignoredPrags }
 255
 256   -- ToDo: should only be valid inside a pragma:
 257   "#-}"                                 { endPrag }
 258 }
 259
 260 <option_prags> {
 261   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 262                                    { dispatch_pragmas fileHeaderPrags }
 263
 264   "-- #"                                 { multiline_doc_comment }
 265 }
 266
 267 <0> {
 268   -- In the "0" mode we ignore these pragmas
 269   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 270                      { nested_comment lexToken }
 271 }
 272
 273 <0> {
 274   "-- #" .* { lineCommentToken }
 275 }
 276
 277 <0,option_prags> {
 278   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 279                     (nested_comment lexToken) }
 280 }
 281
 282 -- '0' state: ordinary lexemes
 283
 284 -- Haddock comments
 285
 286 <0> {
 287   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 288   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 289 }
 290
 291 -- "special" symbols
 292
 293 <0> {
 294   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 295   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 296 }
 297
 298 <0> {
 299   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 300   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 301   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 302   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 303   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 304   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 305   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 306   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 307
 308   "[$" @varid "|"  / { ifExtension qqEnabled }
 309                      { lex_quasiquote_tok }
 310 }
 311
 312 <0> {
 313   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 314                                         { special IToparenbar }
 315   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 316 }
 317
 318 <0> {
 319   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 320 }
 321
 322 <0> {
 323   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 324          { token IToubxparen }
 325   "#)" / { ifExtension unboxedTuplesEnabled }
 326          { token ITcubxparen }
 327 }
 328
 329 <0> {
 330   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 331   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 332 }
 333
 334 <0,option_prags> {
 335   \(                                    { special IToparen }
 336   \)                                    { special ITcparen }
 337   \[                                    { special ITobrack }
 338   \]                                    { special ITcbrack }
 339   \,                                    { special ITcomma }
 340   \;                                    { special ITsemi }
 341   \`                                    { special ITbackquote }
 342
 343   \{                                    { open_brace }
 344   \}                                    { close_brace }
 345 }
 346
 347 <0,option_prags> {
 348   @qual @varid                  { idtoken qvarid }
 349   @qual @conid                  { idtoken qconid }
 350   @varid                        { varid }
 351   @conid                        { idtoken conid }
 352 }
 353
 354 <0> {
 355   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 356   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 357   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 358   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 359 }
 360
 361 -- ToDo: - move `var` and (sym) into lexical syntax?
 362 --       - remove backquote from $special?
 363 <0> {
 364   @qual @varsym       / { ifExtension oldQualOps } { idtoken qvarsym }
 365   @qual @consym       / { ifExtension oldQualOps } { idtoken qconsym }
 366   @qual \( @varsym \) / { ifExtension newQualOps } { idtoken prefixqvarsym }
 367   @qual \( @consym \) / { ifExtension newQualOps } { idtoken prefixqconsym }
 368   @varsym                                          { varsym }
 369   @consym                                          { consym }
 370 }
 371
 372 -- For the normal boxed literals we need to be careful
 373 -- when trying to be close to Haskell98
 374 <0> {
 375   -- Normal integral literals (:: Num a => a, from Integer)
 376   @decimal           { tok_num positive 0 0 decimal }
 377   0[oO] @octal       { tok_num positive 2 2 octal }
 378   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 379
 380   -- Normal rational literals (:: Fractional a => a, from Rational)
 381   @floating_point    { strtoken tok_float }
 382 }
 383
 384 <0> {
 385   -- Unboxed ints (:: Int#) and words (:: Word#)
 386   -- It's simpler (and faster?) to give separate cases to the negatives,
 387   -- especially considering octal/hexadecimal prefixes.
 388   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 389   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 390   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 391   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 392   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 393   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 394
 395   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 396   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 397   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 398
 399   -- Unboxed floats and doubles (:: Float#, :: Double#)
 400   -- prim_{float,double} work with signed literals
 401   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 402   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 403 }
 404
 405 -- Strings and chars are lexed by hand-written code.  The reason is
 406 -- that even if we recognise the string or char here in the regex
 407 -- lexer, we would still have to parse the string afterward in order
 408 -- to convert it to a String.
 409 <0> {
 410   \'                            { lex_char_tok }
 411   \"                            { lex_string_tok }
 412 }
 413
 414 {
 415 -- -----------------------------------------------------------------------------
 416 -- The token type
 417
 418 data Token
 419   = ITas                        -- Haskell keywords
 420   | ITcase
 421   | ITclass
 422   | ITdata
 423   | ITdefault
 424   | ITderiving
 425   | ITdo
 426   | ITelse
 427   | IThiding
 428   | ITif
 429   | ITimport
 430   | ITin
 431   | ITinfix
 432   | ITinfixl
 433   | ITinfixr
 434   | ITinstance
 435   | ITlet
 436   | ITmodule
 437   | ITnewtype
 438   | ITof
 439   | ITqualified
 440   | ITthen
 441   | ITtype
 442   | ITwhere
 443   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 444
 445   | ITforall                    -- GHC extension keywords
 446   | ITforeign
 447   | ITexport
 448   | ITlabel
 449   | ITdynamic
 450   | ITsafe
 451   | ITthreadsafe
 452   | ITunsafe
 453   | ITstdcallconv
 454   | ITccallconv
 455   | ITprimcallconv
 456   | ITdotnet
 457   | ITmdo
 458   | ITfamily
 459   | ITgroup
 460   | ITby
 461   | ITusing
 462
 463         -- Pragmas
 464   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 465   | ITinline_conlike_prag Bool  -- same
 466   | ITspec_prag                 -- SPECIALISE
 467   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 468   | ITsource_prag
 469   | ITrules_prag
 470   | ITwarning_prag
 471   | ITdeprecated_prag
 472   | ITline_prag
 473   | ITscc_prag
 474   | ITgenerated_prag
 475   | ITcore_prag                 -- hdaume: core annotations
 476   | ITunpack_prag
 477   | ITann_prag
 478   | ITclose_prag
 479   | IToptions_prag String
 480   | ITinclude_prag String
 481   | ITlanguage_prag
 482
 483   | ITdotdot                    -- reserved symbols
 484   | ITcolon
 485   | ITdcolon
 486   | ITequal
 487   | ITlam
 488   | ITvbar
 489   | ITlarrow
 490   | ITrarrow
 491   | ITat
 492   | ITtilde
 493   | ITdarrow
 494   | ITminus
 495   | ITbang
 496   | ITstar
 497   | ITdot
 498
 499   | ITbiglam                    -- GHC-extension symbols
 500
 501   | ITocurly                    -- special symbols
 502   | ITccurly
 503   | ITocurlybar                 -- {|, for type applications
 504   | ITccurlybar                 -- |}, for type applications
 505   | ITvocurly
 506   | ITvccurly
 507   | ITobrack
 508   | ITopabrack                  -- [:, for parallel arrays with -XParr
 509   | ITcpabrack                  -- :], for parallel arrays with -XParr
 510   | ITcbrack
 511   | IToparen
 512   | ITcparen
 513   | IToubxparen
 514   | ITcubxparen
 515   | ITsemi
 516   | ITcomma
 517   | ITunderscore
 518   | ITbackquote
 519
 520   | ITvarid   FastString        -- identifiers
 521   | ITconid   FastString
 522   | ITvarsym  FastString
 523   | ITconsym  FastString
 524   | ITqvarid  (FastString,FastString)
 525   | ITqconid  (FastString,FastString)
 526   | ITqvarsym (FastString,FastString)
 527   | ITqconsym (FastString,FastString)
 528   | ITprefixqvarsym (FastString,FastString)
 529   | ITprefixqconsym (FastString,FastString)
 530
 531   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 532
 533   | ITpragma StringBuffer
 534
 535   | ITchar       Char
 536   | ITstring     FastString
 537   | ITinteger    Integer
 538   | ITrational   Rational
 539
 540   | ITprimchar   Char
 541   | ITprimstring FastString
 542   | ITprimint    Integer
 543   | ITprimword   Integer
 544   | ITprimfloat  Rational
 545   | ITprimdouble Rational
 546
 547   -- MetaHaskell extension tokens
 548   | ITopenExpQuote              --  [| or [e|
 549   | ITopenPatQuote              --  [p|
 550   | ITopenDecQuote              --  [d|
 551   | ITopenTypQuote              --  [t|
 552   | ITcloseQuote                --  |]
 553   | ITidEscape   FastString     --  $x
 554   | ITparenEscape               --  $(
 555   | ITvarQuote                  --  '
 556   | ITtyQuote                   --  ''
 557   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 558
 559   -- Arrow notation extension
 560   | ITproc
 561   | ITrec
 562   | IToparenbar                 --  (|
 563   | ITcparenbar                 --  |)
 564   | ITlarrowtail                --  -<
 565   | ITrarrowtail                --  >-
 566   | ITLarrowtail                --  -<<
 567   | ITRarrowtail                --  >>-
 568
 569   | ITunknown String            -- Used when the lexer can't make sense of it
 570   | ITeof                       -- end of file token
 571
 572   -- Documentation annotations
 573   | ITdocCommentNext  String     -- something beginning '-- |'
 574   | ITdocCommentPrev  String     -- something beginning '-- ^'
 575   | ITdocCommentNamed String     -- something beginning '-- $'
 576   | ITdocSection      Int String -- a section heading
 577   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 578   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 579   | ITlineComment     String     -- comment starting by "--"
 580   | ITblockComment    String     -- comment in {- -}
 581
 582 #ifdef DEBUG
 583   deriving Show -- debugging
 584 #endif
 585
 586 {-
 587 isSpecial :: Token -> Bool
 588 -- If we see M.x, where x is a keyword, but
 589 -- is special, we treat is as just plain M.x,
 590 -- not as a keyword.
 591 isSpecial ITas          = True
 592 isSpecial IThiding      = True
 593 isSpecial ITqualified   = True
 594 isSpecial ITforall      = True
 595 isSpecial ITexport      = True
 596 isSpecial ITlabel       = True
 597 isSpecial ITdynamic     = True
 598 isSpecial ITsafe        = True
 599 isSpecial ITthreadsafe  = True
 600 isSpecial ITunsafe      = True
 601 isSpecial ITccallconv   = True
 602 isSpecial ITstdcallconv = True
 603 isSpecial ITprimcallconv = True
 604 isSpecial ITmdo         = True
 605 isSpecial ITfamily      = True
 606 isSpecial ITgroup   = True
 607 isSpecial ITby      = True
 608 isSpecial ITusing   = True
 609 isSpecial _             = False
 610 -}
 611
 612 -- the bitmap provided as the third component indicates whether the
 613 -- corresponding extension keyword is valid under the extension options
 614 -- provided to the compiler; if the extension corresponding to *any* of the
 615 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 616 -- facilitates using a keyword in two different extensions that can be
 617 -- activated independently)
 618 --
 619 reservedWordsFM :: UniqFM (Token, Int)
 620 reservedWordsFM = listToUFM $
 621         map (\(x, y, z) -> (mkFastString x, (y, z)))
 622        [( "_",          ITunderscore,   0 ),
 623         ( "as",         ITas,           0 ),
 624         ( "case",       ITcase,         0 ),
 625         ( "class",      ITclass,        0 ),
 626         ( "data",       ITdata,         0 ),
 627         ( "default",    ITdefault,      0 ),
 628         ( "deriving",   ITderiving,     0 ),
 629         ( "do",         ITdo,           0 ),
 630         ( "else",       ITelse,         0 ),
 631         ( "hiding",     IThiding,       0 ),
 632         ( "if",         ITif,           0 ),
 633         ( "import",     ITimport,       0 ),
 634         ( "in",         ITin,           0 ),
 635         ( "infix",      ITinfix,        0 ),
 636         ( "infixl",     ITinfixl,       0 ),
 637         ( "infixr",     ITinfixr,       0 ),
 638         ( "instance",   ITinstance,     0 ),
 639         ( "let",        ITlet,          0 ),
 640         ( "module",     ITmodule,       0 ),
 641         ( "newtype",    ITnewtype,      0 ),
 642         ( "of",         ITof,           0 ),
 643         ( "qualified",  ITqualified,    0 ),
 644         ( "then",       ITthen,         0 ),
 645         ( "type",       ITtype,         0 ),
 646         ( "where",      ITwhere,        0 ),
 647         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 648
 649     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 650         ( "mdo",        ITmdo,           bit recursiveDoBit),
 651         ( "family",     ITfamily,        bit tyFamBit),
 652     ( "group",  ITgroup,     bit transformComprehensionsBit),
 653     ( "by",     ITby,        bit transformComprehensionsBit),
 654     ( "using",  ITusing,     bit transformComprehensionsBit),
 655
 656         ( "foreign",    ITforeign,       bit ffiBit),
 657         ( "export",     ITexport,        bit ffiBit),
 658         ( "label",      ITlabel,         bit ffiBit),
 659         ( "dynamic",    ITdynamic,       bit ffiBit),
 660         ( "safe",       ITsafe,          bit ffiBit),
 661         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 662         ( "unsafe",     ITunsafe,        bit ffiBit),
 663         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 664         ( "ccall",      ITccallconv,     bit ffiBit),
 665         ( "prim",       ITprimcallconv,  bit ffiBit),
 666         ( "dotnet",     ITdotnet,        bit ffiBit),
 667
 668         ( "rec",        ITrec,           bit arrowsBit),
 669         ( "proc",       ITproc,          bit arrowsBit)
 670      ]
 671
 672 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 673 reservedSymsFM = listToUFM $
 674     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 675       [ ("..",  ITdotdot,   always)
 676         -- (:) is a reserved op, meaning only list cons
 677        ,(":",   ITcolon,    always)
 678        ,("::",  ITdcolon,   always)
 679        ,("=",   ITequal,    always)
 680        ,("\\",  ITlam,      always)
 681        ,("|",   ITvbar,     always)
 682        ,("<-",  ITlarrow,   always)
 683        ,("->",  ITrarrow,   always)
 684        ,("@",   ITat,       always)
 685        ,("~",   ITtilde,    always)
 686        ,("=>",  ITdarrow,   always)
 687        ,("-",   ITminus,    always)
 688        ,("!",   ITbang,     always)
 689
 690         -- For data T (a::*) = MkT
 691        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 692         -- For 'forall a . t'
 693        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 694
 695        ,("-<",  ITlarrowtail, arrowsEnabled)
 696        ,(">-",  ITrarrowtail, arrowsEnabled)
 697        ,("-<<", ITLarrowtail, arrowsEnabled)
 698        ,(">>-", ITRarrowtail, arrowsEnabled)
 699
 700        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 701        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 702        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 703                                 explicitForallEnabled i)
 704        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 705        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 706        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 707         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 708         -- form part of a large operator.  This would let us have a better
 709         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 710        ]
 711
 712 -- -----------------------------------------------------------------------------
 713 -- Lexer actions
 714
 715 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 716
 717 special :: Token -> Action
 718 special tok span _buf _len = return (L span tok)
 719
 720 token, layout_token :: Token -> Action
 721 token t span _buf _len = return (L span t)
 722 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 723
 724 idtoken :: (StringBuffer -> Int -> Token) -> Action
 725 idtoken f span buf len = return (L span $! (f buf len))
 726
 727 skip_one_varid :: (FastString -> Token) -> Action
 728 skip_one_varid f span buf len
 729   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 730
 731 strtoken :: (String -> Token) -> Action
 732 strtoken f span buf len =
 733   return (L span $! (f $! lexemeToString buf len))
 734
 735 init_strtoken :: Int -> (String -> Token) -> Action
 736 -- like strtoken, but drops the last N character(s)
 737 init_strtoken drop f span buf len =
 738   return (L span $! (f $! lexemeToString buf (len-drop)))
 739
 740 begin :: Int -> Action
 741 begin code _span _str _len = do pushLexState code; lexToken
 742
 743 pop :: Action
 744 pop _span _buf _len = do popLexState; lexToken
 745
 746 pop_and :: Action -> Action
 747 pop_and act span buf len = do popLexState; act span buf len
 748
 749 {-# INLINE nextCharIs #-}
 750 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 751 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 752
 753 notFollowedBy :: Char -> AlexAccPred Int
 754 notFollowedBy char _ _ _ (AI _ _ buf)
 755   = nextCharIs buf (/=char)
 756
 757 notFollowedBySymbol :: AlexAccPred Int
 758 notFollowedBySymbol _ _ _ (AI _ _ buf)
 759   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 760
 761 -- We must reject doc comments as being ordinary comments everywhere.
 762 -- In some cases the doc comment will be selected as the lexeme due to
 763 -- maximal munch, but not always, because the nested comment rule is
 764 -- valid in all states, but the doc-comment rules are only valid in
 765 -- the non-layout states.
 766 isNormalComment :: AlexAccPred Int
 767 isNormalComment bits _ _ (AI _ _ buf)
 768   | haddockEnabled bits = notFollowedByDocOrPragma
 769   | otherwise           = nextCharIs buf (/='#')
 770   where
 771     notFollowedByDocOrPragma
 772        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 773
 774 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 775 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 776
 777 {-
 778 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 779   = if haddockEnabled bits then False else (p buf)
 780 -}
 781
 782 atEOL :: AlexAccPred Int
 783 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 784
 785 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 786 ifExtension pred bits _ _ _ = pred bits
 787
 788 multiline_doc_comment :: Action
 789 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 790   where
 791     worker commentAcc input docType oneLine = case alexGetChar input of
 792       Just ('\n', input')
 793         | oneLine -> docCommentEnd input commentAcc docType buf span
 794         | otherwise -> case checkIfCommentLine input' of
 795           Just input -> worker ('\n':commentAcc) input docType False
 796           Nothing -> docCommentEnd input commentAcc docType buf span
 797       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 798       Nothing -> docCommentEnd input commentAcc docType buf span
 799
 800     checkIfCommentLine input = check (dropNonNewlineSpace input)
 801       where
 802         check input = case alexGetChar input of
 803           Just ('-', input) -> case alexGetChar input of
 804             Just ('-', input) -> case alexGetChar input of
 805               Just (c, _) | c /= '-' -> Just input
 806               _ -> Nothing
 807             _ -> Nothing
 808           _ -> Nothing
 809
 810         dropNonNewlineSpace input = case alexGetChar input of
 811           Just (c, input')
 812             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 813             | otherwise -> input
 814           Nothing -> input
 815
 816 lineCommentToken :: Action
 817 lineCommentToken span buf len = do
 818   b <- extension rawTokenStreamEnabled
 819   if b then strtoken ITlineComment span buf len else lexToken
 820
 821 {-
 822   nested comments require traversing by hand, they can't be parsed
 823   using regular expressions.
 824 -}
 825 nested_comment :: P (Located Token) -> Action
 826 nested_comment cont span _str _len = do
 827   input <- getInput
 828   go "" (1::Int) input
 829   where
 830     go commentAcc 0 input = do setInput input
 831                                b <- extension rawTokenStreamEnabled
 832                                if b
 833                                  then docCommentEnd input commentAcc ITblockComment _str span
 834                                  else cont
 835     go commentAcc n input = case alexGetChar input of
 836       Nothing -> errBrace input span
 837       Just ('-',input) -> case alexGetChar input of
 838         Nothing  -> errBrace input span
 839         Just ('\125',input) -> go commentAcc (n-1) input
 840         Just (_,_)          -> go ('-':commentAcc) n input
 841       Just ('\123',input) -> case alexGetChar input of
 842         Nothing  -> errBrace input span
 843         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 844         Just (_,_)       -> go ('\123':commentAcc) n input
 845       Just (c,input) -> go (c:commentAcc) n input
 846
 847 nested_doc_comment :: Action
 848 nested_doc_comment span buf _len = withLexedDocType (go "")
 849   where
 850     go commentAcc input docType _ = case alexGetChar input of
 851       Nothing -> errBrace input span
 852       Just ('-',input) -> case alexGetChar input of
 853         Nothing -> errBrace input span
 854         Just ('\125',input) ->
 855           docCommentEnd input commentAcc docType buf span
 856         Just (_,_) -> go ('-':commentAcc) input docType False
 857       Just ('\123', input) -> case alexGetChar input of
 858         Nothing  -> errBrace input span
 859         Just ('-',input) -> do
 860           setInput input
 861           let cont = do input <- getInput; go commentAcc input docType False
 862           nested_comment cont span buf _len
 863         Just (_,_) -> go ('\123':commentAcc) input docType False
 864       Just (c,input) -> go (c:commentAcc) input docType False
 865
 866 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 867                  -> P (Located Token)
 868 withLexedDocType lexDocComment = do
 869   input@(AI _ _ buf) <- getInput
 870   case prevChar buf ' ' of
 871     '|' -> lexDocComment input ITdocCommentNext False
 872     '^' -> lexDocComment input ITdocCommentPrev False
 873     '$' -> lexDocComment input ITdocCommentNamed False
 874     '*' -> lexDocSection 1 input
 875     '#' -> lexDocComment input ITdocOptionsOld False
 876     _ -> panic "withLexedDocType: Bad doc type"
 877  where
 878     lexDocSection n input = case alexGetChar input of
 879       Just ('*', input) -> lexDocSection (n+1) input
 880       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 881       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 882
 883 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 884 -- off again at the end of the pragma.
 885 rulePrag :: Action
 886 rulePrag span _ _ = do
 887   setExts (.|. bit inRulePragBit)
 888   return (L span ITrules_prag)
 889
 890 endPrag :: Action
 891 endPrag span _ _ = do
 892   setExts (.&. complement (bit inRulePragBit))
 893   return (L span ITclose_prag)
 894
 895 -- docCommentEnd
 896 -------------------------------------------------------------------------------
 897 -- This function is quite tricky. We can't just return a new token, we also
 898 -- need to update the state of the parser. Why? Because the token is longer
 899 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 900 -- it writes the wrong token length to the parser state. This function is
 901 -- called afterwards, so it can just update the state.
 902
 903 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 904 -- which is something that the original lexer didn't account for.
 905 -- I have added last_line_len in the parser state which represents the length
 906 -- of the part of the token that is on the last line. It is now used for layout
 907 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 908 -- was before, the full length of the token, and it is now only used for error
 909 -- messages. /Waern
 910
 911 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 912                  SrcSpan -> P (Located Token)
 913 docCommentEnd input commentAcc docType buf span = do
 914   setInput input
 915   let (AI loc last_offs nextBuf) = input
 916       comment = reverse commentAcc
 917       span' = mkSrcSpan (srcSpanStart span) loc
 918       last_len = byteDiff buf nextBuf
 919
 920       last_line_len = if (last_offs - last_len < 0)
 921         then last_offs
 922         else last_len
 923
 924   span `seq` setLastToken span' last_len last_line_len
 925   return (L span' (docType comment))
 926
 927 errBrace :: AlexInput -> SrcSpan -> P a
 928 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 929
 930 open_brace, close_brace :: Action
 931 open_brace span _str _len = do
 932   ctx <- getContext
 933   setContext (NoLayout:ctx)
 934   return (L span ITocurly)
 935 close_brace span _str _len = do
 936   popContext
 937   return (L span ITccurly)
 938
 939 qvarid, qconid :: StringBuffer -> Int -> Token
 940 qvarid buf len = ITqvarid $! splitQualName buf len False
 941 qconid buf len = ITqconid $! splitQualName buf len False
 942
 943 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 944 -- takes a StringBuffer and a length, and returns the module name
 945 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 946 -- because of hierarchical module names.
 947 splitQualName orig_buf len parens = split orig_buf orig_buf
 948   where
 949     split buf dot_buf
 950         | orig_buf `byteDiff` buf >= len  = done dot_buf
 951         | c == '.'                        = found_dot buf'
 952         | otherwise                       = split buf' dot_buf
 953       where
 954        (c,buf') = nextChar buf
 955
 956     -- careful, we might get names like M....
 957     -- so, if the character after the dot is not upper-case, this is
 958     -- the end of the qualifier part.
 959     found_dot buf -- buf points after the '.'
 960         | isUpper c    = split buf' buf
 961         | otherwise    = done buf
 962       where
 963        (c,buf') = nextChar buf
 964
 965     done dot_buf =
 966         (lexemeToFastString orig_buf (qual_size - 1),
 967          if parens -- Prelude.(+)
 968             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 969             else lexemeToFastString dot_buf (len - qual_size))
 970       where
 971         qual_size = orig_buf `byteDiff` dot_buf
 972
 973 varid :: Action
 974 varid span buf len =
 975   fs `seq`
 976   case lookupUFM reservedWordsFM fs of
 977         Just (keyword,0)    -> do
 978                 maybe_layout keyword
 979                 return (L span keyword)
 980         Just (keyword,exts) -> do
 981                 b <- extension (\i -> exts .&. i /= 0)
 982                 if b then do maybe_layout keyword
 983                              return (L span keyword)
 984                      else return (L span (ITvarid fs))
 985         _other -> return (L span (ITvarid fs))
 986   where
 987         fs = lexemeToFastString buf len
 988
 989 conid :: StringBuffer -> Int -> Token
 990 conid buf len = ITconid fs
 991   where fs = lexemeToFastString buf len
 992
 993 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
 994 qvarsym buf len = ITqvarsym $! splitQualName buf len False
 995 qconsym buf len = ITqconsym $! splitQualName buf len False
 996 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
 997 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
 998
 999 varsym, consym :: Action
1000 varsym = sym ITvarsym
1001 consym = sym ITconsym
1002
1003 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
1004     -> P (Located Token)
1005 sym con span buf len =
1006   case lookupUFM reservedSymsFM fs of
1007         Just (keyword,exts) -> do
1008                 b <- extension exts
1009                 if b then return (L span keyword)
1010                      else return (L span $! con fs)
1011         _other -> return (L span $! con fs)
1012   where
1013         fs = lexemeToFastString buf len
1014
1015 -- Variations on the integral numeric literal.
1016 tok_integral :: (Integer -> Token)
1017      -> (Integer -> Integer)
1018  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1019      -> Int -> Int
1020      -> (Integer, (Char->Int)) -> Action
1021 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1022   return $ L span $ itint $! transint $ parseUnsignedInteger
1023      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1024
1025 -- some conveniences for use with tok_integral
1026 tok_num :: (Integer -> Integer)
1027         -> Int -> Int
1028         -> (Integer, (Char->Int)) -> Action
1029 tok_num = tok_integral ITinteger
1030 tok_primint :: (Integer -> Integer)
1031             -> Int -> Int
1032             -> (Integer, (Char->Int)) -> Action
1033 tok_primint = tok_integral ITprimint
1034 tok_primword :: Int -> Int
1035              -> (Integer, (Char->Int)) -> Action
1036 tok_primword = tok_integral ITprimword positive
1037 positive, negative :: (Integer -> Integer)
1038 positive = id
1039 negative = negate
1040 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1041 decimal = (10,octDecDigit)
1042 octal = (8,octDecDigit)
1043 hexadecimal = (16,hexDigit)
1044
1045 -- readRational can understand negative rationals, exponents, everything.
1046 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1047 tok_float        str = ITrational   $! readRational str
1048 tok_primfloat    str = ITprimfloat  $! readRational str
1049 tok_primdouble   str = ITprimdouble $! readRational str
1050
1051 -- -----------------------------------------------------------------------------
1052 -- Layout processing
1053
1054 -- we're at the first token on a line, insert layout tokens if necessary
1055 do_bol :: Action
1056 do_bol span _str _len = do
1057         pos <- getOffside
1058         case pos of
1059             LT -> do
1060                 --trace "layout: inserting '}'" $ do
1061                 popContext
1062                 -- do NOT pop the lex state, we might have a ';' to insert
1063                 return (L span ITvccurly)
1064             EQ -> do
1065                 --trace "layout: inserting ';'" $ do
1066                 popLexState
1067                 return (L span ITsemi)
1068             GT -> do
1069                 popLexState
1070                 lexToken
1071
1072 -- certain keywords put us in the "layout" state, where we might
1073 -- add an opening curly brace.
1074 maybe_layout :: Token -> P ()
1075 maybe_layout ITdo       = pushLexState layout_do
1076 maybe_layout ITmdo      = pushLexState layout_do
1077 maybe_layout ITof       = pushLexState layout
1078 maybe_layout ITlet      = pushLexState layout
1079 maybe_layout ITwhere    = pushLexState layout
1080 maybe_layout ITrec      = pushLexState layout
1081 maybe_layout _          = return ()
1082
1083 -- Pushing a new implicit layout context.  If the indentation of the
1084 -- next token is not greater than the previous layout context, then
1085 -- Haskell 98 says that the new layout context should be empty; that is
1086 -- the lexer must generate {}.
1087 --
1088 -- We are slightly more lenient than this: when the new context is started
1089 -- by a 'do', then we allow the new context to be at the same indentation as
1090 -- the previous context.  This is what the 'strict' argument is for.
1091 --
1092 new_layout_context :: Bool -> Action
1093 new_layout_context strict span _buf _len = do
1094     popLexState
1095     (AI _ offset _) <- getInput
1096     ctx <- getContext
1097     case ctx of
1098         Layout prev_off : _  |
1099            (strict     && prev_off >= offset  ||
1100             not strict && prev_off > offset) -> do
1101                 -- token is indented to the left of the previous context.
1102                 -- we must generate a {} sequence now.
1103                 pushLexState layout_left
1104                 return (L span ITvocurly)
1105         _ -> do
1106                 setContext (Layout offset : ctx)
1107                 return (L span ITvocurly)
1108
1109 do_layout_left :: Action
1110 do_layout_left span _buf _len = do
1111     popLexState
1112     pushLexState bol  -- we must be at the start of a line
1113     return (L span ITvccurly)
1114
1115 -- -----------------------------------------------------------------------------
1116 -- LINE pragmas
1117
1118 setLine :: Int -> Action
1119 setLine code span buf len = do
1120   let line = parseUnsignedInteger buf len 10 octDecDigit
1121   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1122         -- subtract one: the line number refers to the *following* line
1123   popLexState
1124   pushLexState code
1125   lexToken
1126
1127 setFile :: Int -> Action
1128 setFile code span buf len = do
1129   let file = lexemeToFastString (stepOn buf) (len-2)
1130   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1131   popLexState
1132   pushLexState code
1133   lexToken
1134
1135
1136 -- -----------------------------------------------------------------------------
1137 -- Options, includes and language pragmas.
1138
1139 lex_string_prag :: (String -> Token) -> Action
1140 lex_string_prag mkTok span _buf _len
1141     = do input <- getInput
1142          start <- getSrcLoc
1143          tok <- go [] input
1144          end <- getSrcLoc
1145          return (L (mkSrcSpan start end) tok)
1146     where go acc input
1147               = if isString input "#-}"
1148                    then do setInput input
1149                            return (mkTok (reverse acc))
1150                    else case alexGetChar input of
1151                           Just (c,i) -> go (c:acc) i
1152                           Nothing -> err input
1153           isString _ [] = True
1154           isString i (x:xs)
1155               = case alexGetChar i of
1156                   Just (c,i') | c == x    -> isString i' xs
1157                   _other -> False
1158           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1159
1160
1161 -- -----------------------------------------------------------------------------
1162 -- Strings & Chars
1163
1164 -- This stuff is horrible.  I hates it.
1165
1166 lex_string_tok :: Action
1167 lex_string_tok span _buf _len = do
1168   tok <- lex_string ""
1169   end <- getSrcLoc
1170   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1171
1172 lex_string :: String -> P Token
1173 lex_string s = do
1174   i <- getInput
1175   case alexGetChar' i of
1176     Nothing -> lit_error
1177
1178     Just ('"',i)  -> do
1179         setInput i
1180         magicHash <- extension magicHashEnabled
1181         if magicHash
1182           then do
1183             i <- getInput
1184             case alexGetChar' i of
1185               Just ('#',i) -> do
1186                    setInput i
1187                    if any (> '\xFF') s
1188                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1189                     else let s' = mkZFastString (reverse s) in
1190                          return (ITprimstring s')
1191                         -- mkZFastString is a hack to avoid encoding the
1192                         -- string in UTF-8.  We just want the exact bytes.
1193               _other ->
1194                 return (ITstring (mkFastString (reverse s)))
1195           else
1196                 return (ITstring (mkFastString (reverse s)))
1197
1198     Just ('\\',i)
1199         | Just ('&',i) <- next -> do
1200                 setInput i; lex_string s
1201         | Just (c,i) <- next, is_space c -> do
1202                 setInput i; lex_stringgap s
1203         where next = alexGetChar' i
1204
1205     Just (c, i) -> do
1206         c' <- lex_char c i
1207         lex_string (c':s)
1208
1209 lex_stringgap :: String -> P Token
1210 lex_stringgap s = do
1211   c <- getCharOrFail
1212   case c of
1213     '\\' -> lex_string s
1214     c | is_space c -> lex_stringgap s
1215     _other -> lit_error
1216
1217
1218 lex_char_tok :: Action
1219 -- Here we are basically parsing character literals, such as 'x' or '\n'
1220 -- but, when Template Haskell is on, we additionally spot
1221 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1222 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1223 -- So we have to do two characters of lookahead: when we see 'x we need to
1224 -- see if there's a trailing quote
1225 lex_char_tok span _buf _len = do        -- We've seen '
1226    i1 <- getInput       -- Look ahead to first character
1227    let loc = srcSpanStart span
1228    case alexGetChar' i1 of
1229         Nothing -> lit_error
1230
1231         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1232                   th_exts <- extension thEnabled
1233                   if th_exts then do
1234                         setInput i2
1235                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1236                    else lit_error
1237
1238         Just ('\\', i2@(AI _end2 _ _)) -> do    -- We've seen 'backslash
1239                   setInput i2
1240                   lit_ch <- lex_escape
1241                   mc <- getCharOrFail   -- Trailing quote
1242                   if mc == '\'' then finish_char_tok loc lit_ch
1243                                 else do setInput i2; lit_error
1244
1245         Just (c, i2@(AI _end2 _ _))
1246                 | not (isAny c) -> lit_error
1247                 | otherwise ->
1248
1249                 -- We've seen 'x, where x is a valid character
1250                 --  (i.e. not newline etc) but not a quote or backslash
1251            case alexGetChar' i2 of      -- Look ahead one more character
1252                 Just ('\'', i3) -> do   -- We've seen 'x'
1253                         setInput i3
1254                         finish_char_tok loc c
1255                 _other -> do            -- We've seen 'x not followed by quote
1256                                         -- (including the possibility of EOF)
1257                                         -- If TH is on, just parse the quote only
1258                         th_exts <- extension thEnabled
1259                         let (AI end _ _) = i1
1260                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1261                                    else do setInput i2; lit_error
1262
1263 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1264 finish_char_tok loc ch  -- We've already seen the closing quote
1265                         -- Just need to check for trailing #
1266   = do  magicHash <- extension magicHashEnabled
1267         i@(AI end _ _) <- getInput
1268         if magicHash then do
1269                 case alexGetChar' i of
1270                         Just ('#',i@(AI end _ _)) -> do
1271                                 setInput i
1272                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1273                         _other ->
1274                                 return (L (mkSrcSpan loc end) (ITchar ch))
1275                 else do
1276                    return (L (mkSrcSpan loc end) (ITchar ch))
1277
1278 lex_char :: Char -> AlexInput -> P Char
1279 lex_char c inp = do
1280   case c of
1281       '\\' -> do setInput inp; lex_escape
1282       c | isAny c -> do setInput inp; return c
1283       _other -> lit_error
1284
1285 isAny :: Char -> Bool
1286 isAny c | c > '\x7f' = isPrint c
1287         | otherwise  = is_any c
1288
1289 lex_escape :: P Char
1290 lex_escape = do
1291   c <- getCharOrFail
1292   case c of
1293         'a'   -> return '\a'
1294         'b'   -> return '\b'
1295         'f'   -> return '\f'
1296         'n'   -> return '\n'
1297         'r'   -> return '\r'
1298         't'   -> return '\t'
1299         'v'   -> return '\v'
1300         '\\'  -> return '\\'
1301         '"'   -> return '\"'
1302         '\''  -> return '\''
1303         '^'   -> do c <- getCharOrFail
1304                     if c >= '@' && c <= '_'
1305                         then return (chr (ord c - ord '@'))
1306                         else lit_error
1307
1308         'x'   -> readNum is_hexdigit 16 hexDigit
1309         'o'   -> readNum is_octdigit  8 octDecDigit
1310         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1311
1312         c1 ->  do
1313            i <- getInput
1314            case alexGetChar' i of
1315             Nothing -> lit_error
1316             Just (c2,i2) ->
1317               case alexGetChar' i2 of
1318                 Nothing -> do setInput i2; lit_error
1319                 Just (c3,i3) ->
1320                    let str = [c1,c2,c3] in
1321                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1322                                      Just rest <- [stripPrefix p str] ] of
1323                           (escape_char,[]):_ -> do
1324                                 setInput i3
1325                                 return escape_char
1326                           (escape_char,_:_):_ -> do
1327                                 setInput i2
1328                                 return escape_char
1329                           [] -> lit_error
1330
1331 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1332 readNum is_digit base conv = do
1333   i <- getInput
1334   c <- getCharOrFail
1335   if is_digit c
1336         then readNum2 is_digit base conv (conv c)
1337         else do setInput i; lit_error
1338
1339 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1340 readNum2 is_digit base conv i = do
1341   input <- getInput
1342   read i input
1343   where read i input = do
1344           case alexGetChar' input of
1345             Just (c,input') | is_digit c -> do
1346                 read (i*base + conv c) input'
1347             _other -> do
1348                 if i >= 0 && i <= 0x10FFFF
1349                    then do setInput input; return (chr i)
1350                    else lit_error
1351
1352 silly_escape_chars :: [(String, Char)]
1353 silly_escape_chars = [
1354         ("NUL", '\NUL'),
1355         ("SOH", '\SOH'),
1356         ("STX", '\STX'),
1357         ("ETX", '\ETX'),
1358         ("EOT", '\EOT'),
1359         ("ENQ", '\ENQ'),
1360         ("ACK", '\ACK'),
1361         ("BEL", '\BEL'),
1362         ("BS", '\BS'),
1363         ("HT", '\HT'),
1364         ("LF", '\LF'),
1365         ("VT", '\VT'),
1366         ("FF", '\FF'),
1367         ("CR", '\CR'),
1368         ("SO", '\SO'),
1369         ("SI", '\SI'),
1370         ("DLE", '\DLE'),
1371         ("DC1", '\DC1'),
1372         ("DC2", '\DC2'),
1373         ("DC3", '\DC3'),
1374         ("DC4", '\DC4'),
1375         ("NAK", '\NAK'),
1376         ("SYN", '\SYN'),
1377         ("ETB", '\ETB'),
1378         ("CAN", '\CAN'),
1379         ("EM", '\EM'),
1380         ("SUB", '\SUB'),
1381         ("ESC", '\ESC'),
1382         ("FS", '\FS'),
1383         ("GS", '\GS'),
1384         ("RS", '\RS'),
1385         ("US", '\US'),
1386         ("SP", '\SP'),
1387         ("DEL", '\DEL')
1388         ]
1389
1390 -- before calling lit_error, ensure that the current input is pointing to
1391 -- the position of the error in the buffer.  This is so that we can report
1392 -- a correct location to the user, but also so we can detect UTF-8 decoding
1393 -- errors if they occur.
1394 lit_error :: P a
1395 lit_error = lexError "lexical error in string/character literal"
1396
1397 getCharOrFail :: P Char
1398 getCharOrFail =  do
1399   i <- getInput
1400   case alexGetChar' i of
1401         Nothing -> lexError "unexpected end-of-file in string/character literal"
1402         Just (c,i)  -> do setInput i; return c
1403
1404 -- -----------------------------------------------------------------------------
1405 -- QuasiQuote
1406
1407 lex_quasiquote_tok :: Action
1408 lex_quasiquote_tok span buf len = do
1409   let quoter = reverse $ takeWhile (/= '$')
1410                $ reverse $ lexemeToString buf (len - 1)
1411   quoteStart <- getSrcLoc
1412   quote <- lex_quasiquote ""
1413   end <- getSrcLoc
1414   return (L (mkSrcSpan (srcSpanStart span) end)
1415            (ITquasiQuote (mkFastString quoter,
1416                           mkFastString (reverse quote),
1417                           mkSrcSpan quoteStart end)))
1418
1419 lex_quasiquote :: String -> P String
1420 lex_quasiquote s = do
1421   i <- getInput
1422   case alexGetChar' i of
1423     Nothing -> lit_error
1424
1425     Just ('\\',i)
1426         | Just ('|',i) <- next -> do
1427                 setInput i; lex_quasiquote ('|' : s)
1428         | Just (']',i) <- next -> do
1429                 setInput i; lex_quasiquote (']' : s)
1430         where next = alexGetChar' i
1431
1432     Just ('|',i)
1433         | Just (']',i) <- next -> do
1434                 setInput i; return s
1435         where next = alexGetChar' i
1436
1437     Just (c, i) -> do
1438          setInput i; lex_quasiquote (c : s)
1439
1440 -- -----------------------------------------------------------------------------
1441 -- Warnings
1442
1443 warn :: DynFlag -> SDoc -> Action
1444 warn option warning srcspan _buf _len = do
1445     addWarning option srcspan warning
1446     lexToken
1447
1448 warnThen :: DynFlag -> SDoc -> Action -> Action
1449 warnThen option warning action srcspan buf len = do
1450     addWarning option srcspan warning
1451     action srcspan buf len
1452
1453 -- -----------------------------------------------------------------------------
1454 -- The Parse Monad
1455
1456 data LayoutContext
1457   = NoLayout
1458   | Layout !Int
1459   deriving Show
1460
1461 data ParseResult a
1462   = POk PState a
1463   | PFailed
1464         SrcSpan         -- The start and end of the text span related to
1465                         -- the error.  Might be used in environments which can
1466                         -- show this span, e.g. by highlighting it.
1467         Message         -- The error message
1468
1469 data PState = PState {
1470         buffer     :: StringBuffer,
1471         dflags     :: DynFlags,
1472         messages   :: Messages,
1473         last_loc   :: SrcSpan,  -- pos of previous token
1474         last_offs  :: !Int,     -- offset of the previous token from the
1475                                 -- beginning of  the current line.
1476                                 -- \t is equal to 8 spaces.
1477         last_len   :: !Int,     -- len of previous token
1478         last_line_len :: !Int,
1479         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1480         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1481         context    :: [LayoutContext],
1482         lex_state  :: [Int]
1483      }
1484         -- last_loc and last_len are used when generating error messages,
1485         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1486         -- current token to happyError, we could at least get rid of last_len.
1487         -- Getting rid of last_loc would require finding another way to
1488         -- implement pushCurrentContext (which is only called from one place).
1489
1490 newtype P a = P { unP :: PState -> ParseResult a }
1491
1492 instance Monad P where
1493   return = returnP
1494   (>>=) = thenP
1495   fail = failP
1496
1497 returnP :: a -> P a
1498 returnP a = a `seq` (P $ \s -> POk s a)
1499
1500 thenP :: P a -> (a -> P b) -> P b
1501 (P m) `thenP` k = P $ \ s ->
1502         case m s of
1503                 POk s1 a         -> (unP (k a)) s1
1504                 PFailed span err -> PFailed span err
1505
1506 failP :: String -> P a
1507 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1508
1509 failMsgP :: String -> P a
1510 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1511
1512 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1513 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1514
1515 failSpanMsgP :: SrcSpan -> SDoc -> P a
1516 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1517
1518 extension :: (Int -> Bool) -> P Bool
1519 extension p = P $ \s -> POk s (p $! extsBitmap s)
1520
1521 getExts :: P Int
1522 getExts = P $ \s -> POk s (extsBitmap s)
1523
1524 setExts :: (Int -> Int) -> P ()
1525 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1526
1527 setSrcLoc :: SrcLoc -> P ()
1528 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1529
1530 getSrcLoc :: P SrcLoc
1531 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1532
1533 setLastToken :: SrcSpan -> Int -> Int -> P ()
1534 setLastToken loc len line_len = P $ \s -> POk s {
1535   last_loc=loc,
1536   last_len=len,
1537   last_line_len=line_len
1538 } ()
1539
1540 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1541
1542 alexInputPrevChar :: AlexInput -> Char
1543 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1544
1545 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1546 alexGetChar (AI loc ofs s)
1547   | atEnd s   = Nothing
1548   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1549                 --trace (show (ord c)) $
1550                 Just (adj_c, (AI loc' ofs' s'))
1551   where (c,s') = nextChar s
1552         loc'   = advanceSrcLoc loc c
1553         ofs'   = advanceOffs c ofs
1554
1555         non_graphic     = '\x0'
1556         upper           = '\x1'
1557         lower           = '\x2'
1558         digit           = '\x3'
1559         symbol          = '\x4'
1560         space           = '\x5'
1561         other_graphic   = '\x6'
1562
1563         adj_c
1564           | c <= '\x06' = non_graphic
1565           | c <= '\x7f' = c
1566           -- Alex doesn't handle Unicode, so when Unicode
1567           -- character is encountered we output these values
1568           -- with the actual character value hidden in the state.
1569           | otherwise =
1570                 case generalCategory c of
1571                   UppercaseLetter       -> upper
1572                   LowercaseLetter       -> lower
1573                   TitlecaseLetter       -> upper
1574                   ModifierLetter        -> other_graphic
1575                   OtherLetter           -> lower -- see #1103
1576                   NonSpacingMark        -> other_graphic
1577                   SpacingCombiningMark  -> other_graphic
1578                   EnclosingMark         -> other_graphic
1579                   DecimalNumber         -> digit
1580                   LetterNumber          -> other_graphic
1581                   OtherNumber           -> other_graphic
1582                   ConnectorPunctuation  -> symbol
1583                   DashPunctuation       -> symbol
1584                   OpenPunctuation       -> other_graphic
1585                   ClosePunctuation      -> other_graphic
1586                   InitialQuote          -> other_graphic
1587                   FinalQuote            -> other_graphic
1588                   OtherPunctuation      -> symbol
1589                   MathSymbol            -> symbol
1590                   CurrencySymbol        -> symbol
1591                   ModifierSymbol        -> symbol
1592                   OtherSymbol           -> symbol
1593                   Space                 -> space
1594                   _other                -> non_graphic
1595
1596 -- This version does not squash unicode characters, it is used when
1597 -- lexing strings.
1598 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1599 alexGetChar' (AI loc ofs s)
1600   | atEnd s   = Nothing
1601   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1602                 --trace (show (ord c)) $
1603                 Just (c, (AI loc' ofs' s'))
1604   where (c,s') = nextChar s
1605         loc'   = advanceSrcLoc loc c
1606         ofs'   = advanceOffs c ofs
1607
1608 advanceOffs :: Char -> Int -> Int
1609 advanceOffs '\n' _    = 0
1610 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1611 advanceOffs _    offs = offs + 1
1612
1613 getInput :: P AlexInput
1614 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1615
1616 setInput :: AlexInput -> P ()
1617 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1618
1619 pushLexState :: Int -> P ()
1620 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1621
1622 popLexState :: P Int
1623 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1624
1625 getLexState :: P Int
1626 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1627
1628 -- for reasons of efficiency, flags indicating language extensions (eg,
1629 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1630 -- integer
1631
1632 genericsBit :: Int
1633 genericsBit = 0 -- {| and |}
1634 ffiBit :: Int
1635 ffiBit     = 1
1636 parrBit :: Int
1637 parrBit    = 2
1638 arrowsBit :: Int
1639 arrowsBit  = 4
1640 thBit :: Int
1641 thBit      = 5
1642 ipBit :: Int
1643 ipBit      = 6
1644 explicitForallBit :: Int
1645 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1646 bangPatBit :: Int
1647 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1648                 -- (doesn't affect the lexer)
1649 tyFamBit :: Int
1650 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1651 haddockBit :: Int
1652 haddockBit = 10 -- Lex and parse Haddock comments
1653 magicHashBit :: Int
1654 magicHashBit = 11 -- "#" in both functions and operators
1655 kindSigsBit :: Int
1656 kindSigsBit = 12 -- Kind signatures on type variables
1657 recursiveDoBit :: Int
1658 recursiveDoBit = 13 -- mdo
1659 unicodeSyntaxBit :: Int
1660 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1661 unboxedTuplesBit :: Int
1662 unboxedTuplesBit = 15 -- (# and #)
1663 standaloneDerivingBit :: Int
1664 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1665 transformComprehensionsBit :: Int
1666 transformComprehensionsBit = 17
1667 qqBit :: Int
1668 qqBit      = 18 -- enable quasiquoting
1669 inRulePragBit :: Int
1670 inRulePragBit = 19
1671 rawTokenStreamBit :: Int
1672 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1673 newQualOpsBit :: Int
1674 newQualOpsBit = 21 -- Haskell' qualified operator syntax, e.g. Prelude.(+)
1675
1676 always :: Int -> Bool
1677 always           _     = True
1678 genericsEnabled :: Int -> Bool
1679 genericsEnabled  flags = testBit flags genericsBit
1680 parrEnabled :: Int -> Bool
1681 parrEnabled      flags = testBit flags parrBit
1682 arrowsEnabled :: Int -> Bool
1683 arrowsEnabled    flags = testBit flags arrowsBit
1684 thEnabled :: Int -> Bool
1685 thEnabled        flags = testBit flags thBit
1686 ipEnabled :: Int -> Bool
1687 ipEnabled        flags = testBit flags ipBit
1688 explicitForallEnabled :: Int -> Bool
1689 explicitForallEnabled flags = testBit flags explicitForallBit
1690 bangPatEnabled :: Int -> Bool
1691 bangPatEnabled   flags = testBit flags bangPatBit
1692 -- tyFamEnabled :: Int -> Bool
1693 -- tyFamEnabled     flags = testBit flags tyFamBit
1694 haddockEnabled :: Int -> Bool
1695 haddockEnabled   flags = testBit flags haddockBit
1696 magicHashEnabled :: Int -> Bool
1697 magicHashEnabled flags = testBit flags magicHashBit
1698 -- kindSigsEnabled :: Int -> Bool
1699 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1700 unicodeSyntaxEnabled :: Int -> Bool
1701 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1702 unboxedTuplesEnabled :: Int -> Bool
1703 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1704 standaloneDerivingEnabled :: Int -> Bool
1705 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1706 qqEnabled :: Int -> Bool
1707 qqEnabled        flags = testBit flags qqBit
1708 -- inRulePrag :: Int -> Bool
1709 -- inRulePrag       flags = testBit flags inRulePragBit
1710 rawTokenStreamEnabled :: Int -> Bool
1711 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1712 newQualOps :: Int -> Bool
1713 newQualOps       flags = testBit flags newQualOpsBit
1714 oldQualOps :: Int -> Bool
1715 oldQualOps flags = not (newQualOps flags)
1716
1717 -- PState for parsing options pragmas
1718 --
1719 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1720 pragState dynflags buf loc =
1721   PState {
1722       buffer        = buf,
1723       messages      = emptyMessages,
1724       dflags        = dynflags,
1725       last_loc      = mkSrcSpan loc loc,
1726       last_offs     = 0,
1727       last_len      = 0,
1728       last_line_len = 0,
1729       loc           = loc,
1730       extsBitmap    = 0,
1731       context       = [],
1732       lex_state     = [bol, option_prags, 0]
1733     }
1734
1735
1736 -- create a parse state
1737 --
1738 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1739 mkPState buf loc flags  =
1740   PState {
1741       buffer          = buf,
1742       dflags        = flags,
1743       messages      = emptyMessages,
1744       last_loc      = mkSrcSpan loc loc,
1745       last_offs     = 0,
1746       last_len      = 0,
1747       last_line_len = 0,
1748       loc           = loc,
1749       extsBitmap    = fromIntegral bitmap,
1750       context       = [],
1751       lex_state     = [bol, 0]
1752         -- we begin in the layout state if toplev_layout is set
1753     }
1754     where
1755       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1756                .|. ffiBit       `setBitIf` dopt Opt_ForeignFunctionInterface flags
1757                .|. parrBit      `setBitIf` dopt Opt_PArr         flags
1758                .|. arrowsBit    `setBitIf` dopt Opt_Arrows       flags
1759                .|. thBit        `setBitIf` dopt Opt_TemplateHaskell flags
1760                .|. qqBit        `setBitIf` dopt Opt_QuasiQuotes flags
1761                .|. ipBit        `setBitIf` dopt Opt_ImplicitParams flags
1762                .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
1763                .|. explicitForallBit `setBitIf` dopt Opt_LiberalTypeSynonyms flags
1764                .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
1765                .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
1766                .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
1767                .|. explicitForallBit `setBitIf` dopt Opt_RankNTypes flags
1768                .|. bangPatBit   `setBitIf` dopt Opt_BangPatterns flags
1769                .|. tyFamBit     `setBitIf` dopt Opt_TypeFamilies flags
1770                .|. haddockBit   `setBitIf` dopt Opt_Haddock      flags
1771                .|. magicHashBit `setBitIf` dopt Opt_MagicHash    flags
1772                .|. kindSigsBit  `setBitIf` dopt Opt_KindSignatures flags
1773                .|. recursiveDoBit `setBitIf` dopt Opt_RecursiveDo flags
1774                .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
1775                .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
1776                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1777                .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1778                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1779                .|. newQualOpsBit `setBitIf` dopt Opt_NewQualifiedOperators flags
1780       --
1781       setBitIf :: Int -> Bool -> Int
1782       b `setBitIf` cond | cond      = bit b
1783                         | otherwise = 0
1784
1785 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1786 addWarning option srcspan warning
1787  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1788        let warning' = mkWarnMsg srcspan alwaysQualify warning
1789            ws' = if dopt option d then ws `snocBag` warning' else ws
1790        in POk s{messages=(ws', es)} ()
1791
1792 getMessages :: PState -> Messages
1793 getMessages PState{messages=ms} = ms
1794
1795 getContext :: P [LayoutContext]
1796 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1797
1798 setContext :: [LayoutContext] -> P ()
1799 setContext ctx = P $ \s -> POk s{context=ctx} ()
1800
1801 popContext :: P ()
1802 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1803                               last_len = len, last_loc = last_loc }) ->
1804   case ctx of
1805         (_:tl) -> POk s{ context = tl } ()
1806         []     -> PFailed last_loc (srcParseErr buf len)
1807
1808 -- Push a new layout context at the indentation of the last token read.
1809 -- This is only used at the outer level of a module when the 'module'
1810 -- keyword is missing.
1811 pushCurrentContext :: P ()
1812 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1813     POk s{context = Layout (offs-len) : ctx} ()
1814 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1815
1816 getOffside :: P Ordering
1817 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1818                 let ord = case stk of
1819                         (Layout n:_) -> compare offs n
1820                         _            -> GT
1821                 in POk s ord
1822
1823 -- ---------------------------------------------------------------------------
1824 -- Construct a parse error
1825
1826 srcParseErr
1827   :: StringBuffer       -- current buffer (placed just after the last token)
1828   -> Int                -- length of the previous token
1829   -> Message
1830 srcParseErr buf len
1831   = hcat [ if null token
1832              then ptext (sLit "parse error (possibly incorrect indentation)")
1833              else hcat [ptext (sLit "parse error on input "),
1834                         char '`', text token, char '\'']
1835     ]
1836   where token = lexemeToString (offsetBytes (-len) buf) len
1837
1838 -- Report a parse failure, giving the span of the previous token as
1839 -- the location of the error.  This is the entry point for errors
1840 -- detected during parsing.
1841 srcParseFail :: P a
1842 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1843                             last_loc = last_loc } ->
1844     PFailed last_loc (srcParseErr buf len)
1845
1846 -- A lexical error is reported at a particular position in the source file,
1847 -- not over a token range.
1848 lexError :: String -> P a
1849 lexError str = do
1850   loc <- getSrcLoc
1851   (AI end _ buf) <- getInput
1852   reportLexError loc end buf str
1853
1854 -- -----------------------------------------------------------------------------
1855 -- This is the top-level function: called from the parser each time a
1856 -- new token is to be read from the input.
1857
1858 lexer :: (Located Token -> P a) -> P a
1859 lexer cont = do
1860   tok@(L _span _tok__) <- lexToken
1861 --  trace ("token: " ++ show tok__) $ do
1862   cont tok
1863
1864 lexToken :: P (Located Token)
1865 lexToken = do
1866   inp@(AI loc1 _ buf) <- getInput
1867   sc <- getLexState
1868   exts <- getExts
1869   case alexScanUser exts inp sc of
1870     AlexEOF -> do
1871         let span = mkSrcSpan loc1 loc1
1872         setLastToken span 0 0
1873         return (L span ITeof)
1874     AlexError (AI loc2 _ buf) ->
1875         reportLexError loc1 loc2 buf "lexical error"
1876     AlexSkip inp2 _ -> do
1877         setInput inp2
1878         lexToken
1879     AlexToken inp2@(AI end _ buf2) _ t -> do
1880         setInput inp2
1881         let span = mkSrcSpan loc1 end
1882         let bytes = byteDiff buf buf2
1883         span `seq` setLastToken span bytes bytes
1884         t span buf bytes
1885
1886 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
1887 reportLexError loc1 loc2 buf str
1888   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1889   | otherwise =
1890   let
1891         c = fst (nextChar buf)
1892   in
1893   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1894     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1895     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1896
1897 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
1898 lexTokenStream buf loc dflags = unP go initState
1899     where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
1900           go = do
1901             ltok <- lexer return
1902             case ltok of
1903               L _ ITeof -> return []
1904               _ -> liftM (ltok:) go
1905
1906 linePrags = Map.singleton "line" (begin line_prag2)
1907
1908 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
1909                                  ("options_ghc", lex_string_prag IToptions_prag),
1910                                  ("options_haddock", lex_string_prag ITdocOptions),
1911                                  ("language", token ITlanguage_prag),
1912                                  ("include", lex_string_prag ITinclude_prag)])
1913
1914 ignoredPrags = Map.fromList (map ignored pragmas)
1915                where ignored opt = (opt, nested_comment lexToken)
1916                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
1917                      options_pragmas = map ("options_" ++) impls
1918                      -- CFILES is a hugs-only thing.
1919                      pragmas = options_pragmas ++ ["cfiles", "contract"]
1920
1921 oneWordPrags = Map.fromList([("rules", rulePrag),
1922                            ("inline", token (ITinline_prag True)),
1923                            ("notinline", token (ITinline_prag False)),
1924                            ("specialize", token ITspec_prag),
1925                            ("source", token ITsource_prag),
1926                            ("warning", token ITwarning_prag),
1927                            ("deprecated", token ITdeprecated_prag),
1928                            ("scc", token ITscc_prag),
1929                            ("generated", token ITgenerated_prag),
1930                            ("core", token ITcore_prag),
1931                            ("unpack", token ITunpack_prag),
1932                            ("ann", token ITann_prag)])
1933
1934 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_conlike_prag True)),
1935                              ("notinline conlike", token (ITinline_conlike_prag False)),
1936                              ("specialize inline", token (ITspec_inline_prag True)),
1937                              ("specialize notinline", token (ITspec_inline_prag False))])
1938
1939
1940 dispatch_pragmas :: Map String Action -> Action
1941 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
1942                                        Just found -> found span buf len
1943                                        Nothing -> lexError "unknown pragma"
1944
1945 known_pragma :: Map String Action -> AlexAccPred Int
1946 known_pragma prags _ _ len (AI _ _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
1947                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
1948
1949 clean_pragma :: String -> String
1950 clean_pragma prag = canon_ws (map toLower (unprefix prag))
1951                     where unprefix prag' = case stripPrefix "{-#" prag' of
1952                                              Just rest -> rest
1953                                              Nothing -> prag'
1954                           canonical prag' = case prag' of
1955                                               "noinline" -> "notinline"
1956                                               "specialise" -> "specialize"
1957                                               "constructorlike" -> "conlike"
1958                                               otherwise -> prag'
1959                           canon_ws s = unwords (map canonical (words s))
1960 }