compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 {-# OPTIONS -Wwarn -w #-}
  35 -- The above warning supression flag is a temporary kludge.
  36 -- While working on this module you are encouraged to remove it and fix
  37 -- any warnings in the module. See
  38 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  39 -- for details
  40 --
  41 -- Note that Alex itself generates code with with some unused bindings and
  42 -- without type signatures, so removing the flag might not be possible.
  43
  44 {-# OPTIONS_GHC -funbox-strict-fields #-}
  45
  46 module Lexer (
  47    Token(..), lexer, pragState, mkPState, PState(..),
  48    P(..), ParseResult(..), getSrcLoc,
  49    failLocMsgP, failSpanMsgP, srcParseFail,
  50    getMessages,
  51    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  52    getLexState, popLexState, pushLexState,
  53    extension, standaloneDerivingEnabled, bangPatEnabled,
  54    addWarning,
  55    lexTokenStream
  56   ) where
  57
  58 import Bag
  59 import ErrUtils
  60 import Maybe
  61 import Outputable
  62 import StringBuffer
  63 import FastString
  64 import SrcLoc
  65 import UniqFM
  66 import DynFlags
  67 import Ctype
  68 import Util             ( maybePrefixMatch, readRational )
  69
  70 import Control.Monad
  71 import Data.Bits
  72 import Data.Char
  73 import Data.List
  74 import Data.Map (Map)
  75 import qualified Data.Map as Map
  76 import Data.Ratio
  77 }
  78
  79 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  80 $whitechar   = [\ \n\r\f\v $unispace]
  81 $white_no_nl = $whitechar # \n
  82 $tab         = \t
  83
  84 $ascdigit  = 0-9
  85 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  86 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  87 $digit     = [$ascdigit $unidigit]
  88
  89 $special   = [\(\)\,\;\[\]\`\{\}]
  90 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  91 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  92 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  93
  94 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  95 $asclarge  = [A-Z]
  96 $large     = [$asclarge $unilarge]
  97
  98 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
  99 $ascsmall  = [a-z]
 100 $small     = [$ascsmall $unismall \_]
 101
 102 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 103 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 104
 105 $octit     = 0-7
 106 $hexit     = [$decdigit A-F a-f]
 107 $symchar   = [$symbol \:]
 108 $nl        = [\n\r]
 109 $idchar    = [$small $large $digit \']
 110
 111 $pragmachar = [$small $large $digit]
 112
 113 $docsym    = [\| \^ \* \$]
 114
 115 @varid     = $small $idchar*
 116 @conid     = $large $idchar*
 117
 118 @varsym    = $symbol $symchar*
 119 @consym    = \: $symchar*
 120
 121 @decimal     = $decdigit+
 122 @octal       = $octit+
 123 @hexadecimal = $hexit+
 124 @exponent    = [eE] [\-\+]? @decimal
 125
 126 -- we support the hierarchical module name extension:
 127 @qual = (@conid \.)+
 128
 129 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 130
 131 -- normal signed numerical literals can only be explicitly negative,
 132 -- not explicitly positive (contrast @exponent)
 133 @negative = \-
 134 @signed = @negative ?
 135
 136 haskell :-
 137
 138 -- everywhere: skip whitespace and comments
 139 $white_no_nl+                           ;
 140 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 141
 142 -- Everywhere: deal with nested comments.  We explicitly rule out
 143 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 144 -- (this can happen even though pragmas will normally take precedence due to
 145 -- longest-match, because pragmas aren't valid in every state, but comments
 146 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 147 -- set.
 148
 149 "{-" / { isNormalComment } { nested_comment lexToken }
 150
 151 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 152 -- more dashes followed by a symbol should be parsed as a varsym, so we
 153 -- have to exclude those.
 154
 155 -- Since Haddock comments aren't valid in every state, we need to rule them
 156 -- out here.
 157
 158 -- The following two rules match comments that begin with two dashes, but
 159 -- continue with a different character. The rules test that this character
 160 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 161 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 162 -- have a Haddock comment). The rules then munch the rest of the line.
 163
 164 "-- " ~[$docsym \#] .* { lineCommentToken }
 165 "--" [^$symbol : \ ] .* { lineCommentToken }
 166
 167 -- Next, match Haddock comments if no -haddock flag
 168
 169 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 170
 171 -- Now, when we've matched comments that begin with 2 dashes and continue
 172 -- with a different character, we need to match comments that begin with three
 173 -- or more dashes (which clearly can't be Haddock comments). We only need to
 174 -- make sure that the first non-dash character isn't a symbol, and munch the
 175 -- rest of the line.
 176
 177 "---"\-* [^$symbol :] .* { lineCommentToken }
 178
 179 -- Since the previous rules all match dashes followed by at least one
 180 -- character, we also need to match a whole line filled with just dashes.
 181
 182 "--"\-* / { atEOL } { lineCommentToken }
 183
 184 -- We need this rule since none of the other single line comment rules
 185 -- actually match this case.
 186
 187 "-- " / { atEOL } { lineCommentToken }
 188
 189 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 190 -- blank lines) until we find a non-whitespace character, then do layout
 191 -- processing.
 192 --
 193 -- One slight wibble here: what if the line begins with {-#? In
 194 -- theory, we have to lex the pragma to see if it's one we recognise,
 195 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 196 -- as a nested comment.  We don't bother with this: if the line begins
 197 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 198 <bol> {
 199   \n                                    ;
 200   ^\# (line)?                           { begin line_prag1 }
 201   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 202   ^\# \! .* \n                          ; -- #!, for scripts
 203   ()                                    { do_bol }
 204 }
 205
 206 -- after a layout keyword (let, where, do, of), we begin a new layout
 207 -- context if the curly brace is missing.
 208 -- Careful! This stuff is quite delicate.
 209 <layout, layout_do> {
 210   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 211         -- we might encounter {-# here, but {- has been handled already
 212   \n                                    ;
 213   ^\# (line)?                           { begin line_prag1 }
 214 }
 215
 216 -- do is treated in a subtly different way, see new_layout_context
 217 <layout>    ()                          { new_layout_context True }
 218 <layout_do> ()                          { new_layout_context False }
 219
 220 -- after a new layout context which was found to be to the left of the
 221 -- previous context, we have generated a '{' token, and we now need to
 222 -- generate a matching '}' token.
 223 <layout_left>  ()                       { do_layout_left }
 224
 225 <0,option_prags> \n                             { begin bol }
 226
 227 "{-#" $whitechar* (line|LINE) / { notFollowedByPragmaChar }
 228                             { begin line_prag2 }
 229
 230 -- single-line line pragmas, of the form
 231 --    # <line> "<file>" <extra-stuff> \n
 232 <line_prag1> $decdigit+                 { setLine line_prag1a }
 233 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 234 <line_prag1b> .*                        { pop }
 235
 236 -- Haskell-style line pragmas, of the form
 237 --    {-# LINE <line> "<file>" #-}
 238 <line_prag2> $decdigit+                 { setLine line_prag2a }
 239 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 240 <line_prag2b> "#-}"|"-}"                { pop }
 241    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 242    -- with older versions of GHC which generated these.
 243
 244 <0,option_prags> {
 245   "{-#" $whitechar* $pragmachar+
 246         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 247                                  { dispatch_pragmas twoWordPrags }
 248
 249   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 250                                  { dispatch_pragmas oneWordPrags }
 251
 252   -- We ignore all these pragmas, but don't generate a warning for them
 253   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 254                                  { dispatch_pragmas ignoredPrags }
 255
 256   -- ToDo: should only be valid inside a pragma:
 257   "#-}"                                 { endPrag }
 258 }
 259
 260 <option_prags> {
 261   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 262                                    { dispatch_pragmas fileHeaderPrags }
 263
 264   "-- #"                                 { multiline_doc_comment }
 265 }
 266
 267 <0> {
 268   -- In the "0" mode we ignore these pragmas
 269   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 270                      { nested_comment lexToken }
 271 }
 272
 273 <0> {
 274   "-- #" .* { lineCommentToken }
 275 }
 276
 277 <0,option_prags> {
 278   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 279                     (nested_comment lexToken) }
 280 }
 281
 282 -- '0' state: ordinary lexemes
 283
 284 -- Haddock comments
 285
 286 <0> {
 287   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 288   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 289 }
 290
 291 -- "special" symbols
 292
 293 <0> {
 294   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 295   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 296 }
 297
 298 <0> {
 299   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 300   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 301   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 302   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 303   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 304   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 305   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 306   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 307
 308   "[$" @varid "|"  / { ifExtension qqEnabled }
 309                      { lex_quasiquote_tok }
 310 }
 311
 312 <0> {
 313   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 314                                         { special IToparenbar }
 315   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 316 }
 317
 318 <0> {
 319   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 320 }
 321
 322 <0> {
 323   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 324          { token IToubxparen }
 325   "#)" / { ifExtension unboxedTuplesEnabled }
 326          { token ITcubxparen }
 327 }
 328
 329 <0> {
 330   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 331   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 332 }
 333
 334 <0,option_prags> {
 335   \(                                    { special IToparen }
 336   \)                                    { special ITcparen }
 337   \[                                    { special ITobrack }
 338   \]                                    { special ITcbrack }
 339   \,                                    { special ITcomma }
 340   \;                                    { special ITsemi }
 341   \`                                    { special ITbackquote }
 342
 343   \{                                    { open_brace }
 344   \}                                    { close_brace }
 345 }
 346
 347 <0,option_prags> {
 348   @qual @varid                  { idtoken qvarid }
 349   @qual @conid                  { idtoken qconid }
 350   @varid                        { varid }
 351   @conid                        { idtoken conid }
 352 }
 353
 354 <0> {
 355   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 356   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 357   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 358   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 359 }
 360
 361 -- ToDo: - move `var` and (sym) into lexical syntax?
 362 --       - remove backquote from $special?
 363 <0> {
 364   @qual @varsym       / { ifExtension oldQualOps } { idtoken qvarsym }
 365   @qual @consym       / { ifExtension oldQualOps } { idtoken qconsym }
 366   @qual \( @varsym \) / { ifExtension newQualOps } { idtoken prefixqvarsym }
 367   @qual \( @consym \) / { ifExtension newQualOps } { idtoken prefixqconsym }
 368   @varsym                                          { varsym }
 369   @consym                                          { consym }
 370 }
 371
 372 -- For the normal boxed literals we need to be careful
 373 -- when trying to be close to Haskell98
 374 <0> {
 375   -- Normal integral literals (:: Num a => a, from Integer)
 376   @decimal           { tok_num positive 0 0 decimal }
 377   0[oO] @octal       { tok_num positive 2 2 octal }
 378   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 379
 380   -- Normal rational literals (:: Fractional a => a, from Rational)
 381   @floating_point    { strtoken tok_float }
 382 }
 383
 384 <0> {
 385   -- Unboxed ints (:: Int#) and words (:: Word#)
 386   -- It's simpler (and faster?) to give separate cases to the negatives,
 387   -- especially considering octal/hexadecimal prefixes.
 388   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 389   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 390   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 391   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 392   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 393   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 394
 395   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 396   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 397   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 398
 399   -- Unboxed floats and doubles (:: Float#, :: Double#)
 400   -- prim_{float,double} work with signed literals
 401   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 402   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 403 }
 404
 405 -- Strings and chars are lexed by hand-written code.  The reason is
 406 -- that even if we recognise the string or char here in the regex
 407 -- lexer, we would still have to parse the string afterward in order
 408 -- to convert it to a String.
 409 <0> {
 410   \'                            { lex_char_tok }
 411   \"                            { lex_string_tok }
 412 }
 413
 414 {
 415 -- -----------------------------------------------------------------------------
 416 -- The token type
 417
 418 data Token
 419   = ITas                        -- Haskell keywords
 420   | ITcase
 421   | ITclass
 422   | ITdata
 423   | ITdefault
 424   | ITderiving
 425   | ITdo
 426   | ITelse
 427   | IThiding
 428   | ITif
 429   | ITimport
 430   | ITin
 431   | ITinfix
 432   | ITinfixl
 433   | ITinfixr
 434   | ITinstance
 435   | ITlet
 436   | ITmodule
 437   | ITnewtype
 438   | ITof
 439   | ITqualified
 440   | ITthen
 441   | ITtype
 442   | ITwhere
 443   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 444
 445   | ITforall                    -- GHC extension keywords
 446   | ITforeign
 447   | ITexport
 448   | ITlabel
 449   | ITdynamic
 450   | ITsafe
 451   | ITthreadsafe
 452   | ITunsafe
 453   | ITstdcallconv
 454   | ITccallconv
 455   | ITprimcallconv
 456   | ITdotnet
 457   | ITmdo
 458   | ITfamily
 459   | ITgroup
 460   | ITby
 461   | ITusing
 462
 463         -- Pragmas
 464   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 465   | ITinline_conlike_prag Bool  -- same
 466   | ITspec_prag                 -- SPECIALISE
 467   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 468   | ITsource_prag
 469   | ITrules_prag
 470   | ITwarning_prag
 471   | ITdeprecated_prag
 472   | ITline_prag
 473   | ITscc_prag
 474   | ITgenerated_prag
 475   | ITcore_prag                 -- hdaume: core annotations
 476   | ITunpack_prag
 477   | ITann_prag
 478   | ITclose_prag
 479   | IToptions_prag String
 480   | ITinclude_prag String
 481   | ITlanguage_prag
 482
 483   | ITdotdot                    -- reserved symbols
 484   | ITcolon
 485   | ITdcolon
 486   | ITequal
 487   | ITlam
 488   | ITvbar
 489   | ITlarrow
 490   | ITrarrow
 491   | ITat
 492   | ITtilde
 493   | ITdarrow
 494   | ITminus
 495   | ITbang
 496   | ITstar
 497   | ITdot
 498
 499   | ITbiglam                    -- GHC-extension symbols
 500
 501   | ITocurly                    -- special symbols
 502   | ITccurly
 503   | ITocurlybar                 -- {|, for type applications
 504   | ITccurlybar                 -- |}, for type applications
 505   | ITvocurly
 506   | ITvccurly
 507   | ITobrack
 508   | ITopabrack                  -- [:, for parallel arrays with -XParr
 509   | ITcpabrack                  -- :], for parallel arrays with -XParr
 510   | ITcbrack
 511   | IToparen
 512   | ITcparen
 513   | IToubxparen
 514   | ITcubxparen
 515   | ITsemi
 516   | ITcomma
 517   | ITunderscore
 518   | ITbackquote
 519
 520   | ITvarid   FastString        -- identifiers
 521   | ITconid   FastString
 522   | ITvarsym  FastString
 523   | ITconsym  FastString
 524   | ITqvarid  (FastString,FastString)
 525   | ITqconid  (FastString,FastString)
 526   | ITqvarsym (FastString,FastString)
 527   | ITqconsym (FastString,FastString)
 528   | ITprefixqvarsym (FastString,FastString)
 529   | ITprefixqconsym (FastString,FastString)
 530
 531   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 532
 533   | ITpragma StringBuffer
 534
 535   | ITchar       Char
 536   | ITstring     FastString
 537   | ITinteger    Integer
 538   | ITrational   Rational
 539
 540   | ITprimchar   Char
 541   | ITprimstring FastString
 542   | ITprimint    Integer
 543   | ITprimword   Integer
 544   | ITprimfloat  Rational
 545   | ITprimdouble Rational
 546
 547   -- MetaHaskell extension tokens
 548   | ITopenExpQuote              --  [| or [e|
 549   | ITopenPatQuote              --  [p|
 550   | ITopenDecQuote              --  [d|
 551   | ITopenTypQuote              --  [t|
 552   | ITcloseQuote                --  |]
 553   | ITidEscape   FastString     --  $x
 554   | ITparenEscape               --  $(
 555   | ITvarQuote                  --  '
 556   | ITtyQuote                   --  ''
 557   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 558
 559   -- Arrow notation extension
 560   | ITproc
 561   | ITrec
 562   | IToparenbar                 --  (|
 563   | ITcparenbar                 --  |)
 564   | ITlarrowtail                --  -<
 565   | ITrarrowtail                --  >-
 566   | ITLarrowtail                --  -<<
 567   | ITRarrowtail                --  >>-
 568
 569   | ITunknown String            -- Used when the lexer can't make sense of it
 570   | ITeof                       -- end of file token
 571
 572   -- Documentation annotations
 573   | ITdocCommentNext  String     -- something beginning '-- |'
 574   | ITdocCommentPrev  String     -- something beginning '-- ^'
 575   | ITdocCommentNamed String     -- something beginning '-- $'
 576   | ITdocSection      Int String -- a section heading
 577   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 578   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 579   | ITlineComment     String     -- comment starting by "--"
 580   | ITblockComment    String     -- comment in {- -}
 581
 582 #ifdef DEBUG
 583   deriving Show -- debugging
 584 #endif
 585
 586 {-
 587 isSpecial :: Token -> Bool
 588 -- If we see M.x, where x is a keyword, but
 589 -- is special, we treat is as just plain M.x,
 590 -- not as a keyword.
 591 isSpecial ITas          = True
 592 isSpecial IThiding      = True
 593 isSpecial ITqualified   = True
 594 isSpecial ITforall      = True
 595 isSpecial ITexport      = True
 596 isSpecial ITlabel       = True
 597 isSpecial ITdynamic     = True
 598 isSpecial ITsafe        = True
 599 isSpecial ITthreadsafe  = True
 600 isSpecial ITunsafe      = True
 601 isSpecial ITccallconv   = True
 602 isSpecial ITstdcallconv = True
 603 isSpecial ITprimcallconv = True
 604 isSpecial ITmdo         = True
 605 isSpecial ITfamily      = True
 606 isSpecial ITgroup   = True
 607 isSpecial ITby      = True
 608 isSpecial ITusing   = True
 609 isSpecial _             = False
 610 -}
 611
 612 -- the bitmap provided as the third component indicates whether the
 613 -- corresponding extension keyword is valid under the extension options
 614 -- provided to the compiler; if the extension corresponding to *any* of the
 615 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 616 -- facilitates using a keyword in two different extensions that can be
 617 -- activated independently)
 618 --
 619 reservedWordsFM :: UniqFM (Token, Int)
 620 reservedWordsFM = listToUFM $
 621         map (\(x, y, z) -> (mkFastString x, (y, z)))
 622        [( "_",          ITunderscore,   0 ),
 623         ( "as",         ITas,           0 ),
 624         ( "case",       ITcase,         0 ),
 625         ( "class",      ITclass,        0 ),
 626         ( "data",       ITdata,         0 ),
 627         ( "default",    ITdefault,      0 ),
 628         ( "deriving",   ITderiving,     0 ),
 629         ( "do",         ITdo,           0 ),
 630         ( "else",       ITelse,         0 ),
 631         ( "hiding",     IThiding,       0 ),
 632         ( "if",         ITif,           0 ),
 633         ( "import",     ITimport,       0 ),
 634         ( "in",         ITin,           0 ),
 635         ( "infix",      ITinfix,        0 ),
 636         ( "infixl",     ITinfixl,       0 ),
 637         ( "infixr",     ITinfixr,       0 ),
 638         ( "instance",   ITinstance,     0 ),
 639         ( "let",        ITlet,          0 ),
 640         ( "module",     ITmodule,       0 ),
 641         ( "newtype",    ITnewtype,      0 ),
 642         ( "of",         ITof,           0 ),
 643         ( "qualified",  ITqualified,    0 ),
 644         ( "then",       ITthen,         0 ),
 645         ( "type",       ITtype,         0 ),
 646         ( "where",      ITwhere,        0 ),
 647         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 648
 649     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 650         ( "mdo",        ITmdo,           bit recursiveDoBit),
 651         ( "family",     ITfamily,        bit tyFamBit),
 652     ( "group",  ITgroup,     bit transformComprehensionsBit),
 653     ( "by",     ITby,        bit transformComprehensionsBit),
 654     ( "using",  ITusing,     bit transformComprehensionsBit),
 655
 656         ( "foreign",    ITforeign,       bit ffiBit),
 657         ( "export",     ITexport,        bit ffiBit),
 658         ( "label",      ITlabel,         bit ffiBit),
 659         ( "dynamic",    ITdynamic,       bit ffiBit),
 660         ( "safe",       ITsafe,          bit ffiBit),
 661         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 662         ( "unsafe",     ITunsafe,        bit ffiBit),
 663         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 664         ( "ccall",      ITccallconv,     bit ffiBit),
 665         ( "prim",       ITprimcallconv,  bit ffiBit),
 666         ( "dotnet",     ITdotnet,        bit ffiBit),
 667
 668         ( "rec",        ITrec,           bit arrowsBit),
 669         ( "proc",       ITproc,          bit arrowsBit)
 670      ]
 671
 672 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 673 reservedSymsFM = listToUFM $
 674     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 675       [ ("..",  ITdotdot,   always)
 676         -- (:) is a reserved op, meaning only list cons
 677        ,(":",   ITcolon,    always)
 678        ,("::",  ITdcolon,   always)
 679        ,("=",   ITequal,    always)
 680        ,("\\",  ITlam,      always)
 681        ,("|",   ITvbar,     always)
 682        ,("<-",  ITlarrow,   always)
 683        ,("->",  ITrarrow,   always)
 684        ,("@",   ITat,       always)
 685        ,("~",   ITtilde,    always)
 686        ,("=>",  ITdarrow,   always)
 687        ,("-",   ITminus,    always)
 688        ,("!",   ITbang,     always)
 689
 690         -- For data T (a::*) = MkT
 691        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 692         -- For 'forall a . t'
 693        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 694
 695        ,("-<",  ITlarrowtail, arrowsEnabled)
 696        ,(">-",  ITrarrowtail, arrowsEnabled)
 697        ,("-<<", ITLarrowtail, arrowsEnabled)
 698        ,(">>-", ITRarrowtail, arrowsEnabled)
 699
 700        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 701        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 702        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 703                                 explicitForallEnabled i)
 704        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 705        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 706        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 707         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 708         -- form part of a large operator.  This would let us have a better
 709         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 710        ]
 711
 712 -- -----------------------------------------------------------------------------
 713 -- Lexer actions
 714
 715 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 716
 717 special :: Token -> Action
 718 special tok span _buf _len = return (L span tok)
 719
 720 token, layout_token :: Token -> Action
 721 token t span _buf _len = return (L span t)
 722 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 723
 724 idtoken :: (StringBuffer -> Int -> Token) -> Action
 725 idtoken f span buf len = return (L span $! (f buf len))
 726
 727 skip_one_varid :: (FastString -> Token) -> Action
 728 skip_one_varid f span buf len
 729   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 730
 731 strtoken :: (String -> Token) -> Action
 732 strtoken f span buf len =
 733   return (L span $! (f $! lexemeToString buf len))
 734
 735 init_strtoken :: Int -> (String -> Token) -> Action
 736 -- like strtoken, but drops the last N character(s)
 737 init_strtoken drop f span buf len =
 738   return (L span $! (f $! lexemeToString buf (len-drop)))
 739
 740 begin :: Int -> Action
 741 begin code _span _str _len = do pushLexState code; lexToken
 742
 743 pop :: Action
 744 pop _span _buf _len = do popLexState; lexToken
 745
 746 pop_and :: Action -> Action
 747 pop_and act span buf len = do popLexState; act span buf len
 748
 749 {-# INLINE nextCharIs #-}
 750 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 751 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 752
 753 notFollowedBy :: Char -> AlexAccPred Int
 754 notFollowedBy char _ _ _ (AI _ _ buf)
 755   = nextCharIs buf (/=char)
 756
 757 notFollowedBySymbol :: AlexAccPred Int
 758 notFollowedBySymbol _ _ _ (AI _ _ buf)
 759   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 760
 761 notFollowedByPragmaChar :: AlexAccPred Int
 762 notFollowedByPragmaChar _ _ _ (AI _ _ buf)
 763   = nextCharIs buf (\c -> not (isAlphaNum c || c == '_'))
 764
 765 -- We must reject doc comments as being ordinary comments everywhere.
 766 -- In some cases the doc comment will be selected as the lexeme due to
 767 -- maximal munch, but not always, because the nested comment rule is
 768 -- valid in all states, but the doc-comment rules are only valid in
 769 -- the non-layout states.
 770 isNormalComment :: AlexAccPred Int
 771 isNormalComment bits _ _ (AI _ _ buf)
 772   | haddockEnabled bits = notFollowedByDocOrPragma
 773   | otherwise           = nextCharIs buf (/='#')
 774   where
 775     notFollowedByDocOrPragma
 776        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 777
 778 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 779 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 780
 781 {-
 782 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 783   = if haddockEnabled bits then False else (p buf)
 784 -}
 785
 786 atEOL :: AlexAccPred Int
 787 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 788
 789 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 790 ifExtension pred bits _ _ _ = pred bits
 791
 792 multiline_doc_comment :: Action
 793 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 794   where
 795     worker commentAcc input docType oneLine = case alexGetChar input of
 796       Just ('\n', input')
 797         | oneLine -> docCommentEnd input commentAcc docType buf span
 798         | otherwise -> case checkIfCommentLine input' of
 799           Just input -> worker ('\n':commentAcc) input docType False
 800           Nothing -> docCommentEnd input commentAcc docType buf span
 801       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 802       Nothing -> docCommentEnd input commentAcc docType buf span
 803
 804     checkIfCommentLine input = check (dropNonNewlineSpace input)
 805       where
 806         check input = case alexGetChar input of
 807           Just ('-', input) -> case alexGetChar input of
 808             Just ('-', input) -> case alexGetChar input of
 809               Just (c, _) | c /= '-' -> Just input
 810               _ -> Nothing
 811             _ -> Nothing
 812           _ -> Nothing
 813
 814         dropNonNewlineSpace input = case alexGetChar input of
 815           Just (c, input')
 816             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 817             | otherwise -> input
 818           Nothing -> input
 819
 820 lineCommentToken :: Action
 821 lineCommentToken span buf len = do
 822   b <- extension rawTokenStreamEnabled
 823   if b then strtoken ITlineComment span buf len else lexToken
 824
 825 {-
 826   nested comments require traversing by hand, they can't be parsed
 827   using regular expressions.
 828 -}
 829 nested_comment :: P (Located Token) -> Action
 830 nested_comment cont span _str _len = do
 831   input <- getInput
 832   go "" (1::Int) input
 833   where
 834     go commentAcc 0 input = do setInput input
 835                                b <- extension rawTokenStreamEnabled
 836                                if b
 837                                  then docCommentEnd input commentAcc ITblockComment _str span
 838                                  else cont
 839     go commentAcc n input = case alexGetChar input of
 840       Nothing -> errBrace input span
 841       Just ('-',input) -> case alexGetChar input of
 842         Nothing  -> errBrace input span
 843         Just ('\125',input) -> go commentAcc (n-1) input
 844         Just (_,_)          -> go ('-':commentAcc) n input
 845       Just ('\123',input) -> case alexGetChar input of
 846         Nothing  -> errBrace input span
 847         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 848         Just (_,_)       -> go ('\123':commentAcc) n input
 849       Just (c,input) -> go (c:commentAcc) n input
 850
 851 nested_doc_comment :: Action
 852 nested_doc_comment span buf _len = withLexedDocType (go "")
 853   where
 854     go commentAcc input docType _ = case alexGetChar input of
 855       Nothing -> errBrace input span
 856       Just ('-',input) -> case alexGetChar input of
 857         Nothing -> errBrace input span
 858         Just ('\125',input) ->
 859           docCommentEnd input commentAcc docType buf span
 860         Just (_,_) -> go ('-':commentAcc) input docType False
 861       Just ('\123', input) -> case alexGetChar input of
 862         Nothing  -> errBrace input span
 863         Just ('-',input) -> do
 864           setInput input
 865           let cont = do input <- getInput; go commentAcc input docType False
 866           nested_comment cont span buf _len
 867         Just (_,_) -> go ('\123':commentAcc) input docType False
 868       Just (c,input) -> go (c:commentAcc) input docType False
 869
 870 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 871                  -> P (Located Token)
 872 withLexedDocType lexDocComment = do
 873   input@(AI _ _ buf) <- getInput
 874   case prevChar buf ' ' of
 875     '|' -> lexDocComment input ITdocCommentNext False
 876     '^' -> lexDocComment input ITdocCommentPrev False
 877     '$' -> lexDocComment input ITdocCommentNamed False
 878     '*' -> lexDocSection 1 input
 879     '#' -> lexDocComment input ITdocOptionsOld False
 880     _ -> panic "withLexedDocType: Bad doc type"
 881  where
 882     lexDocSection n input = case alexGetChar input of
 883       Just ('*', input) -> lexDocSection (n+1) input
 884       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 885       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 886
 887 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 888 -- off again at the end of the pragma.
 889 rulePrag :: Action
 890 rulePrag span _ _ = do
 891   setExts (.|. bit inRulePragBit)
 892   return (L span ITrules_prag)
 893
 894 endPrag :: Action
 895 endPrag span _ _ = do
 896   setExts (.&. complement (bit inRulePragBit))
 897   return (L span ITclose_prag)
 898
 899 -- docCommentEnd
 900 -------------------------------------------------------------------------------
 901 -- This function is quite tricky. We can't just return a new token, we also
 902 -- need to update the state of the parser. Why? Because the token is longer
 903 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 904 -- it writes the wrong token length to the parser state. This function is
 905 -- called afterwards, so it can just update the state.
 906
 907 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 908 -- which is something that the original lexer didn't account for.
 909 -- I have added last_line_len in the parser state which represents the length
 910 -- of the part of the token that is on the last line. It is now used for layout
 911 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 912 -- was before, the full length of the token, and it is now only used for error
 913 -- messages. /Waern
 914
 915 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 916                  SrcSpan -> P (Located Token)
 917 docCommentEnd input commentAcc docType buf span = do
 918   setInput input
 919   let (AI loc last_offs nextBuf) = input
 920       comment = reverse commentAcc
 921       span' = mkSrcSpan (srcSpanStart span) loc
 922       last_len = byteDiff buf nextBuf
 923
 924       last_line_len = if (last_offs - last_len < 0)
 925         then last_offs
 926         else last_len
 927
 928   span `seq` setLastToken span' last_len last_line_len
 929   return (L span' (docType comment))
 930
 931 errBrace :: AlexInput -> SrcSpan -> P a
 932 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 933
 934 open_brace, close_brace :: Action
 935 open_brace span _str _len = do
 936   ctx <- getContext
 937   setContext (NoLayout:ctx)
 938   return (L span ITocurly)
 939 close_brace span _str _len = do
 940   popContext
 941   return (L span ITccurly)
 942
 943 qvarid, qconid :: StringBuffer -> Int -> Token
 944 qvarid buf len = ITqvarid $! splitQualName buf len False
 945 qconid buf len = ITqconid $! splitQualName buf len False
 946
 947 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 948 -- takes a StringBuffer and a length, and returns the module name
 949 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 950 -- because of hierarchical module names.
 951 splitQualName orig_buf len parens = split orig_buf orig_buf
 952   where
 953     split buf dot_buf
 954         | orig_buf `byteDiff` buf >= len  = done dot_buf
 955         | c == '.'                        = found_dot buf'
 956         | otherwise                       = split buf' dot_buf
 957       where
 958        (c,buf') = nextChar buf
 959
 960     -- careful, we might get names like M....
 961     -- so, if the character after the dot is not upper-case, this is
 962     -- the end of the qualifier part.
 963     found_dot buf -- buf points after the '.'
 964         | isUpper c    = split buf' buf
 965         | otherwise    = done buf
 966       where
 967        (c,buf') = nextChar buf
 968
 969     done dot_buf =
 970         (lexemeToFastString orig_buf (qual_size - 1),
 971          if parens -- Prelude.(+)
 972             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 973             else lexemeToFastString dot_buf (len - qual_size))
 974       where
 975         qual_size = orig_buf `byteDiff` dot_buf
 976
 977 varid :: Action
 978 varid span buf len =
 979   fs `seq`
 980   case lookupUFM reservedWordsFM fs of
 981         Just (keyword,0)    -> do
 982                 maybe_layout keyword
 983                 return (L span keyword)
 984         Just (keyword,exts) -> do
 985                 b <- extension (\i -> exts .&. i /= 0)
 986                 if b then do maybe_layout keyword
 987                              return (L span keyword)
 988                      else return (L span (ITvarid fs))
 989         _other -> return (L span (ITvarid fs))
 990   where
 991         fs = lexemeToFastString buf len
 992
 993 conid :: StringBuffer -> Int -> Token
 994 conid buf len = ITconid fs
 995   where fs = lexemeToFastString buf len
 996
 997 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
 998 qvarsym buf len = ITqvarsym $! splitQualName buf len False
 999 qconsym buf len = ITqconsym $! splitQualName buf len False
1000 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
1001 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
1002
1003 varsym, consym :: Action
1004 varsym = sym ITvarsym
1005 consym = sym ITconsym
1006
1007 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
1008     -> P (Located Token)
1009 sym con span buf len =
1010   case lookupUFM reservedSymsFM fs of
1011         Just (keyword,exts) -> do
1012                 b <- extension exts
1013                 if b then return (L span keyword)
1014                      else return (L span $! con fs)
1015         _other -> return (L span $! con fs)
1016   where
1017         fs = lexemeToFastString buf len
1018
1019 -- Variations on the integral numeric literal.
1020 tok_integral :: (Integer -> Token)
1021      -> (Integer -> Integer)
1022  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1023      -> Int -> Int
1024      -> (Integer, (Char->Int)) -> Action
1025 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1026   return $ L span $ itint $! transint $ parseUnsignedInteger
1027      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1028
1029 -- some conveniences for use with tok_integral
1030 tok_num :: (Integer -> Integer)
1031         -> Int -> Int
1032         -> (Integer, (Char->Int)) -> Action
1033 tok_num = tok_integral ITinteger
1034 tok_primint :: (Integer -> Integer)
1035             -> Int -> Int
1036             -> (Integer, (Char->Int)) -> Action
1037 tok_primint = tok_integral ITprimint
1038 tok_primword :: Int -> Int
1039              -> (Integer, (Char->Int)) -> Action
1040 tok_primword = tok_integral ITprimword positive
1041 positive, negative :: (Integer -> Integer)
1042 positive = id
1043 negative = negate
1044 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1045 decimal = (10,octDecDigit)
1046 octal = (8,octDecDigit)
1047 hexadecimal = (16,hexDigit)
1048
1049 -- readRational can understand negative rationals, exponents, everything.
1050 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1051 tok_float        str = ITrational   $! readRational str
1052 tok_primfloat    str = ITprimfloat  $! readRational str
1053 tok_primdouble   str = ITprimdouble $! readRational str
1054
1055 -- -----------------------------------------------------------------------------
1056 -- Layout processing
1057
1058 -- we're at the first token on a line, insert layout tokens if necessary
1059 do_bol :: Action
1060 do_bol span _str _len = do
1061         pos <- getOffside
1062         case pos of
1063             LT -> do
1064                 --trace "layout: inserting '}'" $ do
1065                 popContext
1066                 -- do NOT pop the lex state, we might have a ';' to insert
1067                 return (L span ITvccurly)
1068             EQ -> do
1069                 --trace "layout: inserting ';'" $ do
1070                 popLexState
1071                 return (L span ITsemi)
1072             GT -> do
1073                 popLexState
1074                 lexToken
1075
1076 -- certain keywords put us in the "layout" state, where we might
1077 -- add an opening curly brace.
1078 maybe_layout :: Token -> P ()
1079 maybe_layout ITdo       = pushLexState layout_do
1080 maybe_layout ITmdo      = pushLexState layout_do
1081 maybe_layout ITof       = pushLexState layout
1082 maybe_layout ITlet      = pushLexState layout
1083 maybe_layout ITwhere    = pushLexState layout
1084 maybe_layout ITrec      = pushLexState layout
1085 maybe_layout _          = return ()
1086
1087 -- Pushing a new implicit layout context.  If the indentation of the
1088 -- next token is not greater than the previous layout context, then
1089 -- Haskell 98 says that the new layout context should be empty; that is
1090 -- the lexer must generate {}.
1091 --
1092 -- We are slightly more lenient than this: when the new context is started
1093 -- by a 'do', then we allow the new context to be at the same indentation as
1094 -- the previous context.  This is what the 'strict' argument is for.
1095 --
1096 new_layout_context :: Bool -> Action
1097 new_layout_context strict span _buf _len = do
1098     popLexState
1099     (AI _ offset _) <- getInput
1100     ctx <- getContext
1101     case ctx of
1102         Layout prev_off : _  |
1103            (strict     && prev_off >= offset  ||
1104             not strict && prev_off > offset) -> do
1105                 -- token is indented to the left of the previous context.
1106                 -- we must generate a {} sequence now.
1107                 pushLexState layout_left
1108                 return (L span ITvocurly)
1109         _ -> do
1110                 setContext (Layout offset : ctx)
1111                 return (L span ITvocurly)
1112
1113 do_layout_left :: Action
1114 do_layout_left span _buf _len = do
1115     popLexState
1116     pushLexState bol  -- we must be at the start of a line
1117     return (L span ITvccurly)
1118
1119 -- -----------------------------------------------------------------------------
1120 -- LINE pragmas
1121
1122 setLine :: Int -> Action
1123 setLine code span buf len = do
1124   let line = parseUnsignedInteger buf len 10 octDecDigit
1125   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1126         -- subtract one: the line number refers to the *following* line
1127   popLexState
1128   pushLexState code
1129   lexToken
1130
1131 setFile :: Int -> Action
1132 setFile code span buf len = do
1133   let file = lexemeToFastString (stepOn buf) (len-2)
1134   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1135   popLexState
1136   pushLexState code
1137   lexToken
1138
1139
1140 -- -----------------------------------------------------------------------------
1141 -- Options, includes and language pragmas.
1142
1143 lex_string_prag :: (String -> Token) -> Action
1144 lex_string_prag mkTok span _buf _len
1145     = do input <- getInput
1146          start <- getSrcLoc
1147          tok <- go [] input
1148          end <- getSrcLoc
1149          return (L (mkSrcSpan start end) tok)
1150     where go acc input
1151               = if isString input "#-}"
1152                    then do setInput input
1153                            return (mkTok (reverse acc))
1154                    else case alexGetChar input of
1155                           Just (c,i) -> go (c:acc) i
1156                           Nothing -> err input
1157           isString _ [] = True
1158           isString i (x:xs)
1159               = case alexGetChar i of
1160                   Just (c,i') | c == x    -> isString i' xs
1161                   _other -> False
1162           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1163
1164
1165 -- -----------------------------------------------------------------------------
1166 -- Strings & Chars
1167
1168 -- This stuff is horrible.  I hates it.
1169
1170 lex_string_tok :: Action
1171 lex_string_tok span _buf _len = do
1172   tok <- lex_string ""
1173   end <- getSrcLoc
1174   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1175
1176 lex_string :: String -> P Token
1177 lex_string s = do
1178   i <- getInput
1179   case alexGetChar' i of
1180     Nothing -> lit_error
1181
1182     Just ('"',i)  -> do
1183         setInput i
1184         magicHash <- extension magicHashEnabled
1185         if magicHash
1186           then do
1187             i <- getInput
1188             case alexGetChar' i of
1189               Just ('#',i) -> do
1190                    setInput i
1191                    if any (> '\xFF') s
1192                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1193                     else let s' = mkZFastString (reverse s) in
1194                          return (ITprimstring s')
1195                         -- mkZFastString is a hack to avoid encoding the
1196                         -- string in UTF-8.  We just want the exact bytes.
1197               _other ->
1198                 return (ITstring (mkFastString (reverse s)))
1199           else
1200                 return (ITstring (mkFastString (reverse s)))
1201
1202     Just ('\\',i)
1203         | Just ('&',i) <- next -> do
1204                 setInput i; lex_string s
1205         | Just (c,i) <- next, is_space c -> do
1206                 setInput i; lex_stringgap s
1207         where next = alexGetChar' i
1208
1209     Just (c, i) -> do
1210         c' <- lex_char c i
1211         lex_string (c':s)
1212
1213 lex_stringgap :: String -> P Token
1214 lex_stringgap s = do
1215   c <- getCharOrFail
1216   case c of
1217     '\\' -> lex_string s
1218     c | is_space c -> lex_stringgap s
1219     _other -> lit_error
1220
1221
1222 lex_char_tok :: Action
1223 -- Here we are basically parsing character literals, such as 'x' or '\n'
1224 -- but, when Template Haskell is on, we additionally spot
1225 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1226 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1227 -- So we have to do two characters of lookahead: when we see 'x we need to
1228 -- see if there's a trailing quote
1229 lex_char_tok span _buf _len = do        -- We've seen '
1230    i1 <- getInput       -- Look ahead to first character
1231    let loc = srcSpanStart span
1232    case alexGetChar' i1 of
1233         Nothing -> lit_error
1234
1235         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1236                   th_exts <- extension thEnabled
1237                   if th_exts then do
1238                         setInput i2
1239                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1240                    else lit_error
1241
1242         Just ('\\', i2@(AI _end2 _ _)) -> do    -- We've seen 'backslash
1243                   setInput i2
1244                   lit_ch <- lex_escape
1245                   mc <- getCharOrFail   -- Trailing quote
1246                   if mc == '\'' then finish_char_tok loc lit_ch
1247                                 else do setInput i2; lit_error
1248
1249         Just (c, i2@(AI _end2 _ _))
1250                 | not (isAny c) -> lit_error
1251                 | otherwise ->
1252
1253                 -- We've seen 'x, where x is a valid character
1254                 --  (i.e. not newline etc) but not a quote or backslash
1255            case alexGetChar' i2 of      -- Look ahead one more character
1256                 Just ('\'', i3) -> do   -- We've seen 'x'
1257                         setInput i3
1258                         finish_char_tok loc c
1259                 _other -> do            -- We've seen 'x not followed by quote
1260                                         -- (including the possibility of EOF)
1261                                         -- If TH is on, just parse the quote only
1262                         th_exts <- extension thEnabled
1263                         let (AI end _ _) = i1
1264                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1265                                    else do setInput i2; lit_error
1266
1267 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1268 finish_char_tok loc ch  -- We've already seen the closing quote
1269                         -- Just need to check for trailing #
1270   = do  magicHash <- extension magicHashEnabled
1271         i@(AI end _ _) <- getInput
1272         if magicHash then do
1273                 case alexGetChar' i of
1274                         Just ('#',i@(AI end _ _)) -> do
1275                                 setInput i
1276                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1277                         _other ->
1278                                 return (L (mkSrcSpan loc end) (ITchar ch))
1279                 else do
1280                    return (L (mkSrcSpan loc end) (ITchar ch))
1281
1282 lex_char :: Char -> AlexInput -> P Char
1283 lex_char c inp = do
1284   case c of
1285       '\\' -> do setInput inp; lex_escape
1286       c | isAny c -> do setInput inp; return c
1287       _other -> lit_error
1288
1289 isAny :: Char -> Bool
1290 isAny c | c > '\x7f' = isPrint c
1291         | otherwise  = is_any c
1292
1293 lex_escape :: P Char
1294 lex_escape = do
1295   c <- getCharOrFail
1296   case c of
1297         'a'   -> return '\a'
1298         'b'   -> return '\b'
1299         'f'   -> return '\f'
1300         'n'   -> return '\n'
1301         'r'   -> return '\r'
1302         't'   -> return '\t'
1303         'v'   -> return '\v'
1304         '\\'  -> return '\\'
1305         '"'   -> return '\"'
1306         '\''  -> return '\''
1307         '^'   -> do c <- getCharOrFail
1308                     if c >= '@' && c <= '_'
1309                         then return (chr (ord c - ord '@'))
1310                         else lit_error
1311
1312         'x'   -> readNum is_hexdigit 16 hexDigit
1313         'o'   -> readNum is_octdigit  8 octDecDigit
1314         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1315
1316         c1 ->  do
1317            i <- getInput
1318            case alexGetChar' i of
1319             Nothing -> lit_error
1320             Just (c2,i2) ->
1321               case alexGetChar' i2 of
1322                 Nothing -> do setInput i2; lit_error
1323                 Just (c3,i3) ->
1324                    let str = [c1,c2,c3] in
1325                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1326                                      Just rest <- [maybePrefixMatch p str] ] of
1327                           (escape_char,[]):_ -> do
1328                                 setInput i3
1329                                 return escape_char
1330                           (escape_char,_:_):_ -> do
1331                                 setInput i2
1332                                 return escape_char
1333                           [] -> lit_error
1334
1335 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1336 readNum is_digit base conv = do
1337   i <- getInput
1338   c <- getCharOrFail
1339   if is_digit c
1340         then readNum2 is_digit base conv (conv c)
1341         else do setInput i; lit_error
1342
1343 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1344 readNum2 is_digit base conv i = do
1345   input <- getInput
1346   read i input
1347   where read i input = do
1348           case alexGetChar' input of
1349             Just (c,input') | is_digit c -> do
1350                 read (i*base + conv c) input'
1351             _other -> do
1352                 if i >= 0 && i <= 0x10FFFF
1353                    then do setInput input; return (chr i)
1354                    else lit_error
1355
1356 silly_escape_chars :: [(String, Char)]
1357 silly_escape_chars = [
1358         ("NUL", '\NUL'),
1359         ("SOH", '\SOH'),
1360         ("STX", '\STX'),
1361         ("ETX", '\ETX'),
1362         ("EOT", '\EOT'),
1363         ("ENQ", '\ENQ'),
1364         ("ACK", '\ACK'),
1365         ("BEL", '\BEL'),
1366         ("BS", '\BS'),
1367         ("HT", '\HT'),
1368         ("LF", '\LF'),
1369         ("VT", '\VT'),
1370         ("FF", '\FF'),
1371         ("CR", '\CR'),
1372         ("SO", '\SO'),
1373         ("SI", '\SI'),
1374         ("DLE", '\DLE'),
1375         ("DC1", '\DC1'),
1376         ("DC2", '\DC2'),
1377         ("DC3", '\DC3'),
1378         ("DC4", '\DC4'),
1379         ("NAK", '\NAK'),
1380         ("SYN", '\SYN'),
1381         ("ETB", '\ETB'),
1382         ("CAN", '\CAN'),
1383         ("EM", '\EM'),
1384         ("SUB", '\SUB'),
1385         ("ESC", '\ESC'),
1386         ("FS", '\FS'),
1387         ("GS", '\GS'),
1388         ("RS", '\RS'),
1389         ("US", '\US'),
1390         ("SP", '\SP'),
1391         ("DEL", '\DEL')
1392         ]
1393
1394 -- before calling lit_error, ensure that the current input is pointing to
1395 -- the position of the error in the buffer.  This is so that we can report
1396 -- a correct location to the user, but also so we can detect UTF-8 decoding
1397 -- errors if they occur.
1398 lit_error :: P a
1399 lit_error = lexError "lexical error in string/character literal"
1400
1401 getCharOrFail :: P Char
1402 getCharOrFail =  do
1403   i <- getInput
1404   case alexGetChar' i of
1405         Nothing -> lexError "unexpected end-of-file in string/character literal"
1406         Just (c,i)  -> do setInput i; return c
1407
1408 -- -----------------------------------------------------------------------------
1409 -- QuasiQuote
1410
1411 lex_quasiquote_tok :: Action
1412 lex_quasiquote_tok span buf len = do
1413   let quoter = reverse $ takeWhile (/= '$')
1414                $ reverse $ lexemeToString buf (len - 1)
1415   quoteStart <- getSrcLoc
1416   quote <- lex_quasiquote ""
1417   end <- getSrcLoc
1418   return (L (mkSrcSpan (srcSpanStart span) end)
1419            (ITquasiQuote (mkFastString quoter,
1420                           mkFastString (reverse quote),
1421                           mkSrcSpan quoteStart end)))
1422
1423 lex_quasiquote :: String -> P String
1424 lex_quasiquote s = do
1425   i <- getInput
1426   case alexGetChar' i of
1427     Nothing -> lit_error
1428
1429     Just ('\\',i)
1430         | Just ('|',i) <- next -> do
1431                 setInput i; lex_quasiquote ('|' : s)
1432         | Just (']',i) <- next -> do
1433                 setInput i; lex_quasiquote (']' : s)
1434         where next = alexGetChar' i
1435
1436     Just ('|',i)
1437         | Just (']',i) <- next -> do
1438                 setInput i; return s
1439         where next = alexGetChar' i
1440
1441     Just (c, i) -> do
1442          setInput i; lex_quasiquote (c : s)
1443
1444 -- -----------------------------------------------------------------------------
1445 -- Warnings
1446
1447 warn :: DynFlag -> SDoc -> Action
1448 warn option warning srcspan _buf _len = do
1449     addWarning option srcspan warning
1450     lexToken
1451
1452 warnThen :: DynFlag -> SDoc -> Action -> Action
1453 warnThen option warning action srcspan buf len = do
1454     addWarning option srcspan warning
1455     action srcspan buf len
1456
1457 -- -----------------------------------------------------------------------------
1458 -- The Parse Monad
1459
1460 data LayoutContext
1461   = NoLayout
1462   | Layout !Int
1463   deriving Show
1464
1465 data ParseResult a
1466   = POk PState a
1467   | PFailed
1468         SrcSpan         -- The start and end of the text span related to
1469                         -- the error.  Might be used in environments which can
1470                         -- show this span, e.g. by highlighting it.
1471         Message         -- The error message
1472
1473 data PState = PState {
1474         buffer     :: StringBuffer,
1475         dflags     :: DynFlags,
1476         messages   :: Messages,
1477         last_loc   :: SrcSpan,  -- pos of previous token
1478         last_offs  :: !Int,     -- offset of the previous token from the
1479                                 -- beginning of  the current line.
1480                                 -- \t is equal to 8 spaces.
1481         last_len   :: !Int,     -- len of previous token
1482         last_line_len :: !Int,
1483         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1484         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1485         context    :: [LayoutContext],
1486         lex_state  :: [Int]
1487      }
1488         -- last_loc and last_len are used when generating error messages,
1489         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1490         -- current token to happyError, we could at least get rid of last_len.
1491         -- Getting rid of last_loc would require finding another way to
1492         -- implement pushCurrentContext (which is only called from one place).
1493
1494 newtype P a = P { unP :: PState -> ParseResult a }
1495
1496 instance Monad P where
1497   return = returnP
1498   (>>=) = thenP
1499   fail = failP
1500
1501 returnP :: a -> P a
1502 returnP a = a `seq` (P $ \s -> POk s a)
1503
1504 thenP :: P a -> (a -> P b) -> P b
1505 (P m) `thenP` k = P $ \ s ->
1506         case m s of
1507                 POk s1 a         -> (unP (k a)) s1
1508                 PFailed span err -> PFailed span err
1509
1510 failP :: String -> P a
1511 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1512
1513 failMsgP :: String -> P a
1514 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1515
1516 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1517 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1518
1519 failSpanMsgP :: SrcSpan -> SDoc -> P a
1520 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1521
1522 extension :: (Int -> Bool) -> P Bool
1523 extension p = P $ \s -> POk s (p $! extsBitmap s)
1524
1525 getExts :: P Int
1526 getExts = P $ \s -> POk s (extsBitmap s)
1527
1528 setExts :: (Int -> Int) -> P ()
1529 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1530
1531 setSrcLoc :: SrcLoc -> P ()
1532 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1533
1534 getSrcLoc :: P SrcLoc
1535 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1536
1537 setLastToken :: SrcSpan -> Int -> Int -> P ()
1538 setLastToken loc len line_len = P $ \s -> POk s {
1539   last_loc=loc,
1540   last_len=len,
1541   last_line_len=line_len
1542 } ()
1543
1544 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1545
1546 alexInputPrevChar :: AlexInput -> Char
1547 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1548
1549 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1550 alexGetChar (AI loc ofs s)
1551   | atEnd s   = Nothing
1552   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1553                 --trace (show (ord c)) $
1554                 Just (adj_c, (AI loc' ofs' s'))
1555   where (c,s') = nextChar s
1556         loc'   = advanceSrcLoc loc c
1557         ofs'   = advanceOffs c ofs
1558
1559         non_graphic     = '\x0'
1560         upper           = '\x1'
1561         lower           = '\x2'
1562         digit           = '\x3'
1563         symbol          = '\x4'
1564         space           = '\x5'
1565         other_graphic   = '\x6'
1566
1567         adj_c
1568           | c <= '\x06' = non_graphic
1569           | c <= '\x7f' = c
1570           -- Alex doesn't handle Unicode, so when Unicode
1571           -- character is encountered we output these values
1572           -- with the actual character value hidden in the state.
1573           | otherwise =
1574                 case generalCategory c of
1575                   UppercaseLetter       -> upper
1576                   LowercaseLetter       -> lower
1577                   TitlecaseLetter       -> upper
1578                   ModifierLetter        -> other_graphic
1579                   OtherLetter           -> lower -- see #1103
1580                   NonSpacingMark        -> other_graphic
1581                   SpacingCombiningMark  -> other_graphic
1582                   EnclosingMark         -> other_graphic
1583                   DecimalNumber         -> digit
1584                   LetterNumber          -> other_graphic
1585                   OtherNumber           -> other_graphic
1586                   ConnectorPunctuation  -> symbol
1587                   DashPunctuation       -> symbol
1588                   OpenPunctuation       -> other_graphic
1589                   ClosePunctuation      -> other_graphic
1590                   InitialQuote          -> other_graphic
1591                   FinalQuote            -> other_graphic
1592                   OtherPunctuation      -> symbol
1593                   MathSymbol            -> symbol
1594                   CurrencySymbol        -> symbol
1595                   ModifierSymbol        -> symbol
1596                   OtherSymbol           -> symbol
1597                   Space                 -> space
1598                   _other                -> non_graphic
1599
1600 -- This version does not squash unicode characters, it is used when
1601 -- lexing strings.
1602 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1603 alexGetChar' (AI loc ofs s)
1604   | atEnd s   = Nothing
1605   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1606                 --trace (show (ord c)) $
1607                 Just (c, (AI loc' ofs' s'))
1608   where (c,s') = nextChar s
1609         loc'   = advanceSrcLoc loc c
1610         ofs'   = advanceOffs c ofs
1611
1612 advanceOffs :: Char -> Int -> Int
1613 advanceOffs '\n' _    = 0
1614 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1615 advanceOffs _    offs = offs + 1
1616
1617 getInput :: P AlexInput
1618 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1619
1620 setInput :: AlexInput -> P ()
1621 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1622
1623 pushLexState :: Int -> P ()
1624 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1625
1626 popLexState :: P Int
1627 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1628
1629 getLexState :: P Int
1630 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1631
1632 -- for reasons of efficiency, flags indicating language extensions (eg,
1633 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1634 -- integer
1635
1636 genericsBit :: Int
1637 genericsBit = 0 -- {| and |}
1638 ffiBit :: Int
1639 ffiBit     = 1
1640 parrBit :: Int
1641 parrBit    = 2
1642 arrowsBit :: Int
1643 arrowsBit  = 4
1644 thBit :: Int
1645 thBit      = 5
1646 ipBit :: Int
1647 ipBit      = 6
1648 explicitForallBit :: Int
1649 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1650 bangPatBit :: Int
1651 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1652                 -- (doesn't affect the lexer)
1653 tyFamBit :: Int
1654 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1655 haddockBit :: Int
1656 haddockBit = 10 -- Lex and parse Haddock comments
1657 magicHashBit :: Int
1658 magicHashBit = 11 -- "#" in both functions and operators
1659 kindSigsBit :: Int
1660 kindSigsBit = 12 -- Kind signatures on type variables
1661 recursiveDoBit :: Int
1662 recursiveDoBit = 13 -- mdo
1663 unicodeSyntaxBit :: Int
1664 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1665 unboxedTuplesBit :: Int
1666 unboxedTuplesBit = 15 -- (# and #)
1667 standaloneDerivingBit :: Int
1668 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1669 transformComprehensionsBit :: Int
1670 transformComprehensionsBit = 17
1671 qqBit :: Int
1672 qqBit      = 18 -- enable quasiquoting
1673 inRulePragBit :: Int
1674 inRulePragBit = 19
1675 rawTokenStreamBit :: Int
1676 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1677 newQualOpsBit :: Int
1678 newQualOpsBit = 21 -- Haskell' qualified operator syntax, e.g. Prelude.(+)
1679
1680 always :: Int -> Bool
1681 always           _     = True
1682 genericsEnabled :: Int -> Bool
1683 genericsEnabled  flags = testBit flags genericsBit
1684 parrEnabled :: Int -> Bool
1685 parrEnabled      flags = testBit flags parrBit
1686 arrowsEnabled :: Int -> Bool
1687 arrowsEnabled    flags = testBit flags arrowsBit
1688 thEnabled :: Int -> Bool
1689 thEnabled        flags = testBit flags thBit
1690 ipEnabled :: Int -> Bool
1691 ipEnabled        flags = testBit flags ipBit
1692 explicitForallEnabled :: Int -> Bool
1693 explicitForallEnabled flags = testBit flags explicitForallBit
1694 bangPatEnabled :: Int -> Bool
1695 bangPatEnabled   flags = testBit flags bangPatBit
1696 -- tyFamEnabled :: Int -> Bool
1697 -- tyFamEnabled     flags = testBit flags tyFamBit
1698 haddockEnabled :: Int -> Bool
1699 haddockEnabled   flags = testBit flags haddockBit
1700 magicHashEnabled :: Int -> Bool
1701 magicHashEnabled flags = testBit flags magicHashBit
1702 -- kindSigsEnabled :: Int -> Bool
1703 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1704 unicodeSyntaxEnabled :: Int -> Bool
1705 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1706 unboxedTuplesEnabled :: Int -> Bool
1707 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1708 standaloneDerivingEnabled :: Int -> Bool
1709 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1710 qqEnabled :: Int -> Bool
1711 qqEnabled        flags = testBit flags qqBit
1712 -- inRulePrag :: Int -> Bool
1713 -- inRulePrag       flags = testBit flags inRulePragBit
1714 rawTokenStreamEnabled :: Int -> Bool
1715 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1716 newQualOps :: Int -> Bool
1717 newQualOps       flags = testBit flags newQualOpsBit
1718 oldQualOps :: Int -> Bool
1719 oldQualOps flags = not (newQualOps flags)
1720
1721 -- PState for parsing options pragmas
1722 --
1723 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1724 pragState dynflags buf loc =
1725   PState {
1726       buffer        = buf,
1727       messages      = emptyMessages,
1728       dflags        = dynflags,
1729       last_loc      = mkSrcSpan loc loc,
1730       last_offs     = 0,
1731       last_len      = 0,
1732       last_line_len = 0,
1733       loc           = loc,
1734       extsBitmap    = 0,
1735       context       = [],
1736       lex_state     = [bol, option_prags, 0]
1737     }
1738
1739
1740 -- create a parse state
1741 --
1742 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1743 mkPState buf loc flags  =
1744   PState {
1745       buffer          = buf,
1746       dflags        = flags,
1747       messages      = emptyMessages,
1748       last_loc      = mkSrcSpan loc loc,
1749       last_offs     = 0,
1750       last_len      = 0,
1751       last_line_len = 0,
1752       loc           = loc,
1753       extsBitmap    = fromIntegral bitmap,
1754       context       = [],
1755       lex_state     = [bol, 0]
1756         -- we begin in the layout state if toplev_layout is set
1757     }
1758     where
1759       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1760                .|. ffiBit       `setBitIf` dopt Opt_ForeignFunctionInterface flags
1761                .|. parrBit      `setBitIf` dopt Opt_PArr         flags
1762                .|. arrowsBit    `setBitIf` dopt Opt_Arrows       flags
1763                .|. thBit        `setBitIf` dopt Opt_TemplateHaskell flags
1764                .|. qqBit        `setBitIf` dopt Opt_QuasiQuotes flags
1765                .|. ipBit        `setBitIf` dopt Opt_ImplicitParams flags
1766                .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
1767                .|. explicitForallBit `setBitIf` dopt Opt_LiberalTypeSynonyms flags
1768                .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
1769                .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
1770                .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
1771                .|. explicitForallBit `setBitIf` dopt Opt_RankNTypes flags
1772                .|. bangPatBit   `setBitIf` dopt Opt_BangPatterns flags
1773                .|. tyFamBit     `setBitIf` dopt Opt_TypeFamilies flags
1774                .|. haddockBit   `setBitIf` dopt Opt_Haddock      flags
1775                .|. magicHashBit `setBitIf` dopt Opt_MagicHash    flags
1776                .|. kindSigsBit  `setBitIf` dopt Opt_KindSignatures flags
1777                .|. recursiveDoBit `setBitIf` dopt Opt_RecursiveDo flags
1778                .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
1779                .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
1780                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1781                .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1782                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1783                .|. newQualOpsBit `setBitIf` dopt Opt_NewQualifiedOperators flags
1784       --
1785       setBitIf :: Int -> Bool -> Int
1786       b `setBitIf` cond | cond      = bit b
1787                         | otherwise = 0
1788
1789 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1790 addWarning option srcspan warning
1791  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1792        let warning' = mkWarnMsg srcspan alwaysQualify warning
1793            ws' = if dopt option d then ws `snocBag` warning' else ws
1794        in POk s{messages=(ws', es)} ()
1795
1796 getMessages :: PState -> Messages
1797 getMessages PState{messages=ms} = ms
1798
1799 getContext :: P [LayoutContext]
1800 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1801
1802 setContext :: [LayoutContext] -> P ()
1803 setContext ctx = P $ \s -> POk s{context=ctx} ()
1804
1805 popContext :: P ()
1806 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1807                               last_len = len, last_loc = last_loc }) ->
1808   case ctx of
1809         (_:tl) -> POk s{ context = tl } ()
1810         []     -> PFailed last_loc (srcParseErr buf len)
1811
1812 -- Push a new layout context at the indentation of the last token read.
1813 -- This is only used at the outer level of a module when the 'module'
1814 -- keyword is missing.
1815 pushCurrentContext :: P ()
1816 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1817     POk s{context = Layout (offs-len) : ctx} ()
1818 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1819
1820 getOffside :: P Ordering
1821 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1822                 let ord = case stk of
1823                         (Layout n:_) -> compare offs n
1824                         _            -> GT
1825                 in POk s ord
1826
1827 -- ---------------------------------------------------------------------------
1828 -- Construct a parse error
1829
1830 srcParseErr
1831   :: StringBuffer       -- current buffer (placed just after the last token)
1832   -> Int                -- length of the previous token
1833   -> Message
1834 srcParseErr buf len
1835   = hcat [ if null token
1836              then ptext (sLit "parse error (possibly incorrect indentation)")
1837              else hcat [ptext (sLit "parse error on input "),
1838                         char '`', text token, char '\'']
1839     ]
1840   where token = lexemeToString (offsetBytes (-len) buf) len
1841
1842 -- Report a parse failure, giving the span of the previous token as
1843 -- the location of the error.  This is the entry point for errors
1844 -- detected during parsing.
1845 srcParseFail :: P a
1846 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1847                             last_loc = last_loc } ->
1848     PFailed last_loc (srcParseErr buf len)
1849
1850 -- A lexical error is reported at a particular position in the source file,
1851 -- not over a token range.
1852 lexError :: String -> P a
1853 lexError str = do
1854   loc <- getSrcLoc
1855   (AI end _ buf) <- getInput
1856   reportLexError loc end buf str
1857
1858 -- -----------------------------------------------------------------------------
1859 -- This is the top-level function: called from the parser each time a
1860 -- new token is to be read from the input.
1861
1862 lexer :: (Located Token -> P a) -> P a
1863 lexer cont = do
1864   tok@(L _span _tok__) <- lexToken
1865 --  trace ("token: " ++ show tok__) $ do
1866   cont tok
1867
1868 lexToken :: P (Located Token)
1869 lexToken = do
1870   inp@(AI loc1 _ buf) <- getInput
1871   sc <- getLexState
1872   exts <- getExts
1873   case alexScanUser exts inp sc of
1874     AlexEOF -> do
1875         let span = mkSrcSpan loc1 loc1
1876         setLastToken span 0 0
1877         return (L span ITeof)
1878     AlexError (AI loc2 _ buf) ->
1879         reportLexError loc1 loc2 buf "lexical error"
1880     AlexSkip inp2 _ -> do
1881         setInput inp2
1882         lexToken
1883     AlexToken inp2@(AI end _ buf2) _ t -> do
1884         setInput inp2
1885         let span = mkSrcSpan loc1 end
1886         let bytes = byteDiff buf buf2
1887         span `seq` setLastToken span bytes bytes
1888         t span buf bytes
1889
1890 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
1891 reportLexError loc1 loc2 buf str
1892   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1893   | otherwise =
1894   let
1895         c = fst (nextChar buf)
1896   in
1897   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1898     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1899     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1900
1901 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
1902 lexTokenStream buf loc dflags = unP go initState
1903     where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
1904           go = do
1905             ltok <- lexer return
1906             case ltok of
1907               L _ ITeof -> return []
1908               _ -> liftM (ltok:) go
1909
1910 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
1911                                  ("options_ghc", lex_string_prag IToptions_prag),
1912                                  ("options_haddock", lex_string_prag ITdocOptions),
1913                                  ("language", token ITlanguage_prag),
1914                                  ("include", lex_string_prag ITinclude_prag)])
1915
1916 ignoredPrags = Map.fromList (map ignored pragmas)
1917                where ignored opt = (opt, nested_comment lexToken)
1918                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
1919                      options_pragmas = map ("options_" ++) impls
1920                      -- CFILES is a hugs-only thing.
1921                      pragmas = options_pragmas ++ ["cfiles", "contract"]
1922
1923 oneWordPrags = Map.fromList([("rules", rulePrag),
1924                            ("inline", token (ITinline_prag True)),
1925                            ("notinline", token (ITinline_prag False)),
1926                            ("specialize", token ITspec_prag),
1927                            ("source", token ITsource_prag),
1928                            ("warning", token ITwarning_prag),
1929                            ("deprecated", token ITdeprecated_prag),
1930                            ("scc", token ITscc_prag),
1931                            ("generated", token ITgenerated_prag),
1932                            ("core", token ITcore_prag),
1933                            ("unpack", token ITunpack_prag),
1934                            ("ann", token ITann_prag)])
1935
1936 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_conlike_prag True)),
1937                              ("notinline conlike", token (ITinline_conlike_prag False)),
1938                              ("specialize inline", token (ITspec_inline_prag True)),
1939                              ("specialize notinline", token (ITspec_inline_prag False))])
1940
1941
1942 dispatch_pragmas :: Map String Action -> Action
1943 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
1944                                        Just found -> found span buf len
1945                                        Nothing -> lexError "unknown pragma"
1946
1947 known_pragma :: Map String Action -> AlexAccPred Int
1948 known_pragma prags q r len (AI s t buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
1949                                           && (notFollowedByPragmaChar q r len (AI s t buf))
1950
1951 clean_pragma :: String -> String
1952 clean_pragma prag = canon_ws (map toLower (unprefix prag))
1953                     where unprefix prag' = (case stripPrefix "{-#" prag' of
1954                                               Just rest -> rest
1955                                               Nothing -> prag')
1956                           canonical prag' = case prag' of
1957                                               "noinline" -> "notinline"
1958                                               "specialise" -> "specialize"
1959                                               "constructorlike" -> "conlike"
1960                                               otherwise -> prag'
1961                           canon_ws s = unwords (map canonical (words s))
1962 }