compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end RealSrcLoc
  11 -- attached to it.
  12 --
  13 -----------------------------------------------------------------------------
  14
  15 --   ToDo / known bugs:
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 --   qualified operator NOTES.
  24 --
  25 --   - If M.(+) is a single lexeme, then..
  26 --     - Probably (+) should be a single lexeme too, for consistency.
  27 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  28 --     - But we have to rule out reserved operators, otherwise (..) becomes
  29 --       a different lexeme.
  30 --     - Should we therefore also rule out reserved operators in the qualified
  31 --       form?  This is quite difficult to achieve.  We don't do it for
  32 --       qualified varids.
  33
  34 {
  35 -- XXX The above flags turn off warnings in the generated code:
  36 {-# LANGUAGE BangPatterns #-}
  37 {-# OPTIONS_GHC -fno-warn-unused-matches #-}
  38 {-# OPTIONS_GHC -fno-warn-unused-binds #-}
  39 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
  40 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
  41 -- But alex still generates some code that causes the "lazy unlifted bindings"
  42 -- warning, and old compilers don't know about it so we can't easily turn
  43 -- it off, so for now we use the sledge hammer:
  44 {-# OPTIONS_GHC -w #-}
  45
  46 {-# OPTIONS_GHC -funbox-strict-fields #-}
  47
  48 module Lexer (
  49    Token(..), lexer, pragState, mkPState, PState(..),
  50    P(..), ParseResult(..), getSrcLoc,
  51    getPState, getDynFlags, withThisPackage,
  52    failLocMsgP, failSpanMsgP, srcParseFail,
  53    getMessages,
  54    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  55    activeContext, nextIsEOF,
  56    getLexState, popLexState, pushLexState,
  57    extension, bangPatEnabled, datatypeContextsEnabled,
  58    addWarning,
  59    lexTokenStream
  60   ) where
  61
  62 import Bag
  63 import ErrUtils
  64 import Outputable
  65 import StringBuffer
  66 import FastString
  67 import SrcLoc
  68 import UniqFM
  69 import DynFlags
  70 import Module
  71 import Ctype
  72 import BasicTypes       ( InlineSpec(..), RuleMatchInfo(..), FractionalLit(..) )
  73 import Util             ( readRational )
  74
  75 import Control.Monad
  76 import Data.Bits
  77 import Data.Char
  78 import Data.List
  79 import Data.Maybe
  80 import Data.Map (Map)
  81 import qualified Data.Map as Map
  82 import Data.Ratio
  83 }
  84
  85 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  86 $whitechar   = [\ \n\r\f\v $unispace]
  87 $white_no_nl = $whitechar # \n
  88 $tab         = \t
  89
  90 $ascdigit  = 0-9
  91 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  92 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  93 $digit     = [$ascdigit $unidigit]
  94
  95 $special   = [\(\)\,\;\[\]\`\{\}]
  96 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  97 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  98 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  99
 100 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
 101 $asclarge  = [A-Z]
 102 $large     = [$asclarge $unilarge]
 103
 104 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 105 $ascsmall  = [a-z]
 106 $small     = [$ascsmall $unismall \_]
 107
 108 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 109 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 110
 111 $octit     = 0-7
 112 $hexit     = [$decdigit A-F a-f]
 113 $symchar   = [$symbol \:]
 114 $nl        = [\n\r]
 115 $idchar    = [$small $large $digit \']
 116
 117 $pragmachar = [$small $large $digit]
 118
 119 $docsym    = [\| \^ \* \$]
 120
 121 @varid     = $small $idchar*
 122 @conid     = $large $idchar*
 123
 124 @varsym    = $symbol $symchar*
 125 @consym    = \: $symchar*
 126
 127 @decimal     = $decdigit+
 128 @octal       = $octit+
 129 @hexadecimal = $hexit+
 130 @exponent    = [eE] [\-\+]? @decimal
 131
 132 -- we support the hierarchical module name extension:
 133 @qual = (@conid \.)+
 134
 135 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 136
 137 -- normal signed numerical literals can only be explicitly negative,
 138 -- not explicitly positive (contrast @exponent)
 139 @negative = \-
 140 @signed = @negative ?
 141
 142 haskell :-
 143
 144 -- everywhere: skip whitespace and comments
 145 $white_no_nl+                           ;
 146 $tab+         { warn Opt_WarnTabs (text "Warning: Tab character") }
 147
 148 -- Everywhere: deal with nested comments.  We explicitly rule out
 149 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 150 -- (this can happen even though pragmas will normally take precedence due to
 151 -- longest-match, because pragmas aren't valid in every state, but comments
 152 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 153 -- set.
 154
 155 "{-" / { isNormalComment } { nested_comment lexToken }
 156
 157 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 158 -- more dashes followed by a symbol should be parsed as a varsym, so we
 159 -- have to exclude those.
 160
 161 -- Since Haddock comments aren't valid in every state, we need to rule them
 162 -- out here.
 163
 164 -- The following two rules match comments that begin with two dashes, but
 165 -- continue with a different character. The rules test that this character
 166 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 167 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 168 -- have a Haddock comment). The rules then munch the rest of the line.
 169
 170 "-- " ~[$docsym \#] .* { lineCommentToken }
 171 "--" [^$symbol : \ ] .* { lineCommentToken }
 172
 173 -- Next, match Haddock comments if no -haddock flag
 174
 175 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 176
 177 -- Now, when we've matched comments that begin with 2 dashes and continue
 178 -- with a different character, we need to match comments that begin with three
 179 -- or more dashes (which clearly can't be Haddock comments). We only need to
 180 -- make sure that the first non-dash character isn't a symbol, and munch the
 181 -- rest of the line.
 182
 183 "---"\-* [^$symbol :] .* { lineCommentToken }
 184
 185 -- Since the previous rules all match dashes followed by at least one
 186 -- character, we also need to match a whole line filled with just dashes.
 187
 188 "--"\-* / { atEOL } { lineCommentToken }
 189
 190 -- We need this rule since none of the other single line comment rules
 191 -- actually match this case.
 192
 193 "-- " / { atEOL } { lineCommentToken }
 194
 195 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 196 -- blank lines) until we find a non-whitespace character, then do layout
 197 -- processing.
 198 --
 199 -- One slight wibble here: what if the line begins with {-#? In
 200 -- theory, we have to lex the pragma to see if it's one we recognise,
 201 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 202 -- as a nested comment.  We don't bother with this: if the line begins
 203 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 204 <bol> {
 205   \n                                    ;
 206   ^\# (line)?                           { begin line_prag1 }
 207   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 208   ^\# \! .* \n                          ; -- #!, for scripts
 209   ()                                    { do_bol }
 210 }
 211
 212 -- after a layout keyword (let, where, do, of), we begin a new layout
 213 -- context if the curly brace is missing.
 214 -- Careful! This stuff is quite delicate.
 215 <layout, layout_do> {
 216   \{ / { notFollowedBy '-' }            { hopefully_open_brace }
 217         -- we might encounter {-# here, but {- has been handled already
 218   \n                                    ;
 219   ^\# (line)?                           { begin line_prag1 }
 220 }
 221
 222 -- do is treated in a subtly different way, see new_layout_context
 223 <layout>    ()                          { new_layout_context True }
 224 <layout_do> ()                          { new_layout_context False }
 225
 226 -- after a new layout context which was found to be to the left of the
 227 -- previous context, we have generated a '{' token, and we now need to
 228 -- generate a matching '}' token.
 229 <layout_left>  ()                       { do_layout_left }
 230
 231 <0,option_prags> \n                             { begin bol }
 232
 233 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 234                                 { dispatch_pragmas linePrags }
 235
 236 -- single-line line pragmas, of the form
 237 --    # <line> "<file>" <extra-stuff> \n
 238 <line_prag1> $decdigit+                 { setLine line_prag1a }
 239 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 240 <line_prag1b> .*                        { pop }
 241
 242 -- Haskell-style line pragmas, of the form
 243 --    {-# LINE <line> "<file>" #-}
 244 <line_prag2> $decdigit+                 { setLine line_prag2a }
 245 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 246 <line_prag2b> "#-}"|"-}"                { pop }
 247    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 248    -- with older versions of GHC which generated these.
 249
 250 <0,option_prags> {
 251   "{-#" $whitechar* $pragmachar+
 252         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 253                                  { dispatch_pragmas twoWordPrags }
 254
 255   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 256                                  { dispatch_pragmas oneWordPrags }
 257
 258   -- We ignore all these pragmas, but don't generate a warning for them
 259   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 260                                  { dispatch_pragmas ignoredPrags }
 261
 262   -- ToDo: should only be valid inside a pragma:
 263   "#-}"                                 { endPrag }
 264 }
 265
 266 <option_prags> {
 267   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 268                                    { dispatch_pragmas fileHeaderPrags }
 269
 270   "-- #"                                 { multiline_doc_comment }
 271 }
 272
 273 <0> {
 274   -- In the "0" mode we ignore these pragmas
 275   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 276                      { nested_comment lexToken }
 277 }
 278
 279 <0> {
 280   "-- #" .* { lineCommentToken }
 281 }
 282
 283 <0,option_prags> {
 284   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 285                     (nested_comment lexToken) }
 286 }
 287
 288 -- '0' state: ordinary lexemes
 289
 290 -- Haddock comments
 291
 292 <0,option_prags> {
 293   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 294   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 295 }
 296
 297 -- "special" symbols
 298
 299 <0> {
 300   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 301   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 302 }
 303
 304 <0> {
 305   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 306   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 307   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 308   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 309   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 310   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 311   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 312   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 313
 314 -- For backward compatibility, accept the old dollar syntax
 315   "[$" @varid "|"  / { ifExtension qqEnabled }
 316                      { lex_quasiquote_tok }
 317
 318   "[" @varid "|"  / { ifExtension qqEnabled }
 319                      { lex_quasiquote_tok }
 320 }
 321
 322 <0> {
 323   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 324                                         { special IToparenbar }
 325   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 326 }
 327
 328 <0> {
 329   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 330 }
 331
 332 <0> {
 333   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 334          { token IToubxparen }
 335   "#)" / { ifExtension unboxedTuplesEnabled }
 336          { token ITcubxparen }
 337 }
 338
 339 <0,option_prags> {
 340   \(                                    { special IToparen }
 341   \)                                    { special ITcparen }
 342   \[                                    { special ITobrack }
 343   \]                                    { special ITcbrack }
 344   \,                                    { special ITcomma }
 345   \;                                    { special ITsemi }
 346   \`                                    { special ITbackquote }
 347
 348   \{                                    { open_brace }
 349   \}                                    { close_brace }
 350 }
 351
 352 <0,option_prags> {
 353   @qual @varid                  { idtoken qvarid }
 354   @qual @conid                  { idtoken qconid }
 355   @varid                        { varid }
 356   @conid                        { idtoken conid }
 357 }
 358
 359 <0> {
 360   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 361   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 362   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 363   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 364 }
 365
 366 -- ToDo: - move `var` and (sym) into lexical syntax?
 367 --       - remove backquote from $special?
 368 <0> {
 369   @qual @varsym                                    { idtoken qvarsym }
 370   @qual @consym                                    { idtoken qconsym }
 371   @varsym                                          { varsym }
 372   @consym                                          { consym }
 373 }
 374
 375 -- For the normal boxed literals we need to be careful
 376 -- when trying to be close to Haskell98
 377 <0> {
 378   -- Normal integral literals (:: Num a => a, from Integer)
 379   @decimal           { tok_num positive 0 0 decimal }
 380   0[oO] @octal       { tok_num positive 2 2 octal }
 381   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 382
 383   -- Normal rational literals (:: Fractional a => a, from Rational)
 384   @floating_point    { strtoken tok_float }
 385 }
 386
 387 <0> {
 388   -- Unboxed ints (:: Int#) and words (:: Word#)
 389   -- It's simpler (and faster?) to give separate cases to the negatives,
 390   -- especially considering octal/hexadecimal prefixes.
 391   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 392   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 393   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 394   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 395   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 396   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 397
 398   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 399   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 400   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 401
 402   -- Unboxed floats and doubles (:: Float#, :: Double#)
 403   -- prim_{float,double} work with signed literals
 404   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 405   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 406 }
 407
 408 -- Strings and chars are lexed by hand-written code.  The reason is
 409 -- that even if we recognise the string or char here in the regex
 410 -- lexer, we would still have to parse the string afterward in order
 411 -- to convert it to a String.
 412 <0> {
 413   \'                            { lex_char_tok }
 414   \"                            { lex_string_tok }
 415 }
 416
 417 {
 418 -- -----------------------------------------------------------------------------
 419 -- The token type
 420
 421 data Token
 422   = ITas                        -- Haskell keywords
 423   | ITcase
 424   | ITclass
 425   | ITdata
 426   | ITdefault
 427   | ITderiving
 428   | ITdo
 429   | ITelse
 430   | IThiding
 431   | ITif
 432   | ITimport
 433   | ITin
 434   | ITinfix
 435   | ITinfixl
 436   | ITinfixr
 437   | ITinstance
 438   | ITlet
 439   | ITmodule
 440   | ITnewtype
 441   | ITof
 442   | ITqualified
 443   | ITthen
 444   | ITtype
 445   | ITwhere
 446   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 447
 448   | ITforall                    -- GHC extension keywords
 449   | ITforeign
 450   | ITexport
 451   | ITlabel
 452   | ITdynamic
 453   | ITsafe
 454   | ITthreadsafe
 455   | ITinterruptible
 456   | ITunsafe
 457   | ITstdcallconv
 458   | ITccallconv
 459   | ITprimcallconv
 460   | ITmdo
 461   | ITfamily
 462   | ITgroup
 463   | ITby
 464   | ITusing
 465
 466         -- Pragmas
 467   | ITinline_prag InlineSpec RuleMatchInfo
 468   | ITspec_prag                 -- SPECIALISE
 469   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 470   | ITsource_prag
 471   | ITrules_prag
 472   | ITwarning_prag
 473   | ITdeprecated_prag
 474   | ITline_prag
 475   | ITscc_prag
 476   | ITgenerated_prag
 477   | ITcore_prag                 -- hdaume: core annotations
 478   | ITunpack_prag
 479   | ITann_prag
 480   | ITclose_prag
 481   | IToptions_prag String
 482   | ITinclude_prag String
 483   | ITlanguage_prag
 484   | ITvect_prag
 485   | ITvect_scalar_prag
 486
 487   | ITdotdot                    -- reserved symbols
 488   | ITcolon
 489   | ITdcolon
 490   | ITequal
 491   | ITlam
 492   | ITvbar
 493   | ITlarrow
 494   | ITrarrow
 495   | ITat
 496   | ITtilde
 497   | ITdarrow
 498   | ITminus
 499   | ITbang
 500   | ITstar
 501   | ITdot
 502
 503   | ITbiglam                    -- GHC-extension symbols
 504
 505   | ITocurly                    -- special symbols
 506   | ITccurly
 507   | ITocurlybar                 -- {|, for type applications
 508   | ITccurlybar                 -- |}, for type applications
 509   | ITvocurly
 510   | ITvccurly
 511   | ITobrack
 512   | ITopabrack                  -- [:, for parallel arrays with -XParallelArrays
 513   | ITcpabrack                  -- :], for parallel arrays with -XParallelArrays
 514   | ITcbrack
 515   | IToparen
 516   | ITcparen
 517   | IToubxparen
 518   | ITcubxparen
 519   | ITsemi
 520   | ITcomma
 521   | ITunderscore
 522   | ITbackquote
 523
 524   | ITvarid   FastString        -- identifiers
 525   | ITconid   FastString
 526   | ITvarsym  FastString
 527   | ITconsym  FastString
 528   | ITqvarid  (FastString,FastString)
 529   | ITqconid  (FastString,FastString)
 530   | ITqvarsym (FastString,FastString)
 531   | ITqconsym (FastString,FastString)
 532   | ITprefixqvarsym (FastString,FastString)
 533   | ITprefixqconsym (FastString,FastString)
 534
 535   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 536
 537   | ITchar       Char
 538   | ITstring     FastString
 539   | ITinteger    Integer
 540   | ITrational   FractionalLit
 541
 542   | ITprimchar   Char
 543   | ITprimstring FastString
 544   | ITprimint    Integer
 545   | ITprimword   Integer
 546   | ITprimfloat  FractionalLit
 547   | ITprimdouble FractionalLit
 548
 549   -- Template Haskell extension tokens
 550   | ITopenExpQuote              --  [| or [e|
 551   | ITopenPatQuote              --  [p|
 552   | ITopenDecQuote              --  [d|
 553   | ITopenTypQuote              --  [t|
 554   | ITcloseQuote                --  |]
 555   | ITidEscape   FastString     --  $x
 556   | ITparenEscape               --  $(
 557   | ITvarQuote                  --  '
 558   | ITtyQuote                   --  ''
 559   | ITquasiQuote (FastString,FastString,RealSrcSpan) --  [:...|...|]
 560
 561   -- Arrow notation extension
 562   | ITproc
 563   | ITrec
 564   | IToparenbar                 --  (|
 565   | ITcparenbar                 --  |)
 566   | ITlarrowtail                --  -<
 567   | ITrarrowtail                --  >-
 568   | ITLarrowtail                --  -<<
 569   | ITRarrowtail                --  >>-
 570
 571   | ITunknown String            -- Used when the lexer can't make sense of it
 572   | ITeof                       -- end of file token
 573
 574   -- Documentation annotations
 575   | ITdocCommentNext  String     -- something beginning '-- |'
 576   | ITdocCommentPrev  String     -- something beginning '-- ^'
 577   | ITdocCommentNamed String     -- something beginning '-- $'
 578   | ITdocSection      Int String -- a section heading
 579   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 580   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 581   | ITlineComment     String     -- comment starting by "--"
 582   | ITblockComment    String     -- comment in {- -}
 583
 584 #ifdef DEBUG
 585   deriving Show -- debugging
 586 #endif
 587
 588 {-
 589 isSpecial :: Token -> Bool
 590 -- If we see M.x, where x is a keyword, but
 591 -- is special, we treat is as just plain M.x,
 592 -- not as a keyword.
 593 isSpecial ITas          = True
 594 isSpecial IThiding      = True
 595 isSpecial ITqualified   = True
 596 isSpecial ITforall      = True
 597 isSpecial ITexport      = True
 598 isSpecial ITlabel       = True
 599 isSpecial ITdynamic     = True
 600 isSpecial ITsafe        = True
 601 isSpecial ITthreadsafe  = True
 602 isSpecial ITinterruptible = True
 603 isSpecial ITunsafe      = True
 604 isSpecial ITccallconv   = True
 605 isSpecial ITstdcallconv = True
 606 isSpecial ITprimcallconv = True
 607 isSpecial ITmdo         = True
 608 isSpecial ITfamily      = True
 609 isSpecial ITgroup   = True
 610 isSpecial ITby      = True
 611 isSpecial ITusing   = True
 612 isSpecial _             = False
 613 -}
 614
 615 -- the bitmap provided as the third component indicates whether the
 616 -- corresponding extension keyword is valid under the extension options
 617 -- provided to the compiler; if the extension corresponding to *any* of the
 618 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 619 -- facilitates using a keyword in two different extensions that can be
 620 -- activated independently)
 621 --
 622 reservedWordsFM :: UniqFM (Token, Int)
 623 reservedWordsFM = listToUFM $
 624         map (\(x, y, z) -> (mkFastString x, (y, z)))
 625        [( "_",          ITunderscore,   0 ),
 626         ( "as",         ITas,           0 ),
 627         ( "case",       ITcase,         0 ),
 628         ( "class",      ITclass,        0 ),
 629         ( "data",       ITdata,         0 ),
 630         ( "default",    ITdefault,      0 ),
 631         ( "deriving",   ITderiving,     0 ),
 632         ( "do",         ITdo,           0 ),
 633         ( "else",       ITelse,         0 ),
 634         ( "hiding",     IThiding,       0 ),
 635         ( "if",         ITif,           0 ),
 636         ( "import",     ITimport,       0 ),
 637         ( "in",         ITin,           0 ),
 638         ( "infix",      ITinfix,        0 ),
 639         ( "infixl",     ITinfixl,       0 ),
 640         ( "infixr",     ITinfixr,       0 ),
 641         ( "instance",   ITinstance,     0 ),
 642         ( "let",        ITlet,          0 ),
 643         ( "module",     ITmodule,       0 ),
 644         ( "newtype",    ITnewtype,      0 ),
 645         ( "of",         ITof,           0 ),
 646         ( "qualified",  ITqualified,    0 ),
 647         ( "then",       ITthen,         0 ),
 648         ( "type",       ITtype,         0 ),
 649         ( "where",      ITwhere,        0 ),
 650         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 651
 652     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 653         ( "mdo",        ITmdo,           bit recursiveDoBit),
 654         ( "family",     ITfamily,        bit tyFamBit),
 655     ( "group",  ITgroup,     bit transformComprehensionsBit),
 656     ( "by",     ITby,        bit transformComprehensionsBit),
 657     ( "using",  ITusing,     bit transformComprehensionsBit),
 658
 659         ( "foreign",    ITforeign,       bit ffiBit),
 660         ( "export",     ITexport,        bit ffiBit),
 661         ( "label",      ITlabel,         bit ffiBit),
 662         ( "dynamic",    ITdynamic,       bit ffiBit),
 663         ( "safe",       ITsafe,          bit ffiBit),
 664         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 665         ( "interruptible", ITinterruptible, bit ffiBit),
 666         ( "unsafe",     ITunsafe,        bit ffiBit),
 667         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 668         ( "ccall",      ITccallconv,     bit ffiBit),
 669         ( "prim",       ITprimcallconv,  bit ffiBit),
 670
 671         ( "rec",        ITrec,           bit recBit),
 672         ( "proc",       ITproc,          bit arrowsBit)
 673      ]
 674
 675 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 676 reservedSymsFM = listToUFM $
 677     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 678       [ ("..",  ITdotdot,   always)
 679         -- (:) is a reserved op, meaning only list cons
 680        ,(":",   ITcolon,    always)
 681        ,("::",  ITdcolon,   always)
 682        ,("=",   ITequal,    always)
 683        ,("\\",  ITlam,      always)
 684        ,("|",   ITvbar,     always)
 685        ,("<-",  ITlarrow,   always)
 686        ,("->",  ITrarrow,   always)
 687        ,("@",   ITat,       always)
 688        ,("~",   ITtilde,    always)
 689        ,("=>",  ITdarrow,   always)
 690        ,("-",   ITminus,    always)
 691        ,("!",   ITbang,     always)
 692
 693         -- For data T (a::*) = MkT
 694        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 695         -- For 'forall a . t'
 696        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 697
 698        ,("-<",  ITlarrowtail, arrowsEnabled)
 699        ,(">-",  ITrarrowtail, arrowsEnabled)
 700        ,("-<<", ITLarrowtail, arrowsEnabled)
 701        ,(">>-", ITRarrowtail, arrowsEnabled)
 702
 703        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 704        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 705        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 706                                 explicitForallEnabled i)
 707        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 708        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 709
 710        ,("⤙",   ITlarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 711        ,("⤚",   ITrarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 712        ,("⤛",   ITLarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 713        ,("⤜",   ITRarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 714
 715        ,("★", ITstar, unicodeSyntaxEnabled)
 716
 717         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 718         -- form part of a large operator.  This would let us have a better
 719         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 720        ]
 721
 722 -- -----------------------------------------------------------------------------
 723 -- Lexer actions
 724
 725 type Action = RealSrcSpan -> StringBuffer -> Int -> P (RealLocated Token)
 726
 727 special :: Token -> Action
 728 special tok span _buf _len = return (L span tok)
 729
 730 token, layout_token :: Token -> Action
 731 token t span _buf _len = return (L span t)
 732 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 733
 734 idtoken :: (StringBuffer -> Int -> Token) -> Action
 735 idtoken f span buf len = return (L span $! (f buf len))
 736
 737 skip_one_varid :: (FastString -> Token) -> Action
 738 skip_one_varid f span buf len
 739   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 740
 741 strtoken :: (String -> Token) -> Action
 742 strtoken f span buf len =
 743   return (L span $! (f $! lexemeToString buf len))
 744
 745 init_strtoken :: Int -> (String -> Token) -> Action
 746 -- like strtoken, but drops the last N character(s)
 747 init_strtoken drop f span buf len =
 748   return (L span $! (f $! lexemeToString buf (len-drop)))
 749
 750 begin :: Int -> Action
 751 begin code _span _str _len = do pushLexState code; lexToken
 752
 753 pop :: Action
 754 pop _span _buf _len = do _ <- popLexState
 755                          lexToken
 756
 757 hopefully_open_brace :: Action
 758 hopefully_open_brace span buf len
 759  = do relaxed <- extension relaxedLayout
 760       ctx <- getContext
 761       (AI l _) <- getInput
 762       let offset = srcLocCol l
 763           isOK = relaxed ||
 764                  case ctx of
 765                  Layout prev_off : _ -> prev_off < offset
 766                  _                   -> True
 767       if isOK then pop_and open_brace span buf len
 768               else failSpanMsgP (RealSrcSpan span) (text "Missing block")
 769
 770 pop_and :: Action -> Action
 771 pop_and act span buf len = do _ <- popLexState
 772                               act span buf len
 773
 774 {-# INLINE nextCharIs #-}
 775 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 776 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 777
 778 notFollowedBy :: Char -> AlexAccPred Int
 779 notFollowedBy char _ _ _ (AI _ buf)
 780   = nextCharIs buf (/=char)
 781
 782 notFollowedBySymbol :: AlexAccPred Int
 783 notFollowedBySymbol _ _ _ (AI _ buf)
 784   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 785
 786 -- We must reject doc comments as being ordinary comments everywhere.
 787 -- In some cases the doc comment will be selected as the lexeme due to
 788 -- maximal munch, but not always, because the nested comment rule is
 789 -- valid in all states, but the doc-comment rules are only valid in
 790 -- the non-layout states.
 791 isNormalComment :: AlexAccPred Int
 792 isNormalComment bits _ _ (AI _ buf)
 793   | haddockEnabled bits = notFollowedByDocOrPragma
 794   | otherwise           = nextCharIs buf (/='#')
 795   where
 796     notFollowedByDocOrPragma
 797        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 798
 799 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 800 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 801
 802 {-
 803 haddockDisabledAnd p bits _ _ (AI _ buf)
 804   = if haddockEnabled bits then False else (p buf)
 805 -}
 806
 807 atEOL :: AlexAccPred Int
 808 atEOL _ _ _ (AI _ buf) = atEnd buf || currentChar buf == '\n'
 809
 810 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 811 ifExtension pred bits _ _ _ = pred bits
 812
 813 multiline_doc_comment :: Action
 814 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 815   where
 816     worker commentAcc input docType oneLine = case alexGetChar input of
 817       Just ('\n', input')
 818         | oneLine -> docCommentEnd input commentAcc docType buf span
 819         | otherwise -> case checkIfCommentLine input' of
 820           Just input -> worker ('\n':commentAcc) input docType False
 821           Nothing -> docCommentEnd input commentAcc docType buf span
 822       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 823       Nothing -> docCommentEnd input commentAcc docType buf span
 824
 825     checkIfCommentLine input = check (dropNonNewlineSpace input)
 826       where
 827         check input = case alexGetChar input of
 828           Just ('-', input) -> case alexGetChar input of
 829             Just ('-', input) -> case alexGetChar input of
 830               Just (c, _) | c /= '-' -> Just input
 831               _ -> Nothing
 832             _ -> Nothing
 833           _ -> Nothing
 834
 835         dropNonNewlineSpace input = case alexGetChar input of
 836           Just (c, input')
 837             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 838             | otherwise -> input
 839           Nothing -> input
 840
 841 lineCommentToken :: Action
 842 lineCommentToken span buf len = do
 843   b <- extension rawTokenStreamEnabled
 844   if b then strtoken ITlineComment span buf len else lexToken
 845
 846 {-
 847   nested comments require traversing by hand, they can't be parsed
 848   using regular expressions.
 849 -}
 850 nested_comment :: P (RealLocated Token) -> Action
 851 nested_comment cont span _str _len = do
 852   input <- getInput
 853   go "" (1::Int) input
 854   where
 855     go commentAcc 0 input = do setInput input
 856                                b <- extension rawTokenStreamEnabled
 857                                if b
 858                                  then docCommentEnd input commentAcc ITblockComment _str span
 859                                  else cont
 860     go commentAcc n input = case alexGetChar input of
 861       Nothing -> errBrace input span
 862       Just ('-',input) -> case alexGetChar input of
 863         Nothing  -> errBrace input span
 864         Just ('\125',input) -> go commentAcc (n-1) input
 865         Just (_,_)          -> go ('-':commentAcc) n input
 866       Just ('\123',input) -> case alexGetChar input of
 867         Nothing  -> errBrace input span
 868         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 869         Just (_,_)       -> go ('\123':commentAcc) n input
 870       Just (c,input) -> go (c:commentAcc) n input
 871
 872 nested_doc_comment :: Action
 873 nested_doc_comment span buf _len = withLexedDocType (go "")
 874   where
 875     go commentAcc input docType _ = case alexGetChar input of
 876       Nothing -> errBrace input span
 877       Just ('-',input) -> case alexGetChar input of
 878         Nothing -> errBrace input span
 879         Just ('\125',input) ->
 880           docCommentEnd input commentAcc docType buf span
 881         Just (_,_) -> go ('-':commentAcc) input docType False
 882       Just ('\123', input) -> case alexGetChar input of
 883         Nothing  -> errBrace input span
 884         Just ('-',input) -> do
 885           setInput input
 886           let cont = do input <- getInput; go commentAcc input docType False
 887           nested_comment cont span buf _len
 888         Just (_,_) -> go ('\123':commentAcc) input docType False
 889       Just (c,input) -> go (c:commentAcc) input docType False
 890
 891 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (RealLocated Token))
 892                  -> P (RealLocated Token)
 893 withLexedDocType lexDocComment = do
 894   input@(AI _ buf) <- getInput
 895   case prevChar buf ' ' of
 896     '|' -> lexDocComment input ITdocCommentNext False
 897     '^' -> lexDocComment input ITdocCommentPrev False
 898     '$' -> lexDocComment input ITdocCommentNamed False
 899     '*' -> lexDocSection 1 input
 900     '#' -> lexDocComment input ITdocOptionsOld False
 901     _ -> panic "withLexedDocType: Bad doc type"
 902  where
 903     lexDocSection n input = case alexGetChar input of
 904       Just ('*', input) -> lexDocSection (n+1) input
 905       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 906       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 907
 908 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 909 -- off again at the end of the pragma.
 910 rulePrag :: Action
 911 rulePrag span _buf _len = do
 912   setExts (.|. bit inRulePragBit)
 913   return (L span ITrules_prag)
 914
 915 endPrag :: Action
 916 endPrag span _buf _len = do
 917   setExts (.&. complement (bit inRulePragBit))
 918   return (L span ITclose_prag)
 919
 920 -- docCommentEnd
 921 -------------------------------------------------------------------------------
 922 -- This function is quite tricky. We can't just return a new token, we also
 923 -- need to update the state of the parser. Why? Because the token is longer
 924 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 925 -- it writes the wrong token length to the parser state. This function is
 926 -- called afterwards, so it can just update the state.
 927
 928 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 929                  RealSrcSpan -> P (RealLocated Token)
 930 docCommentEnd input commentAcc docType buf span = do
 931   setInput input
 932   let (AI loc nextBuf) = input
 933       comment = reverse commentAcc
 934       span' = mkRealSrcSpan (realSrcSpanStart span) loc
 935       last_len = byteDiff buf nextBuf
 936
 937   span `seq` setLastToken span' last_len
 938   return (L span' (docType comment))
 939
 940 errBrace :: AlexInput -> RealSrcSpan -> P a
 941 errBrace (AI end _) span = failLocMsgP (realSrcSpanStart span) end "unterminated `{-'"
 942
 943 open_brace, close_brace :: Action
 944 open_brace span _str _len = do
 945   ctx <- getContext
 946   setContext (NoLayout:ctx)
 947   return (L span ITocurly)
 948 close_brace span _str _len = do
 949   popContext
 950   return (L span ITccurly)
 951
 952 qvarid, qconid :: StringBuffer -> Int -> Token
 953 qvarid buf len = ITqvarid $! splitQualName buf len False
 954 qconid buf len = ITqconid $! splitQualName buf len False
 955
 956 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 957 -- takes a StringBuffer and a length, and returns the module name
 958 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 959 -- because of hierarchical module names.
 960 splitQualName orig_buf len parens = split orig_buf orig_buf
 961   where
 962     split buf dot_buf
 963         | orig_buf `byteDiff` buf >= len  = done dot_buf
 964         | c == '.'                        = found_dot buf'
 965         | otherwise                       = split buf' dot_buf
 966       where
 967        (c,buf') = nextChar buf
 968
 969     -- careful, we might get names like M....
 970     -- so, if the character after the dot is not upper-case, this is
 971     -- the end of the qualifier part.
 972     found_dot buf -- buf points after the '.'
 973         | isUpper c    = split buf' buf
 974         | otherwise    = done buf
 975       where
 976        (c,buf') = nextChar buf
 977
 978     done dot_buf =
 979         (lexemeToFastString orig_buf (qual_size - 1),
 980          if parens -- Prelude.(+)
 981             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 982             else lexemeToFastString dot_buf (len - qual_size))
 983       where
 984         qual_size = orig_buf `byteDiff` dot_buf
 985
 986 varid :: Action
 987 varid span buf len =
 988   fs `seq`
 989   case lookupUFM reservedWordsFM fs of
 990         Just (keyword,0)    -> do
 991                 maybe_layout keyword
 992                 return (L span keyword)
 993         Just (keyword,exts) -> do
 994                 b <- extension (\i -> exts .&. i /= 0)
 995                 if b then do maybe_layout keyword
 996                              return (L span keyword)
 997                      else return (L span (ITvarid fs))
 998         _other -> return (L span (ITvarid fs))
 999   where
1000         fs = lexemeToFastString buf len
1001
1002 conid :: StringBuffer -> Int -> Token
1003 conid buf len = ITconid fs
1004   where fs = lexemeToFastString buf len
1005
1006 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
1007 qvarsym buf len = ITqvarsym $! splitQualName buf len False
1008 qconsym buf len = ITqconsym $! splitQualName buf len False
1009 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
1010 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
1011
1012 varsym, consym :: Action
1013 varsym = sym ITvarsym
1014 consym = sym ITconsym
1015
1016 sym :: (FastString -> Token) -> RealSrcSpan -> StringBuffer -> Int
1017     -> P (RealLocated Token)
1018 sym con span buf len =
1019   case lookupUFM reservedSymsFM fs of
1020         Just (keyword,exts) -> do
1021                 b <- extension exts
1022                 if b then return (L span keyword)
1023                      else return (L span $! con fs)
1024         _other -> return (L span $! con fs)
1025   where
1026         fs = lexemeToFastString buf len
1027
1028 -- Variations on the integral numeric literal.
1029 tok_integral :: (Integer -> Token)
1030      -> (Integer -> Integer)
1031  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1032      -> Int -> Int
1033      -> (Integer, (Char->Int)) -> Action
1034 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1035   return $ L span $ itint $! transint $ parseUnsignedInteger
1036      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1037
1038 -- some conveniences for use with tok_integral
1039 tok_num :: (Integer -> Integer)
1040         -> Int -> Int
1041         -> (Integer, (Char->Int)) -> Action
1042 tok_num = tok_integral ITinteger
1043 tok_primint :: (Integer -> Integer)
1044             -> Int -> Int
1045             -> (Integer, (Char->Int)) -> Action
1046 tok_primint = tok_integral ITprimint
1047 tok_primword :: Int -> Int
1048              -> (Integer, (Char->Int)) -> Action
1049 tok_primword = tok_integral ITprimword positive
1050 positive, negative :: (Integer -> Integer)
1051 positive = id
1052 negative = negate
1053 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1054 decimal = (10,octDecDigit)
1055 octal = (8,octDecDigit)
1056 hexadecimal = (16,hexDigit)
1057
1058 -- readRational can understand negative rationals, exponents, everything.
1059 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1060 tok_float        str = ITrational   $! readFractionalLit str
1061 tok_primfloat    str = ITprimfloat  $! readFractionalLit str
1062 tok_primdouble   str = ITprimdouble $! readFractionalLit str
1063
1064 readFractionalLit :: String -> FractionalLit
1065 readFractionalLit str = (FL $! str) $! readRational str
1066
1067 -- -----------------------------------------------------------------------------
1068 -- Layout processing
1069
1070 -- we're at the first token on a line, insert layout tokens if necessary
1071 do_bol :: Action
1072 do_bol span _str _len = do
1073         pos <- getOffside
1074         case pos of
1075             LT -> do
1076                 --trace "layout: inserting '}'" $ do
1077                 popContext
1078                 -- do NOT pop the lex state, we might have a ';' to insert
1079                 return (L span ITvccurly)
1080             EQ -> do
1081                 --trace "layout: inserting ';'" $ do
1082                 _ <- popLexState
1083                 return (L span ITsemi)
1084             GT -> do
1085                 _ <- popLexState
1086                 lexToken
1087
1088 -- certain keywords put us in the "layout" state, where we might
1089 -- add an opening curly brace.
1090 maybe_layout :: Token -> P ()
1091 maybe_layout t = do -- If the alternative layout rule is enabled then
1092                     -- we never create an implicit layout context here.
1093                     -- Layout is handled XXX instead.
1094                     -- The code for closing implicit contexts, or
1095                     -- inserting implicit semi-colons, is therefore
1096                     -- irrelevant as it only applies in an implicit
1097                     -- context.
1098                     alr <- extension alternativeLayoutRule
1099                     unless alr $ f t
1100     where f ITdo    = pushLexState layout_do
1101           f ITmdo   = pushLexState layout_do
1102           f ITof    = pushLexState layout
1103           f ITlet   = pushLexState layout
1104           f ITwhere = pushLexState layout
1105           f ITrec   = pushLexState layout
1106           f _       = return ()
1107
1108 -- Pushing a new implicit layout context.  If the indentation of the
1109 -- next token is not greater than the previous layout context, then
1110 -- Haskell 98 says that the new layout context should be empty; that is
1111 -- the lexer must generate {}.
1112 --
1113 -- We are slightly more lenient than this: when the new context is started
1114 -- by a 'do', then we allow the new context to be at the same indentation as
1115 -- the previous context.  This is what the 'strict' argument is for.
1116 --
1117 new_layout_context :: Bool -> Action
1118 new_layout_context strict span _buf _len = do
1119     _ <- popLexState
1120     (AI l _) <- getInput
1121     let offset = srcLocCol l
1122     ctx <- getContext
1123     nondecreasing <- extension nondecreasingIndentation
1124     let strict' = strict || not nondecreasing
1125     case ctx of
1126         Layout prev_off : _  |
1127            (strict'     && prev_off >= offset  ||
1128             not strict' && prev_off > offset) -> do
1129                 -- token is indented to the left of the previous context.
1130                 -- we must generate a {} sequence now.
1131                 pushLexState layout_left
1132                 return (L span ITvocurly)
1133         _ -> do
1134                 setContext (Layout offset : ctx)
1135                 return (L span ITvocurly)
1136
1137 do_layout_left :: Action
1138 do_layout_left span _buf _len = do
1139     _ <- popLexState
1140     pushLexState bol  -- we must be at the start of a line
1141     return (L span ITvccurly)
1142
1143 -- -----------------------------------------------------------------------------
1144 -- LINE pragmas
1145
1146 setLine :: Int -> Action
1147 setLine code span buf len = do
1148   let line = parseUnsignedInteger buf len 10 octDecDigit
1149   setSrcLoc (mkRealSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1)
1150         -- subtract one: the line number refers to the *following* line
1151   _ <- popLexState
1152   pushLexState code
1153   lexToken
1154
1155 setFile :: Int -> Action
1156 setFile code span buf len = do
1157   let file = lexemeToFastString (stepOn buf) (len-2)
1158   setAlrLastLoc $ alrInitialLoc file
1159   setSrcLoc (mkRealSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1160   _ <- popLexState
1161   pushLexState code
1162   lexToken
1163
1164 alrInitialLoc :: FastString -> RealSrcSpan
1165 alrInitialLoc file = mkRealSrcSpan loc loc
1166     where -- This is a hack to ensure that the first line in a file
1167           -- looks like it is after the initial location:
1168           loc = mkRealSrcLoc file (-1) (-1)
1169
1170 -- -----------------------------------------------------------------------------
1171 -- Options, includes and language pragmas.
1172
1173 lex_string_prag :: (String -> Token) -> Action
1174 lex_string_prag mkTok span _buf _len
1175     = do input <- getInput
1176          start <- getSrcLoc
1177          tok <- go [] input
1178          end <- getSrcLoc
1179          return (L (mkRealSrcSpan start end) tok)
1180     where go acc input
1181               = if isString input "#-}"
1182                    then do setInput input
1183                            return (mkTok (reverse acc))
1184                    else case alexGetChar input of
1185                           Just (c,i) -> go (c:acc) i
1186                           Nothing -> err input
1187           isString _ [] = True
1188           isString i (x:xs)
1189               = case alexGetChar i of
1190                   Just (c,i') | c == x    -> isString i' xs
1191                   _other -> False
1192           err (AI end _) = failLocMsgP (realSrcSpanStart span) end "unterminated options pragma"
1193
1194
1195 -- -----------------------------------------------------------------------------
1196 -- Strings & Chars
1197
1198 -- This stuff is horrible.  I hates it.
1199
1200 lex_string_tok :: Action
1201 lex_string_tok span _buf _len = do
1202   tok <- lex_string ""
1203   end <- getSrcLoc
1204   return (L (mkRealSrcSpan (realSrcSpanStart span) end) tok)
1205
1206 lex_string :: String -> P Token
1207 lex_string s = do
1208   i <- getInput
1209   case alexGetChar' i of
1210     Nothing -> lit_error i
1211
1212     Just ('"',i)  -> do
1213         setInput i
1214         magicHash <- extension magicHashEnabled
1215         if magicHash
1216           then do
1217             i <- getInput
1218             case alexGetChar' i of
1219               Just ('#',i) -> do
1220                    setInput i
1221                    if any (> '\xFF') s
1222                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1223                     else let s' = mkZFastString (reverse s) in
1224                          return (ITprimstring s')
1225                         -- mkZFastString is a hack to avoid encoding the
1226                         -- string in UTF-8.  We just want the exact bytes.
1227               _other ->
1228                 return (ITstring (mkFastString (reverse s)))
1229           else
1230                 return (ITstring (mkFastString (reverse s)))
1231
1232     Just ('\\',i)
1233         | Just ('&',i) <- next -> do
1234                 setInput i; lex_string s
1235         | Just (c,i) <- next, c <= '\x7f' && is_space c -> do
1236                            -- is_space only works for <= '\x7f' (#3751)
1237                 setInput i; lex_stringgap s
1238         where next = alexGetChar' i
1239
1240     Just (c, i1) -> do
1241         case c of
1242           '\\' -> do setInput i1; c' <- lex_escape; lex_string (c':s)
1243           c | isAny c -> do setInput i1; lex_string (c:s)
1244           _other -> lit_error i
1245
1246 lex_stringgap :: String -> P Token
1247 lex_stringgap s = do
1248   i <- getInput
1249   c <- getCharOrFail i
1250   case c of
1251     '\\' -> lex_string s
1252     c | is_space c -> lex_stringgap s
1253     _other -> lit_error i
1254
1255
1256 lex_char_tok :: Action
1257 -- Here we are basically parsing character literals, such as 'x' or '\n'
1258 -- but, when Template Haskell is on, we additionally spot
1259 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1260 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1261 -- So we have to do two characters of lookahead: when we see 'x we need to
1262 -- see if there's a trailing quote
1263 lex_char_tok span _buf _len = do        -- We've seen '
1264    i1 <- getInput       -- Look ahead to first character
1265    let loc = realSrcSpanStart span
1266    case alexGetChar' i1 of
1267         Nothing -> lit_error  i1
1268
1269         Just ('\'', i2@(AI end2 _)) -> do       -- We've seen ''
1270                   th_exts <- extension thEnabled
1271                   if th_exts then do
1272                         setInput i2
1273                         return (L (mkRealSrcSpan loc end2)  ITtyQuote)
1274                    else lit_error i1
1275
1276         Just ('\\', i2@(AI _end2 _)) -> do      -- We've seen 'backslash
1277                   setInput i2
1278                   lit_ch <- lex_escape
1279                   i3 <- getInput
1280                   mc <- getCharOrFail i3 -- Trailing quote
1281                   if mc == '\'' then finish_char_tok loc lit_ch
1282                                 else lit_error i3
1283
1284         Just (c, i2@(AI _end2 _))
1285                 | not (isAny c) -> lit_error i1
1286                 | otherwise ->
1287
1288                 -- We've seen 'x, where x is a valid character
1289                 --  (i.e. not newline etc) but not a quote or backslash
1290            case alexGetChar' i2 of      -- Look ahead one more character
1291                 Just ('\'', i3) -> do   -- We've seen 'x'
1292                         setInput i3
1293                         finish_char_tok loc c
1294                 _other -> do            -- We've seen 'x not followed by quote
1295                                         -- (including the possibility of EOF)
1296                                         -- If TH is on, just parse the quote only
1297                         th_exts <- extension thEnabled
1298                         let (AI end _) = i1
1299                         if th_exts then return (L (mkRealSrcSpan loc end) ITvarQuote)
1300                                    else lit_error i2
1301
1302 finish_char_tok :: RealSrcLoc -> Char -> P (RealLocated Token)
1303 finish_char_tok loc ch  -- We've already seen the closing quote
1304                         -- Just need to check for trailing #
1305   = do  magicHash <- extension magicHashEnabled
1306         i@(AI end _) <- getInput
1307         if magicHash then do
1308                 case alexGetChar' i of
1309                         Just ('#',i@(AI end _)) -> do
1310                                 setInput i
1311                                 return (L (mkRealSrcSpan loc end) (ITprimchar ch))
1312                         _other ->
1313                                 return (L (mkRealSrcSpan loc end) (ITchar ch))
1314             else do
1315                    return (L (mkRealSrcSpan loc end) (ITchar ch))
1316
1317 isAny :: Char -> Bool
1318 isAny c | c > '\x7f' = isPrint c
1319         | otherwise  = is_any c
1320
1321 lex_escape :: P Char
1322 lex_escape = do
1323   i0 <- getInput
1324   c <- getCharOrFail i0
1325   case c of
1326         'a'   -> return '\a'
1327         'b'   -> return '\b'
1328         'f'   -> return '\f'
1329         'n'   -> return '\n'
1330         'r'   -> return '\r'
1331         't'   -> return '\t'
1332         'v'   -> return '\v'
1333         '\\'  -> return '\\'
1334         '"'   -> return '\"'
1335         '\''  -> return '\''
1336         '^'   -> do i1 <- getInput
1337                     c <- getCharOrFail i1
1338                     if c >= '@' && c <= '_'
1339                         then return (chr (ord c - ord '@'))
1340                         else lit_error i1
1341
1342         'x'   -> readNum is_hexdigit 16 hexDigit
1343         'o'   -> readNum is_octdigit  8 octDecDigit
1344         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1345
1346         c1 ->  do
1347            i <- getInput
1348            case alexGetChar' i of
1349             Nothing -> lit_error i0
1350             Just (c2,i2) ->
1351               case alexGetChar' i2 of
1352                 Nothing -> do lit_error i0
1353                 Just (c3,i3) ->
1354                    let str = [c1,c2,c3] in
1355                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1356                                      Just rest <- [stripPrefix p str] ] of
1357                           (escape_char,[]):_ -> do
1358                                 setInput i3
1359                                 return escape_char
1360                           (escape_char,_:_):_ -> do
1361                                 setInput i2
1362                                 return escape_char
1363                           [] -> lit_error i0
1364
1365 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1366 readNum is_digit base conv = do
1367   i <- getInput
1368   c <- getCharOrFail i
1369   if is_digit c
1370         then readNum2 is_digit base conv (conv c)
1371         else lit_error i
1372
1373 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1374 readNum2 is_digit base conv i = do
1375   input <- getInput
1376   read i input
1377   where read i input = do
1378           case alexGetChar' input of
1379             Just (c,input') | is_digit c -> do
1380                let i' = i*base + conv c
1381                if i' > 0x10ffff
1382                   then setInput input >> lexError "numeric escape sequence out of range"
1383                   else read i' input'
1384             _other -> do
1385               setInput input; return (chr i)
1386
1387
1388 silly_escape_chars :: [(String, Char)]
1389 silly_escape_chars = [
1390         ("NUL", '\NUL'),
1391         ("SOH", '\SOH'),
1392         ("STX", '\STX'),
1393         ("ETX", '\ETX'),
1394         ("EOT", '\EOT'),
1395         ("ENQ", '\ENQ'),
1396         ("ACK", '\ACK'),
1397         ("BEL", '\BEL'),
1398         ("BS", '\BS'),
1399         ("HT", '\HT'),
1400         ("LF", '\LF'),
1401         ("VT", '\VT'),
1402         ("FF", '\FF'),
1403         ("CR", '\CR'),
1404         ("SO", '\SO'),
1405         ("SI", '\SI'),
1406         ("DLE", '\DLE'),
1407         ("DC1", '\DC1'),
1408         ("DC2", '\DC2'),
1409         ("DC3", '\DC3'),
1410         ("DC4", '\DC4'),
1411         ("NAK", '\NAK'),
1412         ("SYN", '\SYN'),
1413         ("ETB", '\ETB'),
1414         ("CAN", '\CAN'),
1415         ("EM", '\EM'),
1416         ("SUB", '\SUB'),
1417         ("ESC", '\ESC'),
1418         ("FS", '\FS'),
1419         ("GS", '\GS'),
1420         ("RS", '\RS'),
1421         ("US", '\US'),
1422         ("SP", '\SP'),
1423         ("DEL", '\DEL')
1424         ]
1425
1426 -- before calling lit_error, ensure that the current input is pointing to
1427 -- the position of the error in the buffer.  This is so that we can report
1428 -- a correct location to the user, but also so we can detect UTF-8 decoding
1429 -- errors if they occur.
1430 lit_error :: AlexInput -> P a
1431 lit_error i = do setInput i; lexError "lexical error in string/character literal"
1432
1433 getCharOrFail :: AlexInput -> P Char
1434 getCharOrFail i =  do
1435   case alexGetChar' i of
1436         Nothing -> lexError "unexpected end-of-file in string/character literal"
1437         Just (c,i)  -> do setInput i; return c
1438
1439 -- -----------------------------------------------------------------------------
1440 -- QuasiQuote
1441
1442 lex_quasiquote_tok :: Action
1443 lex_quasiquote_tok span buf len = do
1444   let quoter = tail (lexemeToString buf (len - 1))
1445                 -- 'tail' drops the initial '[',
1446                 -- while the -1 drops the trailing '|'
1447   quoteStart <- getSrcLoc
1448   quote <- lex_quasiquote ""
1449   end <- getSrcLoc
1450   return (L (mkRealSrcSpan (realSrcSpanStart span) end)
1451            (ITquasiQuote (mkFastString quoter,
1452                           mkFastString (reverse quote),
1453                           mkRealSrcSpan quoteStart end)))
1454
1455 lex_quasiquote :: String -> P String
1456 lex_quasiquote s = do
1457   i <- getInput
1458   case alexGetChar' i of
1459     Nothing -> lit_error i
1460
1461     Just ('\\',i)
1462         | Just ('|',i) <- next -> do
1463                 setInput i; lex_quasiquote ('|' : s)
1464         | Just (']',i) <- next -> do
1465                 setInput i; lex_quasiquote (']' : s)
1466         where next = alexGetChar' i
1467
1468     Just ('|',i)
1469         | Just (']',i) <- next -> do
1470                 setInput i; return s
1471         where next = alexGetChar' i
1472
1473     Just (c, i) -> do
1474          setInput i; lex_quasiquote (c : s)
1475
1476 -- -----------------------------------------------------------------------------
1477 -- Warnings
1478
1479 warn :: DynFlag -> SDoc -> Action
1480 warn option warning srcspan _buf _len = do
1481     addWarning option (RealSrcSpan srcspan) warning
1482     lexToken
1483
1484 warnThen :: DynFlag -> SDoc -> Action -> Action
1485 warnThen option warning action srcspan buf len = do
1486     addWarning option (RealSrcSpan srcspan) warning
1487     action srcspan buf len
1488
1489 -- -----------------------------------------------------------------------------
1490 -- The Parse Monad
1491
1492 data LayoutContext
1493   = NoLayout
1494   | Layout !Int
1495   deriving Show
1496
1497 data ParseResult a
1498   = POk PState a
1499   | PFailed
1500         SrcSpan         -- The start and end of the text span related to
1501                         -- the error.  Might be used in environments which can
1502                         -- show this span, e.g. by highlighting it.
1503         Message         -- The error message
1504
1505 data PState = PState {
1506         buffer     :: StringBuffer,
1507         dflags     :: DynFlags,
1508         messages   :: Messages,
1509         last_loc   :: RealSrcSpan,      -- pos of previous token
1510         last_len   :: !Int,     -- len of previous token
1511         loc        :: RealSrcLoc,   -- current loc (end of prev token + 1)
1512         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1513         context    :: [LayoutContext],
1514         lex_state  :: [Int],
1515         -- Used in the alternative layout rule:
1516         -- These tokens are the next ones to be sent out. They are
1517         -- just blindly emitted, without the rule looking at them again:
1518         alr_pending_implicit_tokens :: [RealLocated Token],
1519         -- This is the next token to be considered or, if it is Nothing,
1520         -- we need to get the next token from the input stream:
1521         alr_next_token :: Maybe (RealLocated Token),
1522         -- This is what we consider to be the locatino of the last token
1523         -- emitted:
1524         alr_last_loc :: RealSrcSpan,
1525         -- The stack of layout contexts:
1526         alr_context :: [ALRContext],
1527         -- Are we expecting a '{'? If it's Just, then the ALRLayout tells
1528         -- us what sort of layout the '{' will open:
1529         alr_expecting_ocurly :: Maybe ALRLayout,
1530         -- Have we just had the '}' for a let block? If so, than an 'in'
1531         -- token doesn't need to close anything:
1532         alr_justClosedExplicitLetBlock :: Bool
1533      }
1534         -- last_loc and last_len are used when generating error messages,
1535         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1536         -- current token to happyError, we could at least get rid of last_len.
1537         -- Getting rid of last_loc would require finding another way to
1538         -- implement pushCurrentContext (which is only called from one place).
1539
1540 data ALRContext = ALRNoLayout Bool{- does it contain commas? -}
1541                               Bool{- is it a 'let' block? -}
1542                 | ALRLayout ALRLayout Int
1543 data ALRLayout = ALRLayoutLet
1544                | ALRLayoutWhere
1545                | ALRLayoutOf
1546                | ALRLayoutDo
1547
1548 newtype P a = P { unP :: PState -> ParseResult a }
1549
1550 instance Monad P where
1551   return = returnP
1552   (>>=) = thenP
1553   fail = failP
1554
1555 returnP :: a -> P a
1556 returnP a = a `seq` (P $ \s -> POk s a)
1557
1558 thenP :: P a -> (a -> P b) -> P b
1559 (P m) `thenP` k = P $ \ s ->
1560         case m s of
1561                 POk s1 a         -> (unP (k a)) s1
1562                 PFailed span err -> PFailed span err
1563
1564 failP :: String -> P a
1565 failP msg = P $ \s -> PFailed (RealSrcSpan (last_loc s)) (text msg)
1566
1567 failMsgP :: String -> P a
1568 failMsgP msg = P $ \s -> PFailed (RealSrcSpan (last_loc s)) (text msg)
1569
1570 failLocMsgP :: RealSrcLoc -> RealSrcLoc -> String -> P a
1571 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (RealSrcSpan (mkRealSrcSpan loc1 loc2)) (text str)
1572
1573 failSpanMsgP :: SrcSpan -> SDoc -> P a
1574 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1575
1576 getPState :: P PState
1577 getPState = P $ \s -> POk s s
1578
1579 getDynFlags :: P DynFlags
1580 getDynFlags = P $ \s -> POk s (dflags s)
1581
1582 withThisPackage :: (PackageId -> a) -> P a
1583 withThisPackage f
1584  = do   pkg     <- liftM thisPackage getDynFlags
1585         return  $ f pkg
1586
1587 extension :: (Int -> Bool) -> P Bool
1588 extension p = P $ \s -> POk s (p $! extsBitmap s)
1589
1590 getExts :: P Int
1591 getExts = P $ \s -> POk s (extsBitmap s)
1592
1593 setExts :: (Int -> Int) -> P ()
1594 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1595
1596 setSrcLoc :: RealSrcLoc -> P ()
1597 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1598
1599 getSrcLoc :: P RealSrcLoc
1600 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1601
1602 setLastToken :: RealSrcSpan -> Int -> P ()
1603 setLastToken loc len = P $ \s -> POk s {
1604   last_loc=loc,
1605   last_len=len
1606   } ()
1607
1608 data AlexInput = AI RealSrcLoc StringBuffer
1609
1610 alexInputPrevChar :: AlexInput -> Char
1611 alexInputPrevChar (AI _ buf) = prevChar buf '\n'
1612
1613 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1614 alexGetChar (AI loc s)
1615   | atEnd s   = Nothing
1616   | otherwise = adj_c `seq` loc' `seq` s' `seq`
1617                 --trace (show (ord c)) $
1618                 Just (adj_c, (AI loc' s'))
1619   where (c,s') = nextChar s
1620         loc'   = advanceSrcLoc loc c
1621
1622         non_graphic     = '\x0'
1623         upper           = '\x1'
1624         lower           = '\x2'
1625         digit           = '\x3'
1626         symbol          = '\x4'
1627         space           = '\x5'
1628         other_graphic   = '\x6'
1629
1630         adj_c
1631           | c <= '\x06' = non_graphic
1632           | c <= '\x7f' = c
1633           -- Alex doesn't handle Unicode, so when Unicode
1634           -- character is encountered we output these values
1635           -- with the actual character value hidden in the state.
1636           | otherwise =
1637                 case generalCategory c of
1638                   UppercaseLetter       -> upper
1639                   LowercaseLetter       -> lower
1640                   TitlecaseLetter       -> upper
1641                   ModifierLetter        -> other_graphic
1642                   OtherLetter           -> lower -- see #1103
1643                   NonSpacingMark        -> other_graphic
1644                   SpacingCombiningMark  -> other_graphic
1645                   EnclosingMark         -> other_graphic
1646                   DecimalNumber         -> digit
1647                   LetterNumber          -> other_graphic
1648                   OtherNumber           -> digit -- see #4373
1649                   ConnectorPunctuation  -> symbol
1650                   DashPunctuation       -> symbol
1651                   OpenPunctuation       -> other_graphic
1652                   ClosePunctuation      -> other_graphic
1653                   InitialQuote          -> other_graphic
1654                   FinalQuote            -> other_graphic
1655                   OtherPunctuation      -> symbol
1656                   MathSymbol            -> symbol
1657                   CurrencySymbol        -> symbol
1658                   ModifierSymbol        -> symbol
1659                   OtherSymbol           -> symbol
1660                   Space                 -> space
1661                   _other                -> non_graphic
1662
1663 -- This version does not squash unicode characters, it is used when
1664 -- lexing strings.
1665 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1666 alexGetChar' (AI loc s)
1667   | atEnd s   = Nothing
1668   | otherwise = c `seq` loc' `seq` s' `seq`
1669                 --trace (show (ord c)) $
1670                 Just (c, (AI loc' s'))
1671   where (c,s') = nextChar s
1672         loc'   = advanceSrcLoc loc c
1673
1674 getInput :: P AlexInput
1675 getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (AI l b)
1676
1677 setInput :: AlexInput -> P ()
1678 setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } ()
1679
1680 nextIsEOF :: P Bool
1681 nextIsEOF = do
1682   AI _ s <- getInput
1683   return $ atEnd s
1684
1685 pushLexState :: Int -> P ()
1686 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1687
1688 popLexState :: P Int
1689 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1690
1691 getLexState :: P Int
1692 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1693
1694 popNextToken :: P (Maybe (RealLocated Token))
1695 popNextToken
1696     = P $ \s@PState{ alr_next_token = m } ->
1697               POk (s {alr_next_token = Nothing}) m
1698
1699 activeContext :: P Bool
1700 activeContext = do
1701   ctxt <- getALRContext
1702   expc <- getAlrExpectingOCurly
1703   impt <- implicitTokenPending
1704   case (ctxt,expc) of
1705     ([],Nothing) -> return impt
1706     _other       -> return True
1707
1708 setAlrLastLoc :: RealSrcSpan -> P ()
1709 setAlrLastLoc l = P $ \s -> POk (s {alr_last_loc = l}) ()
1710
1711 getAlrLastLoc :: P RealSrcSpan
1712 getAlrLastLoc = P $ \s@(PState {alr_last_loc = l}) -> POk s l
1713
1714 getALRContext :: P [ALRContext]
1715 getALRContext = P $ \s@(PState {alr_context = cs}) -> POk s cs
1716
1717 setALRContext :: [ALRContext] -> P ()
1718 setALRContext cs = P $ \s -> POk (s {alr_context = cs}) ()
1719
1720 getJustClosedExplicitLetBlock :: P Bool
1721 getJustClosedExplicitLetBlock
1722  = P $ \s@(PState {alr_justClosedExplicitLetBlock = b}) -> POk s b
1723
1724 setJustClosedExplicitLetBlock :: Bool -> P ()
1725 setJustClosedExplicitLetBlock b
1726  = P $ \s -> POk (s {alr_justClosedExplicitLetBlock = b}) ()
1727
1728 setNextToken :: RealLocated Token -> P ()
1729 setNextToken t = P $ \s -> POk (s {alr_next_token = Just t}) ()
1730
1731 implicitTokenPending :: P Bool
1732 implicitTokenPending
1733     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1734               case ts of
1735               [] -> POk s False
1736               _  -> POk s True
1737
1738 popPendingImplicitToken :: P (Maybe (RealLocated Token))
1739 popPendingImplicitToken
1740     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1741               case ts of
1742               [] -> POk s Nothing
1743               (t : ts') -> POk (s {alr_pending_implicit_tokens = ts'}) (Just t)
1744
1745 setPendingImplicitTokens :: [RealLocated Token] -> P ()
1746 setPendingImplicitTokens ts = P $ \s -> POk (s {alr_pending_implicit_tokens = ts}) ()
1747
1748 getAlrExpectingOCurly :: P (Maybe ALRLayout)
1749 getAlrExpectingOCurly = P $ \s@(PState {alr_expecting_ocurly = b}) -> POk s b
1750
1751 setAlrExpectingOCurly :: Maybe ALRLayout -> P ()
1752 setAlrExpectingOCurly b = P $ \s -> POk (s {alr_expecting_ocurly = b}) ()
1753
1754 -- for reasons of efficiency, flags indicating language extensions (eg,
1755 -- -fglasgow-exts or -XParallelArrays) are represented by a bitmap stored in an unboxed
1756 -- integer
1757
1758 -- The "genericsBit" is now unused, available for others
1759 -- genericsBit :: Int
1760 -- genericsBit = 0 -- {|, |} and "generic"
1761
1762 ffiBit :: Int
1763 ffiBit     = 1
1764 parrBit :: Int
1765 parrBit    = 2
1766 arrowsBit :: Int
1767 arrowsBit  = 4
1768 thBit :: Int
1769 thBit      = 5
1770 ipBit :: Int
1771 ipBit      = 6
1772 explicitForallBit :: Int
1773 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1774 bangPatBit :: Int
1775 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1776                 -- (doesn't affect the lexer)
1777 tyFamBit :: Int
1778 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1779 haddockBit :: Int
1780 haddockBit = 10 -- Lex and parse Haddock comments
1781 magicHashBit :: Int
1782 magicHashBit = 11 -- "#" in both functions and operators
1783 kindSigsBit :: Int
1784 kindSigsBit = 12 -- Kind signatures on type variables
1785 recursiveDoBit :: Int
1786 recursiveDoBit = 13 -- mdo
1787 unicodeSyntaxBit :: Int
1788 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1789 unboxedTuplesBit :: Int
1790 unboxedTuplesBit = 15 -- (# and #)
1791 datatypeContextsBit :: Int
1792 datatypeContextsBit = 16
1793 transformComprehensionsBit :: Int
1794 transformComprehensionsBit = 17
1795 qqBit :: Int
1796 qqBit      = 18 -- enable quasiquoting
1797 inRulePragBit :: Int
1798 inRulePragBit = 19
1799 rawTokenStreamBit :: Int
1800 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1801 recBit :: Int
1802 recBit = 22 -- rec
1803 alternativeLayoutRuleBit :: Int
1804 alternativeLayoutRuleBit = 23
1805 relaxedLayoutBit :: Int
1806 relaxedLayoutBit = 24
1807 nondecreasingIndentationBit :: Int
1808 nondecreasingIndentationBit = 25
1809
1810 always :: Int -> Bool
1811 always           _     = True
1812 parrEnabled :: Int -> Bool
1813 parrEnabled      flags = testBit flags parrBit
1814 arrowsEnabled :: Int -> Bool
1815 arrowsEnabled    flags = testBit flags arrowsBit
1816 thEnabled :: Int -> Bool
1817 thEnabled        flags = testBit flags thBit
1818 ipEnabled :: Int -> Bool
1819 ipEnabled        flags = testBit flags ipBit
1820 explicitForallEnabled :: Int -> Bool
1821 explicitForallEnabled flags = testBit flags explicitForallBit
1822 bangPatEnabled :: Int -> Bool
1823 bangPatEnabled   flags = testBit flags bangPatBit
1824 -- tyFamEnabled :: Int -> Bool
1825 -- tyFamEnabled     flags = testBit flags tyFamBit
1826 haddockEnabled :: Int -> Bool
1827 haddockEnabled   flags = testBit flags haddockBit
1828 magicHashEnabled :: Int -> Bool
1829 magicHashEnabled flags = testBit flags magicHashBit
1830 -- kindSigsEnabled :: Int -> Bool
1831 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1832 unicodeSyntaxEnabled :: Int -> Bool
1833 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1834 unboxedTuplesEnabled :: Int -> Bool
1835 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1836 datatypeContextsEnabled :: Int -> Bool
1837 datatypeContextsEnabled flags = testBit flags datatypeContextsBit
1838 qqEnabled :: Int -> Bool
1839 qqEnabled        flags = testBit flags qqBit
1840 -- inRulePrag :: Int -> Bool
1841 -- inRulePrag       flags = testBit flags inRulePragBit
1842 rawTokenStreamEnabled :: Int -> Bool
1843 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1844 alternativeLayoutRule :: Int -> Bool
1845 alternativeLayoutRule flags = testBit flags alternativeLayoutRuleBit
1846 relaxedLayout :: Int -> Bool
1847 relaxedLayout flags = testBit flags relaxedLayoutBit
1848 nondecreasingIndentation :: Int -> Bool
1849 nondecreasingIndentation flags = testBit flags nondecreasingIndentationBit
1850
1851 -- PState for parsing options pragmas
1852 --
1853 pragState :: DynFlags -> StringBuffer -> RealSrcLoc -> PState
1854 pragState dynflags buf loc = (mkPState dynflags buf loc) {
1855                                  lex_state = [bol, option_prags, 0]
1856                              }
1857
1858 -- create a parse state
1859 --
1860 mkPState :: DynFlags -> StringBuffer -> RealSrcLoc -> PState
1861 mkPState flags buf loc =
1862   PState {
1863       buffer        = buf,
1864       dflags        = flags,
1865       messages      = emptyMessages,
1866       last_loc      = mkRealSrcSpan loc loc,
1867       last_len      = 0,
1868       loc           = loc,
1869       extsBitmap    = fromIntegral bitmap,
1870       context       = [],
1871       lex_state     = [bol, 0],
1872       alr_pending_implicit_tokens = [],
1873       alr_next_token = Nothing,
1874       alr_last_loc = alrInitialLoc (fsLit "<no file>"),
1875       alr_context = [],
1876       alr_expecting_ocurly = Nothing,
1877       alr_justClosedExplicitLetBlock = False
1878     }
1879     where
1880       bitmap =     ffiBit            `setBitIf` xopt Opt_ForeignFunctionInterface flags
1881                .|. parrBit           `setBitIf` xopt Opt_ParallelArrays  flags
1882                .|. arrowsBit         `setBitIf` xopt Opt_Arrows          flags
1883                .|. thBit             `setBitIf` xopt Opt_TemplateHaskell flags
1884                .|. qqBit             `setBitIf` xopt Opt_QuasiQuotes     flags
1885                .|. ipBit             `setBitIf` xopt Opt_ImplicitParams  flags
1886                .|. explicitForallBit `setBitIf` xopt Opt_ExplicitForAll  flags
1887                .|. bangPatBit        `setBitIf` xopt Opt_BangPatterns    flags
1888                .|. tyFamBit          `setBitIf` xopt Opt_TypeFamilies    flags
1889                .|. haddockBit        `setBitIf` dopt Opt_Haddock         flags
1890                .|. magicHashBit      `setBitIf` xopt Opt_MagicHash       flags
1891                .|. kindSigsBit       `setBitIf` xopt Opt_KindSignatures  flags
1892                .|. recursiveDoBit    `setBitIf` xopt Opt_RecursiveDo     flags
1893                .|. recBit            `setBitIf` xopt Opt_DoRec           flags
1894                .|. recBit            `setBitIf` xopt Opt_Arrows          flags
1895                .|. unicodeSyntaxBit  `setBitIf` xopt Opt_UnicodeSyntax   flags
1896                .|. unboxedTuplesBit  `setBitIf` xopt Opt_UnboxedTuples   flags
1897                .|. datatypeContextsBit `setBitIf` xopt Opt_DatatypeContexts flags
1898                .|. transformComprehensionsBit `setBitIf` xopt Opt_TransformListComp flags
1899                .|. transformComprehensionsBit `setBitIf` xopt Opt_MonadComprehensions flags
1900                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1901                .|. alternativeLayoutRuleBit `setBitIf` xopt Opt_AlternativeLayoutRule flags
1902                .|. relaxedLayoutBit  `setBitIf` xopt Opt_RelaxedLayout flags
1903                .|. nondecreasingIndentationBit `setBitIf` xopt Opt_NondecreasingIndentation flags
1904       --
1905       setBitIf :: Int -> Bool -> Int
1906       b `setBitIf` cond | cond      = bit b
1907                         | otherwise = 0
1908
1909 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1910 addWarning option srcspan warning
1911  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1912        let warning' = mkWarnMsg srcspan alwaysQualify warning
1913            ws' = if dopt option d then ws `snocBag` warning' else ws
1914        in POk s{messages=(ws', es)} ()
1915
1916 getMessages :: PState -> Messages
1917 getMessages PState{messages=ms} = ms
1918
1919 getContext :: P [LayoutContext]
1920 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1921
1922 setContext :: [LayoutContext] -> P ()
1923 setContext ctx = P $ \s -> POk s{context=ctx} ()
1924
1925 popContext :: P ()
1926 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1927                               last_len = len, last_loc = last_loc }) ->
1928   case ctx of
1929         (_:tl) -> POk s{ context = tl } ()
1930         []     -> PFailed (RealSrcSpan last_loc) (srcParseErr buf len)
1931
1932 -- Push a new layout context at the indentation of the last token read.
1933 -- This is only used at the outer level of a module when the 'module'
1934 -- keyword is missing.
1935 pushCurrentContext :: P ()
1936 pushCurrentContext = P $ \ s@PState{ last_loc=loc, context=ctx } ->
1937     POk s{context = Layout (srcSpanStartCol loc) : ctx} ()
1938
1939 getOffside :: P Ordering
1940 getOffside = P $ \s@PState{last_loc=loc, context=stk} ->
1941                 let offs = srcSpanStartCol loc in
1942                 let ord = case stk of
1943                         (Layout n:_) -> --trace ("layout: " ++ show n ++ ", offs: " ++ show offs) $
1944                                         compare offs n
1945                         _            -> GT
1946                 in POk s ord
1947
1948 -- ---------------------------------------------------------------------------
1949 -- Construct a parse error
1950
1951 srcParseErr
1952   :: StringBuffer       -- current buffer (placed just after the last token)
1953   -> Int                -- length of the previous token
1954   -> Message
1955 srcParseErr buf len
1956   = hcat [ if null token
1957              then ptext (sLit "parse error (possibly incorrect indentation)")
1958              else hcat [ptext (sLit "parse error on input "),
1959                         char '`', text token, char '\'']
1960     ]
1961   where token = lexemeToString (offsetBytes (-len) buf) len
1962
1963 -- Report a parse failure, giving the span of the previous token as
1964 -- the location of the error.  This is the entry point for errors
1965 -- detected during parsing.
1966 srcParseFail :: P a
1967 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1968                             last_loc = last_loc } ->
1969     PFailed (RealSrcSpan last_loc) (srcParseErr buf len)
1970
1971 -- A lexical error is reported at a particular position in the source file,
1972 -- not over a token range.
1973 lexError :: String -> P a
1974 lexError str = do
1975   loc <- getSrcLoc
1976   (AI end buf) <- getInput
1977   reportLexError loc end buf str
1978
1979 -- -----------------------------------------------------------------------------
1980 -- This is the top-level function: called from the parser each time a
1981 -- new token is to be read from the input.
1982
1983 lexer :: (Located Token -> P a) -> P a
1984 lexer cont = do
1985   alr <- extension alternativeLayoutRule
1986   let lexTokenFun = if alr then lexTokenAlr else lexToken
1987   (L span tok) <- lexTokenFun
1988   --trace ("token: " ++ show tok) $ do
1989   cont (L (RealSrcSpan span) tok)
1990
1991 lexTokenAlr :: P (RealLocated Token)
1992 lexTokenAlr = do mPending <- popPendingImplicitToken
1993                  t <- case mPending of
1994                       Nothing ->
1995                           do mNext <- popNextToken
1996                              t <- case mNext of
1997                                   Nothing -> lexToken
1998                                   Just next -> return next
1999                              alternativeLayoutRuleToken t
2000                       Just t ->
2001                           return t
2002                  setAlrLastLoc (getLoc t)
2003                  case unLoc t of
2004                      ITwhere -> setAlrExpectingOCurly (Just ALRLayoutWhere)
2005                      ITlet   -> setAlrExpectingOCurly (Just ALRLayoutLet)
2006                      ITof    -> setAlrExpectingOCurly (Just ALRLayoutOf)
2007                      ITdo    -> setAlrExpectingOCurly (Just ALRLayoutDo)
2008                      ITmdo   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2009                      ITrec   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2010                      _       -> return ()
2011                  return t
2012
2013 alternativeLayoutRuleToken :: RealLocated Token -> P (RealLocated Token)
2014 alternativeLayoutRuleToken t
2015     = do context <- getALRContext
2016          lastLoc <- getAlrLastLoc
2017          mExpectingOCurly <- getAlrExpectingOCurly
2018          justClosedExplicitLetBlock <- getJustClosedExplicitLetBlock
2019          setJustClosedExplicitLetBlock False
2020          dflags <- getDynFlags
2021          let transitional = xopt Opt_AlternativeLayoutRuleTransitional dflags
2022              thisLoc = getLoc t
2023              thisCol = srcSpanStartCol thisLoc
2024              newLine = srcSpanStartLine thisLoc > srcSpanEndLine lastLoc
2025          case (unLoc t, context, mExpectingOCurly) of
2026              -- This case handles a GHC extension to the original H98
2027              -- layout rule...
2028              (ITocurly, _, Just alrLayout) ->
2029                  do setAlrExpectingOCurly Nothing
2030                     let isLet = case alrLayout of
2031                                 ALRLayoutLet -> True
2032                                 _ -> False
2033                     setALRContext (ALRNoLayout (containsCommas ITocurly) isLet : context)
2034                     return t
2035              -- ...and makes this case unnecessary
2036              {-
2037              -- I think our implicit open-curly handling is slightly
2038              -- different to John's, in how it interacts with newlines
2039              -- and "in"
2040              (ITocurly, _, Just _) ->
2041                  do setAlrExpectingOCurly Nothing
2042                     setNextToken t
2043                     lexTokenAlr
2044              -}
2045              (_, ALRLayout _ col : ls, Just expectingOCurly)
2046               | (thisCol > col) ||
2047                 (thisCol == col &&
2048                  isNonDecreasingIntentation expectingOCurly) ->
2049                  do setAlrExpectingOCurly Nothing
2050                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2051                     setNextToken t
2052                     return (L thisLoc ITocurly)
2053               | otherwise ->
2054                  do setAlrExpectingOCurly Nothing
2055                     setPendingImplicitTokens [L lastLoc ITccurly]
2056                     setNextToken t
2057                     return (L lastLoc ITocurly)
2058              (_, _, Just expectingOCurly) ->
2059                  do setAlrExpectingOCurly Nothing
2060                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2061                     setNextToken t
2062                     return (L thisLoc ITocurly)
2063              -- We do the [] cases earlier than in the spec, as we
2064              -- have an actual EOF token
2065              (ITeof, ALRLayout _ _ : ls, _) ->
2066                  do setALRContext ls
2067                     setNextToken t
2068                     return (L thisLoc ITccurly)
2069              (ITeof, _, _) ->
2070                  return t
2071              -- the other ITeof case omitted; general case below covers it
2072              (ITin, _, _)
2073               | justClosedExplicitLetBlock ->
2074                  return t
2075              (ITin, ALRLayout ALRLayoutLet _ : ls, _)
2076               | newLine ->
2077                  do setPendingImplicitTokens [t]
2078                     setALRContext ls
2079                     return (L thisLoc ITccurly)
2080              -- This next case is to handle a transitional issue:
2081              (ITwhere, ALRLayout _ col : ls, _)
2082               | newLine && thisCol == col && transitional ->
2083                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2084                                (RealSrcSpan thisLoc)
2085                                (transitionalAlternativeLayoutWarning
2086                                     "`where' clause at the same depth as implicit layout block")
2087                     setALRContext ls
2088                     setNextToken t
2089                     -- Note that we use lastLoc, as we may need to close
2090                     -- more layouts, or give a semicolon
2091                     return (L lastLoc ITccurly)
2092              -- This next case is to handle a transitional issue:
2093              (ITvbar, ALRLayout _ col : ls, _)
2094               | newLine && thisCol == col && transitional ->
2095                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2096                                (RealSrcSpan thisLoc)
2097                                (transitionalAlternativeLayoutWarning
2098                                     "`|' at the same depth as implicit layout block")
2099                     setALRContext ls
2100                     setNextToken t
2101                     -- Note that we use lastLoc, as we may need to close
2102                     -- more layouts, or give a semicolon
2103                     return (L lastLoc ITccurly)
2104              (_, ALRLayout _ col : ls, _)
2105               | newLine && thisCol == col ->
2106                  do setNextToken t
2107                     return (L thisLoc ITsemi)
2108               | newLine && thisCol < col ->
2109                  do setALRContext ls
2110                     setNextToken t
2111                     -- Note that we use lastLoc, as we may need to close
2112                     -- more layouts, or give a semicolon
2113                     return (L lastLoc ITccurly)
2114              -- We need to handle close before open, as 'then' is both
2115              -- an open and a close
2116              (u, _, _)
2117               | isALRclose u ->
2118                  case context of
2119                  ALRLayout _ _ : ls ->
2120                      do setALRContext ls
2121                         setNextToken t
2122                         return (L thisLoc ITccurly)
2123                  ALRNoLayout _ isLet : ls ->
2124                      do let ls' = if isALRopen u
2125                                      then ALRNoLayout (containsCommas u) False : ls
2126                                      else ls
2127                         setALRContext ls'
2128                         when isLet $ setJustClosedExplicitLetBlock True
2129                         return t
2130                  [] ->
2131                      do let ls = if isALRopen u
2132                                     then [ALRNoLayout (containsCommas u) False]
2133                                     else ls
2134                         setALRContext ls
2135                         -- XXX This is an error in John's code, but
2136                         -- it looks reachable to me at first glance
2137                         return t
2138              (u, _, _)
2139               | isALRopen u ->
2140                  do setALRContext (ALRNoLayout (containsCommas u) False : context)
2141                     return t
2142              (ITin, ALRLayout ALRLayoutLet _ : ls, _) ->
2143                  do setALRContext ls
2144                     setPendingImplicitTokens [t]
2145                     return (L thisLoc ITccurly)
2146              (ITin, ALRLayout _ _ : ls, _) ->
2147                  do setALRContext ls
2148                     setNextToken t
2149                     return (L thisLoc ITccurly)
2150              -- the other ITin case omitted; general case below covers it
2151              (ITcomma, ALRLayout _ _ : ls, _)
2152               | topNoLayoutContainsCommas ls ->
2153                  do setALRContext ls
2154                     setNextToken t
2155                     return (L thisLoc ITccurly)
2156              (ITwhere, ALRLayout ALRLayoutDo _ : ls, _) ->
2157                  do setALRContext ls
2158                     setPendingImplicitTokens [t]
2159                     return (L thisLoc ITccurly)
2160              -- the other ITwhere case omitted; general case below covers it
2161              (_, _, _) -> return t
2162
2163 transitionalAlternativeLayoutWarning :: String -> SDoc
2164 transitionalAlternativeLayoutWarning msg
2165     = text "transitional layout will not be accepted in the future:"
2166    $$ text msg
2167
2168 isALRopen :: Token -> Bool
2169 isALRopen ITcase        = True
2170 isALRopen ITif          = True
2171 isALRopen ITthen        = True
2172 isALRopen IToparen      = True
2173 isALRopen ITobrack      = True
2174 isALRopen ITocurly      = True
2175 -- GHC Extensions:
2176 isALRopen IToubxparen   = True
2177 isALRopen ITparenEscape = True
2178 isALRopen _             = False
2179
2180 isALRclose :: Token -> Bool
2181 isALRclose ITof     = True
2182 isALRclose ITthen   = True
2183 isALRclose ITelse   = True
2184 isALRclose ITcparen = True
2185 isALRclose ITcbrack = True
2186 isALRclose ITccurly = True
2187 -- GHC Extensions:
2188 isALRclose ITcubxparen = True
2189 isALRclose _        = False
2190
2191 isNonDecreasingIntentation :: ALRLayout -> Bool
2192 isNonDecreasingIntentation ALRLayoutDo = True
2193 isNonDecreasingIntentation _           = False
2194
2195 containsCommas :: Token -> Bool
2196 containsCommas IToparen = True
2197 containsCommas ITobrack = True
2198 -- John doesn't have {} as containing commas, but records contain them,
2199 -- which caused a problem parsing Cabal's Distribution.Simple.InstallDirs
2200 -- (defaultInstallDirs).
2201 containsCommas ITocurly = True
2202 -- GHC Extensions:
2203 containsCommas IToubxparen = True
2204 containsCommas _        = False
2205
2206 topNoLayoutContainsCommas :: [ALRContext] -> Bool
2207 topNoLayoutContainsCommas [] = False
2208 topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
2209 topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
2210
2211 lexToken :: P (RealLocated Token)
2212 lexToken = do
2213   inp@(AI loc1 buf) <- getInput
2214   sc <- getLexState
2215   exts <- getExts
2216   case alexScanUser exts inp sc of
2217     AlexEOF -> do
2218         let span = mkRealSrcSpan loc1 loc1
2219         setLastToken span 0
2220         return (L span ITeof)
2221     AlexError (AI loc2 buf) ->
2222         reportLexError loc1 loc2 buf "lexical error"
2223     AlexSkip inp2 _ -> do
2224         setInput inp2
2225         lexToken
2226     AlexToken inp2@(AI end buf2) _ t -> do
2227         setInput inp2
2228         let span = mkRealSrcSpan loc1 end
2229         let bytes = byteDiff buf buf2
2230         span `seq` setLastToken span bytes
2231         t span buf bytes
2232
2233 reportLexError :: RealSrcLoc -> RealSrcLoc -> StringBuffer -> [Char] -> P a
2234 reportLexError loc1 loc2 buf str
2235   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
2236   | otherwise =
2237   let
2238         c = fst (nextChar buf)
2239   in
2240   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
2241     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
2242     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
2243
2244 lexTokenStream :: StringBuffer -> RealSrcLoc -> DynFlags -> ParseResult [Located Token]
2245 lexTokenStream buf loc dflags = unP go initState
2246     where dflags' = dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream
2247           initState = mkPState dflags' buf loc
2248           go = do
2249             ltok <- lexer return
2250             case ltok of
2251               L _ ITeof -> return []
2252               _ -> liftM (ltok:) go
2253
2254 linePrags = Map.singleton "line" (begin line_prag2)
2255
2256 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
2257                                  ("options_ghc", lex_string_prag IToptions_prag),
2258                                  ("options_haddock", lex_string_prag ITdocOptions),
2259                                  ("language", token ITlanguage_prag),
2260                                  ("include", lex_string_prag ITinclude_prag)])
2261
2262 ignoredPrags = Map.fromList (map ignored pragmas)
2263                where ignored opt = (opt, nested_comment lexToken)
2264                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
2265                      options_pragmas = map ("options_" ++) impls
2266                      -- CFILES is a hugs-only thing.
2267                      pragmas = options_pragmas ++ ["cfiles", "contract"]
2268
2269 oneWordPrags = Map.fromList([("rules", rulePrag),
2270                            ("inline", token (ITinline_prag Inline FunLike)),
2271                            ("inlinable", token (ITinline_prag Inlinable FunLike)),
2272                            ("inlineable", token (ITinline_prag Inlinable FunLike)),
2273                                           -- Spelling variant
2274                            ("notinline", token (ITinline_prag NoInline FunLike)),
2275                            ("specialize", token ITspec_prag),
2276                            ("source", token ITsource_prag),
2277                            ("warning", token ITwarning_prag),
2278                            ("deprecated", token ITdeprecated_prag),
2279                            ("scc", token ITscc_prag),
2280                            ("generated", token ITgenerated_prag),
2281                            ("core", token ITcore_prag),
2282                            ("unpack", token ITunpack_prag),
2283                            ("ann", token ITann_prag),
2284                            ("vectorize", token ITvect_prag)])
2285
2286 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_prag Inline ConLike)),
2287                              ("notinline conlike", token (ITinline_prag NoInline ConLike)),
2288                              ("specialize inline", token (ITspec_inline_prag True)),
2289                              ("specialize notinline", token (ITspec_inline_prag False)),
2290                              ("vectorize scalar", token ITvect_scalar_prag)])
2291
2292 dispatch_pragmas :: Map String Action -> Action
2293 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
2294                                        Just found -> found span buf len
2295                                        Nothing -> lexError "unknown pragma"
2296
2297 known_pragma :: Map String Action -> AlexAccPred Int
2298 known_pragma prags _ _ len (AI _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
2299                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
2300
2301 clean_pragma :: String -> String
2302 clean_pragma prag = canon_ws (map toLower (unprefix prag))
2303                     where unprefix prag' = case stripPrefix "{-#" prag' of
2304                                              Just rest -> rest
2305                                              Nothing -> prag'
2306                           canonical prag' = case prag' of
2307                                               "noinline" -> "notinline"
2308                                               "specialise" -> "specialize"
2309                                               "vectorise" -> "vectorize"
2310                                               "constructorlike" -> "conlike"
2311                                               _ -> prag'
2312                           canon_ws s = unwords (map canonical (words s))
2313 }