compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 -- XXX The above flags turn off warnings in the generated code:
  35 {-# LANGUAGE BangPatterns #-}
  36 {-# OPTIONS_GHC -fno-warn-unused-matches #-}
  37 {-# OPTIONS_GHC -fno-warn-unused-binds #-}
  38 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
  39 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
  40 -- But alex still generates some code that causes the "lazy unlifted bindings"
  41 -- warning, and old compilers don't know about it so we can't easily turn
  42 -- it off, so for now we use the sledge hammer:
  43 {-# OPTIONS_GHC -w #-}
  44
  45 {-# OPTIONS_GHC -funbox-strict-fields #-}
  46
  47 module Lexer (
  48    Token(..), lexer, pragState, mkPState, PState(..),
  49    P(..), ParseResult(..), getSrcLoc,
  50    getPState, getDynFlags, withThisPackage,
  51    failLocMsgP, failSpanMsgP, srcParseFail,
  52    getMessages,
  53    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  54    activeContext, nextIsEOF,
  55    getLexState, popLexState, pushLexState,
  56    extension, bangPatEnabled, datatypeContextsEnabled,
  57    addWarning,
  58    lexTokenStream
  59   ) where
  60
  61 import Bag
  62 import ErrUtils
  63 import Outputable
  64 import StringBuffer
  65 import FastString
  66 import SrcLoc
  67 import UniqFM
  68 import DynFlags
  69 import Module
  70 import Ctype
  71 import BasicTypes       ( InlineSpec(..), RuleMatchInfo(..) )
  72 import Util             ( readRational )
  73
  74 import Control.Monad
  75 import Data.Bits
  76 import Data.Char
  77 import Data.List
  78 import Data.Maybe
  79 import Data.Map (Map)
  80 import qualified Data.Map as Map
  81 import Data.Ratio
  82 }
  83
  84 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  85 $whitechar   = [\ \n\r\f\v $unispace]
  86 $white_no_nl = $whitechar # \n
  87 $tab         = \t
  88
  89 $ascdigit  = 0-9
  90 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  91 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  92 $digit     = [$ascdigit $unidigit]
  93
  94 $special   = [\(\)\,\;\[\]\`\{\}]
  95 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  96 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  97 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  98
  99 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
 100 $asclarge  = [A-Z]
 101 $large     = [$asclarge $unilarge]
 102
 103 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 104 $ascsmall  = [a-z]
 105 $small     = [$ascsmall $unismall \_]
 106
 107 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 108 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 109
 110 $octit     = 0-7
 111 $hexit     = [$decdigit A-F a-f]
 112 $symchar   = [$symbol \:]
 113 $nl        = [\n\r]
 114 $idchar    = [$small $large $digit \']
 115
 116 $pragmachar = [$small $large $digit]
 117
 118 $docsym    = [\| \^ \* \$]
 119
 120 @varid     = $small $idchar*
 121 @conid     = $large $idchar*
 122
 123 @varsym    = $symbol $symchar*
 124 @consym    = \: $symchar*
 125
 126 @decimal     = $decdigit+
 127 @octal       = $octit+
 128 @hexadecimal = $hexit+
 129 @exponent    = [eE] [\-\+]? @decimal
 130
 131 -- we support the hierarchical module name extension:
 132 @qual = (@conid \.)+
 133
 134 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 135
 136 -- normal signed numerical literals can only be explicitly negative,
 137 -- not explicitly positive (contrast @exponent)
 138 @negative = \-
 139 @signed = @negative ?
 140
 141 haskell :-
 142
 143 -- everywhere: skip whitespace and comments
 144 $white_no_nl+                           ;
 145 $tab+         { warn Opt_WarnTabs (text "Warning: Tab character") }
 146
 147 -- Everywhere: deal with nested comments.  We explicitly rule out
 148 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 149 -- (this can happen even though pragmas will normally take precedence due to
 150 -- longest-match, because pragmas aren't valid in every state, but comments
 151 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 152 -- set.
 153
 154 "{-" / { isNormalComment } { nested_comment lexToken }
 155
 156 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 157 -- more dashes followed by a symbol should be parsed as a varsym, so we
 158 -- have to exclude those.
 159
 160 -- Since Haddock comments aren't valid in every state, we need to rule them
 161 -- out here.
 162
 163 -- The following two rules match comments that begin with two dashes, but
 164 -- continue with a different character. The rules test that this character
 165 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 166 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 167 -- have a Haddock comment). The rules then munch the rest of the line.
 168
 169 "-- " ~[$docsym \#] .* { lineCommentToken }
 170 "--" [^$symbol : \ ] .* { lineCommentToken }
 171
 172 -- Next, match Haddock comments if no -haddock flag
 173
 174 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 175
 176 -- Now, when we've matched comments that begin with 2 dashes and continue
 177 -- with a different character, we need to match comments that begin with three
 178 -- or more dashes (which clearly can't be Haddock comments). We only need to
 179 -- make sure that the first non-dash character isn't a symbol, and munch the
 180 -- rest of the line.
 181
 182 "---"\-* [^$symbol :] .* { lineCommentToken }
 183
 184 -- Since the previous rules all match dashes followed by at least one
 185 -- character, we also need to match a whole line filled with just dashes.
 186
 187 "--"\-* / { atEOL } { lineCommentToken }
 188
 189 -- We need this rule since none of the other single line comment rules
 190 -- actually match this case.
 191
 192 "-- " / { atEOL } { lineCommentToken }
 193
 194 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 195 -- blank lines) until we find a non-whitespace character, then do layout
 196 -- processing.
 197 --
 198 -- One slight wibble here: what if the line begins with {-#? In
 199 -- theory, we have to lex the pragma to see if it's one we recognise,
 200 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 201 -- as a nested comment.  We don't bother with this: if the line begins
 202 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 203 <bol> {
 204   \n                                    ;
 205   ^\# (line)?                           { begin line_prag1 }
 206   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 207   ^\# \! .* \n                          ; -- #!, for scripts
 208   ()                                    { do_bol }
 209 }
 210
 211 -- after a layout keyword (let, where, do, of), we begin a new layout
 212 -- context if the curly brace is missing.
 213 -- Careful! This stuff is quite delicate.
 214 <layout, layout_do> {
 215   \{ / { notFollowedBy '-' }            { hopefully_open_brace }
 216         -- we might encounter {-# here, but {- has been handled already
 217   \n                                    ;
 218   ^\# (line)?                           { begin line_prag1 }
 219 }
 220
 221 -- do is treated in a subtly different way, see new_layout_context
 222 <layout>    ()                          { new_layout_context True }
 223 <layout_do> ()                          { new_layout_context False }
 224
 225 -- after a new layout context which was found to be to the left of the
 226 -- previous context, we have generated a '{' token, and we now need to
 227 -- generate a matching '}' token.
 228 <layout_left>  ()                       { do_layout_left }
 229
 230 <0,option_prags> \n                             { begin bol }
 231
 232 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 233                                 { dispatch_pragmas linePrags }
 234
 235 -- single-line line pragmas, of the form
 236 --    # <line> "<file>" <extra-stuff> \n
 237 <line_prag1> $decdigit+                 { setLine line_prag1a }
 238 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 239 <line_prag1b> .*                        { pop }
 240
 241 -- Haskell-style line pragmas, of the form
 242 --    {-# LINE <line> "<file>" #-}
 243 <line_prag2> $decdigit+                 { setLine line_prag2a }
 244 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 245 <line_prag2b> "#-}"|"-}"                { pop }
 246    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 247    -- with older versions of GHC which generated these.
 248
 249 <0,option_prags> {
 250   "{-#" $whitechar* $pragmachar+
 251         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 252                                  { dispatch_pragmas twoWordPrags }
 253
 254   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 255                                  { dispatch_pragmas oneWordPrags }
 256
 257   -- We ignore all these pragmas, but don't generate a warning for them
 258   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 259                                  { dispatch_pragmas ignoredPrags }
 260
 261   -- ToDo: should only be valid inside a pragma:
 262   "#-}"                                 { endPrag }
 263 }
 264
 265 <option_prags> {
 266   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 267                                    { dispatch_pragmas fileHeaderPrags }
 268
 269   "-- #"                                 { multiline_doc_comment }
 270 }
 271
 272 <0> {
 273   -- In the "0" mode we ignore these pragmas
 274   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 275                      { nested_comment lexToken }
 276 }
 277
 278 <0> {
 279   "-- #" .* { lineCommentToken }
 280 }
 281
 282 <0,option_prags> {
 283   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 284                     (nested_comment lexToken) }
 285 }
 286
 287 -- '0' state: ordinary lexemes
 288
 289 -- Haddock comments
 290
 291 <0,option_prags> {
 292   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 293   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 294 }
 295
 296 -- "special" symbols
 297
 298 <0> {
 299   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 300   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 301 }
 302
 303 <0> {
 304   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 305   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 306   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 307   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 308   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 309   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 310   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 311   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 312
 313 -- For backward compatibility, accept the old dollar syntax
 314   "[$" @varid "|"  / { ifExtension qqEnabled }
 315                      { lex_quasiquote_tok }
 316
 317   "[" @varid "|"  / { ifExtension qqEnabled }
 318                      { lex_quasiquote_tok }
 319 }
 320
 321 <0> {
 322   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 323                                         { special IToparenbar }
 324   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 325 }
 326
 327 <0> {
 328   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 329 }
 330
 331 <0> {
 332   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 333          { token IToubxparen }
 334   "#)" / { ifExtension unboxedTuplesEnabled }
 335          { token ITcubxparen }
 336 }
 337
 338 <0,option_prags> {
 339   \(                                    { special IToparen }
 340   \)                                    { special ITcparen }
 341   \[                                    { special ITobrack }
 342   \]                                    { special ITcbrack }
 343   \,                                    { special ITcomma }
 344   \;                                    { special ITsemi }
 345   \`                                    { special ITbackquote }
 346
 347   \{                                    { open_brace }
 348   \}                                    { close_brace }
 349 }
 350
 351 <0,option_prags> {
 352   @qual @varid                  { idtoken qvarid }
 353   @qual @conid                  { idtoken qconid }
 354   @varid                        { varid }
 355   @conid                        { idtoken conid }
 356 }
 357
 358 <0> {
 359   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 360   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 361   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 362   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 363 }
 364
 365 -- ToDo: - move `var` and (sym) into lexical syntax?
 366 --       - remove backquote from $special?
 367 <0> {
 368   @qual @varsym                                    { idtoken qvarsym }
 369   @qual @consym                                    { idtoken qconsym }
 370   @varsym                                          { varsym }
 371   @consym                                          { consym }
 372 }
 373
 374 -- For the normal boxed literals we need to be careful
 375 -- when trying to be close to Haskell98
 376 <0> {
 377   -- Normal integral literals (:: Num a => a, from Integer)
 378   @decimal           { tok_num positive 0 0 decimal }
 379   0[oO] @octal       { tok_num positive 2 2 octal }
 380   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 381
 382   -- Normal rational literals (:: Fractional a => a, from Rational)
 383   @floating_point    { strtoken tok_float }
 384 }
 385
 386 <0> {
 387   -- Unboxed ints (:: Int#) and words (:: Word#)
 388   -- It's simpler (and faster?) to give separate cases to the negatives,
 389   -- especially considering octal/hexadecimal prefixes.
 390   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 391   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 392   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 393   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 394   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 395   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 396
 397   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 398   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 399   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 400
 401   -- Unboxed floats and doubles (:: Float#, :: Double#)
 402   -- prim_{float,double} work with signed literals
 403   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 404   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 405 }
 406
 407 -- Strings and chars are lexed by hand-written code.  The reason is
 408 -- that even if we recognise the string or char here in the regex
 409 -- lexer, we would still have to parse the string afterward in order
 410 -- to convert it to a String.
 411 <0> {
 412   \'                            { lex_char_tok }
 413   \"                            { lex_string_tok }
 414 }
 415
 416 {
 417 -- -----------------------------------------------------------------------------
 418 -- The token type
 419
 420 data Token
 421   = ITas                        -- Haskell keywords
 422   | ITcase
 423   | ITclass
 424   | ITdata
 425   | ITdefault
 426   | ITderiving
 427   | ITdo
 428   | ITelse
 429   | IThiding
 430   | ITif
 431   | ITimport
 432   | ITin
 433   | ITinfix
 434   | ITinfixl
 435   | ITinfixr
 436   | ITinstance
 437   | ITlet
 438   | ITmodule
 439   | ITnewtype
 440   | ITof
 441   | ITqualified
 442   | ITthen
 443   | ITtype
 444   | ITwhere
 445   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 446
 447   | ITforall                    -- GHC extension keywords
 448   | ITforeign
 449   | ITexport
 450   | ITlabel
 451   | ITdynamic
 452   | ITsafe
 453   | ITthreadsafe
 454   | ITinterruptible
 455   | ITunsafe
 456   | ITstdcallconv
 457   | ITccallconv
 458   | ITprimcallconv
 459   | ITmdo
 460   | ITfamily
 461   | ITgroup
 462   | ITby
 463   | ITusing
 464
 465         -- Pragmas
 466   | ITinline_prag InlineSpec RuleMatchInfo
 467   | ITspec_prag                 -- SPECIALISE
 468   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 469   | ITsource_prag
 470   | ITrules_prag
 471   | ITwarning_prag
 472   | ITdeprecated_prag
 473   | ITline_prag
 474   | ITscc_prag
 475   | ITgenerated_prag
 476   | ITcore_prag                 -- hdaume: core annotations
 477   | ITunpack_prag
 478   | ITann_prag
 479   | ITclose_prag
 480   | IToptions_prag String
 481   | ITinclude_prag String
 482   | ITlanguage_prag
 483   | ITvect_prag
 484   | ITvect_scalar_prag
 485
 486   | ITdotdot                    -- reserved symbols
 487   | ITcolon
 488   | ITdcolon
 489   | ITequal
 490   | ITlam
 491   | ITvbar
 492   | ITlarrow
 493   | ITrarrow
 494   | ITat
 495   | ITtilde
 496   | ITdarrow
 497   | ITminus
 498   | ITbang
 499   | ITstar
 500   | ITdot
 501
 502   | ITbiglam                    -- GHC-extension symbols
 503
 504   | ITocurly                    -- special symbols
 505   | ITccurly
 506   | ITocurlybar                 -- {|, for type applications
 507   | ITccurlybar                 -- |}, for type applications
 508   | ITvocurly
 509   | ITvccurly
 510   | ITobrack
 511   | ITopabrack                  -- [:, for parallel arrays with -XParallelArrays
 512   | ITcpabrack                  -- :], for parallel arrays with -XParallelArrays
 513   | ITcbrack
 514   | IToparen
 515   | ITcparen
 516   | IToubxparen
 517   | ITcubxparen
 518   | ITsemi
 519   | ITcomma
 520   | ITunderscore
 521   | ITbackquote
 522
 523   | ITvarid   FastString        -- identifiers
 524   | ITconid   FastString
 525   | ITvarsym  FastString
 526   | ITconsym  FastString
 527   | ITqvarid  (FastString,FastString)
 528   | ITqconid  (FastString,FastString)
 529   | ITqvarsym (FastString,FastString)
 530   | ITqconsym (FastString,FastString)
 531   | ITprefixqvarsym (FastString,FastString)
 532   | ITprefixqconsym (FastString,FastString)
 533
 534   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 535
 536   | ITchar       Char
 537   | ITstring     FastString
 538   | ITinteger    Integer
 539   | ITrational   Rational
 540
 541   | ITprimchar   Char
 542   | ITprimstring FastString
 543   | ITprimint    Integer
 544   | ITprimword   Integer
 545   | ITprimfloat  Rational
 546   | ITprimdouble Rational
 547
 548   -- Template Haskell extension tokens
 549   | ITopenExpQuote              --  [| or [e|
 550   | ITopenPatQuote              --  [p|
 551   | ITopenDecQuote              --  [d|
 552   | ITopenTypQuote              --  [t|
 553   | ITcloseQuote                --  |]
 554   | ITidEscape   FastString     --  $x
 555   | ITparenEscape               --  $(
 556   | ITvarQuote                  --  '
 557   | ITtyQuote                   --  ''
 558   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 559
 560   -- Arrow notation extension
 561   | ITproc
 562   | ITrec
 563   | IToparenbar                 --  (|
 564   | ITcparenbar                 --  |)
 565   | ITlarrowtail                --  -<
 566   | ITrarrowtail                --  >-
 567   | ITLarrowtail                --  -<<
 568   | ITRarrowtail                --  >>-
 569
 570   | ITunknown String            -- Used when the lexer can't make sense of it
 571   | ITeof                       -- end of file token
 572
 573   -- Documentation annotations
 574   | ITdocCommentNext  String     -- something beginning '-- |'
 575   | ITdocCommentPrev  String     -- something beginning '-- ^'
 576   | ITdocCommentNamed String     -- something beginning '-- $'
 577   | ITdocSection      Int String -- a section heading
 578   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 579   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 580   | ITlineComment     String     -- comment starting by "--"
 581   | ITblockComment    String     -- comment in {- -}
 582
 583 #ifdef DEBUG
 584   deriving Show -- debugging
 585 #endif
 586
 587 {-
 588 isSpecial :: Token -> Bool
 589 -- If we see M.x, where x is a keyword, but
 590 -- is special, we treat is as just plain M.x,
 591 -- not as a keyword.
 592 isSpecial ITas          = True
 593 isSpecial IThiding      = True
 594 isSpecial ITqualified   = True
 595 isSpecial ITforall      = True
 596 isSpecial ITexport      = True
 597 isSpecial ITlabel       = True
 598 isSpecial ITdynamic     = True
 599 isSpecial ITsafe        = True
 600 isSpecial ITthreadsafe  = True
 601 isSpecial ITinterruptible = True
 602 isSpecial ITunsafe      = True
 603 isSpecial ITccallconv   = True
 604 isSpecial ITstdcallconv = True
 605 isSpecial ITprimcallconv = True
 606 isSpecial ITmdo         = True
 607 isSpecial ITfamily      = True
 608 isSpecial ITgroup   = True
 609 isSpecial ITby      = True
 610 isSpecial ITusing   = True
 611 isSpecial _             = False
 612 -}
 613
 614 -- the bitmap provided as the third component indicates whether the
 615 -- corresponding extension keyword is valid under the extension options
 616 -- provided to the compiler; if the extension corresponding to *any* of the
 617 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 618 -- facilitates using a keyword in two different extensions that can be
 619 -- activated independently)
 620 --
 621 reservedWordsFM :: UniqFM (Token, Int)
 622 reservedWordsFM = listToUFM $
 623         map (\(x, y, z) -> (mkFastString x, (y, z)))
 624        [( "_",          ITunderscore,   0 ),
 625         ( "as",         ITas,           0 ),
 626         ( "case",       ITcase,         0 ),
 627         ( "class",      ITclass,        0 ),
 628         ( "data",       ITdata,         0 ),
 629         ( "default",    ITdefault,      0 ),
 630         ( "deriving",   ITderiving,     0 ),
 631         ( "do",         ITdo,           0 ),
 632         ( "else",       ITelse,         0 ),
 633         ( "hiding",     IThiding,       0 ),
 634         ( "if",         ITif,           0 ),
 635         ( "import",     ITimport,       0 ),
 636         ( "in",         ITin,           0 ),
 637         ( "infix",      ITinfix,        0 ),
 638         ( "infixl",     ITinfixl,       0 ),
 639         ( "infixr",     ITinfixr,       0 ),
 640         ( "instance",   ITinstance,     0 ),
 641         ( "let",        ITlet,          0 ),
 642         ( "module",     ITmodule,       0 ),
 643         ( "newtype",    ITnewtype,      0 ),
 644         ( "of",         ITof,           0 ),
 645         ( "qualified",  ITqualified,    0 ),
 646         ( "then",       ITthen,         0 ),
 647         ( "type",       ITtype,         0 ),
 648         ( "where",      ITwhere,        0 ),
 649         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 650
 651     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 652         ( "mdo",        ITmdo,           bit recursiveDoBit),
 653         ( "family",     ITfamily,        bit tyFamBit),
 654     ( "group",  ITgroup,     bit transformComprehensionsBit),
 655     ( "by",     ITby,        bit transformComprehensionsBit),
 656     ( "using",  ITusing,     bit transformComprehensionsBit),
 657
 658         ( "foreign",    ITforeign,       bit ffiBit),
 659         ( "export",     ITexport,        bit ffiBit),
 660         ( "label",      ITlabel,         bit ffiBit),
 661         ( "dynamic",    ITdynamic,       bit ffiBit),
 662         ( "safe",       ITsafe,          bit ffiBit),
 663         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 664         ( "interruptible", ITinterruptible, bit ffiBit),
 665         ( "unsafe",     ITunsafe,        bit ffiBit),
 666         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 667         ( "ccall",      ITccallconv,     bit ffiBit),
 668         ( "prim",       ITprimcallconv,  bit ffiBit),
 669
 670         ( "rec",        ITrec,           bit recBit),
 671         ( "proc",       ITproc,          bit arrowsBit)
 672      ]
 673
 674 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 675 reservedSymsFM = listToUFM $
 676     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 677       [ ("..",  ITdotdot,   always)
 678         -- (:) is a reserved op, meaning only list cons
 679        ,(":",   ITcolon,    always)
 680        ,("::",  ITdcolon,   always)
 681        ,("=",   ITequal,    always)
 682        ,("\\",  ITlam,      always)
 683        ,("|",   ITvbar,     always)
 684        ,("<-",  ITlarrow,   always)
 685        ,("->",  ITrarrow,   always)
 686        ,("@",   ITat,       always)
 687        ,("~",   ITtilde,    always)
 688        ,("=>",  ITdarrow,   always)
 689        ,("-",   ITminus,    always)
 690        ,("!",   ITbang,     always)
 691
 692         -- For data T (a::*) = MkT
 693        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 694         -- For 'forall a . t'
 695        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 696
 697        ,("-<",  ITlarrowtail, arrowsEnabled)
 698        ,(">-",  ITrarrowtail, arrowsEnabled)
 699        ,("-<<", ITLarrowtail, arrowsEnabled)
 700        ,(">>-", ITRarrowtail, arrowsEnabled)
 701
 702        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 703        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 704        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 705                                 explicitForallEnabled i)
 706        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 707        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 708
 709        ,("⤙",   ITlarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 710        ,("⤚",   ITrarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 711        ,("⤛",   ITLarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 712        ,("⤜",   ITRarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 713
 714        ,("★", ITstar, unicodeSyntaxEnabled)
 715
 716         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 717         -- form part of a large operator.  This would let us have a better
 718         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 719        ]
 720
 721 -- -----------------------------------------------------------------------------
 722 -- Lexer actions
 723
 724 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 725
 726 special :: Token -> Action
 727 special tok span _buf _len = return (L span tok)
 728
 729 token, layout_token :: Token -> Action
 730 token t span _buf _len = return (L span t)
 731 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 732
 733 idtoken :: (StringBuffer -> Int -> Token) -> Action
 734 idtoken f span buf len = return (L span $! (f buf len))
 735
 736 skip_one_varid :: (FastString -> Token) -> Action
 737 skip_one_varid f span buf len
 738   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 739
 740 strtoken :: (String -> Token) -> Action
 741 strtoken f span buf len =
 742   return (L span $! (f $! lexemeToString buf len))
 743
 744 init_strtoken :: Int -> (String -> Token) -> Action
 745 -- like strtoken, but drops the last N character(s)
 746 init_strtoken drop f span buf len =
 747   return (L span $! (f $! lexemeToString buf (len-drop)))
 748
 749 begin :: Int -> Action
 750 begin code _span _str _len = do pushLexState code; lexToken
 751
 752 pop :: Action
 753 pop _span _buf _len = do _ <- popLexState
 754                          lexToken
 755
 756 hopefully_open_brace :: Action
 757 hopefully_open_brace span buf len
 758  = do relaxed <- extension relaxedLayout
 759       ctx <- getContext
 760       (AI l _) <- getInput
 761       let offset = srcLocCol l
 762           isOK = relaxed ||
 763                  case ctx of
 764                  Layout prev_off : _ -> prev_off < offset
 765                  _                   -> True
 766       if isOK then pop_and open_brace span buf len
 767               else failSpanMsgP span (text "Missing block")
 768
 769 pop_and :: Action -> Action
 770 pop_and act span buf len = do _ <- popLexState
 771                               act span buf len
 772
 773 {-# INLINE nextCharIs #-}
 774 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 775 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 776
 777 notFollowedBy :: Char -> AlexAccPred Int
 778 notFollowedBy char _ _ _ (AI _ buf)
 779   = nextCharIs buf (/=char)
 780
 781 notFollowedBySymbol :: AlexAccPred Int
 782 notFollowedBySymbol _ _ _ (AI _ buf)
 783   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 784
 785 -- We must reject doc comments as being ordinary comments everywhere.
 786 -- In some cases the doc comment will be selected as the lexeme due to
 787 -- maximal munch, but not always, because the nested comment rule is
 788 -- valid in all states, but the doc-comment rules are only valid in
 789 -- the non-layout states.
 790 isNormalComment :: AlexAccPred Int
 791 isNormalComment bits _ _ (AI _ buf)
 792   | haddockEnabled bits = notFollowedByDocOrPragma
 793   | otherwise           = nextCharIs buf (/='#')
 794   where
 795     notFollowedByDocOrPragma
 796        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 797
 798 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 799 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 800
 801 {-
 802 haddockDisabledAnd p bits _ _ (AI _ buf)
 803   = if haddockEnabled bits then False else (p buf)
 804 -}
 805
 806 atEOL :: AlexAccPred Int
 807 atEOL _ _ _ (AI _ buf) = atEnd buf || currentChar buf == '\n'
 808
 809 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 810 ifExtension pred bits _ _ _ = pred bits
 811
 812 multiline_doc_comment :: Action
 813 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 814   where
 815     worker commentAcc input docType oneLine = case alexGetChar input of
 816       Just ('\n', input')
 817         | oneLine -> docCommentEnd input commentAcc docType buf span
 818         | otherwise -> case checkIfCommentLine input' of
 819           Just input -> worker ('\n':commentAcc) input docType False
 820           Nothing -> docCommentEnd input commentAcc docType buf span
 821       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 822       Nothing -> docCommentEnd input commentAcc docType buf span
 823
 824     checkIfCommentLine input = check (dropNonNewlineSpace input)
 825       where
 826         check input = case alexGetChar input of
 827           Just ('-', input) -> case alexGetChar input of
 828             Just ('-', input) -> case alexGetChar input of
 829               Just (c, _) | c /= '-' -> Just input
 830               _ -> Nothing
 831             _ -> Nothing
 832           _ -> Nothing
 833
 834         dropNonNewlineSpace input = case alexGetChar input of
 835           Just (c, input')
 836             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 837             | otherwise -> input
 838           Nothing -> input
 839
 840 lineCommentToken :: Action
 841 lineCommentToken span buf len = do
 842   b <- extension rawTokenStreamEnabled
 843   if b then strtoken ITlineComment span buf len else lexToken
 844
 845 {-
 846   nested comments require traversing by hand, they can't be parsed
 847   using regular expressions.
 848 -}
 849 nested_comment :: P (Located Token) -> Action
 850 nested_comment cont span _str _len = do
 851   input <- getInput
 852   go "" (1::Int) input
 853   where
 854     go commentAcc 0 input = do setInput input
 855                                b <- extension rawTokenStreamEnabled
 856                                if b
 857                                  then docCommentEnd input commentAcc ITblockComment _str span
 858                                  else cont
 859     go commentAcc n input = case alexGetChar input of
 860       Nothing -> errBrace input span
 861       Just ('-',input) -> case alexGetChar input of
 862         Nothing  -> errBrace input span
 863         Just ('\125',input) -> go commentAcc (n-1) input
 864         Just (_,_)          -> go ('-':commentAcc) n input
 865       Just ('\123',input) -> case alexGetChar input of
 866         Nothing  -> errBrace input span
 867         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 868         Just (_,_)       -> go ('\123':commentAcc) n input
 869       Just (c,input) -> go (c:commentAcc) n input
 870
 871 nested_doc_comment :: Action
 872 nested_doc_comment span buf _len = withLexedDocType (go "")
 873   where
 874     go commentAcc input docType _ = case alexGetChar input of
 875       Nothing -> errBrace input span
 876       Just ('-',input) -> case alexGetChar input of
 877         Nothing -> errBrace input span
 878         Just ('\125',input) ->
 879           docCommentEnd input commentAcc docType buf span
 880         Just (_,_) -> go ('-':commentAcc) input docType False
 881       Just ('\123', input) -> case alexGetChar input of
 882         Nothing  -> errBrace input span
 883         Just ('-',input) -> do
 884           setInput input
 885           let cont = do input <- getInput; go commentAcc input docType False
 886           nested_comment cont span buf _len
 887         Just (_,_) -> go ('\123':commentAcc) input docType False
 888       Just (c,input) -> go (c:commentAcc) input docType False
 889
 890 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 891                  -> P (Located Token)
 892 withLexedDocType lexDocComment = do
 893   input@(AI _ buf) <- getInput
 894   case prevChar buf ' ' of
 895     '|' -> lexDocComment input ITdocCommentNext False
 896     '^' -> lexDocComment input ITdocCommentPrev False
 897     '$' -> lexDocComment input ITdocCommentNamed False
 898     '*' -> lexDocSection 1 input
 899     '#' -> lexDocComment input ITdocOptionsOld False
 900     _ -> panic "withLexedDocType: Bad doc type"
 901  where
 902     lexDocSection n input = case alexGetChar input of
 903       Just ('*', input) -> lexDocSection (n+1) input
 904       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 905       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 906
 907 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 908 -- off again at the end of the pragma.
 909 rulePrag :: Action
 910 rulePrag span _buf _len = do
 911   setExts (.|. bit inRulePragBit)
 912   return (L span ITrules_prag)
 913
 914 endPrag :: Action
 915 endPrag span _buf _len = do
 916   setExts (.&. complement (bit inRulePragBit))
 917   return (L span ITclose_prag)
 918
 919 -- docCommentEnd
 920 -------------------------------------------------------------------------------
 921 -- This function is quite tricky. We can't just return a new token, we also
 922 -- need to update the state of the parser. Why? Because the token is longer
 923 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 924 -- it writes the wrong token length to the parser state. This function is
 925 -- called afterwards, so it can just update the state.
 926
 927 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 928                  SrcSpan -> P (Located Token)
 929 docCommentEnd input commentAcc docType buf span = do
 930   setInput input
 931   let (AI loc nextBuf) = input
 932       comment = reverse commentAcc
 933       span' = mkSrcSpan (srcSpanStart span) loc
 934       last_len = byteDiff buf nextBuf
 935
 936   span `seq` setLastToken span' last_len
 937   return (L span' (docType comment))
 938
 939 errBrace :: AlexInput -> SrcSpan -> P a
 940 errBrace (AI end _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 941
 942 open_brace, close_brace :: Action
 943 open_brace span _str _len = do
 944   ctx <- getContext
 945   setContext (NoLayout:ctx)
 946   return (L span ITocurly)
 947 close_brace span _str _len = do
 948   popContext
 949   return (L span ITccurly)
 950
 951 qvarid, qconid :: StringBuffer -> Int -> Token
 952 qvarid buf len = ITqvarid $! splitQualName buf len False
 953 qconid buf len = ITqconid $! splitQualName buf len False
 954
 955 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 956 -- takes a StringBuffer and a length, and returns the module name
 957 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 958 -- because of hierarchical module names.
 959 splitQualName orig_buf len parens = split orig_buf orig_buf
 960   where
 961     split buf dot_buf
 962         | orig_buf `byteDiff` buf >= len  = done dot_buf
 963         | c == '.'                        = found_dot buf'
 964         | otherwise                       = split buf' dot_buf
 965       where
 966        (c,buf') = nextChar buf
 967
 968     -- careful, we might get names like M....
 969     -- so, if the character after the dot is not upper-case, this is
 970     -- the end of the qualifier part.
 971     found_dot buf -- buf points after the '.'
 972         | isUpper c    = split buf' buf
 973         | otherwise    = done buf
 974       where
 975        (c,buf') = nextChar buf
 976
 977     done dot_buf =
 978         (lexemeToFastString orig_buf (qual_size - 1),
 979          if parens -- Prelude.(+)
 980             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 981             else lexemeToFastString dot_buf (len - qual_size))
 982       where
 983         qual_size = orig_buf `byteDiff` dot_buf
 984
 985 varid :: Action
 986 varid span buf len =
 987   fs `seq`
 988   case lookupUFM reservedWordsFM fs of
 989         Just (keyword,0)    -> do
 990                 maybe_layout keyword
 991                 return (L span keyword)
 992         Just (keyword,exts) -> do
 993                 b <- extension (\i -> exts .&. i /= 0)
 994                 if b then do maybe_layout keyword
 995                              return (L span keyword)
 996                      else return (L span (ITvarid fs))
 997         _other -> return (L span (ITvarid fs))
 998   where
 999         fs = lexemeToFastString buf len
1000
1001 conid :: StringBuffer -> Int -> Token
1002 conid buf len = ITconid fs
1003   where fs = lexemeToFastString buf len
1004
1005 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
1006 qvarsym buf len = ITqvarsym $! splitQualName buf len False
1007 qconsym buf len = ITqconsym $! splitQualName buf len False
1008 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
1009 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
1010
1011 varsym, consym :: Action
1012 varsym = sym ITvarsym
1013 consym = sym ITconsym
1014
1015 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
1016     -> P (Located Token)
1017 sym con span buf len =
1018   case lookupUFM reservedSymsFM fs of
1019         Just (keyword,exts) -> do
1020                 b <- extension exts
1021                 if b then return (L span keyword)
1022                      else return (L span $! con fs)
1023         _other -> return (L span $! con fs)
1024   where
1025         fs = lexemeToFastString buf len
1026
1027 -- Variations on the integral numeric literal.
1028 tok_integral :: (Integer -> Token)
1029      -> (Integer -> Integer)
1030  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1031      -> Int -> Int
1032      -> (Integer, (Char->Int)) -> Action
1033 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1034   return $ L span $ itint $! transint $ parseUnsignedInteger
1035      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1036
1037 -- some conveniences for use with tok_integral
1038 tok_num :: (Integer -> Integer)
1039         -> Int -> Int
1040         -> (Integer, (Char->Int)) -> Action
1041 tok_num = tok_integral ITinteger
1042 tok_primint :: (Integer -> Integer)
1043             -> Int -> Int
1044             -> (Integer, (Char->Int)) -> Action
1045 tok_primint = tok_integral ITprimint
1046 tok_primword :: Int -> Int
1047              -> (Integer, (Char->Int)) -> Action
1048 tok_primword = tok_integral ITprimword positive
1049 positive, negative :: (Integer -> Integer)
1050 positive = id
1051 negative = negate
1052 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1053 decimal = (10,octDecDigit)
1054 octal = (8,octDecDigit)
1055 hexadecimal = (16,hexDigit)
1056
1057 -- readRational can understand negative rationals, exponents, everything.
1058 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1059 tok_float        str = ITrational   $! readRational str
1060 tok_primfloat    str = ITprimfloat  $! readRational str
1061 tok_primdouble   str = ITprimdouble $! readRational str
1062
1063 -- -----------------------------------------------------------------------------
1064 -- Layout processing
1065
1066 -- we're at the first token on a line, insert layout tokens if necessary
1067 do_bol :: Action
1068 do_bol span _str _len = do
1069         pos <- getOffside
1070         case pos of
1071             LT -> do
1072                 --trace "layout: inserting '}'" $ do
1073                 popContext
1074                 -- do NOT pop the lex state, we might have a ';' to insert
1075                 return (L span ITvccurly)
1076             EQ -> do
1077                 --trace "layout: inserting ';'" $ do
1078                 _ <- popLexState
1079                 return (L span ITsemi)
1080             GT -> do
1081                 _ <- popLexState
1082                 lexToken
1083
1084 -- certain keywords put us in the "layout" state, where we might
1085 -- add an opening curly brace.
1086 maybe_layout :: Token -> P ()
1087 maybe_layout t = do -- If the alternative layout rule is enabled then
1088                     -- we never create an implicit layout context here.
1089                     -- Layout is handled XXX instead.
1090                     -- The code for closing implicit contexts, or
1091                     -- inserting implicit semi-colons, is therefore
1092                     -- irrelevant as it only applies in an implicit
1093                     -- context.
1094                     alr <- extension alternativeLayoutRule
1095                     unless alr $ f t
1096     where f ITdo    = pushLexState layout_do
1097           f ITmdo   = pushLexState layout_do
1098           f ITof    = pushLexState layout
1099           f ITlet   = pushLexState layout
1100           f ITwhere = pushLexState layout
1101           f ITrec   = pushLexState layout
1102           f _       = return ()
1103
1104 -- Pushing a new implicit layout context.  If the indentation of the
1105 -- next token is not greater than the previous layout context, then
1106 -- Haskell 98 says that the new layout context should be empty; that is
1107 -- the lexer must generate {}.
1108 --
1109 -- We are slightly more lenient than this: when the new context is started
1110 -- by a 'do', then we allow the new context to be at the same indentation as
1111 -- the previous context.  This is what the 'strict' argument is for.
1112 --
1113 new_layout_context :: Bool -> Action
1114 new_layout_context strict span _buf _len = do
1115     _ <- popLexState
1116     (AI l _) <- getInput
1117     let offset = srcLocCol l
1118     ctx <- getContext
1119     nondecreasing <- extension nondecreasingIndentation
1120     let strict' = strict || not nondecreasing
1121     case ctx of
1122         Layout prev_off : _  |
1123            (strict'     && prev_off >= offset  ||
1124             not strict' && prev_off > offset) -> do
1125                 -- token is indented to the left of the previous context.
1126                 -- we must generate a {} sequence now.
1127                 pushLexState layout_left
1128                 return (L span ITvocurly)
1129         _ -> do
1130                 setContext (Layout offset : ctx)
1131                 return (L span ITvocurly)
1132
1133 do_layout_left :: Action
1134 do_layout_left span _buf _len = do
1135     _ <- popLexState
1136     pushLexState bol  -- we must be at the start of a line
1137     return (L span ITvccurly)
1138
1139 -- -----------------------------------------------------------------------------
1140 -- LINE pragmas
1141
1142 setLine :: Int -> Action
1143 setLine code span buf len = do
1144   let line = parseUnsignedInteger buf len 10 octDecDigit
1145   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1)
1146         -- subtract one: the line number refers to the *following* line
1147   _ <- popLexState
1148   pushLexState code
1149   lexToken
1150
1151 setFile :: Int -> Action
1152 setFile code span buf len = do
1153   let file = lexemeToFastString (stepOn buf) (len-2)
1154   setAlrLastLoc noSrcSpan
1155   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1156   _ <- popLexState
1157   pushLexState code
1158   lexToken
1159
1160
1161 -- -----------------------------------------------------------------------------
1162 -- Options, includes and language pragmas.
1163
1164 lex_string_prag :: (String -> Token) -> Action
1165 lex_string_prag mkTok span _buf _len
1166     = do input <- getInput
1167          start <- getSrcLoc
1168          tok <- go [] input
1169          end <- getSrcLoc
1170          return (L (mkSrcSpan start end) tok)
1171     where go acc input
1172               = if isString input "#-}"
1173                    then do setInput input
1174                            return (mkTok (reverse acc))
1175                    else case alexGetChar input of
1176                           Just (c,i) -> go (c:acc) i
1177                           Nothing -> err input
1178           isString _ [] = True
1179           isString i (x:xs)
1180               = case alexGetChar i of
1181                   Just (c,i') | c == x    -> isString i' xs
1182                   _other -> False
1183           err (AI end _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1184
1185
1186 -- -----------------------------------------------------------------------------
1187 -- Strings & Chars
1188
1189 -- This stuff is horrible.  I hates it.
1190
1191 lex_string_tok :: Action
1192 lex_string_tok span _buf _len = do
1193   tok <- lex_string ""
1194   end <- getSrcLoc
1195   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1196
1197 lex_string :: String -> P Token
1198 lex_string s = do
1199   i <- getInput
1200   case alexGetChar' i of
1201     Nothing -> lit_error i
1202
1203     Just ('"',i)  -> do
1204         setInput i
1205         magicHash <- extension magicHashEnabled
1206         if magicHash
1207           then do
1208             i <- getInput
1209             case alexGetChar' i of
1210               Just ('#',i) -> do
1211                    setInput i
1212                    if any (> '\xFF') s
1213                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1214                     else let s' = mkZFastString (reverse s) in
1215                          return (ITprimstring s')
1216                         -- mkZFastString is a hack to avoid encoding the
1217                         -- string in UTF-8.  We just want the exact bytes.
1218               _other ->
1219                 return (ITstring (mkFastString (reverse s)))
1220           else
1221                 return (ITstring (mkFastString (reverse s)))
1222
1223     Just ('\\',i)
1224         | Just ('&',i) <- next -> do
1225                 setInput i; lex_string s
1226         | Just (c,i) <- next, c <= '\x7f' && is_space c -> do
1227                            -- is_space only works for <= '\x7f' (#3751)
1228                 setInput i; lex_stringgap s
1229         where next = alexGetChar' i
1230
1231     Just (c, i1) -> do
1232         case c of
1233           '\\' -> do setInput i1; c' <- lex_escape; lex_string (c':s)
1234           c | isAny c -> do setInput i1; lex_string (c:s)
1235           _other -> lit_error i
1236
1237 lex_stringgap :: String -> P Token
1238 lex_stringgap s = do
1239   i <- getInput
1240   c <- getCharOrFail i
1241   case c of
1242     '\\' -> lex_string s
1243     c | is_space c -> lex_stringgap s
1244     _other -> lit_error i
1245
1246
1247 lex_char_tok :: Action
1248 -- Here we are basically parsing character literals, such as 'x' or '\n'
1249 -- but, when Template Haskell is on, we additionally spot
1250 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1251 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1252 -- So we have to do two characters of lookahead: when we see 'x we need to
1253 -- see if there's a trailing quote
1254 lex_char_tok span _buf _len = do        -- We've seen '
1255    i1 <- getInput       -- Look ahead to first character
1256    let loc = srcSpanStart span
1257    case alexGetChar' i1 of
1258         Nothing -> lit_error  i1
1259
1260         Just ('\'', i2@(AI end2 _)) -> do       -- We've seen ''
1261                   th_exts <- extension thEnabled
1262                   if th_exts then do
1263                         setInput i2
1264                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1265                    else lit_error i1
1266
1267         Just ('\\', i2@(AI _end2 _)) -> do      -- We've seen 'backslash
1268                   setInput i2
1269                   lit_ch <- lex_escape
1270                   i3 <- getInput
1271                   mc <- getCharOrFail i3 -- Trailing quote
1272                   if mc == '\'' then finish_char_tok loc lit_ch
1273                                 else lit_error i3
1274
1275         Just (c, i2@(AI _end2 _))
1276                 | not (isAny c) -> lit_error i1
1277                 | otherwise ->
1278
1279                 -- We've seen 'x, where x is a valid character
1280                 --  (i.e. not newline etc) but not a quote or backslash
1281            case alexGetChar' i2 of      -- Look ahead one more character
1282                 Just ('\'', i3) -> do   -- We've seen 'x'
1283                         setInput i3
1284                         finish_char_tok loc c
1285                 _other -> do            -- We've seen 'x not followed by quote
1286                                         -- (including the possibility of EOF)
1287                                         -- If TH is on, just parse the quote only
1288                         th_exts <- extension thEnabled
1289                         let (AI end _) = i1
1290                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1291                                    else lit_error i2
1292
1293 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1294 finish_char_tok loc ch  -- We've already seen the closing quote
1295                         -- Just need to check for trailing #
1296   = do  magicHash <- extension magicHashEnabled
1297         i@(AI end _) <- getInput
1298         if magicHash then do
1299                 case alexGetChar' i of
1300                         Just ('#',i@(AI end _)) -> do
1301                                 setInput i
1302                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1303                         _other ->
1304                                 return (L (mkSrcSpan loc end) (ITchar ch))
1305             else do
1306                    return (L (mkSrcSpan loc end) (ITchar ch))
1307
1308 isAny :: Char -> Bool
1309 isAny c | c > '\x7f' = isPrint c
1310         | otherwise  = is_any c
1311
1312 lex_escape :: P Char
1313 lex_escape = do
1314   i0 <- getInput
1315   c <- getCharOrFail i0
1316   case c of
1317         'a'   -> return '\a'
1318         'b'   -> return '\b'
1319         'f'   -> return '\f'
1320         'n'   -> return '\n'
1321         'r'   -> return '\r'
1322         't'   -> return '\t'
1323         'v'   -> return '\v'
1324         '\\'  -> return '\\'
1325         '"'   -> return '\"'
1326         '\''  -> return '\''
1327         '^'   -> do i1 <- getInput
1328                     c <- getCharOrFail i1
1329                     if c >= '@' && c <= '_'
1330                         then return (chr (ord c - ord '@'))
1331                         else lit_error i1
1332
1333         'x'   -> readNum is_hexdigit 16 hexDigit
1334         'o'   -> readNum is_octdigit  8 octDecDigit
1335         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1336
1337         c1 ->  do
1338            i <- getInput
1339            case alexGetChar' i of
1340             Nothing -> lit_error i0
1341             Just (c2,i2) ->
1342               case alexGetChar' i2 of
1343                 Nothing -> do lit_error i0
1344                 Just (c3,i3) ->
1345                    let str = [c1,c2,c3] in
1346                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1347                                      Just rest <- [stripPrefix p str] ] of
1348                           (escape_char,[]):_ -> do
1349                                 setInput i3
1350                                 return escape_char
1351                           (escape_char,_:_):_ -> do
1352                                 setInput i2
1353                                 return escape_char
1354                           [] -> lit_error i0
1355
1356 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1357 readNum is_digit base conv = do
1358   i <- getInput
1359   c <- getCharOrFail i
1360   if is_digit c
1361         then readNum2 is_digit base conv (conv c)
1362         else lit_error i
1363
1364 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1365 readNum2 is_digit base conv i = do
1366   input <- getInput
1367   read i input
1368   where read i input = do
1369           case alexGetChar' input of
1370             Just (c,input') | is_digit c -> do
1371                let i' = i*base + conv c
1372                if i' > 0x10ffff
1373                   then setInput input >> lexError "numeric escape sequence out of range"
1374                   else read i' input'
1375             _other -> do
1376               setInput input; return (chr i)
1377
1378
1379 silly_escape_chars :: [(String, Char)]
1380 silly_escape_chars = [
1381         ("NUL", '\NUL'),
1382         ("SOH", '\SOH'),
1383         ("STX", '\STX'),
1384         ("ETX", '\ETX'),
1385         ("EOT", '\EOT'),
1386         ("ENQ", '\ENQ'),
1387         ("ACK", '\ACK'),
1388         ("BEL", '\BEL'),
1389         ("BS", '\BS'),
1390         ("HT", '\HT'),
1391         ("LF", '\LF'),
1392         ("VT", '\VT'),
1393         ("FF", '\FF'),
1394         ("CR", '\CR'),
1395         ("SO", '\SO'),
1396         ("SI", '\SI'),
1397         ("DLE", '\DLE'),
1398         ("DC1", '\DC1'),
1399         ("DC2", '\DC2'),
1400         ("DC3", '\DC3'),
1401         ("DC4", '\DC4'),
1402         ("NAK", '\NAK'),
1403         ("SYN", '\SYN'),
1404         ("ETB", '\ETB'),
1405         ("CAN", '\CAN'),
1406         ("EM", '\EM'),
1407         ("SUB", '\SUB'),
1408         ("ESC", '\ESC'),
1409         ("FS", '\FS'),
1410         ("GS", '\GS'),
1411         ("RS", '\RS'),
1412         ("US", '\US'),
1413         ("SP", '\SP'),
1414         ("DEL", '\DEL')
1415         ]
1416
1417 -- before calling lit_error, ensure that the current input is pointing to
1418 -- the position of the error in the buffer.  This is so that we can report
1419 -- a correct location to the user, but also so we can detect UTF-8 decoding
1420 -- errors if they occur.
1421 lit_error :: AlexInput -> P a
1422 lit_error i = do setInput i; lexError "lexical error in string/character literal"
1423
1424 getCharOrFail :: AlexInput -> P Char
1425 getCharOrFail i =  do
1426   case alexGetChar' i of
1427         Nothing -> lexError "unexpected end-of-file in string/character literal"
1428         Just (c,i)  -> do setInput i; return c
1429
1430 -- -----------------------------------------------------------------------------
1431 -- QuasiQuote
1432
1433 lex_quasiquote_tok :: Action
1434 lex_quasiquote_tok span buf len = do
1435   let quoter = tail (lexemeToString buf (len - 1))
1436                 -- 'tail' drops the initial '[',
1437                 -- while the -1 drops the trailing '|'
1438   quoteStart <- getSrcLoc
1439   quote <- lex_quasiquote ""
1440   end <- getSrcLoc
1441   return (L (mkSrcSpan (srcSpanStart span) end)
1442            (ITquasiQuote (mkFastString quoter,
1443                           mkFastString (reverse quote),
1444                           mkSrcSpan quoteStart end)))
1445
1446 lex_quasiquote :: String -> P String
1447 lex_quasiquote s = do
1448   i <- getInput
1449   case alexGetChar' i of
1450     Nothing -> lit_error i
1451
1452     Just ('\\',i)
1453         | Just ('|',i) <- next -> do
1454                 setInput i; lex_quasiquote ('|' : s)
1455         | Just (']',i) <- next -> do
1456                 setInput i; lex_quasiquote (']' : s)
1457         where next = alexGetChar' i
1458
1459     Just ('|',i)
1460         | Just (']',i) <- next -> do
1461                 setInput i; return s
1462         where next = alexGetChar' i
1463
1464     Just (c, i) -> do
1465          setInput i; lex_quasiquote (c : s)
1466
1467 -- -----------------------------------------------------------------------------
1468 -- Warnings
1469
1470 warn :: DynFlag -> SDoc -> Action
1471 warn option warning srcspan _buf _len = do
1472     addWarning option srcspan warning
1473     lexToken
1474
1475 warnThen :: DynFlag -> SDoc -> Action -> Action
1476 warnThen option warning action srcspan buf len = do
1477     addWarning option srcspan warning
1478     action srcspan buf len
1479
1480 -- -----------------------------------------------------------------------------
1481 -- The Parse Monad
1482
1483 data LayoutContext
1484   = NoLayout
1485   | Layout !Int
1486   deriving Show
1487
1488 data ParseResult a
1489   = POk PState a
1490   | PFailed
1491         SrcSpan         -- The start and end of the text span related to
1492                         -- the error.  Might be used in environments which can
1493                         -- show this span, e.g. by highlighting it.
1494         Message         -- The error message
1495
1496 data PState = PState {
1497         buffer     :: StringBuffer,
1498         dflags     :: DynFlags,
1499         messages   :: Messages,
1500         last_loc   :: SrcSpan,  -- pos of previous token
1501         last_len   :: !Int,     -- len of previous token
1502         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1503         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1504         context    :: [LayoutContext],
1505         lex_state  :: [Int],
1506         -- Used in the alternative layout rule:
1507         -- These tokens are the next ones to be sent out. They are
1508         -- just blindly emitted, without the rule looking at them again:
1509         alr_pending_implicit_tokens :: [Located Token],
1510         -- This is the next token to be considered or, if it is Nothing,
1511         -- we need to get the next token from the input stream:
1512         alr_next_token :: Maybe (Located Token),
1513         -- This is what we consider to be the locatino of the last token
1514         -- emitted:
1515         alr_last_loc :: SrcSpan,
1516         -- The stack of layout contexts:
1517         alr_context :: [ALRContext],
1518         -- Are we expecting a '{'? If it's Just, then the ALRLayout tells
1519         -- us what sort of layout the '{' will open:
1520         alr_expecting_ocurly :: Maybe ALRLayout,
1521         -- Have we just had the '}' for a let block? If so, than an 'in'
1522         -- token doesn't need to close anything:
1523         alr_justClosedExplicitLetBlock :: Bool
1524      }
1525         -- last_loc and last_len are used when generating error messages,
1526         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1527         -- current token to happyError, we could at least get rid of last_len.
1528         -- Getting rid of last_loc would require finding another way to
1529         -- implement pushCurrentContext (which is only called from one place).
1530
1531 data ALRContext = ALRNoLayout Bool{- does it contain commas? -}
1532                               Bool{- is it a 'let' block? -}
1533                 | ALRLayout ALRLayout Int
1534 data ALRLayout = ALRLayoutLet
1535                | ALRLayoutWhere
1536                | ALRLayoutOf
1537                | ALRLayoutDo
1538
1539 newtype P a = P { unP :: PState -> ParseResult a }
1540
1541 instance Monad P where
1542   return = returnP
1543   (>>=) = thenP
1544   fail = failP
1545
1546 returnP :: a -> P a
1547 returnP a = a `seq` (P $ \s -> POk s a)
1548
1549 thenP :: P a -> (a -> P b) -> P b
1550 (P m) `thenP` k = P $ \ s ->
1551         case m s of
1552                 POk s1 a         -> (unP (k a)) s1
1553                 PFailed span err -> PFailed span err
1554
1555 failP :: String -> P a
1556 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1557
1558 failMsgP :: String -> P a
1559 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1560
1561 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1562 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1563
1564 failSpanMsgP :: SrcSpan -> SDoc -> P a
1565 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1566
1567 getPState :: P PState
1568 getPState = P $ \s -> POk s s
1569
1570 getDynFlags :: P DynFlags
1571 getDynFlags = P $ \s -> POk s (dflags s)
1572
1573 withThisPackage :: (PackageId -> a) -> P a
1574 withThisPackage f
1575  = do   pkg     <- liftM thisPackage getDynFlags
1576         return  $ f pkg
1577
1578 extension :: (Int -> Bool) -> P Bool
1579 extension p = P $ \s -> POk s (p $! extsBitmap s)
1580
1581 getExts :: P Int
1582 getExts = P $ \s -> POk s (extsBitmap s)
1583
1584 setExts :: (Int -> Int) -> P ()
1585 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1586
1587 setSrcLoc :: SrcLoc -> P ()
1588 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1589
1590 getSrcLoc :: P SrcLoc
1591 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1592
1593 setLastToken :: SrcSpan -> Int -> P ()
1594 setLastToken loc len = P $ \s -> POk s {
1595   last_loc=loc,
1596   last_len=len
1597   } ()
1598
1599 data AlexInput = AI SrcLoc StringBuffer
1600
1601 alexInputPrevChar :: AlexInput -> Char
1602 alexInputPrevChar (AI _ buf) = prevChar buf '\n'
1603
1604 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1605 alexGetChar (AI loc s)
1606   | atEnd s   = Nothing
1607   | otherwise = adj_c `seq` loc' `seq` s' `seq`
1608                 --trace (show (ord c)) $
1609                 Just (adj_c, (AI loc' s'))
1610   where (c,s') = nextChar s
1611         loc'   = advanceSrcLoc loc c
1612
1613         non_graphic     = '\x0'
1614         upper           = '\x1'
1615         lower           = '\x2'
1616         digit           = '\x3'
1617         symbol          = '\x4'
1618         space           = '\x5'
1619         other_graphic   = '\x6'
1620
1621         adj_c
1622           | c <= '\x06' = non_graphic
1623           | c <= '\x7f' = c
1624           -- Alex doesn't handle Unicode, so when Unicode
1625           -- character is encountered we output these values
1626           -- with the actual character value hidden in the state.
1627           | otherwise =
1628                 case generalCategory c of
1629                   UppercaseLetter       -> upper
1630                   LowercaseLetter       -> lower
1631                   TitlecaseLetter       -> upper
1632                   ModifierLetter        -> other_graphic
1633                   OtherLetter           -> lower -- see #1103
1634                   NonSpacingMark        -> other_graphic
1635                   SpacingCombiningMark  -> other_graphic
1636                   EnclosingMark         -> other_graphic
1637                   DecimalNumber         -> digit
1638                   LetterNumber          -> other_graphic
1639                   OtherNumber           -> digit -- see #4373
1640                   ConnectorPunctuation  -> symbol
1641                   DashPunctuation       -> symbol
1642                   OpenPunctuation       -> other_graphic
1643                   ClosePunctuation      -> other_graphic
1644                   InitialQuote          -> other_graphic
1645                   FinalQuote            -> other_graphic
1646                   OtherPunctuation      -> symbol
1647                   MathSymbol            -> symbol
1648                   CurrencySymbol        -> symbol
1649                   ModifierSymbol        -> symbol
1650                   OtherSymbol           -> symbol
1651                   Space                 -> space
1652                   _other                -> non_graphic
1653
1654 -- This version does not squash unicode characters, it is used when
1655 -- lexing strings.
1656 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1657 alexGetChar' (AI loc s)
1658   | atEnd s   = Nothing
1659   | otherwise = c `seq` loc' `seq` s' `seq`
1660                 --trace (show (ord c)) $
1661                 Just (c, (AI loc' s'))
1662   where (c,s') = nextChar s
1663         loc'   = advanceSrcLoc loc c
1664
1665 getInput :: P AlexInput
1666 getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (AI l b)
1667
1668 setInput :: AlexInput -> P ()
1669 setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } ()
1670
1671 nextIsEOF :: P Bool
1672 nextIsEOF = do
1673   AI _ s <- getInput
1674   return $ atEnd s
1675
1676 pushLexState :: Int -> P ()
1677 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1678
1679 popLexState :: P Int
1680 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1681
1682 getLexState :: P Int
1683 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1684
1685 popNextToken :: P (Maybe (Located Token))
1686 popNextToken
1687     = P $ \s@PState{ alr_next_token = m } ->
1688               POk (s {alr_next_token = Nothing}) m
1689
1690 activeContext :: P Bool
1691 activeContext = do
1692   ctxt <- getALRContext
1693   expc <- getAlrExpectingOCurly
1694   impt <- implicitTokenPending
1695   case (ctxt,expc) of
1696     ([],Nothing) -> return impt
1697     _other       -> return True
1698
1699 setAlrLastLoc :: SrcSpan -> P ()
1700 setAlrLastLoc l = P $ \s -> POk (s {alr_last_loc = l}) ()
1701
1702 getAlrLastLoc :: P SrcSpan
1703 getAlrLastLoc = P $ \s@(PState {alr_last_loc = l}) -> POk s l
1704
1705 getALRContext :: P [ALRContext]
1706 getALRContext = P $ \s@(PState {alr_context = cs}) -> POk s cs
1707
1708 setALRContext :: [ALRContext] -> P ()
1709 setALRContext cs = P $ \s -> POk (s {alr_context = cs}) ()
1710
1711 getJustClosedExplicitLetBlock :: P Bool
1712 getJustClosedExplicitLetBlock
1713  = P $ \s@(PState {alr_justClosedExplicitLetBlock = b}) -> POk s b
1714
1715 setJustClosedExplicitLetBlock :: Bool -> P ()
1716 setJustClosedExplicitLetBlock b
1717  = P $ \s -> POk (s {alr_justClosedExplicitLetBlock = b}) ()
1718
1719 setNextToken :: Located Token -> P ()
1720 setNextToken t = P $ \s -> POk (s {alr_next_token = Just t}) ()
1721
1722 implicitTokenPending :: P Bool
1723 implicitTokenPending
1724     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1725               case ts of
1726               [] -> POk s False
1727               _  -> POk s True
1728
1729 popPendingImplicitToken :: P (Maybe (Located Token))
1730 popPendingImplicitToken
1731     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1732               case ts of
1733               [] -> POk s Nothing
1734               (t : ts') -> POk (s {alr_pending_implicit_tokens = ts'}) (Just t)
1735
1736 setPendingImplicitTokens :: [Located Token] -> P ()
1737 setPendingImplicitTokens ts = P $ \s -> POk (s {alr_pending_implicit_tokens = ts}) ()
1738
1739 getAlrExpectingOCurly :: P (Maybe ALRLayout)
1740 getAlrExpectingOCurly = P $ \s@(PState {alr_expecting_ocurly = b}) -> POk s b
1741
1742 setAlrExpectingOCurly :: Maybe ALRLayout -> P ()
1743 setAlrExpectingOCurly b = P $ \s -> POk (s {alr_expecting_ocurly = b}) ()
1744
1745 -- for reasons of efficiency, flags indicating language extensions (eg,
1746 -- -fglasgow-exts or -XParallelArrays) are represented by a bitmap stored in an unboxed
1747 -- integer
1748
1749 -- The "genericsBit" is now unused, available for others
1750 -- genericsBit :: Int
1751 -- genericsBit = 0 -- {|, |} and "generic"
1752
1753 ffiBit :: Int
1754 ffiBit     = 1
1755 parrBit :: Int
1756 parrBit    = 2
1757 arrowsBit :: Int
1758 arrowsBit  = 4
1759 thBit :: Int
1760 thBit      = 5
1761 ipBit :: Int
1762 ipBit      = 6
1763 explicitForallBit :: Int
1764 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1765 bangPatBit :: Int
1766 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1767                 -- (doesn't affect the lexer)
1768 tyFamBit :: Int
1769 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1770 haddockBit :: Int
1771 haddockBit = 10 -- Lex and parse Haddock comments
1772 magicHashBit :: Int
1773 magicHashBit = 11 -- "#" in both functions and operators
1774 kindSigsBit :: Int
1775 kindSigsBit = 12 -- Kind signatures on type variables
1776 recursiveDoBit :: Int
1777 recursiveDoBit = 13 -- mdo
1778 unicodeSyntaxBit :: Int
1779 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1780 unboxedTuplesBit :: Int
1781 unboxedTuplesBit = 15 -- (# and #)
1782 datatypeContextsBit :: Int
1783 datatypeContextsBit = 16
1784 transformComprehensionsBit :: Int
1785 transformComprehensionsBit = 17
1786 qqBit :: Int
1787 qqBit      = 18 -- enable quasiquoting
1788 inRulePragBit :: Int
1789 inRulePragBit = 19
1790 rawTokenStreamBit :: Int
1791 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1792 recBit :: Int
1793 recBit = 22 -- rec
1794 alternativeLayoutRuleBit :: Int
1795 alternativeLayoutRuleBit = 23
1796 relaxedLayoutBit :: Int
1797 relaxedLayoutBit = 24
1798 nondecreasingIndentationBit :: Int
1799 nondecreasingIndentationBit = 25
1800
1801 always :: Int -> Bool
1802 always           _     = True
1803 parrEnabled :: Int -> Bool
1804 parrEnabled      flags = testBit flags parrBit
1805 arrowsEnabled :: Int -> Bool
1806 arrowsEnabled    flags = testBit flags arrowsBit
1807 thEnabled :: Int -> Bool
1808 thEnabled        flags = testBit flags thBit
1809 ipEnabled :: Int -> Bool
1810 ipEnabled        flags = testBit flags ipBit
1811 explicitForallEnabled :: Int -> Bool
1812 explicitForallEnabled flags = testBit flags explicitForallBit
1813 bangPatEnabled :: Int -> Bool
1814 bangPatEnabled   flags = testBit flags bangPatBit
1815 -- tyFamEnabled :: Int -> Bool
1816 -- tyFamEnabled     flags = testBit flags tyFamBit
1817 haddockEnabled :: Int -> Bool
1818 haddockEnabled   flags = testBit flags haddockBit
1819 magicHashEnabled :: Int -> Bool
1820 magicHashEnabled flags = testBit flags magicHashBit
1821 -- kindSigsEnabled :: Int -> Bool
1822 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1823 unicodeSyntaxEnabled :: Int -> Bool
1824 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1825 unboxedTuplesEnabled :: Int -> Bool
1826 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1827 datatypeContextsEnabled :: Int -> Bool
1828 datatypeContextsEnabled flags = testBit flags datatypeContextsBit
1829 qqEnabled :: Int -> Bool
1830 qqEnabled        flags = testBit flags qqBit
1831 -- inRulePrag :: Int -> Bool
1832 -- inRulePrag       flags = testBit flags inRulePragBit
1833 rawTokenStreamEnabled :: Int -> Bool
1834 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1835 alternativeLayoutRule :: Int -> Bool
1836 alternativeLayoutRule flags = testBit flags alternativeLayoutRuleBit
1837 relaxedLayout :: Int -> Bool
1838 relaxedLayout flags = testBit flags relaxedLayoutBit
1839 nondecreasingIndentation :: Int -> Bool
1840 nondecreasingIndentation flags = testBit flags nondecreasingIndentationBit
1841
1842 -- PState for parsing options pragmas
1843 --
1844 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1845 pragState dynflags buf loc = (mkPState dynflags buf loc) {
1846                                  lex_state = [bol, option_prags, 0]
1847                              }
1848
1849 -- create a parse state
1850 --
1851 mkPState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1852 mkPState flags buf loc =
1853   PState {
1854       buffer        = buf,
1855       dflags        = flags,
1856       messages      = emptyMessages,
1857       last_loc      = mkSrcSpan loc loc,
1858       last_len      = 0,
1859       loc           = loc,
1860       extsBitmap    = fromIntegral bitmap,
1861       context       = [],
1862       lex_state     = [bol, 0],
1863       alr_pending_implicit_tokens = [],
1864       alr_next_token = Nothing,
1865       alr_last_loc = noSrcSpan,
1866       alr_context = [],
1867       alr_expecting_ocurly = Nothing,
1868       alr_justClosedExplicitLetBlock = False
1869     }
1870     where
1871       bitmap =     ffiBit            `setBitIf` xopt Opt_ForeignFunctionInterface flags
1872                .|. parrBit           `setBitIf` xopt Opt_ParallelArrays  flags
1873                .|. arrowsBit         `setBitIf` xopt Opt_Arrows          flags
1874                .|. thBit             `setBitIf` xopt Opt_TemplateHaskell flags
1875                .|. qqBit             `setBitIf` xopt Opt_QuasiQuotes     flags
1876                .|. ipBit             `setBitIf` xopt Opt_ImplicitParams  flags
1877                .|. explicitForallBit `setBitIf` xopt Opt_ExplicitForAll  flags
1878                .|. bangPatBit        `setBitIf` xopt Opt_BangPatterns    flags
1879                .|. tyFamBit          `setBitIf` xopt Opt_TypeFamilies    flags
1880                .|. haddockBit        `setBitIf` dopt Opt_Haddock         flags
1881                .|. magicHashBit      `setBitIf` xopt Opt_MagicHash       flags
1882                .|. kindSigsBit       `setBitIf` xopt Opt_KindSignatures  flags
1883                .|. recursiveDoBit    `setBitIf` xopt Opt_RecursiveDo     flags
1884                .|. recBit            `setBitIf` xopt Opt_DoRec           flags
1885                .|. recBit            `setBitIf` xopt Opt_Arrows          flags
1886                .|. unicodeSyntaxBit  `setBitIf` xopt Opt_UnicodeSyntax   flags
1887                .|. unboxedTuplesBit  `setBitIf` xopt Opt_UnboxedTuples   flags
1888                .|. datatypeContextsBit `setBitIf` xopt Opt_DatatypeContexts flags
1889                .|. transformComprehensionsBit `setBitIf` xopt Opt_TransformListComp flags
1890                .|. transformComprehensionsBit `setBitIf` xopt Opt_MonadComprehensions flags
1891                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1892                .|. alternativeLayoutRuleBit `setBitIf` xopt Opt_AlternativeLayoutRule flags
1893                .|. relaxedLayoutBit  `setBitIf` xopt Opt_RelaxedLayout flags
1894                .|. nondecreasingIndentationBit `setBitIf` xopt Opt_NondecreasingIndentation flags
1895       --
1896       setBitIf :: Int -> Bool -> Int
1897       b `setBitIf` cond | cond      = bit b
1898                         | otherwise = 0
1899
1900 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1901 addWarning option srcspan warning
1902  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1903        let warning' = mkWarnMsg srcspan alwaysQualify warning
1904            ws' = if dopt option d then ws `snocBag` warning' else ws
1905        in POk s{messages=(ws', es)} ()
1906
1907 getMessages :: PState -> Messages
1908 getMessages PState{messages=ms} = ms
1909
1910 getContext :: P [LayoutContext]
1911 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1912
1913 setContext :: [LayoutContext] -> P ()
1914 setContext ctx = P $ \s -> POk s{context=ctx} ()
1915
1916 popContext :: P ()
1917 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1918                               last_len = len, last_loc = last_loc }) ->
1919   case ctx of
1920         (_:tl) -> POk s{ context = tl } ()
1921         []     -> PFailed last_loc (srcParseErr buf len)
1922
1923 -- Push a new layout context at the indentation of the last token read.
1924 -- This is only used at the outer level of a module when the 'module'
1925 -- keyword is missing.
1926 pushCurrentContext :: P ()
1927 pushCurrentContext = P $ \ s@PState{ last_loc=loc, context=ctx } ->
1928     POk s{context = Layout (srcSpanStartCol loc) : ctx} ()
1929
1930 getOffside :: P Ordering
1931 getOffside = P $ \s@PState{last_loc=loc, context=stk} ->
1932                 let offs = srcSpanStartCol loc in
1933                 let ord = case stk of
1934                         (Layout n:_) -> --trace ("layout: " ++ show n ++ ", offs: " ++ show offs) $
1935                                         compare offs n
1936                         _            -> GT
1937                 in POk s ord
1938
1939 -- ---------------------------------------------------------------------------
1940 -- Construct a parse error
1941
1942 srcParseErr
1943   :: StringBuffer       -- current buffer (placed just after the last token)
1944   -> Int                -- length of the previous token
1945   -> Message
1946 srcParseErr buf len
1947   = hcat [ if null token
1948              then ptext (sLit "parse error (possibly incorrect indentation)")
1949              else hcat [ptext (sLit "parse error on input "),
1950                         char '`', text token, char '\'']
1951     ]
1952   where token = lexemeToString (offsetBytes (-len) buf) len
1953
1954 -- Report a parse failure, giving the span of the previous token as
1955 -- the location of the error.  This is the entry point for errors
1956 -- detected during parsing.
1957 srcParseFail :: P a
1958 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1959                             last_loc = last_loc } ->
1960     PFailed last_loc (srcParseErr buf len)
1961
1962 -- A lexical error is reported at a particular position in the source file,
1963 -- not over a token range.
1964 lexError :: String -> P a
1965 lexError str = do
1966   loc <- getSrcLoc
1967   (AI end buf) <- getInput
1968   reportLexError loc end buf str
1969
1970 -- -----------------------------------------------------------------------------
1971 -- This is the top-level function: called from the parser each time a
1972 -- new token is to be read from the input.
1973
1974 lexer :: (Located Token -> P a) -> P a
1975 lexer cont = do
1976   alr <- extension alternativeLayoutRule
1977   let lexTokenFun = if alr then lexTokenAlr else lexToken
1978   tok@(L _span _tok__) <- lexTokenFun
1979   --trace ("token: " ++ show _tok__) $ do
1980   cont tok
1981
1982 lexTokenAlr :: P (Located Token)
1983 lexTokenAlr = do mPending <- popPendingImplicitToken
1984                  t <- case mPending of
1985                       Nothing ->
1986                           do mNext <- popNextToken
1987                              t <- case mNext of
1988                                   Nothing -> lexToken
1989                                   Just next -> return next
1990                              alternativeLayoutRuleToken t
1991                       Just t ->
1992                           return t
1993                  setAlrLastLoc (getLoc t)
1994                  case unLoc t of
1995                      ITwhere -> setAlrExpectingOCurly (Just ALRLayoutWhere)
1996                      ITlet   -> setAlrExpectingOCurly (Just ALRLayoutLet)
1997                      ITof    -> setAlrExpectingOCurly (Just ALRLayoutOf)
1998                      ITdo    -> setAlrExpectingOCurly (Just ALRLayoutDo)
1999                      ITmdo   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2000                      ITrec   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2001                      _       -> return ()
2002                  return t
2003
2004 alternativeLayoutRuleToken :: Located Token -> P (Located Token)
2005 alternativeLayoutRuleToken t
2006     = do context <- getALRContext
2007          lastLoc <- getAlrLastLoc
2008          mExpectingOCurly <- getAlrExpectingOCurly
2009          justClosedExplicitLetBlock <- getJustClosedExplicitLetBlock
2010          setJustClosedExplicitLetBlock False
2011          dflags <- getDynFlags
2012          let transitional = xopt Opt_AlternativeLayoutRuleTransitional dflags
2013              thisLoc = getLoc t
2014              thisCol = srcSpanStartCol thisLoc
2015              newLine = (lastLoc == noSrcSpan)
2016                     || (srcSpanStartLine thisLoc > srcSpanEndLine lastLoc)
2017          case (unLoc t, context, mExpectingOCurly) of
2018              -- This case handles a GHC extension to the original H98
2019              -- layout rule...
2020              (ITocurly, _, Just alrLayout) ->
2021                  do setAlrExpectingOCurly Nothing
2022                     let isLet = case alrLayout of
2023                                 ALRLayoutLet -> True
2024                                 _ -> False
2025                     setALRContext (ALRNoLayout (containsCommas ITocurly) isLet : context)
2026                     return t
2027              -- ...and makes this case unnecessary
2028              {-
2029              -- I think our implicit open-curly handling is slightly
2030              -- different to John's, in how it interacts with newlines
2031              -- and "in"
2032              (ITocurly, _, Just _) ->
2033                  do setAlrExpectingOCurly Nothing
2034                     setNextToken t
2035                     lexTokenAlr
2036              -}
2037              (_, ALRLayout _ col : ls, Just expectingOCurly)
2038               | (thisCol > col) ||
2039                 (thisCol == col &&
2040                  isNonDecreasingIntentation expectingOCurly) ->
2041                  do setAlrExpectingOCurly Nothing
2042                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2043                     setNextToken t
2044                     return (L thisLoc ITocurly)
2045               | otherwise ->
2046                  do setAlrExpectingOCurly Nothing
2047                     setPendingImplicitTokens [L lastLoc ITccurly]
2048                     setNextToken t
2049                     return (L lastLoc ITocurly)
2050              (_, _, Just expectingOCurly) ->
2051                  do setAlrExpectingOCurly Nothing
2052                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2053                     setNextToken t
2054                     return (L thisLoc ITocurly)
2055              -- We do the [] cases earlier than in the spec, as we
2056              -- have an actual EOF token
2057              (ITeof, ALRLayout _ _ : ls, _) ->
2058                  do setALRContext ls
2059                     setNextToken t
2060                     return (L thisLoc ITccurly)
2061              (ITeof, _, _) ->
2062                  return t
2063              -- the other ITeof case omitted; general case below covers it
2064              (ITin, _, _)
2065               | justClosedExplicitLetBlock ->
2066                  return t
2067              (ITin, ALRLayout ALRLayoutLet _ : ls, _)
2068               | newLine ->
2069                  do setPendingImplicitTokens [t]
2070                     setALRContext ls
2071                     return (L thisLoc ITccurly)
2072              -- This next case is to handle a transitional issue:
2073              (ITwhere, ALRLayout _ col : ls, _)
2074               | newLine && thisCol == col && transitional ->
2075                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2076                                thisLoc
2077                                (transitionalAlternativeLayoutWarning
2078                                     "`where' clause at the same depth as implicit layout block")
2079                     setALRContext ls
2080                     setNextToken t
2081                     -- Note that we use lastLoc, as we may need to close
2082                     -- more layouts, or give a semicolon
2083                     return (L lastLoc ITccurly)
2084              -- This next case is to handle a transitional issue:
2085              (ITvbar, ALRLayout _ col : ls, _)
2086               | newLine && thisCol == col && transitional ->
2087                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2088                                thisLoc
2089                                (transitionalAlternativeLayoutWarning
2090                                     "`|' at the same depth as implicit layout block")
2091                     setALRContext ls
2092                     setNextToken t
2093                     -- Note that we use lastLoc, as we may need to close
2094                     -- more layouts, or give a semicolon
2095                     return (L lastLoc ITccurly)
2096              (_, ALRLayout _ col : ls, _)
2097               | newLine && thisCol == col ->
2098                  do setNextToken t
2099                     return (L thisLoc ITsemi)
2100               | newLine && thisCol < col ->
2101                  do setALRContext ls
2102                     setNextToken t
2103                     -- Note that we use lastLoc, as we may need to close
2104                     -- more layouts, or give a semicolon
2105                     return (L lastLoc ITccurly)
2106              -- We need to handle close before open, as 'then' is both
2107              -- an open and a close
2108              (u, _, _)
2109               | isALRclose u ->
2110                  case context of
2111                  ALRLayout _ _ : ls ->
2112                      do setALRContext ls
2113                         setNextToken t
2114                         return (L thisLoc ITccurly)
2115                  ALRNoLayout _ isLet : ls ->
2116                      do let ls' = if isALRopen u
2117                                      then ALRNoLayout (containsCommas u) False : ls
2118                                      else ls
2119                         setALRContext ls'
2120                         when isLet $ setJustClosedExplicitLetBlock True
2121                         return t
2122                  [] ->
2123                      do let ls = if isALRopen u
2124                                     then [ALRNoLayout (containsCommas u) False]
2125                                     else ls
2126                         setALRContext ls
2127                         -- XXX This is an error in John's code, but
2128                         -- it looks reachable to me at first glance
2129                         return t
2130              (u, _, _)
2131               | isALRopen u ->
2132                  do setALRContext (ALRNoLayout (containsCommas u) False : context)
2133                     return t
2134              (ITin, ALRLayout ALRLayoutLet _ : ls, _) ->
2135                  do setALRContext ls
2136                     setPendingImplicitTokens [t]
2137                     return (L thisLoc ITccurly)
2138              (ITin, ALRLayout _ _ : ls, _) ->
2139                  do setALRContext ls
2140                     setNextToken t
2141                     return (L thisLoc ITccurly)
2142              -- the other ITin case omitted; general case below covers it
2143              (ITcomma, ALRLayout _ _ : ls, _)
2144               | topNoLayoutContainsCommas ls ->
2145                  do setALRContext ls
2146                     setNextToken t
2147                     return (L thisLoc ITccurly)
2148              (ITwhere, ALRLayout ALRLayoutDo _ : ls, _) ->
2149                  do setALRContext ls
2150                     setPendingImplicitTokens [t]
2151                     return (L thisLoc ITccurly)
2152              -- the other ITwhere case omitted; general case below covers it
2153              (_, _, _) -> return t
2154
2155 transitionalAlternativeLayoutWarning :: String -> SDoc
2156 transitionalAlternativeLayoutWarning msg
2157     = text "transitional layout will not be accepted in the future:"
2158    $$ text msg
2159
2160 isALRopen :: Token -> Bool
2161 isALRopen ITcase        = True
2162 isALRopen ITif          = True
2163 isALRopen ITthen        = True
2164 isALRopen IToparen      = True
2165 isALRopen ITobrack      = True
2166 isALRopen ITocurly      = True
2167 -- GHC Extensions:
2168 isALRopen IToubxparen   = True
2169 isALRopen ITparenEscape = True
2170 isALRopen _             = False
2171
2172 isALRclose :: Token -> Bool
2173 isALRclose ITof     = True
2174 isALRclose ITthen   = True
2175 isALRclose ITelse   = True
2176 isALRclose ITcparen = True
2177 isALRclose ITcbrack = True
2178 isALRclose ITccurly = True
2179 -- GHC Extensions:
2180 isALRclose ITcubxparen = True
2181 isALRclose _        = False
2182
2183 isNonDecreasingIntentation :: ALRLayout -> Bool
2184 isNonDecreasingIntentation ALRLayoutDo = True
2185 isNonDecreasingIntentation _           = False
2186
2187 containsCommas :: Token -> Bool
2188 containsCommas IToparen = True
2189 containsCommas ITobrack = True
2190 -- John doesn't have {} as containing commas, but records contain them,
2191 -- which caused a problem parsing Cabal's Distribution.Simple.InstallDirs
2192 -- (defaultInstallDirs).
2193 containsCommas ITocurly = True
2194 -- GHC Extensions:
2195 containsCommas IToubxparen = True
2196 containsCommas _        = False
2197
2198 topNoLayoutContainsCommas :: [ALRContext] -> Bool
2199 topNoLayoutContainsCommas [] = False
2200 topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
2201 topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
2202
2203 lexToken :: P (Located Token)
2204 lexToken = do
2205   inp@(AI loc1 buf) <- getInput
2206   sc <- getLexState
2207   exts <- getExts
2208   case alexScanUser exts inp sc of
2209     AlexEOF -> do
2210         let span = mkSrcSpan loc1 loc1
2211         setLastToken span 0
2212         return (L span ITeof)
2213     AlexError (AI loc2 buf) ->
2214         reportLexError loc1 loc2 buf "lexical error"
2215     AlexSkip inp2 _ -> do
2216         setInput inp2
2217         lexToken
2218     AlexToken inp2@(AI end buf2) _ t -> do
2219         setInput inp2
2220         let span = mkSrcSpan loc1 end
2221         let bytes = byteDiff buf buf2
2222         span `seq` setLastToken span bytes
2223         t span buf bytes
2224
2225 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
2226 reportLexError loc1 loc2 buf str
2227   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
2228   | otherwise =
2229   let
2230         c = fst (nextChar buf)
2231   in
2232   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
2233     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
2234     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
2235
2236 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
2237 lexTokenStream buf loc dflags = unP go initState
2238     where dflags' = dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream
2239           initState = mkPState dflags' buf loc
2240           go = do
2241             ltok <- lexer return
2242             case ltok of
2243               L _ ITeof -> return []
2244               _ -> liftM (ltok:) go
2245
2246 linePrags = Map.singleton "line" (begin line_prag2)
2247
2248 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
2249                                  ("options_ghc", lex_string_prag IToptions_prag),
2250                                  ("options_haddock", lex_string_prag ITdocOptions),
2251                                  ("language", token ITlanguage_prag),
2252                                  ("include", lex_string_prag ITinclude_prag)])
2253
2254 ignoredPrags = Map.fromList (map ignored pragmas)
2255                where ignored opt = (opt, nested_comment lexToken)
2256                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
2257                      options_pragmas = map ("options_" ++) impls
2258                      -- CFILES is a hugs-only thing.
2259                      pragmas = options_pragmas ++ ["cfiles", "contract"]
2260
2261 oneWordPrags = Map.fromList([("rules", rulePrag),
2262                            ("inline", token (ITinline_prag Inline FunLike)),
2263                            ("inlinable", token (ITinline_prag Inlinable FunLike)),
2264                            ("inlineable", token (ITinline_prag Inlinable FunLike)),
2265                                           -- Spelling variant
2266                            ("notinline", token (ITinline_prag NoInline FunLike)),
2267                            ("specialize", token ITspec_prag),
2268                            ("source", token ITsource_prag),
2269                            ("warning", token ITwarning_prag),
2270                            ("deprecated", token ITdeprecated_prag),
2271                            ("scc", token ITscc_prag),
2272                            ("generated", token ITgenerated_prag),
2273                            ("core", token ITcore_prag),
2274                            ("unpack", token ITunpack_prag),
2275                            ("ann", token ITann_prag),
2276                            ("vectorize", token ITvect_prag)])
2277
2278 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_prag Inline ConLike)),
2279                              ("notinline conlike", token (ITinline_prag NoInline ConLike)),
2280                              ("specialize inline", token (ITspec_inline_prag True)),
2281                              ("specialize notinline", token (ITspec_inline_prag False)),
2282                              ("vectorize scalar", token ITvect_scalar_prag)])
2283
2284 dispatch_pragmas :: Map String Action -> Action
2285 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
2286                                        Just found -> found span buf len
2287                                        Nothing -> lexError "unknown pragma"
2288
2289 known_pragma :: Map String Action -> AlexAccPred Int
2290 known_pragma prags _ _ len (AI _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
2291                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
2292
2293 clean_pragma :: String -> String
2294 clean_pragma prag = canon_ws (map toLower (unprefix prag))
2295                     where unprefix prag' = case stripPrefix "{-#" prag' of
2296                                              Just rest -> rest
2297                                              Nothing -> prag'
2298                           canonical prag' = case prag' of
2299                                               "noinline" -> "notinline"
2300                                               "specialise" -> "specialize"
2301                                               "vectorise" -> "vectorize"
2302                                               "constructorlike" -> "conlike"
2303                                               _ -> prag'
2304                           canon_ws s = unwords (map canonical (words s))
2305 }