compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 -- XXX The above flags turn off warnings in the generated code:
  35 {-# LANGUAGE BangPatterns #-}
  36 {-# OPTIONS_GHC -fno-warn-unused-matches #-}
  37 {-# OPTIONS_GHC -fno-warn-unused-binds #-}
  38 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
  39 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
  40 -- But alex still generates some code that causes the "lazy unlifted bindings"
  41 -- warning, and old compilers don't know about it so we can't easily turn
  42 -- it off, so for now we use the sledge hammer:
  43 {-# OPTIONS_GHC -w #-}
  44
  45 {-# OPTIONS_GHC -funbox-strict-fields #-}
  46
  47 module Lexer (
  48    Token(..), lexer, pragState, mkPState, PState(..),
  49    P(..), ParseResult(..), getSrcLoc,
  50    getPState, getDynFlags, withThisPackage,
  51    failLocMsgP, failSpanMsgP, srcParseFail,
  52    getMessages,
  53    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  54    activeContext, nextIsEOF,
  55    getLexState, popLexState, pushLexState,
  56    extension, bangPatEnabled, datatypeContextsEnabled,
  57    addWarning,
  58    lexTokenStream
  59   ) where
  60
  61 import Bag
  62 import ErrUtils
  63 import Outputable
  64 import StringBuffer
  65 import FastString
  66 import SrcLoc
  67 import UniqFM
  68 import DynFlags
  69 import Module
  70 import Ctype
  71 import BasicTypes       ( InlineSpec(..), RuleMatchInfo(..) )
  72 import Util             ( readRational )
  73
  74 import Control.Monad
  75 import Data.Bits
  76 import Data.Char
  77 import Data.List
  78 import Data.Maybe
  79 import Data.Map (Map)
  80 import qualified Data.Map as Map
  81 import Data.Ratio
  82 }
  83
  84 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  85 $whitechar   = [\ \n\r\f\v $unispace]
  86 $white_no_nl = $whitechar # \n
  87 $tab         = \t
  88
  89 $ascdigit  = 0-9
  90 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  91 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  92 $digit     = [$ascdigit $unidigit]
  93
  94 $special   = [\(\)\,\;\[\]\`\{\}]
  95 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  96 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  97 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  98
  99 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
 100 $asclarge  = [A-Z]
 101 $large     = [$asclarge $unilarge]
 102
 103 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 104 $ascsmall  = [a-z]
 105 $small     = [$ascsmall $unismall \_]
 106
 107 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 108 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 109
 110 $octit     = 0-7
 111 $hexit     = [$decdigit A-F a-f]
 112 $symchar   = [$symbol \:]
 113 $nl        = [\n\r]
 114 $idchar    = [$small $large $digit \']
 115
 116 $pragmachar = [$small $large $digit]
 117
 118 $docsym    = [\| \^ \* \$]
 119
 120 @varid     = $small $idchar*
 121 @conid     = $large $idchar*
 122
 123 @varsym    = $symbol $symchar*
 124 @consym    = \: $symchar*
 125
 126 @decimal     = $decdigit+
 127 @octal       = $octit+
 128 @hexadecimal = $hexit+
 129 @exponent    = [eE] [\-\+]? @decimal
 130
 131 -- we support the hierarchical module name extension:
 132 @qual = (@conid \.)+
 133
 134 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 135
 136 -- normal signed numerical literals can only be explicitly negative,
 137 -- not explicitly positive (contrast @exponent)
 138 @negative = \-
 139 @signed = @negative ?
 140
 141 haskell :-
 142
 143 -- everywhere: skip whitespace and comments
 144 $white_no_nl+                           ;
 145 $tab+         { warn Opt_WarnTabs (text "Warning: Tab character") }
 146
 147 -- Everywhere: deal with nested comments.  We explicitly rule out
 148 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 149 -- (this can happen even though pragmas will normally take precedence due to
 150 -- longest-match, because pragmas aren't valid in every state, but comments
 151 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 152 -- set.
 153
 154 "{-" / { isNormalComment } { nested_comment lexToken }
 155
 156 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 157 -- more dashes followed by a symbol should be parsed as a varsym, so we
 158 -- have to exclude those.
 159
 160 -- Since Haddock comments aren't valid in every state, we need to rule them
 161 -- out here.
 162
 163 -- The following two rules match comments that begin with two dashes, but
 164 -- continue with a different character. The rules test that this character
 165 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 166 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 167 -- have a Haddock comment). The rules then munch the rest of the line.
 168
 169 "-- " ~[$docsym \#] .* { lineCommentToken }
 170 "--" [^$symbol : \ ] .* { lineCommentToken }
 171
 172 -- Next, match Haddock comments if no -haddock flag
 173
 174 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 175
 176 -- Now, when we've matched comments that begin with 2 dashes and continue
 177 -- with a different character, we need to match comments that begin with three
 178 -- or more dashes (which clearly can't be Haddock comments). We only need to
 179 -- make sure that the first non-dash character isn't a symbol, and munch the
 180 -- rest of the line.
 181
 182 "---"\-* [^$symbol :] .* { lineCommentToken }
 183
 184 -- Since the previous rules all match dashes followed by at least one
 185 -- character, we also need to match a whole line filled with just dashes.
 186
 187 "--"\-* / { atEOL } { lineCommentToken }
 188
 189 -- We need this rule since none of the other single line comment rules
 190 -- actually match this case.
 191
 192 "-- " / { atEOL } { lineCommentToken }
 193
 194 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 195 -- blank lines) until we find a non-whitespace character, then do layout
 196 -- processing.
 197 --
 198 -- One slight wibble here: what if the line begins with {-#? In
 199 -- theory, we have to lex the pragma to see if it's one we recognise,
 200 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 201 -- as a nested comment.  We don't bother with this: if the line begins
 202 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 203 <bol> {
 204   \n                                    ;
 205   ^\# (line)?                           { begin line_prag1 }
 206   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 207   ^\# \! .* \n                          ; -- #!, for scripts
 208   ()                                    { do_bol }
 209 }
 210
 211 -- after a layout keyword (let, where, do, of), we begin a new layout
 212 -- context if the curly brace is missing.
 213 -- Careful! This stuff is quite delicate.
 214 <layout, layout_do> {
 215   \{ / { notFollowedBy '-' }            { hopefully_open_brace }
 216         -- we might encounter {-# here, but {- has been handled already
 217   \n                                    ;
 218   ^\# (line)?                           { begin line_prag1 }
 219 }
 220
 221 -- do is treated in a subtly different way, see new_layout_context
 222 <layout>    ()                          { new_layout_context True }
 223 <layout_do> ()                          { new_layout_context False }
 224
 225 -- after a new layout context which was found to be to the left of the
 226 -- previous context, we have generated a '{' token, and we now need to
 227 -- generate a matching '}' token.
 228 <layout_left>  ()                       { do_layout_left }
 229
 230 <0,option_prags> \n                             { begin bol }
 231
 232 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 233                                 { dispatch_pragmas linePrags }
 234
 235 -- single-line line pragmas, of the form
 236 --    # <line> "<file>" <extra-stuff> \n
 237 <line_prag1> $decdigit+                 { setLine line_prag1a }
 238 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 239 <line_prag1b> .*                        { pop }
 240
 241 -- Haskell-style line pragmas, of the form
 242 --    {-# LINE <line> "<file>" #-}
 243 <line_prag2> $decdigit+                 { setLine line_prag2a }
 244 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 245 <line_prag2b> "#-}"|"-}"                { pop }
 246    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 247    -- with older versions of GHC which generated these.
 248
 249 <0,option_prags> {
 250   "{-#" $whitechar* $pragmachar+
 251         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 252                                  { dispatch_pragmas twoWordPrags }
 253
 254   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 255                                  { dispatch_pragmas oneWordPrags }
 256
 257   -- We ignore all these pragmas, but don't generate a warning for them
 258   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 259                                  { dispatch_pragmas ignoredPrags }
 260
 261   -- ToDo: should only be valid inside a pragma:
 262   "#-}"                                 { endPrag }
 263 }
 264
 265 <option_prags> {
 266   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 267                                    { dispatch_pragmas fileHeaderPrags }
 268
 269   "-- #"                                 { multiline_doc_comment }
 270 }
 271
 272 <0> {
 273   -- In the "0" mode we ignore these pragmas
 274   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 275                      { nested_comment lexToken }
 276 }
 277
 278 <0> {
 279   "-- #" .* { lineCommentToken }
 280 }
 281
 282 <0,option_prags> {
 283   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 284                     (nested_comment lexToken) }
 285 }
 286
 287 -- '0' state: ordinary lexemes
 288
 289 -- Haddock comments
 290
 291 <0,option_prags> {
 292   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 293   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 294 }
 295
 296 -- "special" symbols
 297
 298 <0> {
 299   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 300   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 301 }
 302
 303 <0> {
 304   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 305   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 306   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 307   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 308   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 309   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 310   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 311   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 312
 313 -- For backward compatibility, accept the old dollar syntax
 314   "[$" @varid "|"  / { ifExtension qqEnabled }
 315                      { lex_quasiquote_tok }
 316
 317   "[" @varid "|"  / { ifExtension qqEnabled }
 318                      { lex_quasiquote_tok }
 319 }
 320
 321 <0> {
 322   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 323                                         { special IToparenbar }
 324   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 325 }
 326
 327 <0> {
 328   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 329 }
 330
 331 <0> {
 332   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 333          { token IToubxparen }
 334   "#)" / { ifExtension unboxedTuplesEnabled }
 335          { token ITcubxparen }
 336 }
 337
 338 <0> {
 339   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 340   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 341 }
 342
 343 <0,option_prags> {
 344   \(                                    { special IToparen }
 345   \)                                    { special ITcparen }
 346   \[                                    { special ITobrack }
 347   \]                                    { special ITcbrack }
 348   \,                                    { special ITcomma }
 349   \;                                    { special ITsemi }
 350   \`                                    { special ITbackquote }
 351
 352   \{                                    { open_brace }
 353   \}                                    { close_brace }
 354 }
 355
 356 <0,option_prags> {
 357   @qual @varid                  { idtoken qvarid }
 358   @qual @conid                  { idtoken qconid }
 359   @varid                        { varid }
 360   @conid                        { idtoken conid }
 361 }
 362
 363 <0> {
 364   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 365   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 366   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 367   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 368 }
 369
 370 -- ToDo: - move `var` and (sym) into lexical syntax?
 371 --       - remove backquote from $special?
 372 <0> {
 373   @qual @varsym                                    { idtoken qvarsym }
 374   @qual @consym                                    { idtoken qconsym }
 375   @varsym                                          { varsym }
 376   @consym                                          { consym }
 377 }
 378
 379 -- For the normal boxed literals we need to be careful
 380 -- when trying to be close to Haskell98
 381 <0> {
 382   -- Normal integral literals (:: Num a => a, from Integer)
 383   @decimal           { tok_num positive 0 0 decimal }
 384   0[oO] @octal       { tok_num positive 2 2 octal }
 385   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 386
 387   -- Normal rational literals (:: Fractional a => a, from Rational)
 388   @floating_point    { strtoken tok_float }
 389 }
 390
 391 <0> {
 392   -- Unboxed ints (:: Int#) and words (:: Word#)
 393   -- It's simpler (and faster?) to give separate cases to the negatives,
 394   -- especially considering octal/hexadecimal prefixes.
 395   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 396   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 397   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 398   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 399   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 400   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 401
 402   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 403   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 404   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 405
 406   -- Unboxed floats and doubles (:: Float#, :: Double#)
 407   -- prim_{float,double} work with signed literals
 408   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 409   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 410 }
 411
 412 -- Strings and chars are lexed by hand-written code.  The reason is
 413 -- that even if we recognise the string or char here in the regex
 414 -- lexer, we would still have to parse the string afterward in order
 415 -- to convert it to a String.
 416 <0> {
 417   \'                            { lex_char_tok }
 418   \"                            { lex_string_tok }
 419 }
 420
 421 {
 422 -- -----------------------------------------------------------------------------
 423 -- The token type
 424
 425 data Token
 426   = ITas                        -- Haskell keywords
 427   | ITcase
 428   | ITclass
 429   | ITdata
 430   | ITdefault
 431   | ITderiving
 432   | ITdo
 433   | ITelse
 434   | ITgeneric
 435   | IThiding
 436   | ITif
 437   | ITimport
 438   | ITin
 439   | ITinfix
 440   | ITinfixl
 441   | ITinfixr
 442   | ITinstance
 443   | ITlet
 444   | ITmodule
 445   | ITnewtype
 446   | ITof
 447   | ITqualified
 448   | ITthen
 449   | ITtype
 450   | ITwhere
 451   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 452
 453   | ITforall                    -- GHC extension keywords
 454   | ITforeign
 455   | ITexport
 456   | ITlabel
 457   | ITdynamic
 458   | ITsafe
 459   | ITthreadsafe
 460   | ITinterruptible
 461   | ITunsafe
 462   | ITstdcallconv
 463   | ITccallconv
 464   | ITprimcallconv
 465   | ITmdo
 466   | ITfamily
 467   | ITgroup
 468   | ITby
 469   | ITusing
 470
 471         -- Pragmas
 472   | ITinline_prag InlineSpec RuleMatchInfo
 473   | ITspec_prag                 -- SPECIALISE
 474   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 475   | ITsource_prag
 476   | ITrules_prag
 477   | ITwarning_prag
 478   | ITdeprecated_prag
 479   | ITline_prag
 480   | ITscc_prag
 481   | ITgenerated_prag
 482   | ITcore_prag                 -- hdaume: core annotations
 483   | ITunpack_prag
 484   | ITann_prag
 485   | ITclose_prag
 486   | IToptions_prag String
 487   | ITinclude_prag String
 488   | ITlanguage_prag
 489   | ITvect_prag
 490   | ITvect_scalar_prag
 491
 492   | ITdotdot                    -- reserved symbols
 493   | ITcolon
 494   | ITdcolon
 495   | ITequal
 496   | ITlam
 497   | ITvbar
 498   | ITlarrow
 499   | ITrarrow
 500   | ITat
 501   | ITtilde
 502   | ITdarrow
 503   | ITminus
 504   | ITbang
 505   | ITstar
 506   | ITdot
 507
 508   | ITbiglam                    -- GHC-extension symbols
 509
 510   | ITocurly                    -- special symbols
 511   | ITccurly
 512   | ITocurlybar                 -- {|, for type applications
 513   | ITccurlybar                 -- |}, for type applications
 514   | ITvocurly
 515   | ITvccurly
 516   | ITobrack
 517   | ITopabrack                  -- [:, for parallel arrays with -XParallelArrays
 518   | ITcpabrack                  -- :], for parallel arrays with -XParallelArrays
 519   | ITcbrack
 520   | IToparen
 521   | ITcparen
 522   | IToubxparen
 523   | ITcubxparen
 524   | ITsemi
 525   | ITcomma
 526   | ITunderscore
 527   | ITbackquote
 528
 529   | ITvarid   FastString        -- identifiers
 530   | ITconid   FastString
 531   | ITvarsym  FastString
 532   | ITconsym  FastString
 533   | ITqvarid  (FastString,FastString)
 534   | ITqconid  (FastString,FastString)
 535   | ITqvarsym (FastString,FastString)
 536   | ITqconsym (FastString,FastString)
 537   | ITprefixqvarsym (FastString,FastString)
 538   | ITprefixqconsym (FastString,FastString)
 539
 540   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 541
 542   | ITchar       Char
 543   | ITstring     FastString
 544   | ITinteger    Integer
 545   | ITrational   Rational
 546
 547   | ITprimchar   Char
 548   | ITprimstring FastString
 549   | ITprimint    Integer
 550   | ITprimword   Integer
 551   | ITprimfloat  Rational
 552   | ITprimdouble Rational
 553
 554   -- Template Haskell extension tokens
 555   | ITopenExpQuote              --  [| or [e|
 556   | ITopenPatQuote              --  [p|
 557   | ITopenDecQuote              --  [d|
 558   | ITopenTypQuote              --  [t|
 559   | ITcloseQuote                --  |]
 560   | ITidEscape   FastString     --  $x
 561   | ITparenEscape               --  $(
 562   | ITvarQuote                  --  '
 563   | ITtyQuote                   --  ''
 564   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 565
 566   -- Arrow notation extension
 567   | ITproc
 568   | ITrec
 569   | IToparenbar                 --  (|
 570   | ITcparenbar                 --  |)
 571   | ITlarrowtail                --  -<
 572   | ITrarrowtail                --  >-
 573   | ITLarrowtail                --  -<<
 574   | ITRarrowtail                --  >>-
 575
 576   | ITunknown String            -- Used when the lexer can't make sense of it
 577   | ITeof                       -- end of file token
 578
 579   -- Documentation annotations
 580   | ITdocCommentNext  String     -- something beginning '-- |'
 581   | ITdocCommentPrev  String     -- something beginning '-- ^'
 582   | ITdocCommentNamed String     -- something beginning '-- $'
 583   | ITdocSection      Int String -- a section heading
 584   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 585   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 586   | ITlineComment     String     -- comment starting by "--"
 587   | ITblockComment    String     -- comment in {- -}
 588
 589 #ifdef DEBUG
 590   deriving Show -- debugging
 591 #endif
 592
 593 {-
 594 isSpecial :: Token -> Bool
 595 -- If we see M.x, where x is a keyword, but
 596 -- is special, we treat is as just plain M.x,
 597 -- not as a keyword.
 598 isSpecial ITas          = True
 599 isSpecial IThiding      = True
 600 isSpecial ITqualified   = True
 601 isSpecial ITforall      = True
 602 isSpecial ITexport      = True
 603 isSpecial ITlabel       = True
 604 isSpecial ITdynamic     = True
 605 isSpecial ITsafe        = True
 606 isSpecial ITthreadsafe  = True
 607 isSpecial ITinterruptible = True
 608 isSpecial ITunsafe      = True
 609 isSpecial ITccallconv   = True
 610 isSpecial ITstdcallconv = True
 611 isSpecial ITprimcallconv = True
 612 isSpecial ITmdo         = True
 613 isSpecial ITfamily      = True
 614 isSpecial ITgroup   = True
 615 isSpecial ITby      = True
 616 isSpecial ITusing   = True
 617 isSpecial _             = False
 618 -}
 619
 620 -- the bitmap provided as the third component indicates whether the
 621 -- corresponding extension keyword is valid under the extension options
 622 -- provided to the compiler; if the extension corresponding to *any* of the
 623 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 624 -- facilitates using a keyword in two different extensions that can be
 625 -- activated independently)
 626 --
 627 reservedWordsFM :: UniqFM (Token, Int)
 628 reservedWordsFM = listToUFM $
 629         map (\(x, y, z) -> (mkFastString x, (y, z)))
 630        [( "_",          ITunderscore,   0 ),
 631         ( "as",         ITas,           0 ),
 632         ( "case",       ITcase,         0 ),
 633         ( "class",      ITclass,        0 ),
 634         ( "data",       ITdata,         0 ),
 635         ( "default",    ITdefault,      0 ),
 636         ( "deriving",   ITderiving,     0 ),
 637         ( "do",         ITdo,           0 ),
 638         ( "else",       ITelse,         0 ),
 639         ( "generic",    ITgeneric,      bit genericsBit ),
 640         ( "hiding",     IThiding,       0 ),
 641         ( "if",         ITif,           0 ),
 642         ( "import",     ITimport,       0 ),
 643         ( "in",         ITin,           0 ),
 644         ( "infix",      ITinfix,        0 ),
 645         ( "infixl",     ITinfixl,       0 ),
 646         ( "infixr",     ITinfixr,       0 ),
 647         ( "instance",   ITinstance,     0 ),
 648         ( "let",        ITlet,          0 ),
 649         ( "module",     ITmodule,       0 ),
 650         ( "newtype",    ITnewtype,      0 ),
 651         ( "of",         ITof,           0 ),
 652         ( "qualified",  ITqualified,    0 ),
 653         ( "then",       ITthen,         0 ),
 654         ( "type",       ITtype,         0 ),
 655         ( "where",      ITwhere,        0 ),
 656         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 657
 658     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 659         ( "mdo",        ITmdo,           bit recursiveDoBit),
 660         ( "family",     ITfamily,        bit tyFamBit),
 661     ( "group",  ITgroup,     bit transformComprehensionsBit),
 662     ( "by",     ITby,        bit transformComprehensionsBit),
 663     ( "using",  ITusing,     bit transformComprehensionsBit),
 664
 665         ( "foreign",    ITforeign,       bit ffiBit),
 666         ( "export",     ITexport,        bit ffiBit),
 667         ( "label",      ITlabel,         bit ffiBit),
 668         ( "dynamic",    ITdynamic,       bit ffiBit),
 669         ( "safe",       ITsafe,          bit ffiBit),
 670         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 671         ( "interruptible", ITinterruptible, bit ffiBit),
 672         ( "unsafe",     ITunsafe,        bit ffiBit),
 673         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 674         ( "ccall",      ITccallconv,     bit ffiBit),
 675         ( "prim",       ITprimcallconv,  bit ffiBit),
 676
 677         ( "rec",        ITrec,           bit recBit),
 678         ( "proc",       ITproc,          bit arrowsBit)
 679      ]
 680
 681 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 682 reservedSymsFM = listToUFM $
 683     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 684       [ ("..",  ITdotdot,   always)
 685         -- (:) is a reserved op, meaning only list cons
 686        ,(":",   ITcolon,    always)
 687        ,("::",  ITdcolon,   always)
 688        ,("=",   ITequal,    always)
 689        ,("\\",  ITlam,      always)
 690        ,("|",   ITvbar,     always)
 691        ,("<-",  ITlarrow,   always)
 692        ,("->",  ITrarrow,   always)
 693        ,("@",   ITat,       always)
 694        ,("~",   ITtilde,    always)
 695        ,("=>",  ITdarrow,   always)
 696        ,("-",   ITminus,    always)
 697        ,("!",   ITbang,     always)
 698
 699         -- For data T (a::*) = MkT
 700        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 701         -- For 'forall a . t'
 702        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 703
 704        ,("-<",  ITlarrowtail, arrowsEnabled)
 705        ,(">-",  ITrarrowtail, arrowsEnabled)
 706        ,("-<<", ITLarrowtail, arrowsEnabled)
 707        ,(">>-", ITRarrowtail, arrowsEnabled)
 708
 709        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 710        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 711        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 712                                 explicitForallEnabled i)
 713        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 714        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 715
 716        ,("⤙",   ITlarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 717        ,("⤚",   ITrarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 718        ,("⤛",   ITLarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 719        ,("⤜",   ITRarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 720
 721        ,("★", ITstar, unicodeSyntaxEnabled)
 722
 723         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 724         -- form part of a large operator.  This would let us have a better
 725         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 726        ]
 727
 728 -- -----------------------------------------------------------------------------
 729 -- Lexer actions
 730
 731 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 732
 733 special :: Token -> Action
 734 special tok span _buf _len = return (L span tok)
 735
 736 token, layout_token :: Token -> Action
 737 token t span _buf _len = return (L span t)
 738 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 739
 740 idtoken :: (StringBuffer -> Int -> Token) -> Action
 741 idtoken f span buf len = return (L span $! (f buf len))
 742
 743 skip_one_varid :: (FastString -> Token) -> Action
 744 skip_one_varid f span buf len
 745   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 746
 747 strtoken :: (String -> Token) -> Action
 748 strtoken f span buf len =
 749   return (L span $! (f $! lexemeToString buf len))
 750
 751 init_strtoken :: Int -> (String -> Token) -> Action
 752 -- like strtoken, but drops the last N character(s)
 753 init_strtoken drop f span buf len =
 754   return (L span $! (f $! lexemeToString buf (len-drop)))
 755
 756 begin :: Int -> Action
 757 begin code _span _str _len = do pushLexState code; lexToken
 758
 759 pop :: Action
 760 pop _span _buf _len = do _ <- popLexState
 761                          lexToken
 762
 763 hopefully_open_brace :: Action
 764 hopefully_open_brace span buf len
 765  = do relaxed <- extension relaxedLayout
 766       ctx <- getContext
 767       (AI l _) <- getInput
 768       let offset = srcLocCol l
 769           isOK = relaxed ||
 770                  case ctx of
 771                  Layout prev_off : _ -> prev_off < offset
 772                  _                   -> True
 773       if isOK then pop_and open_brace span buf len
 774               else failSpanMsgP span (text "Missing block")
 775
 776 pop_and :: Action -> Action
 777 pop_and act span buf len = do _ <- popLexState
 778                               act span buf len
 779
 780 {-# INLINE nextCharIs #-}
 781 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 782 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 783
 784 notFollowedBy :: Char -> AlexAccPred Int
 785 notFollowedBy char _ _ _ (AI _ buf)
 786   = nextCharIs buf (/=char)
 787
 788 notFollowedBySymbol :: AlexAccPred Int
 789 notFollowedBySymbol _ _ _ (AI _ buf)
 790   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 791
 792 -- We must reject doc comments as being ordinary comments everywhere.
 793 -- In some cases the doc comment will be selected as the lexeme due to
 794 -- maximal munch, but not always, because the nested comment rule is
 795 -- valid in all states, but the doc-comment rules are only valid in
 796 -- the non-layout states.
 797 isNormalComment :: AlexAccPred Int
 798 isNormalComment bits _ _ (AI _ buf)
 799   | haddockEnabled bits = notFollowedByDocOrPragma
 800   | otherwise           = nextCharIs buf (/='#')
 801   where
 802     notFollowedByDocOrPragma
 803        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 804
 805 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 806 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 807
 808 {-
 809 haddockDisabledAnd p bits _ _ (AI _ buf)
 810   = if haddockEnabled bits then False else (p buf)
 811 -}
 812
 813 atEOL :: AlexAccPred Int
 814 atEOL _ _ _ (AI _ buf) = atEnd buf || currentChar buf == '\n'
 815
 816 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 817 ifExtension pred bits _ _ _ = pred bits
 818
 819 multiline_doc_comment :: Action
 820 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 821   where
 822     worker commentAcc input docType oneLine = case alexGetChar input of
 823       Just ('\n', input')
 824         | oneLine -> docCommentEnd input commentAcc docType buf span
 825         | otherwise -> case checkIfCommentLine input' of
 826           Just input -> worker ('\n':commentAcc) input docType False
 827           Nothing -> docCommentEnd input commentAcc docType buf span
 828       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 829       Nothing -> docCommentEnd input commentAcc docType buf span
 830
 831     checkIfCommentLine input = check (dropNonNewlineSpace input)
 832       where
 833         check input = case alexGetChar input of
 834           Just ('-', input) -> case alexGetChar input of
 835             Just ('-', input) -> case alexGetChar input of
 836               Just (c, _) | c /= '-' -> Just input
 837               _ -> Nothing
 838             _ -> Nothing
 839           _ -> Nothing
 840
 841         dropNonNewlineSpace input = case alexGetChar input of
 842           Just (c, input')
 843             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 844             | otherwise -> input
 845           Nothing -> input
 846
 847 lineCommentToken :: Action
 848 lineCommentToken span buf len = do
 849   b <- extension rawTokenStreamEnabled
 850   if b then strtoken ITlineComment span buf len else lexToken
 851
 852 {-
 853   nested comments require traversing by hand, they can't be parsed
 854   using regular expressions.
 855 -}
 856 nested_comment :: P (Located Token) -> Action
 857 nested_comment cont span _str _len = do
 858   input <- getInput
 859   go "" (1::Int) input
 860   where
 861     go commentAcc 0 input = do setInput input
 862                                b <- extension rawTokenStreamEnabled
 863                                if b
 864                                  then docCommentEnd input commentAcc ITblockComment _str span
 865                                  else cont
 866     go commentAcc n input = case alexGetChar input of
 867       Nothing -> errBrace input span
 868       Just ('-',input) -> case alexGetChar input of
 869         Nothing  -> errBrace input span
 870         Just ('\125',input) -> go commentAcc (n-1) input
 871         Just (_,_)          -> go ('-':commentAcc) n input
 872       Just ('\123',input) -> case alexGetChar input of
 873         Nothing  -> errBrace input span
 874         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 875         Just (_,_)       -> go ('\123':commentAcc) n input
 876       Just (c,input) -> go (c:commentAcc) n input
 877
 878 nested_doc_comment :: Action
 879 nested_doc_comment span buf _len = withLexedDocType (go "")
 880   where
 881     go commentAcc input docType _ = case alexGetChar input of
 882       Nothing -> errBrace input span
 883       Just ('-',input) -> case alexGetChar input of
 884         Nothing -> errBrace input span
 885         Just ('\125',input) ->
 886           docCommentEnd input commentAcc docType buf span
 887         Just (_,_) -> go ('-':commentAcc) input docType False
 888       Just ('\123', input) -> case alexGetChar input of
 889         Nothing  -> errBrace input span
 890         Just ('-',input) -> do
 891           setInput input
 892           let cont = do input <- getInput; go commentAcc input docType False
 893           nested_comment cont span buf _len
 894         Just (_,_) -> go ('\123':commentAcc) input docType False
 895       Just (c,input) -> go (c:commentAcc) input docType False
 896
 897 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 898                  -> P (Located Token)
 899 withLexedDocType lexDocComment = do
 900   input@(AI _ buf) <- getInput
 901   case prevChar buf ' ' of
 902     '|' -> lexDocComment input ITdocCommentNext False
 903     '^' -> lexDocComment input ITdocCommentPrev False
 904     '$' -> lexDocComment input ITdocCommentNamed False
 905     '*' -> lexDocSection 1 input
 906     '#' -> lexDocComment input ITdocOptionsOld False
 907     _ -> panic "withLexedDocType: Bad doc type"
 908  where
 909     lexDocSection n input = case alexGetChar input of
 910       Just ('*', input) -> lexDocSection (n+1) input
 911       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 912       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 913
 914 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 915 -- off again at the end of the pragma.
 916 rulePrag :: Action
 917 rulePrag span _buf _len = do
 918   setExts (.|. bit inRulePragBit)
 919   return (L span ITrules_prag)
 920
 921 endPrag :: Action
 922 endPrag span _buf _len = do
 923   setExts (.&. complement (bit inRulePragBit))
 924   return (L span ITclose_prag)
 925
 926 -- docCommentEnd
 927 -------------------------------------------------------------------------------
 928 -- This function is quite tricky. We can't just return a new token, we also
 929 -- need to update the state of the parser. Why? Because the token is longer
 930 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 931 -- it writes the wrong token length to the parser state. This function is
 932 -- called afterwards, so it can just update the state.
 933
 934 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 935                  SrcSpan -> P (Located Token)
 936 docCommentEnd input commentAcc docType buf span = do
 937   setInput input
 938   let (AI loc nextBuf) = input
 939       comment = reverse commentAcc
 940       span' = mkSrcSpan (srcSpanStart span) loc
 941       last_len = byteDiff buf nextBuf
 942
 943   span `seq` setLastToken span' last_len
 944   return (L span' (docType comment))
 945
 946 errBrace :: AlexInput -> SrcSpan -> P a
 947 errBrace (AI end _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 948
 949 open_brace, close_brace :: Action
 950 open_brace span _str _len = do
 951   ctx <- getContext
 952   setContext (NoLayout:ctx)
 953   return (L span ITocurly)
 954 close_brace span _str _len = do
 955   popContext
 956   return (L span ITccurly)
 957
 958 qvarid, qconid :: StringBuffer -> Int -> Token
 959 qvarid buf len = ITqvarid $! splitQualName buf len False
 960 qconid buf len = ITqconid $! splitQualName buf len False
 961
 962 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 963 -- takes a StringBuffer and a length, and returns the module name
 964 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 965 -- because of hierarchical module names.
 966 splitQualName orig_buf len parens = split orig_buf orig_buf
 967   where
 968     split buf dot_buf
 969         | orig_buf `byteDiff` buf >= len  = done dot_buf
 970         | c == '.'                        = found_dot buf'
 971         | otherwise                       = split buf' dot_buf
 972       where
 973        (c,buf') = nextChar buf
 974
 975     -- careful, we might get names like M....
 976     -- so, if the character after the dot is not upper-case, this is
 977     -- the end of the qualifier part.
 978     found_dot buf -- buf points after the '.'
 979         | isUpper c    = split buf' buf
 980         | otherwise    = done buf
 981       where
 982        (c,buf') = nextChar buf
 983
 984     done dot_buf =
 985         (lexemeToFastString orig_buf (qual_size - 1),
 986          if parens -- Prelude.(+)
 987             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 988             else lexemeToFastString dot_buf (len - qual_size))
 989       where
 990         qual_size = orig_buf `byteDiff` dot_buf
 991
 992 varid :: Action
 993 varid span buf len =
 994   fs `seq`
 995   case lookupUFM reservedWordsFM fs of
 996         Just (keyword,0)    -> do
 997                 maybe_layout keyword
 998                 return (L span keyword)
 999         Just (keyword,exts) -> do
1000                 b <- extension (\i -> exts .&. i /= 0)
1001                 if b then do maybe_layout keyword
1002                              return (L span keyword)
1003                      else return (L span (ITvarid fs))
1004         _other -> return (L span (ITvarid fs))
1005   where
1006         fs = lexemeToFastString buf len
1007
1008 conid :: StringBuffer -> Int -> Token
1009 conid buf len = ITconid fs
1010   where fs = lexemeToFastString buf len
1011
1012 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
1013 qvarsym buf len = ITqvarsym $! splitQualName buf len False
1014 qconsym buf len = ITqconsym $! splitQualName buf len False
1015 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
1016 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
1017
1018 varsym, consym :: Action
1019 varsym = sym ITvarsym
1020 consym = sym ITconsym
1021
1022 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
1023     -> P (Located Token)
1024 sym con span buf len =
1025   case lookupUFM reservedSymsFM fs of
1026         Just (keyword,exts) -> do
1027                 b <- extension exts
1028                 if b then return (L span keyword)
1029                      else return (L span $! con fs)
1030         _other -> return (L span $! con fs)
1031   where
1032         fs = lexemeToFastString buf len
1033
1034 -- Variations on the integral numeric literal.
1035 tok_integral :: (Integer -> Token)
1036      -> (Integer -> Integer)
1037  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1038      -> Int -> Int
1039      -> (Integer, (Char->Int)) -> Action
1040 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1041   return $ L span $ itint $! transint $ parseUnsignedInteger
1042      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1043
1044 -- some conveniences for use with tok_integral
1045 tok_num :: (Integer -> Integer)
1046         -> Int -> Int
1047         -> (Integer, (Char->Int)) -> Action
1048 tok_num = tok_integral ITinteger
1049 tok_primint :: (Integer -> Integer)
1050             -> Int -> Int
1051             -> (Integer, (Char->Int)) -> Action
1052 tok_primint = tok_integral ITprimint
1053 tok_primword :: Int -> Int
1054              -> (Integer, (Char->Int)) -> Action
1055 tok_primword = tok_integral ITprimword positive
1056 positive, negative :: (Integer -> Integer)
1057 positive = id
1058 negative = negate
1059 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1060 decimal = (10,octDecDigit)
1061 octal = (8,octDecDigit)
1062 hexadecimal = (16,hexDigit)
1063
1064 -- readRational can understand negative rationals, exponents, everything.
1065 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1066 tok_float        str = ITrational   $! readRational str
1067 tok_primfloat    str = ITprimfloat  $! readRational str
1068 tok_primdouble   str = ITprimdouble $! readRational str
1069
1070 -- -----------------------------------------------------------------------------
1071 -- Layout processing
1072
1073 -- we're at the first token on a line, insert layout tokens if necessary
1074 do_bol :: Action
1075 do_bol span _str _len = do
1076         pos <- getOffside
1077         case pos of
1078             LT -> do
1079                 --trace "layout: inserting '}'" $ do
1080                 popContext
1081                 -- do NOT pop the lex state, we might have a ';' to insert
1082                 return (L span ITvccurly)
1083             EQ -> do
1084                 --trace "layout: inserting ';'" $ do
1085                 _ <- popLexState
1086                 return (L span ITsemi)
1087             GT -> do
1088                 _ <- popLexState
1089                 lexToken
1090
1091 -- certain keywords put us in the "layout" state, where we might
1092 -- add an opening curly brace.
1093 maybe_layout :: Token -> P ()
1094 maybe_layout t = do -- If the alternative layout rule is enabled then
1095                     -- we never create an implicit layout context here.
1096                     -- Layout is handled XXX instead.
1097                     -- The code for closing implicit contexts, or
1098                     -- inserting implicit semi-colons, is therefore
1099                     -- irrelevant as it only applies in an implicit
1100                     -- context.
1101                     alr <- extension alternativeLayoutRule
1102                     unless alr $ f t
1103     where f ITdo    = pushLexState layout_do
1104           f ITmdo   = pushLexState layout_do
1105           f ITof    = pushLexState layout
1106           f ITlet   = pushLexState layout
1107           f ITwhere = pushLexState layout
1108           f ITrec   = pushLexState layout
1109           f _       = return ()
1110
1111 -- Pushing a new implicit layout context.  If the indentation of the
1112 -- next token is not greater than the previous layout context, then
1113 -- Haskell 98 says that the new layout context should be empty; that is
1114 -- the lexer must generate {}.
1115 --
1116 -- We are slightly more lenient than this: when the new context is started
1117 -- by a 'do', then we allow the new context to be at the same indentation as
1118 -- the previous context.  This is what the 'strict' argument is for.
1119 --
1120 new_layout_context :: Bool -> Action
1121 new_layout_context strict span _buf _len = do
1122     _ <- popLexState
1123     (AI l _) <- getInput
1124     let offset = srcLocCol l
1125     ctx <- getContext
1126     nondecreasing <- extension nondecreasingIndentation
1127     let strict' = strict || not nondecreasing
1128     case ctx of
1129         Layout prev_off : _  |
1130            (strict'     && prev_off >= offset  ||
1131             not strict' && prev_off > offset) -> do
1132                 -- token is indented to the left of the previous context.
1133                 -- we must generate a {} sequence now.
1134                 pushLexState layout_left
1135                 return (L span ITvocurly)
1136         _ -> do
1137                 setContext (Layout offset : ctx)
1138                 return (L span ITvocurly)
1139
1140 do_layout_left :: Action
1141 do_layout_left span _buf _len = do
1142     _ <- popLexState
1143     pushLexState bol  -- we must be at the start of a line
1144     return (L span ITvccurly)
1145
1146 -- -----------------------------------------------------------------------------
1147 -- LINE pragmas
1148
1149 setLine :: Int -> Action
1150 setLine code span buf len = do
1151   let line = parseUnsignedInteger buf len 10 octDecDigit
1152   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1)
1153         -- subtract one: the line number refers to the *following* line
1154   _ <- popLexState
1155   pushLexState code
1156   lexToken
1157
1158 setFile :: Int -> Action
1159 setFile code span buf len = do
1160   let file = lexemeToFastString (stepOn buf) (len-2)
1161   setAlrLastLoc noSrcSpan
1162   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1163   _ <- popLexState
1164   pushLexState code
1165   lexToken
1166
1167
1168 -- -----------------------------------------------------------------------------
1169 -- Options, includes and language pragmas.
1170
1171 lex_string_prag :: (String -> Token) -> Action
1172 lex_string_prag mkTok span _buf _len
1173     = do input <- getInput
1174          start <- getSrcLoc
1175          tok <- go [] input
1176          end <- getSrcLoc
1177          return (L (mkSrcSpan start end) tok)
1178     where go acc input
1179               = if isString input "#-}"
1180                    then do setInput input
1181                            return (mkTok (reverse acc))
1182                    else case alexGetChar input of
1183                           Just (c,i) -> go (c:acc) i
1184                           Nothing -> err input
1185           isString _ [] = True
1186           isString i (x:xs)
1187               = case alexGetChar i of
1188                   Just (c,i') | c == x    -> isString i' xs
1189                   _other -> False
1190           err (AI end _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1191
1192
1193 -- -----------------------------------------------------------------------------
1194 -- Strings & Chars
1195
1196 -- This stuff is horrible.  I hates it.
1197
1198 lex_string_tok :: Action
1199 lex_string_tok span _buf _len = do
1200   tok <- lex_string ""
1201   end <- getSrcLoc
1202   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1203
1204 lex_string :: String -> P Token
1205 lex_string s = do
1206   i <- getInput
1207   case alexGetChar' i of
1208     Nothing -> lit_error i
1209
1210     Just ('"',i)  -> do
1211         setInput i
1212         magicHash <- extension magicHashEnabled
1213         if magicHash
1214           then do
1215             i <- getInput
1216             case alexGetChar' i of
1217               Just ('#',i) -> do
1218                    setInput i
1219                    if any (> '\xFF') s
1220                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1221                     else let s' = mkZFastString (reverse s) in
1222                          return (ITprimstring s')
1223                         -- mkZFastString is a hack to avoid encoding the
1224                         -- string in UTF-8.  We just want the exact bytes.
1225               _other ->
1226                 return (ITstring (mkFastString (reverse s)))
1227           else
1228                 return (ITstring (mkFastString (reverse s)))
1229
1230     Just ('\\',i)
1231         | Just ('&',i) <- next -> do
1232                 setInput i; lex_string s
1233         | Just (c,i) <- next, c <= '\x7f' && is_space c -> do
1234                            -- is_space only works for <= '\x7f' (#3751)
1235                 setInput i; lex_stringgap s
1236         where next = alexGetChar' i
1237
1238     Just (c, i1) -> do
1239         case c of
1240           '\\' -> do setInput i1; c' <- lex_escape; lex_string (c':s)
1241           c | isAny c -> do setInput i1; lex_string (c:s)
1242           _other -> lit_error i
1243
1244 lex_stringgap :: String -> P Token
1245 lex_stringgap s = do
1246   i <- getInput
1247   c <- getCharOrFail i
1248   case c of
1249     '\\' -> lex_string s
1250     c | is_space c -> lex_stringgap s
1251     _other -> lit_error i
1252
1253
1254 lex_char_tok :: Action
1255 -- Here we are basically parsing character literals, such as 'x' or '\n'
1256 -- but, when Template Haskell is on, we additionally spot
1257 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1258 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1259 -- So we have to do two characters of lookahead: when we see 'x we need to
1260 -- see if there's a trailing quote
1261 lex_char_tok span _buf _len = do        -- We've seen '
1262    i1 <- getInput       -- Look ahead to first character
1263    let loc = srcSpanStart span
1264    case alexGetChar' i1 of
1265         Nothing -> lit_error  i1
1266
1267         Just ('\'', i2@(AI end2 _)) -> do       -- We've seen ''
1268                   th_exts <- extension thEnabled
1269                   if th_exts then do
1270                         setInput i2
1271                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1272                    else lit_error i1
1273
1274         Just ('\\', i2@(AI _end2 _)) -> do      -- We've seen 'backslash
1275                   setInput i2
1276                   lit_ch <- lex_escape
1277                   i3 <- getInput
1278                   mc <- getCharOrFail i3 -- Trailing quote
1279                   if mc == '\'' then finish_char_tok loc lit_ch
1280                                 else lit_error i3
1281
1282         Just (c, i2@(AI _end2 _))
1283                 | not (isAny c) -> lit_error i1
1284                 | otherwise ->
1285
1286                 -- We've seen 'x, where x is a valid character
1287                 --  (i.e. not newline etc) but not a quote or backslash
1288            case alexGetChar' i2 of      -- Look ahead one more character
1289                 Just ('\'', i3) -> do   -- We've seen 'x'
1290                         setInput i3
1291                         finish_char_tok loc c
1292                 _other -> do            -- We've seen 'x not followed by quote
1293                                         -- (including the possibility of EOF)
1294                                         -- If TH is on, just parse the quote only
1295                         th_exts <- extension thEnabled
1296                         let (AI end _) = i1
1297                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1298                                    else lit_error i2
1299
1300 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1301 finish_char_tok loc ch  -- We've already seen the closing quote
1302                         -- Just need to check for trailing #
1303   = do  magicHash <- extension magicHashEnabled
1304         i@(AI end _) <- getInput
1305         if magicHash then do
1306                 case alexGetChar' i of
1307                         Just ('#',i@(AI end _)) -> do
1308                                 setInput i
1309                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1310                         _other ->
1311                                 return (L (mkSrcSpan loc end) (ITchar ch))
1312             else do
1313                    return (L (mkSrcSpan loc end) (ITchar ch))
1314
1315 isAny :: Char -> Bool
1316 isAny c | c > '\x7f' = isPrint c
1317         | otherwise  = is_any c
1318
1319 lex_escape :: P Char
1320 lex_escape = do
1321   i0 <- getInput
1322   c <- getCharOrFail i0
1323   case c of
1324         'a'   -> return '\a'
1325         'b'   -> return '\b'
1326         'f'   -> return '\f'
1327         'n'   -> return '\n'
1328         'r'   -> return '\r'
1329         't'   -> return '\t'
1330         'v'   -> return '\v'
1331         '\\'  -> return '\\'
1332         '"'   -> return '\"'
1333         '\''  -> return '\''
1334         '^'   -> do i1 <- getInput
1335                     c <- getCharOrFail i1
1336                     if c >= '@' && c <= '_'
1337                         then return (chr (ord c - ord '@'))
1338                         else lit_error i1
1339
1340         'x'   -> readNum is_hexdigit 16 hexDigit
1341         'o'   -> readNum is_octdigit  8 octDecDigit
1342         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1343
1344         c1 ->  do
1345            i <- getInput
1346            case alexGetChar' i of
1347             Nothing -> lit_error i0
1348             Just (c2,i2) ->
1349               case alexGetChar' i2 of
1350                 Nothing -> do lit_error i0
1351                 Just (c3,i3) ->
1352                    let str = [c1,c2,c3] in
1353                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1354                                      Just rest <- [stripPrefix p str] ] of
1355                           (escape_char,[]):_ -> do
1356                                 setInput i3
1357                                 return escape_char
1358                           (escape_char,_:_):_ -> do
1359                                 setInput i2
1360                                 return escape_char
1361                           [] -> lit_error i0
1362
1363 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1364 readNum is_digit base conv = do
1365   i <- getInput
1366   c <- getCharOrFail i
1367   if is_digit c
1368         then readNum2 is_digit base conv (conv c)
1369         else lit_error i
1370
1371 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1372 readNum2 is_digit base conv i = do
1373   input <- getInput
1374   read i input
1375   where read i input = do
1376           case alexGetChar' input of
1377             Just (c,input') | is_digit c -> do
1378                let i' = i*base + conv c
1379                if i' > 0x10ffff
1380                   then setInput input >> lexError "numeric escape sequence out of range"
1381                   else read i' input'
1382             _other -> do
1383               setInput input; return (chr i)
1384
1385
1386 silly_escape_chars :: [(String, Char)]
1387 silly_escape_chars = [
1388         ("NUL", '\NUL'),
1389         ("SOH", '\SOH'),
1390         ("STX", '\STX'),
1391         ("ETX", '\ETX'),
1392         ("EOT", '\EOT'),
1393         ("ENQ", '\ENQ'),
1394         ("ACK", '\ACK'),
1395         ("BEL", '\BEL'),
1396         ("BS", '\BS'),
1397         ("HT", '\HT'),
1398         ("LF", '\LF'),
1399         ("VT", '\VT'),
1400         ("FF", '\FF'),
1401         ("CR", '\CR'),
1402         ("SO", '\SO'),
1403         ("SI", '\SI'),
1404         ("DLE", '\DLE'),
1405         ("DC1", '\DC1'),
1406         ("DC2", '\DC2'),
1407         ("DC3", '\DC3'),
1408         ("DC4", '\DC4'),
1409         ("NAK", '\NAK'),
1410         ("SYN", '\SYN'),
1411         ("ETB", '\ETB'),
1412         ("CAN", '\CAN'),
1413         ("EM", '\EM'),
1414         ("SUB", '\SUB'),
1415         ("ESC", '\ESC'),
1416         ("FS", '\FS'),
1417         ("GS", '\GS'),
1418         ("RS", '\RS'),
1419         ("US", '\US'),
1420         ("SP", '\SP'),
1421         ("DEL", '\DEL')
1422         ]
1423
1424 -- before calling lit_error, ensure that the current input is pointing to
1425 -- the position of the error in the buffer.  This is so that we can report
1426 -- a correct location to the user, but also so we can detect UTF-8 decoding
1427 -- errors if they occur.
1428 lit_error :: AlexInput -> P a
1429 lit_error i = do setInput i; lexError "lexical error in string/character literal"
1430
1431 getCharOrFail :: AlexInput -> P Char
1432 getCharOrFail i =  do
1433   case alexGetChar' i of
1434         Nothing -> lexError "unexpected end-of-file in string/character literal"
1435         Just (c,i)  -> do setInput i; return c
1436
1437 -- -----------------------------------------------------------------------------
1438 -- QuasiQuote
1439
1440 lex_quasiquote_tok :: Action
1441 lex_quasiquote_tok span buf len = do
1442   let quoter = tail (lexemeToString buf (len - 1))
1443                 -- 'tail' drops the initial '[',
1444                 -- while the -1 drops the trailing '|'
1445   quoteStart <- getSrcLoc
1446   quote <- lex_quasiquote ""
1447   end <- getSrcLoc
1448   return (L (mkSrcSpan (srcSpanStart span) end)
1449            (ITquasiQuote (mkFastString quoter,
1450                           mkFastString (reverse quote),
1451                           mkSrcSpan quoteStart end)))
1452
1453 lex_quasiquote :: String -> P String
1454 lex_quasiquote s = do
1455   i <- getInput
1456   case alexGetChar' i of
1457     Nothing -> lit_error i
1458
1459     Just ('\\',i)
1460         | Just ('|',i) <- next -> do
1461                 setInput i; lex_quasiquote ('|' : s)
1462         | Just (']',i) <- next -> do
1463                 setInput i; lex_quasiquote (']' : s)
1464         where next = alexGetChar' i
1465
1466     Just ('|',i)
1467         | Just (']',i) <- next -> do
1468                 setInput i; return s
1469         where next = alexGetChar' i
1470
1471     Just (c, i) -> do
1472          setInput i; lex_quasiquote (c : s)
1473
1474 -- -----------------------------------------------------------------------------
1475 -- Warnings
1476
1477 warn :: DynFlag -> SDoc -> Action
1478 warn option warning srcspan _buf _len = do
1479     addWarning option srcspan warning
1480     lexToken
1481
1482 warnThen :: DynFlag -> SDoc -> Action -> Action
1483 warnThen option warning action srcspan buf len = do
1484     addWarning option srcspan warning
1485     action srcspan buf len
1486
1487 -- -----------------------------------------------------------------------------
1488 -- The Parse Monad
1489
1490 data LayoutContext
1491   = NoLayout
1492   | Layout !Int
1493   deriving Show
1494
1495 data ParseResult a
1496   = POk PState a
1497   | PFailed
1498         SrcSpan         -- The start and end of the text span related to
1499                         -- the error.  Might be used in environments which can
1500                         -- show this span, e.g. by highlighting it.
1501         Message         -- The error message
1502
1503 data PState = PState {
1504         buffer     :: StringBuffer,
1505         dflags     :: DynFlags,
1506         messages   :: Messages,
1507         last_loc   :: SrcSpan,  -- pos of previous token
1508         last_len   :: !Int,     -- len of previous token
1509         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1510         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1511         context    :: [LayoutContext],
1512         lex_state  :: [Int],
1513         -- Used in the alternative layout rule:
1514         -- These tokens are the next ones to be sent out. They are
1515         -- just blindly emitted, without the rule looking at them again:
1516         alr_pending_implicit_tokens :: [Located Token],
1517         -- This is the next token to be considered or, if it is Nothing,
1518         -- we need to get the next token from the input stream:
1519         alr_next_token :: Maybe (Located Token),
1520         -- This is what we consider to be the locatino of the last token
1521         -- emitted:
1522         alr_last_loc :: SrcSpan,
1523         -- The stack of layout contexts:
1524         alr_context :: [ALRContext],
1525         -- Are we expecting a '{'? If it's Just, then the ALRLayout tells
1526         -- us what sort of layout the '{' will open:
1527         alr_expecting_ocurly :: Maybe ALRLayout,
1528         -- Have we just had the '}' for a let block? If so, than an 'in'
1529         -- token doesn't need to close anything:
1530         alr_justClosedExplicitLetBlock :: Bool
1531      }
1532         -- last_loc and last_len are used when generating error messages,
1533         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1534         -- current token to happyError, we could at least get rid of last_len.
1535         -- Getting rid of last_loc would require finding another way to
1536         -- implement pushCurrentContext (which is only called from one place).
1537
1538 data ALRContext = ALRNoLayout Bool{- does it contain commas? -}
1539                               Bool{- is it a 'let' block? -}
1540                 | ALRLayout ALRLayout Int
1541 data ALRLayout = ALRLayoutLet
1542                | ALRLayoutWhere
1543                | ALRLayoutOf
1544                | ALRLayoutDo
1545
1546 newtype P a = P { unP :: PState -> ParseResult a }
1547
1548 instance Monad P where
1549   return = returnP
1550   (>>=) = thenP
1551   fail = failP
1552
1553 returnP :: a -> P a
1554 returnP a = a `seq` (P $ \s -> POk s a)
1555
1556 thenP :: P a -> (a -> P b) -> P b
1557 (P m) `thenP` k = P $ \ s ->
1558         case m s of
1559                 POk s1 a         -> (unP (k a)) s1
1560                 PFailed span err -> PFailed span err
1561
1562 failP :: String -> P a
1563 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1564
1565 failMsgP :: String -> P a
1566 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1567
1568 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1569 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1570
1571 failSpanMsgP :: SrcSpan -> SDoc -> P a
1572 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1573
1574 getPState :: P PState
1575 getPState = P $ \s -> POk s s
1576
1577 getDynFlags :: P DynFlags
1578 getDynFlags = P $ \s -> POk s (dflags s)
1579
1580 withThisPackage :: (PackageId -> a) -> P a
1581 withThisPackage f
1582  = do   pkg     <- liftM thisPackage getDynFlags
1583         return  $ f pkg
1584
1585 extension :: (Int -> Bool) -> P Bool
1586 extension p = P $ \s -> POk s (p $! extsBitmap s)
1587
1588 getExts :: P Int
1589 getExts = P $ \s -> POk s (extsBitmap s)
1590
1591 setExts :: (Int -> Int) -> P ()
1592 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1593
1594 setSrcLoc :: SrcLoc -> P ()
1595 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1596
1597 getSrcLoc :: P SrcLoc
1598 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1599
1600 setLastToken :: SrcSpan -> Int -> P ()
1601 setLastToken loc len = P $ \s -> POk s {
1602   last_loc=loc,
1603   last_len=len
1604   } ()
1605
1606 data AlexInput = AI SrcLoc StringBuffer
1607
1608 alexInputPrevChar :: AlexInput -> Char
1609 alexInputPrevChar (AI _ buf) = prevChar buf '\n'
1610
1611 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1612 alexGetChar (AI loc s)
1613   | atEnd s   = Nothing
1614   | otherwise = adj_c `seq` loc' `seq` s' `seq`
1615                 --trace (show (ord c)) $
1616                 Just (adj_c, (AI loc' s'))
1617   where (c,s') = nextChar s
1618         loc'   = advanceSrcLoc loc c
1619
1620         non_graphic     = '\x0'
1621         upper           = '\x1'
1622         lower           = '\x2'
1623         digit           = '\x3'
1624         symbol          = '\x4'
1625         space           = '\x5'
1626         other_graphic   = '\x6'
1627
1628         adj_c
1629           | c <= '\x06' = non_graphic
1630           | c <= '\x7f' = c
1631           -- Alex doesn't handle Unicode, so when Unicode
1632           -- character is encountered we output these values
1633           -- with the actual character value hidden in the state.
1634           | otherwise =
1635                 case generalCategory c of
1636                   UppercaseLetter       -> upper
1637                   LowercaseLetter       -> lower
1638                   TitlecaseLetter       -> upper
1639                   ModifierLetter        -> other_graphic
1640                   OtherLetter           -> lower -- see #1103
1641                   NonSpacingMark        -> other_graphic
1642                   SpacingCombiningMark  -> other_graphic
1643                   EnclosingMark         -> other_graphic
1644                   DecimalNumber         -> digit
1645                   LetterNumber          -> other_graphic
1646                   OtherNumber           -> digit -- see #4373
1647                   ConnectorPunctuation  -> symbol
1648                   DashPunctuation       -> symbol
1649                   OpenPunctuation       -> other_graphic
1650                   ClosePunctuation      -> other_graphic
1651                   InitialQuote          -> other_graphic
1652                   FinalQuote            -> other_graphic
1653                   OtherPunctuation      -> symbol
1654                   MathSymbol            -> symbol
1655                   CurrencySymbol        -> symbol
1656                   ModifierSymbol        -> symbol
1657                   OtherSymbol           -> symbol
1658                   Space                 -> space
1659                   _other                -> non_graphic
1660
1661 -- This version does not squash unicode characters, it is used when
1662 -- lexing strings.
1663 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1664 alexGetChar' (AI loc s)
1665   | atEnd s   = Nothing
1666   | otherwise = c `seq` loc' `seq` s' `seq`
1667                 --trace (show (ord c)) $
1668                 Just (c, (AI loc' s'))
1669   where (c,s') = nextChar s
1670         loc'   = advanceSrcLoc loc c
1671
1672 getInput :: P AlexInput
1673 getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (AI l b)
1674
1675 setInput :: AlexInput -> P ()
1676 setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } ()
1677
1678 nextIsEOF :: P Bool
1679 nextIsEOF = do
1680   AI _ s <- getInput
1681   return $ atEnd s
1682
1683 pushLexState :: Int -> P ()
1684 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1685
1686 popLexState :: P Int
1687 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1688
1689 getLexState :: P Int
1690 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1691
1692 popNextToken :: P (Maybe (Located Token))
1693 popNextToken
1694     = P $ \s@PState{ alr_next_token = m } ->
1695               POk (s {alr_next_token = Nothing}) m
1696
1697 activeContext :: P Bool
1698 activeContext = do
1699   ctxt <- getALRContext
1700   expc <- getAlrExpectingOCurly
1701   impt <- implicitTokenPending
1702   case (ctxt,expc) of
1703     ([],Nothing) -> return impt
1704     _other       -> return True
1705
1706 setAlrLastLoc :: SrcSpan -> P ()
1707 setAlrLastLoc l = P $ \s -> POk (s {alr_last_loc = l}) ()
1708
1709 getAlrLastLoc :: P SrcSpan
1710 getAlrLastLoc = P $ \s@(PState {alr_last_loc = l}) -> POk s l
1711
1712 getALRContext :: P [ALRContext]
1713 getALRContext = P $ \s@(PState {alr_context = cs}) -> POk s cs
1714
1715 setALRContext :: [ALRContext] -> P ()
1716 setALRContext cs = P $ \s -> POk (s {alr_context = cs}) ()
1717
1718 getJustClosedExplicitLetBlock :: P Bool
1719 getJustClosedExplicitLetBlock
1720  = P $ \s@(PState {alr_justClosedExplicitLetBlock = b}) -> POk s b
1721
1722 setJustClosedExplicitLetBlock :: Bool -> P ()
1723 setJustClosedExplicitLetBlock b
1724  = P $ \s -> POk (s {alr_justClosedExplicitLetBlock = b}) ()
1725
1726 setNextToken :: Located Token -> P ()
1727 setNextToken t = P $ \s -> POk (s {alr_next_token = Just t}) ()
1728
1729 implicitTokenPending :: P Bool
1730 implicitTokenPending
1731     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1732               case ts of
1733               [] -> POk s False
1734               _  -> POk s True
1735
1736 popPendingImplicitToken :: P (Maybe (Located Token))
1737 popPendingImplicitToken
1738     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1739               case ts of
1740               [] -> POk s Nothing
1741               (t : ts') -> POk (s {alr_pending_implicit_tokens = ts'}) (Just t)
1742
1743 setPendingImplicitTokens :: [Located Token] -> P ()
1744 setPendingImplicitTokens ts = P $ \s -> POk (s {alr_pending_implicit_tokens = ts}) ()
1745
1746 getAlrExpectingOCurly :: P (Maybe ALRLayout)
1747 getAlrExpectingOCurly = P $ \s@(PState {alr_expecting_ocurly = b}) -> POk s b
1748
1749 setAlrExpectingOCurly :: Maybe ALRLayout -> P ()
1750 setAlrExpectingOCurly b = P $ \s -> POk (s {alr_expecting_ocurly = b}) ()
1751
1752 -- for reasons of efficiency, flags indicating language extensions (eg,
1753 -- -fglasgow-exts or -XParallelArrays) are represented by a bitmap stored in an unboxed
1754 -- integer
1755
1756 genericsBit :: Int
1757 genericsBit = 0 -- {|, |} and "generic"
1758 ffiBit :: Int
1759 ffiBit     = 1
1760 parrBit :: Int
1761 parrBit    = 2
1762 arrowsBit :: Int
1763 arrowsBit  = 4
1764 thBit :: Int
1765 thBit      = 5
1766 ipBit :: Int
1767 ipBit      = 6
1768 explicitForallBit :: Int
1769 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1770 bangPatBit :: Int
1771 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1772                 -- (doesn't affect the lexer)
1773 tyFamBit :: Int
1774 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1775 haddockBit :: Int
1776 haddockBit = 10 -- Lex and parse Haddock comments
1777 magicHashBit :: Int
1778 magicHashBit = 11 -- "#" in both functions and operators
1779 kindSigsBit :: Int
1780 kindSigsBit = 12 -- Kind signatures on type variables
1781 recursiveDoBit :: Int
1782 recursiveDoBit = 13 -- mdo
1783 unicodeSyntaxBit :: Int
1784 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1785 unboxedTuplesBit :: Int
1786 unboxedTuplesBit = 15 -- (# and #)
1787 datatypeContextsBit :: Int
1788 datatypeContextsBit = 16
1789 transformComprehensionsBit :: Int
1790 transformComprehensionsBit = 17
1791 qqBit :: Int
1792 qqBit      = 18 -- enable quasiquoting
1793 inRulePragBit :: Int
1794 inRulePragBit = 19
1795 rawTokenStreamBit :: Int
1796 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1797 recBit :: Int
1798 recBit = 22 -- rec
1799 alternativeLayoutRuleBit :: Int
1800 alternativeLayoutRuleBit = 23
1801 relaxedLayoutBit :: Int
1802 relaxedLayoutBit = 24
1803 nondecreasingIndentationBit :: Int
1804 nondecreasingIndentationBit = 25
1805
1806 always :: Int -> Bool
1807 always           _     = True
1808 genericsEnabled :: Int -> Bool
1809 genericsEnabled  flags = testBit flags genericsBit
1810 parrEnabled :: Int -> Bool
1811 parrEnabled      flags = testBit flags parrBit
1812 arrowsEnabled :: Int -> Bool
1813 arrowsEnabled    flags = testBit flags arrowsBit
1814 thEnabled :: Int -> Bool
1815 thEnabled        flags = testBit flags thBit
1816 ipEnabled :: Int -> Bool
1817 ipEnabled        flags = testBit flags ipBit
1818 explicitForallEnabled :: Int -> Bool
1819 explicitForallEnabled flags = testBit flags explicitForallBit
1820 bangPatEnabled :: Int -> Bool
1821 bangPatEnabled   flags = testBit flags bangPatBit
1822 -- tyFamEnabled :: Int -> Bool
1823 -- tyFamEnabled     flags = testBit flags tyFamBit
1824 haddockEnabled :: Int -> Bool
1825 haddockEnabled   flags = testBit flags haddockBit
1826 magicHashEnabled :: Int -> Bool
1827 magicHashEnabled flags = testBit flags magicHashBit
1828 -- kindSigsEnabled :: Int -> Bool
1829 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1830 unicodeSyntaxEnabled :: Int -> Bool
1831 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1832 unboxedTuplesEnabled :: Int -> Bool
1833 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1834 datatypeContextsEnabled :: Int -> Bool
1835 datatypeContextsEnabled flags = testBit flags datatypeContextsBit
1836 qqEnabled :: Int -> Bool
1837 qqEnabled        flags = testBit flags qqBit
1838 -- inRulePrag :: Int -> Bool
1839 -- inRulePrag       flags = testBit flags inRulePragBit
1840 rawTokenStreamEnabled :: Int -> Bool
1841 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1842 alternativeLayoutRule :: Int -> Bool
1843 alternativeLayoutRule flags = testBit flags alternativeLayoutRuleBit
1844 relaxedLayout :: Int -> Bool
1845 relaxedLayout flags = testBit flags relaxedLayoutBit
1846 nondecreasingIndentation :: Int -> Bool
1847 nondecreasingIndentation flags = testBit flags nondecreasingIndentationBit
1848
1849 -- PState for parsing options pragmas
1850 --
1851 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1852 pragState dynflags buf loc = (mkPState dynflags buf loc) {
1853                                  lex_state = [bol, option_prags, 0]
1854                              }
1855
1856 -- create a parse state
1857 --
1858 mkPState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1859 mkPState flags buf loc =
1860   PState {
1861       buffer        = buf,
1862       dflags        = flags,
1863       messages      = emptyMessages,
1864       last_loc      = mkSrcSpan loc loc,
1865       last_len      = 0,
1866       loc           = loc,
1867       extsBitmap    = fromIntegral bitmap,
1868       context       = [],
1869       lex_state     = [bol, 0],
1870       alr_pending_implicit_tokens = [],
1871       alr_next_token = Nothing,
1872       alr_last_loc = noSrcSpan,
1873       alr_context = [],
1874       alr_expecting_ocurly = Nothing,
1875       alr_justClosedExplicitLetBlock = False
1876     }
1877     where
1878       bitmap =     genericsBit       `setBitIf` xopt Opt_Generics flags
1879                .|. ffiBit            `setBitIf` xopt Opt_ForeignFunctionInterface flags
1880                .|. parrBit           `setBitIf` xopt Opt_ParallelArrays  flags
1881                .|. arrowsBit         `setBitIf` xopt Opt_Arrows          flags
1882                .|. thBit             `setBitIf` xopt Opt_TemplateHaskell flags
1883                .|. qqBit             `setBitIf` xopt Opt_QuasiQuotes     flags
1884                .|. ipBit             `setBitIf` xopt Opt_ImplicitParams  flags
1885                .|. explicitForallBit `setBitIf` xopt Opt_ExplicitForAll  flags
1886                .|. bangPatBit        `setBitIf` xopt Opt_BangPatterns    flags
1887                .|. tyFamBit          `setBitIf` xopt Opt_TypeFamilies    flags
1888                .|. haddockBit        `setBitIf` dopt Opt_Haddock         flags
1889                .|. magicHashBit      `setBitIf` xopt Opt_MagicHash       flags
1890                .|. kindSigsBit       `setBitIf` xopt Opt_KindSignatures  flags
1891                .|. recursiveDoBit    `setBitIf` xopt Opt_RecursiveDo     flags
1892                .|. recBit            `setBitIf` xopt Opt_DoRec           flags
1893                .|. recBit            `setBitIf` xopt Opt_Arrows          flags
1894                .|. unicodeSyntaxBit  `setBitIf` xopt Opt_UnicodeSyntax   flags
1895                .|. unboxedTuplesBit  `setBitIf` xopt Opt_UnboxedTuples   flags
1896                .|. datatypeContextsBit `setBitIf` xopt Opt_DatatypeContexts flags
1897                .|. transformComprehensionsBit `setBitIf` xopt Opt_TransformListComp flags
1898                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1899                .|. alternativeLayoutRuleBit `setBitIf` xopt Opt_AlternativeLayoutRule flags
1900                .|. relaxedLayoutBit  `setBitIf` xopt Opt_RelaxedLayout flags
1901                .|. nondecreasingIndentationBit `setBitIf` xopt Opt_NondecreasingIndentation flags
1902       --
1903       setBitIf :: Int -> Bool -> Int
1904       b `setBitIf` cond | cond      = bit b
1905                         | otherwise = 0
1906
1907 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1908 addWarning option srcspan warning
1909  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1910        let warning' = mkWarnMsg srcspan alwaysQualify warning
1911            ws' = if dopt option d then ws `snocBag` warning' else ws
1912        in POk s{messages=(ws', es)} ()
1913
1914 getMessages :: PState -> Messages
1915 getMessages PState{messages=ms} = ms
1916
1917 getContext :: P [LayoutContext]
1918 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1919
1920 setContext :: [LayoutContext] -> P ()
1921 setContext ctx = P $ \s -> POk s{context=ctx} ()
1922
1923 popContext :: P ()
1924 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1925                               last_len = len, last_loc = last_loc }) ->
1926   case ctx of
1927         (_:tl) -> POk s{ context = tl } ()
1928         []     -> PFailed last_loc (srcParseErr buf len)
1929
1930 -- Push a new layout context at the indentation of the last token read.
1931 -- This is only used at the outer level of a module when the 'module'
1932 -- keyword is missing.
1933 pushCurrentContext :: P ()
1934 pushCurrentContext = P $ \ s@PState{ last_loc=loc, context=ctx } ->
1935     POk s{context = Layout (srcSpanStartCol loc) : ctx} ()
1936
1937 getOffside :: P Ordering
1938 getOffside = P $ \s@PState{last_loc=loc, context=stk} ->
1939                 let offs = srcSpanStartCol loc in
1940                 let ord = case stk of
1941                         (Layout n:_) -> --trace ("layout: " ++ show n ++ ", offs: " ++ show offs) $
1942                                         compare offs n
1943                         _            -> GT
1944                 in POk s ord
1945
1946 -- ---------------------------------------------------------------------------
1947 -- Construct a parse error
1948
1949 srcParseErr
1950   :: StringBuffer       -- current buffer (placed just after the last token)
1951   -> Int                -- length of the previous token
1952   -> Message
1953 srcParseErr buf len
1954   = hcat [ if null token
1955              then ptext (sLit "parse error (possibly incorrect indentation)")
1956              else hcat [ptext (sLit "parse error on input "),
1957                         char '`', text token, char '\'']
1958     ]
1959   where token = lexemeToString (offsetBytes (-len) buf) len
1960
1961 -- Report a parse failure, giving the span of the previous token as
1962 -- the location of the error.  This is the entry point for errors
1963 -- detected during parsing.
1964 srcParseFail :: P a
1965 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1966                             last_loc = last_loc } ->
1967     PFailed last_loc (srcParseErr buf len)
1968
1969 -- A lexical error is reported at a particular position in the source file,
1970 -- not over a token range.
1971 lexError :: String -> P a
1972 lexError str = do
1973   loc <- getSrcLoc
1974   (AI end buf) <- getInput
1975   reportLexError loc end buf str
1976
1977 -- -----------------------------------------------------------------------------
1978 -- This is the top-level function: called from the parser each time a
1979 -- new token is to be read from the input.
1980
1981 lexer :: (Located Token -> P a) -> P a
1982 lexer cont = do
1983   alr <- extension alternativeLayoutRule
1984   let lexTokenFun = if alr then lexTokenAlr else lexToken
1985   tok@(L _span _tok__) <- lexTokenFun
1986   --trace ("token: " ++ show _tok__) $ do
1987   cont tok
1988
1989 lexTokenAlr :: P (Located Token)
1990 lexTokenAlr = do mPending <- popPendingImplicitToken
1991                  t <- case mPending of
1992                       Nothing ->
1993                           do mNext <- popNextToken
1994                              t <- case mNext of
1995                                   Nothing -> lexToken
1996                                   Just next -> return next
1997                              alternativeLayoutRuleToken t
1998                       Just t ->
1999                           return t
2000                  setAlrLastLoc (getLoc t)
2001                  case unLoc t of
2002                      ITwhere -> setAlrExpectingOCurly (Just ALRLayoutWhere)
2003                      ITlet   -> setAlrExpectingOCurly (Just ALRLayoutLet)
2004                      ITof    -> setAlrExpectingOCurly (Just ALRLayoutOf)
2005                      ITdo    -> setAlrExpectingOCurly (Just ALRLayoutDo)
2006                      ITmdo   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2007                      ITrec   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2008                      _       -> return ()
2009                  return t
2010
2011 alternativeLayoutRuleToken :: Located Token -> P (Located Token)
2012 alternativeLayoutRuleToken t
2013     = do context <- getALRContext
2014          lastLoc <- getAlrLastLoc
2015          mExpectingOCurly <- getAlrExpectingOCurly
2016          justClosedExplicitLetBlock <- getJustClosedExplicitLetBlock
2017          setJustClosedExplicitLetBlock False
2018          dflags <- getDynFlags
2019          let transitional = xopt Opt_AlternativeLayoutRuleTransitional dflags
2020              thisLoc = getLoc t
2021              thisCol = srcSpanStartCol thisLoc
2022              newLine = (lastLoc == noSrcSpan)
2023                     || (srcSpanStartLine thisLoc > srcSpanEndLine lastLoc)
2024          case (unLoc t, context, mExpectingOCurly) of
2025              -- This case handles a GHC extension to the original H98
2026              -- layout rule...
2027              (ITocurly, _, Just alrLayout) ->
2028                  do setAlrExpectingOCurly Nothing
2029                     let isLet = case alrLayout of
2030                                 ALRLayoutLet -> True
2031                                 _ -> False
2032                     setALRContext (ALRNoLayout (containsCommas ITocurly) isLet : context)
2033                     return t
2034              -- ...and makes this case unnecessary
2035              {-
2036              -- I think our implicit open-curly handling is slightly
2037              -- different to John's, in how it interacts with newlines
2038              -- and "in"
2039              (ITocurly, _, Just _) ->
2040                  do setAlrExpectingOCurly Nothing
2041                     setNextToken t
2042                     lexTokenAlr
2043              -}
2044              (_, ALRLayout _ col : ls, Just expectingOCurly)
2045               | (thisCol > col) ||
2046                 (thisCol == col &&
2047                  isNonDecreasingIntentation expectingOCurly) ->
2048                  do setAlrExpectingOCurly Nothing
2049                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2050                     setNextToken t
2051                     return (L thisLoc ITocurly)
2052               | otherwise ->
2053                  do setAlrExpectingOCurly Nothing
2054                     setPendingImplicitTokens [L lastLoc ITccurly]
2055                     setNextToken t
2056                     return (L lastLoc ITocurly)
2057              (_, _, Just expectingOCurly) ->
2058                  do setAlrExpectingOCurly Nothing
2059                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2060                     setNextToken t
2061                     return (L thisLoc ITocurly)
2062              -- We do the [] cases earlier than in the spec, as we
2063              -- have an actual EOF token
2064              (ITeof, ALRLayout _ _ : ls, _) ->
2065                  do setALRContext ls
2066                     setNextToken t
2067                     return (L thisLoc ITccurly)
2068              (ITeof, _, _) ->
2069                  return t
2070              -- the other ITeof case omitted; general case below covers it
2071              (ITin, _, _)
2072               | justClosedExplicitLetBlock ->
2073                  return t
2074              (ITin, ALRLayout ALRLayoutLet _ : ls, _)
2075               | newLine ->
2076                  do setPendingImplicitTokens [t]
2077                     setALRContext ls
2078                     return (L thisLoc ITccurly)
2079              -- This next case is to handle a transitional issue:
2080              (ITwhere, ALRLayout _ col : ls, _)
2081               | newLine && thisCol == col && transitional ->
2082                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2083                                thisLoc
2084                                (transitionalAlternativeLayoutWarning
2085                                     "`where' clause at the same depth as implicit layout block")
2086                     setALRContext ls
2087                     setNextToken t
2088                     -- Note that we use lastLoc, as we may need to close
2089                     -- more layouts, or give a semicolon
2090                     return (L lastLoc ITccurly)
2091              -- This next case is to handle a transitional issue:
2092              (ITvbar, ALRLayout _ col : ls, _)
2093               | newLine && thisCol == col && transitional ->
2094                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2095                                thisLoc
2096                                (transitionalAlternativeLayoutWarning
2097                                     "`|' at the same depth as implicit layout block")
2098                     setALRContext ls
2099                     setNextToken t
2100                     -- Note that we use lastLoc, as we may need to close
2101                     -- more layouts, or give a semicolon
2102                     return (L lastLoc ITccurly)
2103              (_, ALRLayout _ col : ls, _)
2104               | newLine && thisCol == col ->
2105                  do setNextToken t
2106                     return (L thisLoc ITsemi)
2107               | newLine && thisCol < col ->
2108                  do setALRContext ls
2109                     setNextToken t
2110                     -- Note that we use lastLoc, as we may need to close
2111                     -- more layouts, or give a semicolon
2112                     return (L lastLoc ITccurly)
2113              -- We need to handle close before open, as 'then' is both
2114              -- an open and a close
2115              (u, _, _)
2116               | isALRclose u ->
2117                  case context of
2118                  ALRLayout _ _ : ls ->
2119                      do setALRContext ls
2120                         setNextToken t
2121                         return (L thisLoc ITccurly)
2122                  ALRNoLayout _ isLet : ls ->
2123                      do let ls' = if isALRopen u
2124                                      then ALRNoLayout (containsCommas u) False : ls
2125                                      else ls
2126                         setALRContext ls'
2127                         when isLet $ setJustClosedExplicitLetBlock True
2128                         return t
2129                  [] ->
2130                      do let ls = if isALRopen u
2131                                     then [ALRNoLayout (containsCommas u) False]
2132                                     else ls
2133                         setALRContext ls
2134                         -- XXX This is an error in John's code, but
2135                         -- it looks reachable to me at first glance
2136                         return t
2137              (u, _, _)
2138               | isALRopen u ->
2139                  do setALRContext (ALRNoLayout (containsCommas u) False : context)
2140                     return t
2141              (ITin, ALRLayout ALRLayoutLet _ : ls, _) ->
2142                  do setALRContext ls
2143                     setPendingImplicitTokens [t]
2144                     return (L thisLoc ITccurly)
2145              (ITin, ALRLayout _ _ : ls, _) ->
2146                  do setALRContext ls
2147                     setNextToken t
2148                     return (L thisLoc ITccurly)
2149              -- the other ITin case omitted; general case below covers it
2150              (ITcomma, ALRLayout _ _ : ls, _)
2151               | topNoLayoutContainsCommas ls ->
2152                  do setALRContext ls
2153                     setNextToken t
2154                     return (L thisLoc ITccurly)
2155              (ITwhere, ALRLayout ALRLayoutDo _ : ls, _) ->
2156                  do setALRContext ls
2157                     setPendingImplicitTokens [t]
2158                     return (L thisLoc ITccurly)
2159              -- the other ITwhere case omitted; general case below covers it
2160              (_, _, _) -> return t
2161
2162 transitionalAlternativeLayoutWarning :: String -> SDoc
2163 transitionalAlternativeLayoutWarning msg
2164     = text "transitional layout will not be accepted in the future:"
2165    $$ text msg
2166
2167 isALRopen :: Token -> Bool
2168 isALRopen ITcase        = True
2169 isALRopen ITif          = True
2170 isALRopen ITthen        = True
2171 isALRopen IToparen      = True
2172 isALRopen ITobrack      = True
2173 isALRopen ITocurly      = True
2174 -- GHC Extensions:
2175 isALRopen IToubxparen   = True
2176 isALRopen ITparenEscape = True
2177 isALRopen _             = False
2178
2179 isALRclose :: Token -> Bool
2180 isALRclose ITof     = True
2181 isALRclose ITthen   = True
2182 isALRclose ITelse   = True
2183 isALRclose ITcparen = True
2184 isALRclose ITcbrack = True
2185 isALRclose ITccurly = True
2186 -- GHC Extensions:
2187 isALRclose ITcubxparen = True
2188 isALRclose _        = False
2189
2190 isNonDecreasingIntentation :: ALRLayout -> Bool
2191 isNonDecreasingIntentation ALRLayoutDo = True
2192 isNonDecreasingIntentation _           = False
2193
2194 containsCommas :: Token -> Bool
2195 containsCommas IToparen = True
2196 containsCommas ITobrack = True
2197 -- John doesn't have {} as containing commas, but records contain them,
2198 -- which caused a problem parsing Cabal's Distribution.Simple.InstallDirs
2199 -- (defaultInstallDirs).
2200 containsCommas ITocurly = True
2201 -- GHC Extensions:
2202 containsCommas IToubxparen = True
2203 containsCommas _        = False
2204
2205 topNoLayoutContainsCommas :: [ALRContext] -> Bool
2206 topNoLayoutContainsCommas [] = False
2207 topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
2208 topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
2209
2210 lexToken :: P (Located Token)
2211 lexToken = do
2212   inp@(AI loc1 buf) <- getInput
2213   sc <- getLexState
2214   exts <- getExts
2215   case alexScanUser exts inp sc of
2216     AlexEOF -> do
2217         let span = mkSrcSpan loc1 loc1
2218         setLastToken span 0
2219         return (L span ITeof)
2220     AlexError (AI loc2 buf) ->
2221         reportLexError loc1 loc2 buf "lexical error"
2222     AlexSkip inp2 _ -> do
2223         setInput inp2
2224         lexToken
2225     AlexToken inp2@(AI end buf2) _ t -> do
2226         setInput inp2
2227         let span = mkSrcSpan loc1 end
2228         let bytes = byteDiff buf buf2
2229         span `seq` setLastToken span bytes
2230         t span buf bytes
2231
2232 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
2233 reportLexError loc1 loc2 buf str
2234   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
2235   | otherwise =
2236   let
2237         c = fst (nextChar buf)
2238   in
2239   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
2240     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
2241     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
2242
2243 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
2244 lexTokenStream buf loc dflags = unP go initState
2245     where dflags' = dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream
2246           initState = mkPState dflags' buf loc
2247           go = do
2248             ltok <- lexer return
2249             case ltok of
2250               L _ ITeof -> return []
2251               _ -> liftM (ltok:) go
2252
2253 linePrags = Map.singleton "line" (begin line_prag2)
2254
2255 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
2256                                  ("options_ghc", lex_string_prag IToptions_prag),
2257                                  ("options_haddock", lex_string_prag ITdocOptions),
2258                                  ("language", token ITlanguage_prag),
2259                                  ("include", lex_string_prag ITinclude_prag)])
2260
2261 ignoredPrags = Map.fromList (map ignored pragmas)
2262                where ignored opt = (opt, nested_comment lexToken)
2263                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
2264                      options_pragmas = map ("options_" ++) impls
2265                      -- CFILES is a hugs-only thing.
2266                      pragmas = options_pragmas ++ ["cfiles", "contract"]
2267
2268 oneWordPrags = Map.fromList([("rules", rulePrag),
2269                            ("inline", token (ITinline_prag Inline FunLike)),
2270                            ("inlinable", token (ITinline_prag Inlinable FunLike)),
2271                            ("inlineable", token (ITinline_prag Inlinable FunLike)),
2272                                           -- Spelling variant
2273                            ("notinline", token (ITinline_prag NoInline FunLike)),
2274                            ("specialize", token ITspec_prag),
2275                            ("source", token ITsource_prag),
2276                            ("warning", token ITwarning_prag),
2277                            ("deprecated", token ITdeprecated_prag),
2278                            ("scc", token ITscc_prag),
2279                            ("generated", token ITgenerated_prag),
2280                            ("core", token ITcore_prag),
2281                            ("unpack", token ITunpack_prag),
2282                            ("ann", token ITann_prag),
2283                            ("vectorize", token ITvect_prag)])
2284
2285 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_prag Inline ConLike)),
2286                              ("notinline conlike", token (ITinline_prag NoInline ConLike)),
2287                              ("specialize inline", token (ITspec_inline_prag True)),
2288                              ("specialize notinline", token (ITspec_inline_prag False)),
2289                              ("vectorize scalar", token ITvect_scalar_prag)])
2290
2291 dispatch_pragmas :: Map String Action -> Action
2292 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
2293                                        Just found -> found span buf len
2294                                        Nothing -> lexError "unknown pragma"
2295
2296 known_pragma :: Map String Action -> AlexAccPred Int
2297 known_pragma prags _ _ len (AI _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
2298                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
2299
2300 clean_pragma :: String -> String
2301 clean_pragma prag = canon_ws (map toLower (unprefix prag))
2302                     where unprefix prag' = case stripPrefix "{-#" prag' of
2303                                              Just rest -> rest
2304                                              Nothing -> prag'
2305                           canonical prag' = case prag' of
2306                                               "noinline" -> "notinline"
2307                                               "specialise" -> "specialize"
2308                                               "vectorise" -> "vectorize"
2309                                               "constructorlike" -> "conlike"
2310                                               _ -> prag'
2311                           canon_ws s = unwords (map canonical (words s))
2312 }