compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 -- XXX The above flags turn off warnings in the generated code:
  35 {-# OPTIONS_GHC -fno-warn-unused-matches #-}
  36 {-# OPTIONS_GHC -fno-warn-unused-binds #-}
  37 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
  38 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
  39 -- But alex still generates some code that causes the "lazy unlifted bindings"
  40 -- warning, and old compilers don't know about it so we can't easily turn
  41 -- it off, so for now we use the sledge hammer:
  42 {-# OPTIONS_GHC -w #-}
  43
  44 {-# OPTIONS_GHC -funbox-strict-fields #-}
  45
  46 module Lexer (
  47    Token(..), lexer, pragState, mkPState, PState(..),
  48    P(..), ParseResult(..), getSrcLoc,
  49    getPState, getDynFlags, withThisPackage,
  50    failLocMsgP, failSpanMsgP, srcParseFail,
  51    getMessages,
  52    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  53    getLexState, popLexState, pushLexState,
  54    extension, standaloneDerivingEnabled, bangPatEnabled,
  55    addWarning,
  56    lexTokenStream
  57   ) where
  58
  59 import Bag
  60 import ErrUtils
  61 import Outputable
  62 import StringBuffer
  63 import FastString
  64 import SrcLoc
  65 import UniqFM
  66 import DynFlags
  67 import Module
  68 import Ctype
  69 import Util             ( readRational )
  70
  71 import Control.Monad
  72 import Data.Bits
  73 import Data.Char
  74 import Data.List
  75 import Data.Maybe
  76 import Data.Map (Map)
  77 import qualified Data.Map as Map
  78 import Data.Ratio
  79 }
  80
  81 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  82 $whitechar   = [\ \n\r\f\v $unispace]
  83 $white_no_nl = $whitechar # \n
  84 $tab         = \t
  85
  86 $ascdigit  = 0-9
  87 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  88 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  89 $digit     = [$ascdigit $unidigit]
  90
  91 $special   = [\(\)\,\;\[\]\`\{\}]
  92 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  93 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  94 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  95
  96 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  97 $asclarge  = [A-Z]
  98 $large     = [$asclarge $unilarge]
  99
 100 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 101 $ascsmall  = [a-z]
 102 $small     = [$ascsmall $unismall \_]
 103
 104 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 105 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 106
 107 $octit     = 0-7
 108 $hexit     = [$decdigit A-F a-f]
 109 $symchar   = [$symbol \:]
 110 $nl        = [\n\r]
 111 $idchar    = [$small $large $digit \']
 112
 113 $pragmachar = [$small $large $digit]
 114
 115 $docsym    = [\| \^ \* \$]
 116
 117 @varid     = $small $idchar*
 118 @conid     = $large $idchar*
 119
 120 @varsym    = $symbol $symchar*
 121 @consym    = \: $symchar*
 122
 123 @decimal     = $decdigit+
 124 @octal       = $octit+
 125 @hexadecimal = $hexit+
 126 @exponent    = [eE] [\-\+]? @decimal
 127
 128 -- we support the hierarchical module name extension:
 129 @qual = (@conid \.)+
 130
 131 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 132
 133 -- normal signed numerical literals can only be explicitly negative,
 134 -- not explicitly positive (contrast @exponent)
 135 @negative = \-
 136 @signed = @negative ?
 137
 138 haskell :-
 139
 140 -- everywhere: skip whitespace and comments
 141 $white_no_nl+                           ;
 142 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 143
 144 -- Everywhere: deal with nested comments.  We explicitly rule out
 145 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 146 -- (this can happen even though pragmas will normally take precedence due to
 147 -- longest-match, because pragmas aren't valid in every state, but comments
 148 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 149 -- set.
 150
 151 "{-" / { isNormalComment } { nested_comment lexToken }
 152
 153 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 154 -- more dashes followed by a symbol should be parsed as a varsym, so we
 155 -- have to exclude those.
 156
 157 -- Since Haddock comments aren't valid in every state, we need to rule them
 158 -- out here.
 159
 160 -- The following two rules match comments that begin with two dashes, but
 161 -- continue with a different character. The rules test that this character
 162 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 163 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 164 -- have a Haddock comment). The rules then munch the rest of the line.
 165
 166 "-- " ~[$docsym \#] .* { lineCommentToken }
 167 "--" [^$symbol : \ ] .* { lineCommentToken }
 168
 169 -- Next, match Haddock comments if no -haddock flag
 170
 171 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 172
 173 -- Now, when we've matched comments that begin with 2 dashes and continue
 174 -- with a different character, we need to match comments that begin with three
 175 -- or more dashes (which clearly can't be Haddock comments). We only need to
 176 -- make sure that the first non-dash character isn't a symbol, and munch the
 177 -- rest of the line.
 178
 179 "---"\-* [^$symbol :] .* { lineCommentToken }
 180
 181 -- Since the previous rules all match dashes followed by at least one
 182 -- character, we also need to match a whole line filled with just dashes.
 183
 184 "--"\-* / { atEOL } { lineCommentToken }
 185
 186 -- We need this rule since none of the other single line comment rules
 187 -- actually match this case.
 188
 189 "-- " / { atEOL } { lineCommentToken }
 190
 191 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 192 -- blank lines) until we find a non-whitespace character, then do layout
 193 -- processing.
 194 --
 195 -- One slight wibble here: what if the line begins with {-#? In
 196 -- theory, we have to lex the pragma to see if it's one we recognise,
 197 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 198 -- as a nested comment.  We don't bother with this: if the line begins
 199 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 200 <bol> {
 201   \n                                    ;
 202   ^\# (line)?                           { begin line_prag1 }
 203   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 204   ^\# \! .* \n                          ; -- #!, for scripts
 205   ()                                    { do_bol }
 206 }
 207
 208 -- after a layout keyword (let, where, do, of), we begin a new layout
 209 -- context if the curly brace is missing.
 210 -- Careful! This stuff is quite delicate.
 211 <layout, layout_do> {
 212   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 213         -- we might encounter {-# here, but {- has been handled already
 214   \n                                    ;
 215   ^\# (line)?                           { begin line_prag1 }
 216 }
 217
 218 -- do is treated in a subtly different way, see new_layout_context
 219 <layout>    ()                          { new_layout_context True }
 220 <layout_do> ()                          { new_layout_context False }
 221
 222 -- after a new layout context which was found to be to the left of the
 223 -- previous context, we have generated a '{' token, and we now need to
 224 -- generate a matching '}' token.
 225 <layout_left>  ()                       { do_layout_left }
 226
 227 <0,option_prags> \n                             { begin bol }
 228
 229 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 230                                 { dispatch_pragmas linePrags }
 231
 232 -- single-line line pragmas, of the form
 233 --    # <line> "<file>" <extra-stuff> \n
 234 <line_prag1> $decdigit+                 { setLine line_prag1a }
 235 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 236 <line_prag1b> .*                        { pop }
 237
 238 -- Haskell-style line pragmas, of the form
 239 --    {-# LINE <line> "<file>" #-}
 240 <line_prag2> $decdigit+                 { setLine line_prag2a }
 241 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 242 <line_prag2b> "#-}"|"-}"                { pop }
 243    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 244    -- with older versions of GHC which generated these.
 245
 246 <0,option_prags> {
 247   "{-#" $whitechar* $pragmachar+
 248         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 249                                  { dispatch_pragmas twoWordPrags }
 250
 251   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 252                                  { dispatch_pragmas oneWordPrags }
 253
 254   -- We ignore all these pragmas, but don't generate a warning for them
 255   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 256                                  { dispatch_pragmas ignoredPrags }
 257
 258   -- ToDo: should only be valid inside a pragma:
 259   "#-}"                                 { endPrag }
 260 }
 261
 262 <option_prags> {
 263   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 264                                    { dispatch_pragmas fileHeaderPrags }
 265
 266   "-- #"                                 { multiline_doc_comment }
 267 }
 268
 269 <0> {
 270   -- In the "0" mode we ignore these pragmas
 271   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 272                      { nested_comment lexToken }
 273 }
 274
 275 <0> {
 276   "-- #" .* { lineCommentToken }
 277 }
 278
 279 <0,option_prags> {
 280   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 281                     (nested_comment lexToken) }
 282 }
 283
 284 -- '0' state: ordinary lexemes
 285
 286 -- Haddock comments
 287
 288 <0> {
 289   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 290   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 291 }
 292
 293 -- "special" symbols
 294
 295 <0> {
 296   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 297   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 298 }
 299
 300 <0> {
 301   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 302   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 303   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 304   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 305   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 306   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 307   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 308   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 309
 310   "[" @varid "|"  / { ifExtension qqEnabled }
 311                      { lex_quasiquote_tok }
 312 }
 313
 314 <0> {
 315   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 316                                         { special IToparenbar }
 317   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 318 }
 319
 320 <0> {
 321   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 322 }
 323
 324 <0> {
 325   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 326          { token IToubxparen }
 327   "#)" / { ifExtension unboxedTuplesEnabled }
 328          { token ITcubxparen }
 329 }
 330
 331 <0> {
 332   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 333   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 334 }
 335
 336 <0,option_prags> {
 337   \(                                    { special IToparen }
 338   \)                                    { special ITcparen }
 339   \[                                    { special ITobrack }
 340   \]                                    { special ITcbrack }
 341   \,                                    { special ITcomma }
 342   \;                                    { special ITsemi }
 343   \`                                    { special ITbackquote }
 344
 345   \{                                    { open_brace }
 346   \}                                    { close_brace }
 347 }
 348
 349 <0,option_prags> {
 350   @qual @varid                  { idtoken qvarid }
 351   @qual @conid                  { idtoken qconid }
 352   @varid                        { varid }
 353   @conid                        { idtoken conid }
 354 }
 355
 356 <0> {
 357   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 358   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 359   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 360   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 361 }
 362
 363 -- ToDo: - move `var` and (sym) into lexical syntax?
 364 --       - remove backquote from $special?
 365 <0> {
 366   @qual @varsym       / { ifExtension oldQualOps } { idtoken qvarsym }
 367   @qual @consym       / { ifExtension oldQualOps } { idtoken qconsym }
 368   @qual \( @varsym \) / { ifExtension newQualOps } { idtoken prefixqvarsym }
 369   @qual \( @consym \) / { ifExtension newQualOps } { idtoken prefixqconsym }
 370   @varsym                                          { varsym }
 371   @consym                                          { consym }
 372 }
 373
 374 -- For the normal boxed literals we need to be careful
 375 -- when trying to be close to Haskell98
 376 <0> {
 377   -- Normal integral literals (:: Num a => a, from Integer)
 378   @decimal           { tok_num positive 0 0 decimal }
 379   0[oO] @octal       { tok_num positive 2 2 octal }
 380   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 381
 382   -- Normal rational literals (:: Fractional a => a, from Rational)
 383   @floating_point    { strtoken tok_float }
 384 }
 385
 386 <0> {
 387   -- Unboxed ints (:: Int#) and words (:: Word#)
 388   -- It's simpler (and faster?) to give separate cases to the negatives,
 389   -- especially considering octal/hexadecimal prefixes.
 390   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 391   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 392   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 393   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 394   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 395   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 396
 397   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 398   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 399   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 400
 401   -- Unboxed floats and doubles (:: Float#, :: Double#)
 402   -- prim_{float,double} work with signed literals
 403   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 404   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 405 }
 406
 407 -- Strings and chars are lexed by hand-written code.  The reason is
 408 -- that even if we recognise the string or char here in the regex
 409 -- lexer, we would still have to parse the string afterward in order
 410 -- to convert it to a String.
 411 <0> {
 412   \'                            { lex_char_tok }
 413   \"                            { lex_string_tok }
 414 }
 415
 416 {
 417 -- -----------------------------------------------------------------------------
 418 -- The token type
 419
 420 data Token
 421   = ITas                        -- Haskell keywords
 422   | ITcase
 423   | ITclass
 424   | ITdata
 425   | ITdefault
 426   | ITderiving
 427   | ITdo
 428   | ITelse
 429   | IThiding
 430   | ITif
 431   | ITimport
 432   | ITin
 433   | ITinfix
 434   | ITinfixl
 435   | ITinfixr
 436   | ITinstance
 437   | ITlet
 438   | ITmodule
 439   | ITnewtype
 440   | ITof
 441   | ITqualified
 442   | ITthen
 443   | ITtype
 444   | ITwhere
 445   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 446
 447   | ITforall                    -- GHC extension keywords
 448   | ITforeign
 449   | ITexport
 450   | ITlabel
 451   | ITdynamic
 452   | ITsafe
 453   | ITthreadsafe
 454   | ITunsafe
 455   | ITstdcallconv
 456   | ITccallconv
 457   | ITprimcallconv
 458   | ITmdo
 459   | ITfamily
 460   | ITgroup
 461   | ITby
 462   | ITusing
 463
 464         -- Pragmas
 465   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 466   | ITinline_conlike_prag Bool  -- same
 467   | ITspec_prag                 -- SPECIALISE
 468   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 469   | ITsource_prag
 470   | ITrules_prag
 471   | ITwarning_prag
 472   | ITdeprecated_prag
 473   | ITline_prag
 474   | ITscc_prag
 475   | ITgenerated_prag
 476   | ITcore_prag                 -- hdaume: core annotations
 477   | ITunpack_prag
 478   | ITann_prag
 479   | ITclose_prag
 480   | IToptions_prag String
 481   | ITinclude_prag String
 482   | ITlanguage_prag
 483
 484   | ITdotdot                    -- reserved symbols
 485   | ITcolon
 486   | ITdcolon
 487   | ITequal
 488   | ITlam
 489   | ITvbar
 490   | ITlarrow
 491   | ITrarrow
 492   | ITat
 493   | ITtilde
 494   | ITdarrow
 495   | ITminus
 496   | ITbang
 497   | ITstar
 498   | ITdot
 499
 500   | ITbiglam                    -- GHC-extension symbols
 501
 502   | ITocurly                    -- special symbols
 503   | ITccurly
 504   | ITocurlybar                 -- {|, for type applications
 505   | ITccurlybar                 -- |}, for type applications
 506   | ITvocurly
 507   | ITvccurly
 508   | ITobrack
 509   | ITopabrack                  -- [:, for parallel arrays with -XParr
 510   | ITcpabrack                  -- :], for parallel arrays with -XParr
 511   | ITcbrack
 512   | IToparen
 513   | ITcparen
 514   | IToubxparen
 515   | ITcubxparen
 516   | ITsemi
 517   | ITcomma
 518   | ITunderscore
 519   | ITbackquote
 520
 521   | ITvarid   FastString        -- identifiers
 522   | ITconid   FastString
 523   | ITvarsym  FastString
 524   | ITconsym  FastString
 525   | ITqvarid  (FastString,FastString)
 526   | ITqconid  (FastString,FastString)
 527   | ITqvarsym (FastString,FastString)
 528   | ITqconsym (FastString,FastString)
 529   | ITprefixqvarsym (FastString,FastString)
 530   | ITprefixqconsym (FastString,FastString)
 531
 532   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 533
 534   | ITchar       Char
 535   | ITstring     FastString
 536   | ITinteger    Integer
 537   | ITrational   Rational
 538
 539   | ITprimchar   Char
 540   | ITprimstring FastString
 541   | ITprimint    Integer
 542   | ITprimword   Integer
 543   | ITprimfloat  Rational
 544   | ITprimdouble Rational
 545
 546   -- Template Haskell extension tokens
 547   | ITopenExpQuote              --  [| or [e|
 548   | ITopenPatQuote              --  [p|
 549   | ITopenDecQuote              --  [d|
 550   | ITopenTypQuote              --  [t|
 551   | ITcloseQuote                --  |]
 552   | ITidEscape   FastString     --  $x
 553   | ITparenEscape               --  $(
 554   | ITvarQuote                  --  '
 555   | ITtyQuote                   --  ''
 556   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 557
 558   -- Arrow notation extension
 559   | ITproc
 560   | ITrec
 561   | IToparenbar                 --  (|
 562   | ITcparenbar                 --  |)
 563   | ITlarrowtail                --  -<
 564   | ITrarrowtail                --  >-
 565   | ITLarrowtail                --  -<<
 566   | ITRarrowtail                --  >>-
 567
 568   | ITunknown String            -- Used when the lexer can't make sense of it
 569   | ITeof                       -- end of file token
 570
 571   -- Documentation annotations
 572   | ITdocCommentNext  String     -- something beginning '-- |'
 573   | ITdocCommentPrev  String     -- something beginning '-- ^'
 574   | ITdocCommentNamed String     -- something beginning '-- $'
 575   | ITdocSection      Int String -- a section heading
 576   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 577   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 578   | ITlineComment     String     -- comment starting by "--"
 579   | ITblockComment    String     -- comment in {- -}
 580
 581 #ifdef DEBUG
 582   deriving Show -- debugging
 583 #endif
 584
 585 {-
 586 isSpecial :: Token -> Bool
 587 -- If we see M.x, where x is a keyword, but
 588 -- is special, we treat is as just plain M.x,
 589 -- not as a keyword.
 590 isSpecial ITas          = True
 591 isSpecial IThiding      = True
 592 isSpecial ITqualified   = True
 593 isSpecial ITforall      = True
 594 isSpecial ITexport      = True
 595 isSpecial ITlabel       = True
 596 isSpecial ITdynamic     = True
 597 isSpecial ITsafe        = True
 598 isSpecial ITthreadsafe  = True
 599 isSpecial ITunsafe      = True
 600 isSpecial ITccallconv   = True
 601 isSpecial ITstdcallconv = True
 602 isSpecial ITprimcallconv = True
 603 isSpecial ITmdo         = True
 604 isSpecial ITfamily      = True
 605 isSpecial ITgroup   = True
 606 isSpecial ITby      = True
 607 isSpecial ITusing   = True
 608 isSpecial _             = False
 609 -}
 610
 611 -- the bitmap provided as the third component indicates whether the
 612 -- corresponding extension keyword is valid under the extension options
 613 -- provided to the compiler; if the extension corresponding to *any* of the
 614 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 615 -- facilitates using a keyword in two different extensions that can be
 616 -- activated independently)
 617 --
 618 reservedWordsFM :: UniqFM (Token, Int)
 619 reservedWordsFM = listToUFM $
 620         map (\(x, y, z) -> (mkFastString x, (y, z)))
 621        [( "_",          ITunderscore,   0 ),
 622         ( "as",         ITas,           0 ),
 623         ( "case",       ITcase,         0 ),
 624         ( "class",      ITclass,        0 ),
 625         ( "data",       ITdata,         0 ),
 626         ( "default",    ITdefault,      0 ),
 627         ( "deriving",   ITderiving,     0 ),
 628         ( "do",         ITdo,           0 ),
 629         ( "else",       ITelse,         0 ),
 630         ( "hiding",     IThiding,       0 ),
 631         ( "if",         ITif,           0 ),
 632         ( "import",     ITimport,       0 ),
 633         ( "in",         ITin,           0 ),
 634         ( "infix",      ITinfix,        0 ),
 635         ( "infixl",     ITinfixl,       0 ),
 636         ( "infixr",     ITinfixr,       0 ),
 637         ( "instance",   ITinstance,     0 ),
 638         ( "let",        ITlet,          0 ),
 639         ( "module",     ITmodule,       0 ),
 640         ( "newtype",    ITnewtype,      0 ),
 641         ( "of",         ITof,           0 ),
 642         ( "qualified",  ITqualified,    0 ),
 643         ( "then",       ITthen,         0 ),
 644         ( "type",       ITtype,         0 ),
 645         ( "where",      ITwhere,        0 ),
 646         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 647
 648     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 649         ( "mdo",        ITmdo,           bit recursiveDoBit),
 650         ( "family",     ITfamily,        bit tyFamBit),
 651     ( "group",  ITgroup,     bit transformComprehensionsBit),
 652     ( "by",     ITby,        bit transformComprehensionsBit),
 653     ( "using",  ITusing,     bit transformComprehensionsBit),
 654
 655         ( "foreign",    ITforeign,       bit ffiBit),
 656         ( "export",     ITexport,        bit ffiBit),
 657         ( "label",      ITlabel,         bit ffiBit),
 658         ( "dynamic",    ITdynamic,       bit ffiBit),
 659         ( "safe",       ITsafe,          bit ffiBit),
 660         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 661         ( "unsafe",     ITunsafe,        bit ffiBit),
 662         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 663         ( "ccall",      ITccallconv,     bit ffiBit),
 664         ( "prim",       ITprimcallconv,  bit ffiBit),
 665
 666         ( "rec",        ITrec,           bit recBit),
 667         ( "proc",       ITproc,          bit arrowsBit)
 668      ]
 669
 670 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 671 reservedSymsFM = listToUFM $
 672     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 673       [ ("..",  ITdotdot,   always)
 674         -- (:) is a reserved op, meaning only list cons
 675        ,(":",   ITcolon,    always)
 676        ,("::",  ITdcolon,   always)
 677        ,("=",   ITequal,    always)
 678        ,("\\",  ITlam,      always)
 679        ,("|",   ITvbar,     always)
 680        ,("<-",  ITlarrow,   always)
 681        ,("->",  ITrarrow,   always)
 682        ,("@",   ITat,       always)
 683        ,("~",   ITtilde,    always)
 684        ,("=>",  ITdarrow,   always)
 685        ,("-",   ITminus,    always)
 686        ,("!",   ITbang,     always)
 687
 688         -- For data T (a::*) = MkT
 689        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 690         -- For 'forall a . t'
 691        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 692
 693        ,("-<",  ITlarrowtail, arrowsEnabled)
 694        ,(">-",  ITrarrowtail, arrowsEnabled)
 695        ,("-<<", ITLarrowtail, arrowsEnabled)
 696        ,(">>-", ITRarrowtail, arrowsEnabled)
 697
 698        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 699        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 700        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 701                                 explicitForallEnabled i)
 702        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 703        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 704
 705        ,("⤙",   ITlarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 706        ,("⤚",   ITrarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 707        ,("⤛",   ITLarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 708        ,("⤜",   ITRarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 709
 710        ,("★", ITstar, unicodeSyntaxEnabled)
 711
 712         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 713         -- form part of a large operator.  This would let us have a better
 714         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 715        ]
 716
 717 -- -----------------------------------------------------------------------------
 718 -- Lexer actions
 719
 720 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 721
 722 special :: Token -> Action
 723 special tok span _buf _len = return (L span tok)
 724
 725 token, layout_token :: Token -> Action
 726 token t span _buf _len = return (L span t)
 727 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 728
 729 idtoken :: (StringBuffer -> Int -> Token) -> Action
 730 idtoken f span buf len = return (L span $! (f buf len))
 731
 732 skip_one_varid :: (FastString -> Token) -> Action
 733 skip_one_varid f span buf len
 734   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 735
 736 strtoken :: (String -> Token) -> Action
 737 strtoken f span buf len =
 738   return (L span $! (f $! lexemeToString buf len))
 739
 740 init_strtoken :: Int -> (String -> Token) -> Action
 741 -- like strtoken, but drops the last N character(s)
 742 init_strtoken drop f span buf len =
 743   return (L span $! (f $! lexemeToString buf (len-drop)))
 744
 745 begin :: Int -> Action
 746 begin code _span _str _len = do pushLexState code; lexToken
 747
 748 pop :: Action
 749 pop _span _buf _len = do _ <- popLexState
 750                          lexToken
 751
 752 pop_and :: Action -> Action
 753 pop_and act span buf len = do _ <- popLexState
 754                               act span buf len
 755
 756 {-# INLINE nextCharIs #-}
 757 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 758 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 759
 760 notFollowedBy :: Char -> AlexAccPred Int
 761 notFollowedBy char _ _ _ (AI _ buf)
 762   = nextCharIs buf (/=char)
 763
 764 notFollowedBySymbol :: AlexAccPred Int
 765 notFollowedBySymbol _ _ _ (AI _ buf)
 766   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 767
 768 -- We must reject doc comments as being ordinary comments everywhere.
 769 -- In some cases the doc comment will be selected as the lexeme due to
 770 -- maximal munch, but not always, because the nested comment rule is
 771 -- valid in all states, but the doc-comment rules are only valid in
 772 -- the non-layout states.
 773 isNormalComment :: AlexAccPred Int
 774 isNormalComment bits _ _ (AI _ buf)
 775   | haddockEnabled bits = notFollowedByDocOrPragma
 776   | otherwise           = nextCharIs buf (/='#')
 777   where
 778     notFollowedByDocOrPragma
 779        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 780
 781 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 782 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 783
 784 {-
 785 haddockDisabledAnd p bits _ _ (AI _ buf)
 786   = if haddockEnabled bits then False else (p buf)
 787 -}
 788
 789 atEOL :: AlexAccPred Int
 790 atEOL _ _ _ (AI _ buf) = atEnd buf || currentChar buf == '\n'
 791
 792 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 793 ifExtension pred bits _ _ _ = pred bits
 794
 795 multiline_doc_comment :: Action
 796 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 797   where
 798     worker commentAcc input docType oneLine = case alexGetChar input of
 799       Just ('\n', input')
 800         | oneLine -> docCommentEnd input commentAcc docType buf span
 801         | otherwise -> case checkIfCommentLine input' of
 802           Just input -> worker ('\n':commentAcc) input docType False
 803           Nothing -> docCommentEnd input commentAcc docType buf span
 804       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 805       Nothing -> docCommentEnd input commentAcc docType buf span
 806
 807     checkIfCommentLine input = check (dropNonNewlineSpace input)
 808       where
 809         check input = case alexGetChar input of
 810           Just ('-', input) -> case alexGetChar input of
 811             Just ('-', input) -> case alexGetChar input of
 812               Just (c, _) | c /= '-' -> Just input
 813               _ -> Nothing
 814             _ -> Nothing
 815           _ -> Nothing
 816
 817         dropNonNewlineSpace input = case alexGetChar input of
 818           Just (c, input')
 819             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 820             | otherwise -> input
 821           Nothing -> input
 822
 823 lineCommentToken :: Action
 824 lineCommentToken span buf len = do
 825   b <- extension rawTokenStreamEnabled
 826   if b then strtoken ITlineComment span buf len else lexToken
 827
 828 {-
 829   nested comments require traversing by hand, they can't be parsed
 830   using regular expressions.
 831 -}
 832 nested_comment :: P (Located Token) -> Action
 833 nested_comment cont span _str _len = do
 834   input <- getInput
 835   go "" (1::Int) input
 836   where
 837     go commentAcc 0 input = do setInput input
 838                                b <- extension rawTokenStreamEnabled
 839                                if b
 840                                  then docCommentEnd input commentAcc ITblockComment _str span
 841                                  else cont
 842     go commentAcc n input = case alexGetChar input of
 843       Nothing -> errBrace input span
 844       Just ('-',input) -> case alexGetChar input of
 845         Nothing  -> errBrace input span
 846         Just ('\125',input) -> go commentAcc (n-1) input
 847         Just (_,_)          -> go ('-':commentAcc) n input
 848       Just ('\123',input) -> case alexGetChar input of
 849         Nothing  -> errBrace input span
 850         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 851         Just (_,_)       -> go ('\123':commentAcc) n input
 852       Just (c,input) -> go (c:commentAcc) n input
 853
 854 nested_doc_comment :: Action
 855 nested_doc_comment span buf _len = withLexedDocType (go "")
 856   where
 857     go commentAcc input docType _ = case alexGetChar input of
 858       Nothing -> errBrace input span
 859       Just ('-',input) -> case alexGetChar input of
 860         Nothing -> errBrace input span
 861         Just ('\125',input) ->
 862           docCommentEnd input commentAcc docType buf span
 863         Just (_,_) -> go ('-':commentAcc) input docType False
 864       Just ('\123', input) -> case alexGetChar input of
 865         Nothing  -> errBrace input span
 866         Just ('-',input) -> do
 867           setInput input
 868           let cont = do input <- getInput; go commentAcc input docType False
 869           nested_comment cont span buf _len
 870         Just (_,_) -> go ('\123':commentAcc) input docType False
 871       Just (c,input) -> go (c:commentAcc) input docType False
 872
 873 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 874                  -> P (Located Token)
 875 withLexedDocType lexDocComment = do
 876   input@(AI _ buf) <- getInput
 877   case prevChar buf ' ' of
 878     '|' -> lexDocComment input ITdocCommentNext False
 879     '^' -> lexDocComment input ITdocCommentPrev False
 880     '$' -> lexDocComment input ITdocCommentNamed False
 881     '*' -> lexDocSection 1 input
 882     '#' -> lexDocComment input ITdocOptionsOld False
 883     _ -> panic "withLexedDocType: Bad doc type"
 884  where
 885     lexDocSection n input = case alexGetChar input of
 886       Just ('*', input) -> lexDocSection (n+1) input
 887       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 888       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 889
 890 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 891 -- off again at the end of the pragma.
 892 rulePrag :: Action
 893 rulePrag span _buf _len = do
 894   setExts (.|. bit inRulePragBit)
 895   return (L span ITrules_prag)
 896
 897 endPrag :: Action
 898 endPrag span _buf _len = do
 899   setExts (.&. complement (bit inRulePragBit))
 900   return (L span ITclose_prag)
 901
 902 -- docCommentEnd
 903 -------------------------------------------------------------------------------
 904 -- This function is quite tricky. We can't just return a new token, we also
 905 -- need to update the state of the parser. Why? Because the token is longer
 906 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 907 -- it writes the wrong token length to the parser state. This function is
 908 -- called afterwards, so it can just update the state.
 909
 910 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 911                  SrcSpan -> P (Located Token)
 912 docCommentEnd input commentAcc docType buf span = do
 913   setInput input
 914   let (AI loc nextBuf) = input
 915       comment = reverse commentAcc
 916       span' = mkSrcSpan (srcSpanStart span) loc
 917       last_len = byteDiff buf nextBuf
 918
 919   span `seq` setLastToken span' last_len
 920   return (L span' (docType comment))
 921
 922 errBrace :: AlexInput -> SrcSpan -> P a
 923 errBrace (AI end _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 924
 925 open_brace, close_brace :: Action
 926 open_brace span _str _len = do
 927   ctx <- getContext
 928   setContext (NoLayout:ctx)
 929   return (L span ITocurly)
 930 close_brace span _str _len = do
 931   popContext
 932   return (L span ITccurly)
 933
 934 qvarid, qconid :: StringBuffer -> Int -> Token
 935 qvarid buf len = ITqvarid $! splitQualName buf len False
 936 qconid buf len = ITqconid $! splitQualName buf len False
 937
 938 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 939 -- takes a StringBuffer and a length, and returns the module name
 940 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 941 -- because of hierarchical module names.
 942 splitQualName orig_buf len parens = split orig_buf orig_buf
 943   where
 944     split buf dot_buf
 945         | orig_buf `byteDiff` buf >= len  = done dot_buf
 946         | c == '.'                        = found_dot buf'
 947         | otherwise                       = split buf' dot_buf
 948       where
 949        (c,buf') = nextChar buf
 950
 951     -- careful, we might get names like M....
 952     -- so, if the character after the dot is not upper-case, this is
 953     -- the end of the qualifier part.
 954     found_dot buf -- buf points after the '.'
 955         | isUpper c    = split buf' buf
 956         | otherwise    = done buf
 957       where
 958        (c,buf') = nextChar buf
 959
 960     done dot_buf =
 961         (lexemeToFastString orig_buf (qual_size - 1),
 962          if parens -- Prelude.(+)
 963             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 964             else lexemeToFastString dot_buf (len - qual_size))
 965       where
 966         qual_size = orig_buf `byteDiff` dot_buf
 967
 968 varid :: Action
 969 varid span buf len =
 970   fs `seq`
 971   case lookupUFM reservedWordsFM fs of
 972         Just (keyword,0)    -> do
 973                 maybe_layout keyword
 974                 return (L span keyword)
 975         Just (keyword,exts) -> do
 976                 b <- extension (\i -> exts .&. i /= 0)
 977                 if b then do maybe_layout keyword
 978                              return (L span keyword)
 979                      else return (L span (ITvarid fs))
 980         _other -> return (L span (ITvarid fs))
 981   where
 982         fs = lexemeToFastString buf len
 983
 984 conid :: StringBuffer -> Int -> Token
 985 conid buf len = ITconid fs
 986   where fs = lexemeToFastString buf len
 987
 988 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
 989 qvarsym buf len = ITqvarsym $! splitQualName buf len False
 990 qconsym buf len = ITqconsym $! splitQualName buf len False
 991 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
 992 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
 993
 994 varsym, consym :: Action
 995 varsym = sym ITvarsym
 996 consym = sym ITconsym
 997
 998 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
 999     -> P (Located Token)
1000 sym con span buf len =
1001   case lookupUFM reservedSymsFM fs of
1002         Just (keyword,exts) -> do
1003                 b <- extension exts
1004                 if b then return (L span keyword)
1005                      else return (L span $! con fs)
1006         _other -> return (L span $! con fs)
1007   where
1008         fs = lexemeToFastString buf len
1009
1010 -- Variations on the integral numeric literal.
1011 tok_integral :: (Integer -> Token)
1012      -> (Integer -> Integer)
1013  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1014      -> Int -> Int
1015      -> (Integer, (Char->Int)) -> Action
1016 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1017   return $ L span $ itint $! transint $ parseUnsignedInteger
1018      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1019
1020 -- some conveniences for use with tok_integral
1021 tok_num :: (Integer -> Integer)
1022         -> Int -> Int
1023         -> (Integer, (Char->Int)) -> Action
1024 tok_num = tok_integral ITinteger
1025 tok_primint :: (Integer -> Integer)
1026             -> Int -> Int
1027             -> (Integer, (Char->Int)) -> Action
1028 tok_primint = tok_integral ITprimint
1029 tok_primword :: Int -> Int
1030              -> (Integer, (Char->Int)) -> Action
1031 tok_primword = tok_integral ITprimword positive
1032 positive, negative :: (Integer -> Integer)
1033 positive = id
1034 negative = negate
1035 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1036 decimal = (10,octDecDigit)
1037 octal = (8,octDecDigit)
1038 hexadecimal = (16,hexDigit)
1039
1040 -- readRational can understand negative rationals, exponents, everything.
1041 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1042 tok_float        str = ITrational   $! readRational str
1043 tok_primfloat    str = ITprimfloat  $! readRational str
1044 tok_primdouble   str = ITprimdouble $! readRational str
1045
1046 -- -----------------------------------------------------------------------------
1047 -- Layout processing
1048
1049 -- we're at the first token on a line, insert layout tokens if necessary
1050 do_bol :: Action
1051 do_bol span _str _len = do
1052         pos <- getOffside
1053         case pos of
1054             LT -> do
1055                 --trace "layout: inserting '}'" $ do
1056                 popContext
1057                 -- do NOT pop the lex state, we might have a ';' to insert
1058                 return (L span ITvccurly)
1059             EQ -> do
1060                 --trace "layout: inserting ';'" $ do
1061                 _ <- popLexState
1062                 return (L span ITsemi)
1063             GT -> do
1064                 _ <- popLexState
1065                 lexToken
1066
1067 -- certain keywords put us in the "layout" state, where we might
1068 -- add an opening curly brace.
1069 maybe_layout :: Token -> P ()
1070 maybe_layout t = do -- If the alternative layout rule is enabled then
1071                     -- we never create an implicit layout context here.
1072                     -- Layout is handled XXX instead.
1073                     -- The code for closing implicit contexts, or
1074                     -- inserting implicit semi-colons, is therefore
1075                     -- irrelevant as it only applies in an implicit
1076                     -- context.
1077                     alr <- extension alternativeLayoutRule
1078                     unless alr $ f t
1079     where f ITdo    = pushLexState layout_do
1080           f ITmdo   = pushLexState layout_do
1081           f ITof    = pushLexState layout
1082           f ITlet   = pushLexState layout
1083           f ITwhere = pushLexState layout
1084           f ITrec   = pushLexState layout
1085           f _       = return ()
1086
1087 -- Pushing a new implicit layout context.  If the indentation of the
1088 -- next token is not greater than the previous layout context, then
1089 -- Haskell 98 says that the new layout context should be empty; that is
1090 -- the lexer must generate {}.
1091 --
1092 -- We are slightly more lenient than this: when the new context is started
1093 -- by a 'do', then we allow the new context to be at the same indentation as
1094 -- the previous context.  This is what the 'strict' argument is for.
1095 --
1096 new_layout_context :: Bool -> Action
1097 new_layout_context strict span _buf _len = do
1098     _ <- popLexState
1099     (AI l _) <- getInput
1100     let offset = srcLocCol l
1101     ctx <- getContext
1102     case ctx of
1103         Layout prev_off : _  |
1104            (strict     && prev_off >= offset  ||
1105             not strict && prev_off > offset) -> do
1106                 -- token is indented to the left of the previous context.
1107                 -- we must generate a {} sequence now.
1108                 pushLexState layout_left
1109                 return (L span ITvocurly)
1110         _ -> do
1111                 setContext (Layout offset : ctx)
1112                 return (L span ITvocurly)
1113
1114 do_layout_left :: Action
1115 do_layout_left span _buf _len = do
1116     _ <- popLexState
1117     pushLexState bol  -- we must be at the start of a line
1118     return (L span ITvccurly)
1119
1120 -- -----------------------------------------------------------------------------
1121 -- LINE pragmas
1122
1123 setLine :: Int -> Action
1124 setLine code span buf len = do
1125   let line = parseUnsignedInteger buf len 10 octDecDigit
1126   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1)
1127         -- subtract one: the line number refers to the *following* line
1128   _ <- popLexState
1129   pushLexState code
1130   lexToken
1131
1132 setFile :: Int -> Action
1133 setFile code span buf len = do
1134   let file = lexemeToFastString (stepOn buf) (len-2)
1135   setAlrLastLoc noSrcSpan
1136   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1137   _ <- popLexState
1138   pushLexState code
1139   lexToken
1140
1141
1142 -- -----------------------------------------------------------------------------
1143 -- Options, includes and language pragmas.
1144
1145 lex_string_prag :: (String -> Token) -> Action
1146 lex_string_prag mkTok span _buf _len
1147     = do input <- getInput
1148          start <- getSrcLoc
1149          tok <- go [] input
1150          end <- getSrcLoc
1151          return (L (mkSrcSpan start end) tok)
1152     where go acc input
1153               = if isString input "#-}"
1154                    then do setInput input
1155                            return (mkTok (reverse acc))
1156                    else case alexGetChar input of
1157                           Just (c,i) -> go (c:acc) i
1158                           Nothing -> err input
1159           isString _ [] = True
1160           isString i (x:xs)
1161               = case alexGetChar i of
1162                   Just (c,i') | c == x    -> isString i' xs
1163                   _other -> False
1164           err (AI end _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1165
1166
1167 -- -----------------------------------------------------------------------------
1168 -- Strings & Chars
1169
1170 -- This stuff is horrible.  I hates it.
1171
1172 lex_string_tok :: Action
1173 lex_string_tok span _buf _len = do
1174   tok <- lex_string ""
1175   end <- getSrcLoc
1176   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1177
1178 lex_string :: String -> P Token
1179 lex_string s = do
1180   i <- getInput
1181   case alexGetChar' i of
1182     Nothing -> lit_error i
1183
1184     Just ('"',i)  -> do
1185         setInput i
1186         magicHash <- extension magicHashEnabled
1187         if magicHash
1188           then do
1189             i <- getInput
1190             case alexGetChar' i of
1191               Just ('#',i) -> do
1192                    setInput i
1193                    if any (> '\xFF') s
1194                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1195                     else let s' = mkZFastString (reverse s) in
1196                          return (ITprimstring s')
1197                         -- mkZFastString is a hack to avoid encoding the
1198                         -- string in UTF-8.  We just want the exact bytes.
1199               _other ->
1200                 return (ITstring (mkFastString (reverse s)))
1201           else
1202                 return (ITstring (mkFastString (reverse s)))
1203
1204     Just ('\\',i)
1205         | Just ('&',i) <- next -> do
1206                 setInput i; lex_string s
1207         | Just (c,i) <- next, c <= '\x7f' && is_space c -> do
1208                            -- is_space only works for <= '\x7f' (#3751)
1209                 setInput i; lex_stringgap s
1210         where next = alexGetChar' i
1211
1212     Just (c, i1) -> do
1213         case c of
1214           '\\' -> do setInput i1; c' <- lex_escape; lex_string (c':s)
1215           c | isAny c -> do setInput i1; lex_string (c:s)
1216           _other -> lit_error i
1217
1218 lex_stringgap :: String -> P Token
1219 lex_stringgap s = do
1220   i <- getInput
1221   c <- getCharOrFail i
1222   case c of
1223     '\\' -> lex_string s
1224     c | is_space c -> lex_stringgap s
1225     _other -> lit_error i
1226
1227
1228 lex_char_tok :: Action
1229 -- Here we are basically parsing character literals, such as 'x' or '\n'
1230 -- but, when Template Haskell is on, we additionally spot
1231 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1232 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1233 -- So we have to do two characters of lookahead: when we see 'x we need to
1234 -- see if there's a trailing quote
1235 lex_char_tok span _buf _len = do        -- We've seen '
1236    i1 <- getInput       -- Look ahead to first character
1237    let loc = srcSpanStart span
1238    case alexGetChar' i1 of
1239         Nothing -> lit_error  i1
1240
1241         Just ('\'', i2@(AI end2 _)) -> do       -- We've seen ''
1242                   th_exts <- extension thEnabled
1243                   if th_exts then do
1244                         setInput i2
1245                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1246                    else lit_error i1
1247
1248         Just ('\\', i2@(AI _end2 _)) -> do      -- We've seen 'backslash
1249                   setInput i2
1250                   lit_ch <- lex_escape
1251                   i3 <- getInput
1252                   mc <- getCharOrFail i3 -- Trailing quote
1253                   if mc == '\'' then finish_char_tok loc lit_ch
1254                                 else lit_error i3
1255
1256         Just (c, i2@(AI _end2 _))
1257                 | not (isAny c) -> lit_error i1
1258                 | otherwise ->
1259
1260                 -- We've seen 'x, where x is a valid character
1261                 --  (i.e. not newline etc) but not a quote or backslash
1262            case alexGetChar' i2 of      -- Look ahead one more character
1263                 Just ('\'', i3) -> do   -- We've seen 'x'
1264                         setInput i3
1265                         finish_char_tok loc c
1266                 _other -> do            -- We've seen 'x not followed by quote
1267                                         -- (including the possibility of EOF)
1268                                         -- If TH is on, just parse the quote only
1269                         th_exts <- extension thEnabled
1270                         let (AI end _) = i1
1271                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1272                                    else lit_error i2
1273
1274 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1275 finish_char_tok loc ch  -- We've already seen the closing quote
1276                         -- Just need to check for trailing #
1277   = do  magicHash <- extension magicHashEnabled
1278         i@(AI end _) <- getInput
1279         if magicHash then do
1280                 case alexGetChar' i of
1281                         Just ('#',i@(AI end _)) -> do
1282                                 setInput i
1283                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1284                         _other ->
1285                                 return (L (mkSrcSpan loc end) (ITchar ch))
1286             else do
1287                    return (L (mkSrcSpan loc end) (ITchar ch))
1288
1289 isAny :: Char -> Bool
1290 isAny c | c > '\x7f' = isPrint c
1291         | otherwise  = is_any c
1292
1293 lex_escape :: P Char
1294 lex_escape = do
1295   i0 <- getInput
1296   c <- getCharOrFail i0
1297   case c of
1298         'a'   -> return '\a'
1299         'b'   -> return '\b'
1300         'f'   -> return '\f'
1301         'n'   -> return '\n'
1302         'r'   -> return '\r'
1303         't'   -> return '\t'
1304         'v'   -> return '\v'
1305         '\\'  -> return '\\'
1306         '"'   -> return '\"'
1307         '\''  -> return '\''
1308         '^'   -> do i1 <- getInput
1309                     c <- getCharOrFail i1
1310                     if c >= '@' && c <= '_'
1311                         then return (chr (ord c - ord '@'))
1312                         else lit_error i1
1313
1314         'x'   -> readNum is_hexdigit 16 hexDigit
1315         'o'   -> readNum is_octdigit  8 octDecDigit
1316         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1317
1318         c1 ->  do
1319            i <- getInput
1320            case alexGetChar' i of
1321             Nothing -> lit_error i0
1322             Just (c2,i2) ->
1323               case alexGetChar' i2 of
1324                 Nothing -> do lit_error i0
1325                 Just (c3,i3) ->
1326                    let str = [c1,c2,c3] in
1327                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1328                                      Just rest <- [stripPrefix p str] ] of
1329                           (escape_char,[]):_ -> do
1330                                 setInput i3
1331                                 return escape_char
1332                           (escape_char,_:_):_ -> do
1333                                 setInput i2
1334                                 return escape_char
1335                           [] -> lit_error i0
1336
1337 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1338 readNum is_digit base conv = do
1339   i <- getInput
1340   c <- getCharOrFail i
1341   if is_digit c
1342         then readNum2 is_digit base conv (conv c)
1343         else lit_error i
1344
1345 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1346 readNum2 is_digit base conv i = do
1347   input <- getInput
1348   read i input
1349   where read i input = do
1350           case alexGetChar' input of
1351             Just (c,input') | is_digit c -> do
1352                 read (i*base + conv c) input'
1353             _other -> do
1354                 if i >= 0 && i <= 0x10FFFF
1355                    then do setInput input; return (chr i)
1356                    else lit_error input
1357
1358 silly_escape_chars :: [(String, Char)]
1359 silly_escape_chars = [
1360         ("NUL", '\NUL'),
1361         ("SOH", '\SOH'),
1362         ("STX", '\STX'),
1363         ("ETX", '\ETX'),
1364         ("EOT", '\EOT'),
1365         ("ENQ", '\ENQ'),
1366         ("ACK", '\ACK'),
1367         ("BEL", '\BEL'),
1368         ("BS", '\BS'),
1369         ("HT", '\HT'),
1370         ("LF", '\LF'),
1371         ("VT", '\VT'),
1372         ("FF", '\FF'),
1373         ("CR", '\CR'),
1374         ("SO", '\SO'),
1375         ("SI", '\SI'),
1376         ("DLE", '\DLE'),
1377         ("DC1", '\DC1'),
1378         ("DC2", '\DC2'),
1379         ("DC3", '\DC3'),
1380         ("DC4", '\DC4'),
1381         ("NAK", '\NAK'),
1382         ("SYN", '\SYN'),
1383         ("ETB", '\ETB'),
1384         ("CAN", '\CAN'),
1385         ("EM", '\EM'),
1386         ("SUB", '\SUB'),
1387         ("ESC", '\ESC'),
1388         ("FS", '\FS'),
1389         ("GS", '\GS'),
1390         ("RS", '\RS'),
1391         ("US", '\US'),
1392         ("SP", '\SP'),
1393         ("DEL", '\DEL')
1394         ]
1395
1396 -- before calling lit_error, ensure that the current input is pointing to
1397 -- the position of the error in the buffer.  This is so that we can report
1398 -- a correct location to the user, but also so we can detect UTF-8 decoding
1399 -- errors if they occur.
1400 lit_error :: AlexInput -> P a
1401 lit_error i = do setInput i; lexError "lexical error in string/character literal"
1402
1403 getCharOrFail :: AlexInput -> P Char
1404 getCharOrFail i =  do
1405   case alexGetChar' i of
1406         Nothing -> lexError "unexpected end-of-file in string/character literal"
1407         Just (c,i)  -> do setInput i; return c
1408
1409 -- -----------------------------------------------------------------------------
1410 -- QuasiQuote
1411
1412 lex_quasiquote_tok :: Action
1413 lex_quasiquote_tok span buf len = do
1414   let quoter = tail (lexemeToString buf (len - 1))
1415                 -- 'tail' drops the initial '[',
1416                 -- while the -1 drops the trailing '|'
1417   quoteStart <- getSrcLoc
1418   quote <- lex_quasiquote ""
1419   end <- getSrcLoc
1420   return (L (mkSrcSpan (srcSpanStart span) end)
1421            (ITquasiQuote (mkFastString quoter,
1422                           mkFastString (reverse quote),
1423                           mkSrcSpan quoteStart end)))
1424
1425 lex_quasiquote :: String -> P String
1426 lex_quasiquote s = do
1427   i <- getInput
1428   case alexGetChar' i of
1429     Nothing -> lit_error i
1430
1431     Just ('\\',i)
1432         | Just ('|',i) <- next -> do
1433                 setInput i; lex_quasiquote ('|' : s)
1434         | Just (']',i) <- next -> do
1435                 setInput i; lex_quasiquote (']' : s)
1436         where next = alexGetChar' i
1437
1438     Just ('|',i)
1439         | Just (']',i) <- next -> do
1440                 setInput i; return s
1441         where next = alexGetChar' i
1442
1443     Just (c, i) -> do
1444          setInput i; lex_quasiquote (c : s)
1445
1446 -- -----------------------------------------------------------------------------
1447 -- Warnings
1448
1449 warn :: DynFlag -> SDoc -> Action
1450 warn option warning srcspan _buf _len = do
1451     addWarning option srcspan warning
1452     lexToken
1453
1454 warnThen :: DynFlag -> SDoc -> Action -> Action
1455 warnThen option warning action srcspan buf len = do
1456     addWarning option srcspan warning
1457     action srcspan buf len
1458
1459 -- -----------------------------------------------------------------------------
1460 -- The Parse Monad
1461
1462 data LayoutContext
1463   = NoLayout
1464   | Layout !Int
1465   deriving Show
1466
1467 data ParseResult a
1468   = POk PState a
1469   | PFailed
1470         SrcSpan         -- The start and end of the text span related to
1471                         -- the error.  Might be used in environments which can
1472                         -- show this span, e.g. by highlighting it.
1473         Message         -- The error message
1474
1475 data PState = PState {
1476         buffer     :: StringBuffer,
1477         dflags     :: DynFlags,
1478         messages   :: Messages,
1479         last_loc   :: SrcSpan,  -- pos of previous token
1480         last_len   :: !Int,     -- len of previous token
1481         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1482         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1483         context    :: [LayoutContext],
1484         lex_state  :: [Int],
1485         -- Used in the alternative layout rule:
1486         -- These tokens are the next ones to be sent out. They are
1487         -- just blindly emitted, without the rule looking at them again:
1488         alr_pending_implicit_tokens :: [Located Token],
1489         -- This is the next token to be considered or, if it is Nothing,
1490         -- we need to get the next token from the input stream:
1491         alr_next_token :: Maybe (Located Token),
1492         -- This is what we consider to be the locatino of the last token
1493         -- emitted:
1494         alr_last_loc :: SrcSpan,
1495         -- The stack of layout contexts:
1496         alr_context :: [ALRContext],
1497         -- Are we expecting a '{'? If it's Just, then the ALRLayout tells
1498         -- us what sort of layout the '{' will open:
1499         alr_expecting_ocurly :: Maybe ALRLayout,
1500         -- Have we just had the '}' for a let block? If so, than an 'in'
1501         -- token doesn't need to close anything:
1502         alr_justClosedExplicitLetBlock :: Bool
1503      }
1504         -- last_loc and last_len are used when generating error messages,
1505         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1506         -- current token to happyError, we could at least get rid of last_len.
1507         -- Getting rid of last_loc would require finding another way to
1508         -- implement pushCurrentContext (which is only called from one place).
1509
1510 data ALRContext = ALRNoLayout Bool{- does it contain commas? -}
1511                               Bool{- is it a 'let' block? -}
1512                 | ALRLayout ALRLayout Int
1513 data ALRLayout = ALRLayoutLet
1514                | ALRLayoutWhere
1515                | ALRLayoutOf
1516                | ALRLayoutDo
1517
1518 newtype P a = P { unP :: PState -> ParseResult a }
1519
1520 instance Monad P where
1521   return = returnP
1522   (>>=) = thenP
1523   fail = failP
1524
1525 returnP :: a -> P a
1526 returnP a = a `seq` (P $ \s -> POk s a)
1527
1528 thenP :: P a -> (a -> P b) -> P b
1529 (P m) `thenP` k = P $ \ s ->
1530         case m s of
1531                 POk s1 a         -> (unP (k a)) s1
1532                 PFailed span err -> PFailed span err
1533
1534 failP :: String -> P a
1535 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1536
1537 failMsgP :: String -> P a
1538 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1539
1540 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1541 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1542
1543 failSpanMsgP :: SrcSpan -> SDoc -> P a
1544 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1545
1546 getPState :: P PState
1547 getPState = P $ \s -> POk s s
1548
1549 getDynFlags :: P DynFlags
1550 getDynFlags = P $ \s -> POk s (dflags s)
1551
1552 withThisPackage :: (PackageId -> a) -> P a
1553 withThisPackage f
1554  = do   pkg     <- liftM thisPackage getDynFlags
1555         return  $ f pkg
1556
1557 extension :: (Int -> Bool) -> P Bool
1558 extension p = P $ \s -> POk s (p $! extsBitmap s)
1559
1560 getExts :: P Int
1561 getExts = P $ \s -> POk s (extsBitmap s)
1562
1563 setExts :: (Int -> Int) -> P ()
1564 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1565
1566 setSrcLoc :: SrcLoc -> P ()
1567 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1568
1569 getSrcLoc :: P SrcLoc
1570 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1571
1572 setLastToken :: SrcSpan -> Int -> P ()
1573 setLastToken loc len = P $ \s -> POk s {
1574   last_loc=loc,
1575   last_len=len
1576   } ()
1577
1578 data AlexInput = AI SrcLoc StringBuffer
1579
1580 alexInputPrevChar :: AlexInput -> Char
1581 alexInputPrevChar (AI _ buf) = prevChar buf '\n'
1582
1583 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1584 alexGetChar (AI loc s)
1585   | atEnd s   = Nothing
1586   | otherwise = adj_c `seq` loc' `seq` s' `seq`
1587                 --trace (show (ord c)) $
1588                 Just (adj_c, (AI loc' s'))
1589   where (c,s') = nextChar s
1590         loc'   = advanceSrcLoc loc c
1591
1592         non_graphic     = '\x0'
1593         upper           = '\x1'
1594         lower           = '\x2'
1595         digit           = '\x3'
1596         symbol          = '\x4'
1597         space           = '\x5'
1598         other_graphic   = '\x6'
1599
1600         adj_c
1601           | c <= '\x06' = non_graphic
1602           | c <= '\x7f' = c
1603           -- Alex doesn't handle Unicode, so when Unicode
1604           -- character is encountered we output these values
1605           -- with the actual character value hidden in the state.
1606           | otherwise =
1607                 case generalCategory c of
1608                   UppercaseLetter       -> upper
1609                   LowercaseLetter       -> lower
1610                   TitlecaseLetter       -> upper
1611                   ModifierLetter        -> other_graphic
1612                   OtherLetter           -> lower -- see #1103
1613                   NonSpacingMark        -> other_graphic
1614                   SpacingCombiningMark  -> other_graphic
1615                   EnclosingMark         -> other_graphic
1616                   DecimalNumber         -> digit
1617                   LetterNumber          -> other_graphic
1618                   OtherNumber           -> other_graphic
1619                   ConnectorPunctuation  -> symbol
1620                   DashPunctuation       -> symbol
1621                   OpenPunctuation       -> other_graphic
1622                   ClosePunctuation      -> other_graphic
1623                   InitialQuote          -> other_graphic
1624                   FinalQuote            -> other_graphic
1625                   OtherPunctuation      -> symbol
1626                   MathSymbol            -> symbol
1627                   CurrencySymbol        -> symbol
1628                   ModifierSymbol        -> symbol
1629                   OtherSymbol           -> symbol
1630                   Space                 -> space
1631                   _other                -> non_graphic
1632
1633 -- This version does not squash unicode characters, it is used when
1634 -- lexing strings.
1635 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1636 alexGetChar' (AI loc s)
1637   | atEnd s   = Nothing
1638   | otherwise = c `seq` loc' `seq` s' `seq`
1639                 --trace (show (ord c)) $
1640                 Just (c, (AI loc' s'))
1641   where (c,s') = nextChar s
1642         loc'   = advanceSrcLoc loc c
1643
1644 getInput :: P AlexInput
1645 getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (AI l b)
1646
1647 setInput :: AlexInput -> P ()
1648 setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } ()
1649
1650 pushLexState :: Int -> P ()
1651 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1652
1653 popLexState :: P Int
1654 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1655
1656 getLexState :: P Int
1657 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1658
1659 popNextToken :: P (Maybe (Located Token))
1660 popNextToken
1661     = P $ \s@PState{ alr_next_token = m } ->
1662               POk (s {alr_next_token = Nothing}) m
1663
1664 setAlrLastLoc :: SrcSpan -> P ()
1665 setAlrLastLoc l = P $ \s -> POk (s {alr_last_loc = l}) ()
1666
1667 getAlrLastLoc :: P SrcSpan
1668 getAlrLastLoc = P $ \s@(PState {alr_last_loc = l}) -> POk s l
1669
1670 getALRContext :: P [ALRContext]
1671 getALRContext = P $ \s@(PState {alr_context = cs}) -> POk s cs
1672
1673 setALRContext :: [ALRContext] -> P ()
1674 setALRContext cs = P $ \s -> POk (s {alr_context = cs}) ()
1675
1676 getJustClosedExplicitLetBlock :: P Bool
1677 getJustClosedExplicitLetBlock
1678  = P $ \s@(PState {alr_justClosedExplicitLetBlock = b}) -> POk s b
1679
1680 setJustClosedExplicitLetBlock :: Bool -> P ()
1681 setJustClosedExplicitLetBlock b
1682  = P $ \s -> POk (s {alr_justClosedExplicitLetBlock = b}) ()
1683
1684 setNextToken :: Located Token -> P ()
1685 setNextToken t = P $ \s -> POk (s {alr_next_token = Just t}) ()
1686
1687 popPendingImplicitToken :: P (Maybe (Located Token))
1688 popPendingImplicitToken
1689     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1690               case ts of
1691               [] -> POk s Nothing
1692               (t : ts') -> POk (s {alr_pending_implicit_tokens = ts'}) (Just t)
1693
1694 setPendingImplicitTokens :: [Located Token] -> P ()
1695 setPendingImplicitTokens ts = P $ \s -> POk (s {alr_pending_implicit_tokens = ts}) ()
1696
1697 getAlrExpectingOCurly :: P (Maybe ALRLayout)
1698 getAlrExpectingOCurly = P $ \s@(PState {alr_expecting_ocurly = b}) -> POk s b
1699
1700 setAlrExpectingOCurly :: Maybe ALRLayout -> P ()
1701 setAlrExpectingOCurly b = P $ \s -> POk (s {alr_expecting_ocurly = b}) ()
1702
1703 -- for reasons of efficiency, flags indicating language extensions (eg,
1704 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1705 -- integer
1706
1707 genericsBit :: Int
1708 genericsBit = 0 -- {| and |}
1709 ffiBit :: Int
1710 ffiBit     = 1
1711 parrBit :: Int
1712 parrBit    = 2
1713 arrowsBit :: Int
1714 arrowsBit  = 4
1715 thBit :: Int
1716 thBit      = 5
1717 ipBit :: Int
1718 ipBit      = 6
1719 explicitForallBit :: Int
1720 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1721 bangPatBit :: Int
1722 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1723                 -- (doesn't affect the lexer)
1724 tyFamBit :: Int
1725 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1726 haddockBit :: Int
1727 haddockBit = 10 -- Lex and parse Haddock comments
1728 magicHashBit :: Int
1729 magicHashBit = 11 -- "#" in both functions and operators
1730 kindSigsBit :: Int
1731 kindSigsBit = 12 -- Kind signatures on type variables
1732 recursiveDoBit :: Int
1733 recursiveDoBit = 13 -- mdo
1734 unicodeSyntaxBit :: Int
1735 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1736 unboxedTuplesBit :: Int
1737 unboxedTuplesBit = 15 -- (# and #)
1738 standaloneDerivingBit :: Int
1739 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1740 transformComprehensionsBit :: Int
1741 transformComprehensionsBit = 17
1742 qqBit :: Int
1743 qqBit      = 18 -- enable quasiquoting
1744 inRulePragBit :: Int
1745 inRulePragBit = 19
1746 rawTokenStreamBit :: Int
1747 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1748 newQualOpsBit :: Int
1749 newQualOpsBit = 21 -- Haskell' qualified operator syntax, e.g. Prelude.(+)
1750 recBit :: Int
1751 recBit = 22 -- rec
1752 alternativeLayoutRuleBit :: Int
1753 alternativeLayoutRuleBit = 23
1754
1755 always :: Int -> Bool
1756 always           _     = True
1757 genericsEnabled :: Int -> Bool
1758 genericsEnabled  flags = testBit flags genericsBit
1759 parrEnabled :: Int -> Bool
1760 parrEnabled      flags = testBit flags parrBit
1761 arrowsEnabled :: Int -> Bool
1762 arrowsEnabled    flags = testBit flags arrowsBit
1763 thEnabled :: Int -> Bool
1764 thEnabled        flags = testBit flags thBit
1765 ipEnabled :: Int -> Bool
1766 ipEnabled        flags = testBit flags ipBit
1767 explicitForallEnabled :: Int -> Bool
1768 explicitForallEnabled flags = testBit flags explicitForallBit
1769 bangPatEnabled :: Int -> Bool
1770 bangPatEnabled   flags = testBit flags bangPatBit
1771 -- tyFamEnabled :: Int -> Bool
1772 -- tyFamEnabled     flags = testBit flags tyFamBit
1773 haddockEnabled :: Int -> Bool
1774 haddockEnabled   flags = testBit flags haddockBit
1775 magicHashEnabled :: Int -> Bool
1776 magicHashEnabled flags = testBit flags magicHashBit
1777 -- kindSigsEnabled :: Int -> Bool
1778 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1779 unicodeSyntaxEnabled :: Int -> Bool
1780 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1781 unboxedTuplesEnabled :: Int -> Bool
1782 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1783 standaloneDerivingEnabled :: Int -> Bool
1784 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1785 qqEnabled :: Int -> Bool
1786 qqEnabled        flags = testBit flags qqBit
1787 -- inRulePrag :: Int -> Bool
1788 -- inRulePrag       flags = testBit flags inRulePragBit
1789 rawTokenStreamEnabled :: Int -> Bool
1790 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1791 newQualOps :: Int -> Bool
1792 newQualOps       flags = testBit flags newQualOpsBit
1793 oldQualOps :: Int -> Bool
1794 oldQualOps flags = not (newQualOps flags)
1795 alternativeLayoutRule :: Int -> Bool
1796 alternativeLayoutRule flags = testBit flags alternativeLayoutRuleBit
1797
1798 -- PState for parsing options pragmas
1799 --
1800 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1801 pragState dynflags buf loc =
1802   PState {
1803       buffer        = buf,
1804       messages      = emptyMessages,
1805       dflags        = dynflags,
1806       last_loc      = mkSrcSpan loc loc,
1807       last_len      = 0,
1808       loc           = loc,
1809       extsBitmap    = 0,
1810       context       = [],
1811       lex_state     = [bol, option_prags, 0],
1812       alr_pending_implicit_tokens = [],
1813       alr_next_token = Nothing,
1814       alr_last_loc = noSrcSpan,
1815       alr_context = [],
1816       alr_expecting_ocurly = Nothing,
1817       alr_justClosedExplicitLetBlock = False
1818     }
1819
1820
1821 -- create a parse state
1822 --
1823 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1824 mkPState buf loc flags  =
1825   PState {
1826       buffer          = buf,
1827       dflags        = flags,
1828       messages      = emptyMessages,
1829       last_loc      = mkSrcSpan loc loc,
1830       last_len      = 0,
1831       loc           = loc,
1832       extsBitmap    = fromIntegral bitmap,
1833       context       = [],
1834       lex_state     = [bol, 0],
1835         -- we begin in the layout state if toplev_layout is set
1836       alr_pending_implicit_tokens = [],
1837       alr_next_token = Nothing,
1838       alr_last_loc = noSrcSpan,
1839       alr_context = [],
1840       alr_expecting_ocurly = Nothing,
1841       alr_justClosedExplicitLetBlock = False
1842     }
1843     where
1844       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1845                .|. ffiBit            `setBitIf` dopt Opt_ForeignFunctionInterface flags
1846                .|. parrBit           `setBitIf` dopt Opt_PArr         flags
1847                .|. arrowsBit         `setBitIf` dopt Opt_Arrows       flags
1848                .|. thBit             `setBitIf` dopt Opt_TemplateHaskell flags
1849                .|. qqBit             `setBitIf` dopt Opt_QuasiQuotes flags
1850                .|. ipBit             `setBitIf` dopt Opt_ImplicitParams flags
1851                .|. explicitForallBit `setBitIf` dopt Opt_ExplicitForAll flags
1852                .|. bangPatBit        `setBitIf` dopt Opt_BangPatterns flags
1853                .|. tyFamBit          `setBitIf` dopt Opt_TypeFamilies flags
1854                .|. haddockBit        `setBitIf` dopt Opt_Haddock      flags
1855                .|. magicHashBit      `setBitIf` dopt Opt_MagicHash    flags
1856                .|. kindSigsBit       `setBitIf` dopt Opt_KindSignatures flags
1857                .|. recursiveDoBit    `setBitIf` dopt Opt_RecursiveDo flags
1858                .|. recBit            `setBitIf` dopt Opt_DoRec  flags
1859                .|. recBit            `setBitIf` dopt Opt_Arrows flags
1860                .|. unicodeSyntaxBit  `setBitIf` dopt Opt_UnicodeSyntax flags
1861                .|. unboxedTuplesBit  `setBitIf` dopt Opt_UnboxedTuples flags
1862                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1863                .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1864                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1865                .|. newQualOpsBit `setBitIf` dopt Opt_NewQualifiedOperators flags
1866                .|. alternativeLayoutRuleBit `setBitIf` dopt Opt_AlternativeLayoutRule flags
1867       --
1868       setBitIf :: Int -> Bool -> Int
1869       b `setBitIf` cond | cond      = bit b
1870                         | otherwise = 0
1871
1872 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1873 addWarning option srcspan warning
1874  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1875        let warning' = mkWarnMsg srcspan alwaysQualify warning
1876            ws' = if dopt option d then ws `snocBag` warning' else ws
1877        in POk s{messages=(ws', es)} ()
1878
1879 getMessages :: PState -> Messages
1880 getMessages PState{messages=ms} = ms
1881
1882 getContext :: P [LayoutContext]
1883 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1884
1885 setContext :: [LayoutContext] -> P ()
1886 setContext ctx = P $ \s -> POk s{context=ctx} ()
1887
1888 popContext :: P ()
1889 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1890                               last_len = len, last_loc = last_loc }) ->
1891   case ctx of
1892         (_:tl) -> POk s{ context = tl } ()
1893         []     -> PFailed last_loc (srcParseErr buf len)
1894
1895 -- Push a new layout context at the indentation of the last token read.
1896 -- This is only used at the outer level of a module when the 'module'
1897 -- keyword is missing.
1898 pushCurrentContext :: P ()
1899 pushCurrentContext = P $ \ s@PState{ last_loc=loc, context=ctx } ->
1900     POk s{context = Layout (srcSpanStartCol loc) : ctx} ()
1901
1902 getOffside :: P Ordering
1903 getOffside = P $ \s@PState{last_loc=loc, context=stk} ->
1904                 let offs = srcSpanStartCol loc in
1905                 let ord = case stk of
1906                         (Layout n:_) -> --trace ("layout: " ++ show n ++ ", offs: " ++ show offs) $
1907                                         compare offs n
1908                         _            -> GT
1909                 in POk s ord
1910
1911 -- ---------------------------------------------------------------------------
1912 -- Construct a parse error
1913
1914 srcParseErr
1915   :: StringBuffer       -- current buffer (placed just after the last token)
1916   -> Int                -- length of the previous token
1917   -> Message
1918 srcParseErr buf len
1919   = hcat [ if null token
1920              then ptext (sLit "parse error (possibly incorrect indentation)")
1921              else hcat [ptext (sLit "parse error on input "),
1922                         char '`', text token, char '\'']
1923     ]
1924   where token = lexemeToString (offsetBytes (-len) buf) len
1925
1926 -- Report a parse failure, giving the span of the previous token as
1927 -- the location of the error.  This is the entry point for errors
1928 -- detected during parsing.
1929 srcParseFail :: P a
1930 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1931                             last_loc = last_loc } ->
1932     PFailed last_loc (srcParseErr buf len)
1933
1934 -- A lexical error is reported at a particular position in the source file,
1935 -- not over a token range.
1936 lexError :: String -> P a
1937 lexError str = do
1938   loc <- getSrcLoc
1939   (AI end buf) <- getInput
1940   reportLexError loc end buf str
1941
1942 -- -----------------------------------------------------------------------------
1943 -- This is the top-level function: called from the parser each time a
1944 -- new token is to be read from the input.
1945
1946 lexer :: (Located Token -> P a) -> P a
1947 lexer cont = do
1948   alr <- extension alternativeLayoutRule
1949   let lexTokenFun = if alr then lexTokenAlr else lexToken
1950   tok@(L _span _tok__) <- lexTokenFun
1951   --trace ("token: " ++ show _tok__) $ do
1952   cont tok
1953
1954 lexTokenAlr :: P (Located Token)
1955 lexTokenAlr = do mPending <- popPendingImplicitToken
1956                  t <- case mPending of
1957                       Nothing ->
1958                           do mNext <- popNextToken
1959                              t <- case mNext of
1960                                   Nothing -> lexToken
1961                                   Just next -> return next
1962                              alternativeLayoutRuleToken t
1963                       Just t ->
1964                           return t
1965                  setAlrLastLoc (getLoc t)
1966                  case unLoc t of
1967                      ITwhere -> setAlrExpectingOCurly (Just ALRLayoutWhere)
1968                      ITlet   -> setAlrExpectingOCurly (Just ALRLayoutLet)
1969                      ITof    -> setAlrExpectingOCurly (Just ALRLayoutOf)
1970                      ITdo    -> setAlrExpectingOCurly (Just ALRLayoutDo)
1971                      ITmdo   -> setAlrExpectingOCurly (Just ALRLayoutDo)
1972                      ITrec   -> setAlrExpectingOCurly (Just ALRLayoutDo)
1973                      _       -> return ()
1974                  return t
1975
1976 alternativeLayoutRuleToken :: Located Token -> P (Located Token)
1977 alternativeLayoutRuleToken t
1978     = do context <- getALRContext
1979          lastLoc <- getAlrLastLoc
1980          mExpectingOCurly <- getAlrExpectingOCurly
1981          justClosedExplicitLetBlock <- getJustClosedExplicitLetBlock
1982          setJustClosedExplicitLetBlock False
1983          dflags <- getDynFlags
1984          let transitional = dopt Opt_AlternativeLayoutRuleTransitional dflags
1985              thisLoc = getLoc t
1986              thisCol = srcSpanStartCol thisLoc
1987              newLine = (lastLoc == noSrcSpan)
1988                     || (srcSpanStartLine thisLoc > srcSpanEndLine lastLoc)
1989          case (unLoc t, context, mExpectingOCurly) of
1990              -- This case handles a GHC extension to the original H98
1991              -- layout rule...
1992              (ITocurly, _, Just alrLayout) ->
1993                  do setAlrExpectingOCurly Nothing
1994                     let isLet = case alrLayout of
1995                                 ALRLayoutLet -> True
1996                                 _ -> False
1997                     setALRContext (ALRNoLayout (containsCommas ITocurly) isLet : context)
1998                     return t
1999              -- ...and makes this case unnecessary
2000              {-
2001              -- I think our implicit open-curly handling is slightly
2002              -- different to John's, in how it interacts with newlines
2003              -- and "in"
2004              (ITocurly, _, Just _) ->
2005                  do setAlrExpectingOCurly Nothing
2006                     setNextToken t
2007                     lexTokenAlr
2008              -}
2009              (_, ALRLayout _ col : ls, Just expectingOCurly)
2010               | (thisCol > col) ||
2011                 (thisCol == col &&
2012                  isNonDecreasingIntentation expectingOCurly) ->
2013                  do setAlrExpectingOCurly Nothing
2014                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2015                     setNextToken t
2016                     return (L thisLoc ITocurly)
2017               | otherwise ->
2018                  do setAlrExpectingOCurly Nothing
2019                     setPendingImplicitTokens [L lastLoc ITccurly]
2020                     setNextToken t
2021                     return (L lastLoc ITocurly)
2022              (_, _, Just expectingOCurly) ->
2023                  do setAlrExpectingOCurly Nothing
2024                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2025                     setNextToken t
2026                     return (L thisLoc ITocurly)
2027              -- We do the [] cases earlier than in the spec, as we
2028              -- have an actual EOF token
2029              (ITeof, ALRLayout _ _ : ls, _) ->
2030                  do setALRContext ls
2031                     setNextToken t
2032                     return (L thisLoc ITccurly)
2033              (ITeof, _, _) ->
2034                  return t
2035              -- the other ITeof case omitted; general case below covers it
2036              (ITin, _, _)
2037               | justClosedExplicitLetBlock ->
2038                  return t
2039              (ITin, ALRLayout ALRLayoutLet _ : ls, _)
2040               | newLine ->
2041                  do setPendingImplicitTokens [t]
2042                     setALRContext ls
2043                     return (L thisLoc ITccurly)
2044              -- This next case is to handle a transitional issue:
2045              (ITwhere, ALRLayout _ col : ls, _)
2046               | newLine && thisCol == col && transitional ->
2047                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2048                                thisLoc
2049                                (transitionalAlternativeLayoutWarning
2050                                     "`where' clause at the same depth as implicit layout block")
2051                     setALRContext ls
2052                     setNextToken t
2053                     -- Note that we use lastLoc, as we may need to close
2054                     -- more layouts, or give a semicolon
2055                     return (L lastLoc ITccurly)
2056              -- This next case is to handle a transitional issue:
2057              (ITvbar, ALRLayout _ col : ls, _)
2058               | newLine && thisCol == col && transitional ->
2059                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2060                                thisLoc
2061                                (transitionalAlternativeLayoutWarning
2062                                     "`|' at the same depth as implicit layout block")
2063                     setALRContext ls
2064                     setNextToken t
2065                     -- Note that we use lastLoc, as we may need to close
2066                     -- more layouts, or give a semicolon
2067                     return (L lastLoc ITccurly)
2068              (_, ALRLayout _ col : ls, _)
2069               | newLine && thisCol == col ->
2070                  do setNextToken t
2071                     return (L thisLoc ITsemi)
2072               | newLine && thisCol < col ->
2073                  do setALRContext ls
2074                     setNextToken t
2075                     -- Note that we use lastLoc, as we may need to close
2076                     -- more layouts, or give a semicolon
2077                     return (L lastLoc ITccurly)
2078              -- We need to handle close before open, as 'then' is both
2079              -- an open and a close
2080              (u, _, _)
2081               | isALRclose u ->
2082                  case context of
2083                  ALRLayout _ _ : ls ->
2084                      do setALRContext ls
2085                         setNextToken t
2086                         return (L thisLoc ITccurly)
2087                  ALRNoLayout _ isLet : ls ->
2088                      do let ls' = if isALRopen u
2089                                      then ALRNoLayout (containsCommas u) False : ls
2090                                      else ls
2091                         setALRContext ls'
2092                         when isLet $ setJustClosedExplicitLetBlock True
2093                         return t
2094                  [] ->
2095                      do let ls = if isALRopen u
2096                                     then [ALRNoLayout (containsCommas u) False]
2097                                     else ls
2098                         setALRContext ls
2099                         -- XXX This is an error in John's code, but
2100                         -- it looks reachable to me at first glance
2101                         return t
2102              (u, _, _)
2103               | isALRopen u ->
2104                  do setALRContext (ALRNoLayout (containsCommas u) False : context)
2105                     return t
2106              (ITin, ALRLayout ALRLayoutLet _ : ls, _) ->
2107                  do setALRContext ls
2108                     setPendingImplicitTokens [t]
2109                     return (L thisLoc ITccurly)
2110              (ITin, ALRLayout _ _ : ls, _) ->
2111                  do setALRContext ls
2112                     setNextToken t
2113                     return (L thisLoc ITccurly)
2114              -- the other ITin case omitted; general case below covers it
2115              (ITcomma, ALRLayout _ _ : ls, _)
2116               | topNoLayoutContainsCommas ls ->
2117                  do setALRContext ls
2118                     setNextToken t
2119                     return (L thisLoc ITccurly)
2120              (ITwhere, ALRLayout ALRLayoutDo _ : ls, _) ->
2121                  do setALRContext ls
2122                     setPendingImplicitTokens [t]
2123                     return (L thisLoc ITccurly)
2124              -- the other ITwhere case omitted; general case below covers it
2125              (_, _, _) -> return t
2126
2127 transitionalAlternativeLayoutWarning :: String -> SDoc
2128 transitionalAlternativeLayoutWarning msg
2129     = text "transitional layout will not be accepted in the future:"
2130    $$ text msg
2131
2132 isALRopen :: Token -> Bool
2133 isALRopen ITcase        = True
2134 isALRopen ITif          = True
2135 isALRopen ITthen        = True
2136 isALRopen IToparen      = True
2137 isALRopen ITobrack      = True
2138 isALRopen ITocurly      = True
2139 -- GHC Extensions:
2140 isALRopen IToubxparen   = True
2141 isALRopen ITparenEscape = True
2142 isALRopen _             = False
2143
2144 isALRclose :: Token -> Bool
2145 isALRclose ITof     = True
2146 isALRclose ITthen   = True
2147 isALRclose ITelse   = True
2148 isALRclose ITcparen = True
2149 isALRclose ITcbrack = True
2150 isALRclose ITccurly = True
2151 -- GHC Extensions:
2152 isALRclose ITcubxparen = True
2153 isALRclose _        = False
2154
2155 isNonDecreasingIntentation :: ALRLayout -> Bool
2156 isNonDecreasingIntentation ALRLayoutDo = True
2157 isNonDecreasingIntentation _           = False
2158
2159 containsCommas :: Token -> Bool
2160 containsCommas IToparen = True
2161 containsCommas ITobrack = True
2162 -- John doesn't have {} as containing commas, but records contain them,
2163 -- which caused a problem parsing Cabal's Distribution.Simple.InstallDirs
2164 -- (defaultInstallDirs).
2165 containsCommas ITocurly = True
2166 -- GHC Extensions:
2167 containsCommas IToubxparen = True
2168 containsCommas _        = False
2169
2170 topNoLayoutContainsCommas :: [ALRContext] -> Bool
2171 topNoLayoutContainsCommas [] = False
2172 topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
2173 topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
2174
2175 lexToken :: P (Located Token)
2176 lexToken = do
2177   inp@(AI loc1 buf) <- getInput
2178   sc <- getLexState
2179   exts <- getExts
2180   case alexScanUser exts inp sc of
2181     AlexEOF -> do
2182         let span = mkSrcSpan loc1 loc1
2183         setLastToken span 0
2184         return (L span ITeof)
2185     AlexError (AI loc2 buf) ->
2186         reportLexError loc1 loc2 buf "lexical error"
2187     AlexSkip inp2 _ -> do
2188         setInput inp2
2189         lexToken
2190     AlexToken inp2@(AI end buf2) _ t -> do
2191         setInput inp2
2192         let span = mkSrcSpan loc1 end
2193         let bytes = byteDiff buf buf2
2194         span `seq` setLastToken span bytes
2195         t span buf bytes
2196
2197 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
2198 reportLexError loc1 loc2 buf str
2199   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
2200   | otherwise =
2201   let
2202         c = fst (nextChar buf)
2203   in
2204   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
2205     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
2206     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
2207
2208 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
2209 lexTokenStream buf loc dflags = unP go initState
2210     where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
2211           go = do
2212             ltok <- lexer return
2213             case ltok of
2214               L _ ITeof -> return []
2215               _ -> liftM (ltok:) go
2216
2217 linePrags = Map.singleton "line" (begin line_prag2)
2218
2219 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
2220                                  ("options_ghc", lex_string_prag IToptions_prag),
2221                                  ("options_haddock", lex_string_prag ITdocOptions),
2222                                  ("language", token ITlanguage_prag),
2223                                  ("include", lex_string_prag ITinclude_prag)])
2224
2225 ignoredPrags = Map.fromList (map ignored pragmas)
2226                where ignored opt = (opt, nested_comment lexToken)
2227                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
2228                      options_pragmas = map ("options_" ++) impls
2229                      -- CFILES is a hugs-only thing.
2230                      pragmas = options_pragmas ++ ["cfiles", "contract"]
2231
2232 oneWordPrags = Map.fromList([("rules", rulePrag),
2233                            ("inline", token (ITinline_prag True)),
2234                            ("notinline", token (ITinline_prag False)),
2235                            ("specialize", token ITspec_prag),
2236                            ("source", token ITsource_prag),
2237                            ("warning", token ITwarning_prag),
2238                            ("deprecated", token ITdeprecated_prag),
2239                            ("scc", token ITscc_prag),
2240                            ("generated", token ITgenerated_prag),
2241                            ("core", token ITcore_prag),
2242                            ("unpack", token ITunpack_prag),
2243                            ("ann", token ITann_prag)])
2244
2245 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_conlike_prag True)),
2246                              ("notinline conlike", token (ITinline_conlike_prag False)),
2247                              ("specialize inline", token (ITspec_inline_prag True)),
2248                              ("specialize notinline", token (ITspec_inline_prag False))])
2249
2250
2251 dispatch_pragmas :: Map String Action -> Action
2252 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
2253                                        Just found -> found span buf len
2254                                        Nothing -> lexError "unknown pragma"
2255
2256 known_pragma :: Map String Action -> AlexAccPred Int
2257 known_pragma prags _ _ len (AI _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
2258                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
2259
2260 clean_pragma :: String -> String
2261 clean_pragma prag = canon_ws (map toLower (unprefix prag))
2262                     where unprefix prag' = case stripPrefix "{-#" prag' of
2263                                              Just rest -> rest
2264                                              Nothing -> prag'
2265                           canonical prag' = case prag' of
2266                                               "noinline" -> "notinline"
2267                                               "specialise" -> "specialize"
2268                                               "constructorlike" -> "conlike"
2269                                               _ -> prag'
2270                           canon_ws s = unwords (map canonical (words s))
2271 }