compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 -- XXX The above flags turn off warnings in the generated code:
  35 {-# LANGUAGE BangPatterns #-}
  36 {-# OPTIONS_GHC -fno-warn-unused-matches #-}
  37 {-# OPTIONS_GHC -fno-warn-unused-binds #-}
  38 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
  39 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
  40 -- But alex still generates some code that causes the "lazy unlifted bindings"
  41 -- warning, and old compilers don't know about it so we can't easily turn
  42 -- it off, so for now we use the sledge hammer:
  43 {-# OPTIONS_GHC -w #-}
  44
  45 {-# OPTIONS_GHC -funbox-strict-fields #-}
  46
  47 module Lexer (
  48    Token(..), lexer, pragState, mkPState, PState(..),
  49    P(..), ParseResult(..), getSrcLoc,
  50    getPState, getDynFlags, withThisPackage,
  51    failLocMsgP, failSpanMsgP, srcParseFail,
  52    getMessages,
  53    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  54    activeContext, nextIsEOF,
  55    getLexState, popLexState, pushLexState,
  56    extension, bangPatEnabled, datatypeContextsEnabled,
  57    addWarning,
  58    incrBracketDepth, decrBracketDepth, getParserBrakDepth,
  59    lexTokenStream
  60   ) where
  61
  62 import Bag
  63 import ErrUtils
  64 import Outputable
  65 import StringBuffer
  66 import FastString
  67 import SrcLoc
  68 import UniqFM
  69 import DynFlags
  70 import Module
  71 import Ctype
  72 import BasicTypes       ( InlineSpec(..), RuleMatchInfo(..) )
  73 import Util             ( readRational )
  74
  75 import Control.Monad
  76 import Data.Bits
  77 import Data.Char
  78 import Data.List
  79 import Data.Maybe
  80 import Data.Map (Map)
  81 import qualified Data.Map as Map
  82 import Data.Ratio
  83 }
  84
  85 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  86 $whitechar   = [\ \n\r\f\v $unispace]
  87 $white_no_nl = $whitechar # \n
  88 $tab         = \t
  89
  90 $ascdigit  = 0-9
  91 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  92 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  93 $digit     = [$ascdigit $unidigit]
  94
  95 $special   = [\(\)\,\;\[\]\`\{\}]
  96 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  97 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  98 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  99
 100 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
 101 $asclarge  = [A-Z]
 102 $large     = [$asclarge $unilarge]
 103
 104 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 105 $ascsmall  = [a-z]
 106 $small     = [$ascsmall $unismall \_]
 107
 108 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 109 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 110
 111 $octit     = 0-7
 112 $hexit     = [$decdigit A-F a-f]
 113 $symchar   = [$symbol \:]
 114 $nl        = [\n\r]
 115 $idchar    = [$small $large $digit \']
 116
 117 $pragmachar = [$small $large $digit]
 118
 119 $docsym    = [\| \^ \* \$]
 120
 121 @varid     = $small $idchar*
 122 @conid     = $large $idchar*
 123
 124 @varsym    = $symbol $symchar*
 125 @consym    = \: $symchar*
 126
 127 @decimal     = $decdigit+
 128 @octal       = $octit+
 129 @hexadecimal = $hexit+
 130 @exponent    = [eE] [\-\+]? @decimal
 131
 132 -- we support the hierarchical module name extension:
 133 @qual = (@conid \.)+
 134
 135 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 136
 137 -- normal signed numerical literals can only be explicitly negative,
 138 -- not explicitly positive (contrast @exponent)
 139 @negative = \-
 140 @signed = @negative ?
 141
 142 haskell :-
 143
 144 -- everywhere: skip whitespace and comments
 145 $white_no_nl+                           ;
 146 $tab+         { warn Opt_WarnTabs (text "Warning: Tab character") }
 147
 148 -- Everywhere: deal with nested comments.  We explicitly rule out
 149 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 150 -- (this can happen even though pragmas will normally take precedence due to
 151 -- longest-match, because pragmas aren't valid in every state, but comments
 152 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 153 -- set.
 154
 155 "{-" / { isNormalComment } { nested_comment lexToken }
 156
 157 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 158 -- more dashes followed by a symbol should be parsed as a varsym, so we
 159 -- have to exclude those.
 160
 161 -- Since Haddock comments aren't valid in every state, we need to rule them
 162 -- out here.
 163
 164 -- The following two rules match comments that begin with two dashes, but
 165 -- continue with a different character. The rules test that this character
 166 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 167 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 168 -- have a Haddock comment). The rules then munch the rest of the line.
 169
 170 "-- " ~[$docsym \#] .* { lineCommentToken }
 171 "--" [^$symbol : \ ] .* { lineCommentToken }
 172
 173 -- Next, match Haddock comments if no -haddock flag
 174
 175 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 176
 177 -- Now, when we've matched comments that begin with 2 dashes and continue
 178 -- with a different character, we need to match comments that begin with three
 179 -- or more dashes (which clearly can't be Haddock comments). We only need to
 180 -- make sure that the first non-dash character isn't a symbol, and munch the
 181 -- rest of the line.
 182
 183 "---"\-* [^$symbol :] .* { lineCommentToken }
 184
 185 -- Since the previous rules all match dashes followed by at least one
 186 -- character, we also need to match a whole line filled with just dashes.
 187
 188 "--"\-* / { atEOL } { lineCommentToken }
 189
 190 -- We need this rule since none of the other single line comment rules
 191 -- actually match this case.
 192
 193 "-- " / { atEOL } { lineCommentToken }
 194
 195 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 196 -- blank lines) until we find a non-whitespace character, then do layout
 197 -- processing.
 198 --
 199 -- One slight wibble here: what if the line begins with {-#? In
 200 -- theory, we have to lex the pragma to see if it's one we recognise,
 201 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 202 -- as a nested comment.  We don't bother with this: if the line begins
 203 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 204 <bol> {
 205   \n                                    ;
 206   ^\# (line)?                           { begin line_prag1 }
 207   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 208   ^\# \! .* \n                          ; -- #!, for scripts
 209   ()                                    { do_bol }
 210 }
 211
 212 -- after a layout keyword (let, where, do, of), we begin a new layout
 213 -- context if the curly brace is missing.
 214 -- Careful! This stuff is quite delicate.
 215 <layout, layout_do> {
 216   \{ / { notFollowedBy '-' }            { hopefully_open_brace }
 217         -- we might encounter {-# here, but {- has been handled already
 218   \n                                    ;
 219   ^\# (line)?                           { begin line_prag1 }
 220 }
 221
 222 -- do is treated in a subtly different way, see new_layout_context
 223 <layout>    ()                          { new_layout_context True }
 224 <layout_do> ()                          { new_layout_context False }
 225
 226 -- after a new layout context which was found to be to the left of the
 227 -- previous context, we have generated a '{' token, and we now need to
 228 -- generate a matching '}' token.
 229 <layout_left>  ()                       { do_layout_left }
 230
 231 <0,option_prags> \n                             { begin bol }
 232
 233 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 234                                 { dispatch_pragmas linePrags }
 235
 236 -- single-line line pragmas, of the form
 237 --    # <line> "<file>" <extra-stuff> \n
 238 <line_prag1> $decdigit+                 { setLine line_prag1a }
 239 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 240 <line_prag1b> .*                        { pop }
 241
 242 -- Haskell-style line pragmas, of the form
 243 --    {-# LINE <line> "<file>" #-}
 244 <line_prag2> $decdigit+                 { setLine line_prag2a }
 245 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 246 <line_prag2b> "#-}"|"-}"                { pop }
 247    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 248    -- with older versions of GHC which generated these.
 249
 250 <0,option_prags> {
 251   "{-#" $whitechar* $pragmachar+
 252         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 253                                  { dispatch_pragmas twoWordPrags }
 254
 255   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 256                                  { dispatch_pragmas oneWordPrags }
 257
 258   -- We ignore all these pragmas, but don't generate a warning for them
 259   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 260                                  { dispatch_pragmas ignoredPrags }
 261
 262   -- ToDo: should only be valid inside a pragma:
 263   "#-}"                                 { endPrag }
 264 }
 265
 266 <option_prags> {
 267   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 268                                    { dispatch_pragmas fileHeaderPrags }
 269
 270   "-- #"                                 { multiline_doc_comment }
 271 }
 272
 273 <0> {
 274   -- In the "0" mode we ignore these pragmas
 275   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 276                      { nested_comment lexToken }
 277 }
 278
 279 <0> {
 280   "-- #" .* { lineCommentToken }
 281 }
 282
 283 <0,option_prags> {
 284   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 285                     (nested_comment lexToken) }
 286 }
 287
 288 -- '0' state: ordinary lexemes
 289
 290 -- Haddock comments
 291
 292 <0,option_prags> {
 293   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 294   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 295 }
 296
 297 -- "special" symbols
 298
 299 <0> {
 300   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 301   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 302 }
 303
 304 <0> {
 305   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 306   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 307   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 308   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 309   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 310   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 311   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 312   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 313
 314 -- For backward compatibility, accept the old dollar syntax
 315   "[$" @varid "|"  / { ifExtension qqEnabled }
 316                      { lex_quasiquote_tok }
 317
 318   "[" @varid "|"  / { ifExtension qqEnabled }
 319                      { lex_quasiquote_tok }
 320 }
 321
 322 <0> {
 323   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 324                                         { special IToparenbar }
 325   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 326 }
 327
 328 <0> {
 329   "<[" / { ifExtension hetMetEnabled `alexAndPred` notFollowedBySymbol }
 330                                         { special ITopenBrak }
 331   "]>" / { ifExtension hetMetEnabled }  { special ITcloseBrak }
 332   "~~" / { ifExtension hetMetEnabled }  { special ITescape }
 333   "%%" / { ifExtension hetMetEnabled }  { special ITdoublePercent }
 334   "~~$" / { ifExtension hetMetEnabled }  { special ITescapeDollar }
 335 }
 336
 337 <0> {
 338   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 339 }
 340
 341 <0> {
 342   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 343          { token IToubxparen }
 344   "#)" / { ifExtension unboxedTuplesEnabled }
 345          { token ITcubxparen }
 346 }
 347
 348 <0> {
 349   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 350   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 351 }
 352
 353 <0,option_prags> {
 354   \(                                    { special IToparen }
 355   \)                                    { special ITcparen }
 356   \[                                    { special ITobrack }
 357   \]                                    { special ITcbrack }
 358   \,                                    { special ITcomma }
 359   \;                                    { special ITsemi }
 360   \`                                    { special ITbackquote }
 361
 362   \{                                    { open_brace }
 363   \}                                    { close_brace }
 364 }
 365
 366 <0,option_prags> {
 367   @qual @varid                  { idtoken qvarid }
 368   @qual @conid                  { idtoken qconid }
 369   @varid                        { varid }
 370   @conid                        { idtoken conid }
 371 }
 372
 373 <0> {
 374   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 375   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 376   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 377   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 378 }
 379
 380 -- ToDo: - move `var` and (sym) into lexical syntax?
 381 --       - remove backquote from $special?
 382 <0> {
 383   @qual @varsym                                    { idtoken qvarsym }
 384   @qual @consym                                    { idtoken qconsym }
 385   @varsym                                          { varsym }
 386   @consym                                          { consym }
 387 }
 388
 389 -- For the normal boxed literals we need to be careful
 390 -- when trying to be close to Haskell98
 391 <0> {
 392   -- Normal integral literals (:: Num a => a, from Integer)
 393   @decimal           { tok_num positive 0 0 decimal }
 394   0[oO] @octal       { tok_num positive 2 2 octal }
 395   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 396
 397   -- Normal rational literals (:: Fractional a => a, from Rational)
 398   @floating_point    { strtoken tok_float }
 399 }
 400
 401 <0> {
 402   -- Unboxed ints (:: Int#) and words (:: Word#)
 403   -- It's simpler (and faster?) to give separate cases to the negatives,
 404   -- especially considering octal/hexadecimal prefixes.
 405   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 406   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 407   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 408   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 409   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 410   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 411
 412   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 413   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 414   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 415
 416   -- Unboxed floats and doubles (:: Float#, :: Double#)
 417   -- prim_{float,double} work with signed literals
 418   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 419   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 420 }
 421
 422 -- Strings and chars are lexed by hand-written code.  The reason is
 423 -- that even if we recognise the string or char here in the regex
 424 -- lexer, we would still have to parse the string afterward in order
 425 -- to convert it to a String.
 426 <0> {
 427   \'                            { lex_char_tok }
 428   \"                            { lex_string_tok }
 429 }
 430
 431 {
 432 -- -----------------------------------------------------------------------------
 433 -- The token type
 434
 435 data Token
 436   = ITas                        -- Haskell keywords
 437   | ITcase
 438   | ITclass
 439   | ITdata
 440   | ITdefault
 441   | ITderiving
 442   | ITdo
 443   | ITelse
 444   | IThiding
 445   | ITif
 446   | ITimport
 447   | ITin
 448   | ITinfix
 449   | ITinfixl
 450   | ITinfixr
 451   | ITinstance
 452   | ITlet
 453   | ITmodule
 454   | ITnewtype
 455   | ITof
 456   | ITqualified
 457   | ITthen
 458   | ITtype
 459   | ITwhere
 460   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 461
 462   | ITforall                    -- GHC extension keywords
 463   | ITforeign
 464   | ITexport
 465   | ITlabel
 466   | ITdynamic
 467   | ITsafe
 468   | ITthreadsafe
 469   | ITinterruptible
 470   | ITunsafe
 471   | ITstdcallconv
 472   | ITccallconv
 473   | ITprimcallconv
 474   | ITmdo
 475   | ITfamily
 476   | ITgroup
 477   | ITby
 478   | ITusing
 479
 480         -- Pragmas
 481   | ITinline_prag InlineSpec RuleMatchInfo
 482   | ITspec_prag                 -- SPECIALISE
 483   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 484   | ITsource_prag
 485   | ITrules_prag
 486   | ITwarning_prag
 487   | ITdeprecated_prag
 488   | ITline_prag
 489   | ITscc_prag
 490   | ITgenerated_prag
 491   | ITcore_prag                 -- hdaume: core annotations
 492   | ITunpack_prag
 493   | ITann_prag
 494   | ITclose_prag
 495   | IToptions_prag String
 496   | ITinclude_prag String
 497   | ITlanguage_prag
 498
 499   | ITdotdot                    -- reserved symbols
 500   | ITcolon
 501   | ITdcolon
 502   | ITequal
 503   | ITlam
 504   | ITvbar
 505   | ITlarrow
 506   | ITrarrow
 507   | ITat
 508   | ITtilde
 509   | ITdarrow
 510   | ITminus
 511   | ITbang
 512   | ITstar
 513   | ITdot
 514
 515   | ITbiglam                    -- GHC-extension symbols
 516
 517   | ITocurly                    -- special symbols
 518   | ITccurly
 519   | ITocurlybar                 -- {|, for type applications
 520   | ITccurlybar                 -- |}, for type applications
 521   | ITvocurly
 522   | ITvccurly
 523   | ITobrack
 524   | ITopabrack                  -- [:, for parallel arrays with -XParallelArrays
 525   | ITcpabrack                  -- :], for parallel arrays with -XParallelArrays
 526   | ITcbrack
 527   | IToparen
 528   | ITcparen
 529   | IToubxparen
 530   | ITcubxparen
 531   | ITsemi
 532   | ITcomma
 533   | ITunderscore
 534   | ITbackquote
 535
 536   | ITvarid   FastString        -- identifiers
 537   | ITconid   FastString
 538   | ITvarsym  FastString
 539   | ITconsym  FastString
 540   | ITqvarid  (FastString,FastString)
 541   | ITqconid  (FastString,FastString)
 542   | ITqvarsym (FastString,FastString)
 543   | ITqconsym (FastString,FastString)
 544   | ITprefixqvarsym (FastString,FastString)
 545   | ITprefixqconsym (FastString,FastString)
 546
 547   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 548
 549   | ITchar       Char
 550   | ITstring     FastString
 551   | ITinteger    Integer
 552   | ITrational   Rational
 553
 554   | ITprimchar   Char
 555   | ITprimstring FastString
 556   | ITprimint    Integer
 557   | ITprimword   Integer
 558   | ITprimfloat  Rational
 559   | ITprimdouble Rational
 560
 561   -- Template Haskell extension tokens
 562   | ITopenExpQuote              --  [| or [e|
 563   | ITopenPatQuote              --  [p|
 564   | ITopenDecQuote              --  [d|
 565   | ITopenTypQuote              --  [t|
 566   | ITcloseQuote                --  |]
 567   | ITidEscape   FastString     --  $x
 568   | ITparenEscape               --  $(
 569   | ITvarQuote                  --  '
 570   | ITtyQuote                   --  ''
 571   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 572
 573   -- Arrow notation extension
 574   | ITproc
 575   | ITrec
 576   | IToparenbar                 --  (|
 577   | ITcparenbar                 --  |)
 578   | ITlarrowtail                --  -<
 579   | ITrarrowtail                --  >-
 580   | ITLarrowtail                --  -<<
 581   | ITRarrowtail                --  >>-
 582
 583   -- Heterogeneous Metaprogramming extension
 584   | ITopenBrak                  --  <[
 585   | ITcloseBrak                 --  ]>
 586   | ITescape                    --  ~~
 587   | ITescapeDollar              --  ~~$
 588   | ITdoublePercent             --  %%
 589
 590   | ITunknown String            -- Used when the lexer can't make sense of it
 591   | ITeof                       -- end of file token
 592
 593   -- Documentation annotations
 594   | ITdocCommentNext  String     -- something beginning '-- |'
 595   | ITdocCommentPrev  String     -- something beginning '-- ^'
 596   | ITdocCommentNamed String     -- something beginning '-- $'
 597   | ITdocSection      Int String -- a section heading
 598   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 599   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 600   | ITlineComment     String     -- comment starting by "--"
 601   | ITblockComment    String     -- comment in {- -}
 602
 603 #ifdef DEBUG
 604   deriving Show -- debugging
 605 #endif
 606
 607 {-
 608 isSpecial :: Token -> Bool
 609 -- If we see M.x, where x is a keyword, but
 610 -- is special, we treat is as just plain M.x,
 611 -- not as a keyword.
 612 isSpecial ITas          = True
 613 isSpecial IThiding      = True
 614 isSpecial ITqualified   = True
 615 isSpecial ITforall      = True
 616 isSpecial ITexport      = True
 617 isSpecial ITlabel       = True
 618 isSpecial ITdynamic     = True
 619 isSpecial ITsafe        = True
 620 isSpecial ITthreadsafe  = True
 621 isSpecial ITinterruptible = True
 622 isSpecial ITunsafe      = True
 623 isSpecial ITccallconv   = True
 624 isSpecial ITstdcallconv = True
 625 isSpecial ITprimcallconv = True
 626 isSpecial ITmdo         = True
 627 isSpecial ITfamily      = True
 628 isSpecial ITgroup   = True
 629 isSpecial ITby      = True
 630 isSpecial ITusing   = True
 631 isSpecial _             = False
 632 -}
 633
 634 -- the bitmap provided as the third component indicates whether the
 635 -- corresponding extension keyword is valid under the extension options
 636 -- provided to the compiler; if the extension corresponding to *any* of the
 637 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 638 -- facilitates using a keyword in two different extensions that can be
 639 -- activated independently)
 640 --
 641 reservedWordsFM :: UniqFM (Token, Int)
 642 reservedWordsFM = listToUFM $
 643         map (\(x, y, z) -> (mkFastString x, (y, z)))
 644        [( "_",          ITunderscore,   0 ),
 645         ( "as",         ITas,           0 ),
 646         ( "case",       ITcase,         0 ),
 647         ( "class",      ITclass,        0 ),
 648         ( "data",       ITdata,         0 ),
 649         ( "default",    ITdefault,      0 ),
 650         ( "deriving",   ITderiving,     0 ),
 651         ( "do",         ITdo,           0 ),
 652         ( "else",       ITelse,         0 ),
 653         ( "hiding",     IThiding,       0 ),
 654         ( "if",         ITif,           0 ),
 655         ( "import",     ITimport,       0 ),
 656         ( "in",         ITin,           0 ),
 657         ( "infix",      ITinfix,        0 ),
 658         ( "infixl",     ITinfixl,       0 ),
 659         ( "infixr",     ITinfixr,       0 ),
 660         ( "instance",   ITinstance,     0 ),
 661         ( "let",        ITlet,          0 ),
 662         ( "module",     ITmodule,       0 ),
 663         ( "newtype",    ITnewtype,      0 ),
 664         ( "of",         ITof,           0 ),
 665         ( "qualified",  ITqualified,    0 ),
 666         ( "then",       ITthen,         0 ),
 667         ( "type",       ITtype,         0 ),
 668         ( "where",      ITwhere,        0 ),
 669         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 670
 671     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 672         ( "mdo",        ITmdo,           bit recursiveDoBit),
 673         ( "family",     ITfamily,        bit tyFamBit),
 674     ( "group",  ITgroup,     bit transformComprehensionsBit),
 675     ( "by",     ITby,        bit transformComprehensionsBit),
 676     ( "using",  ITusing,     bit transformComprehensionsBit),
 677
 678         ( "foreign",    ITforeign,       bit ffiBit),
 679         ( "export",     ITexport,        bit ffiBit),
 680         ( "label",      ITlabel,         bit ffiBit),
 681         ( "dynamic",    ITdynamic,       bit ffiBit),
 682         ( "safe",       ITsafe,          bit ffiBit),
 683         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 684         ( "interruptible", ITinterruptible, bit ffiBit),
 685         ( "unsafe",     ITunsafe,        bit ffiBit),
 686         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 687         ( "ccall",      ITccallconv,     bit ffiBit),
 688         ( "prim",       ITprimcallconv,  bit ffiBit),
 689
 690         ( "rec",        ITrec,           bit recBit),
 691         ( "proc",       ITproc,          bit arrowsBit)
 692      ]
 693
 694 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 695 reservedSymsFM = listToUFM $
 696     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 697       [ ("..",  ITdotdot,   always)
 698         -- (:) is a reserved op, meaning only list cons
 699        ,(":",   ITcolon,    always)
 700        ,("::",  ITdcolon,   always)
 701        ,("=",   ITequal,    always)
 702        ,("\\",  ITlam,      always)
 703        ,("|",   ITvbar,     always)
 704        ,("<-",  ITlarrow,   always)
 705        ,("->",  ITrarrow,   always)
 706        ,("@",   ITat,       always)
 707        ,("~",   ITtilde,    always)
 708        ,("=>",  ITdarrow,   always)
 709        ,("-",   ITminus,    always)
 710        ,("!",   ITbang,     always)
 711
 712         -- For data T (a::*) = MkT
 713        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 714         -- For 'forall a . t'
 715        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 716
 717        ,("-<",  ITlarrowtail, arrowsEnabled)
 718        ,(">-",  ITrarrowtail, arrowsEnabled)
 719        ,("-<<", ITLarrowtail, arrowsEnabled)
 720        ,(">>-", ITRarrowtail, arrowsEnabled)
 721
 722        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 723        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 724        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 725                                 explicitForallEnabled i)
 726        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 727        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 728
 729        ,("⤙",   ITlarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 730        ,("⤚",   ITrarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 731        ,("⤛",   ITLarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 732        ,("⤜",   ITRarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 733
 734        ,("★", ITstar, unicodeSyntaxEnabled)
 735
 736         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 737         -- form part of a large operator.  This would let us have a better
 738         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 739        ]
 740
 741 -- -----------------------------------------------------------------------------
 742 -- Lexer actions
 743
 744 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 745
 746 special :: Token -> Action
 747 special tok span _buf _len = return (L span tok)
 748
 749 token, layout_token :: Token -> Action
 750 token t span _buf _len = return (L span t)
 751 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 752
 753 idtoken :: (StringBuffer -> Int -> Token) -> Action
 754 idtoken f span buf len = return (L span $! (f buf len))
 755
 756 skip_one_varid :: (FastString -> Token) -> Action
 757 skip_one_varid f span buf len
 758   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 759
 760 strtoken :: (String -> Token) -> Action
 761 strtoken f span buf len =
 762   return (L span $! (f $! lexemeToString buf len))
 763
 764 init_strtoken :: Int -> (String -> Token) -> Action
 765 -- like strtoken, but drops the last N character(s)
 766 init_strtoken drop f span buf len =
 767   return (L span $! (f $! lexemeToString buf (len-drop)))
 768
 769 begin :: Int -> Action
 770 begin code _span _str _len = do pushLexState code; lexToken
 771
 772 pop :: Action
 773 pop _span _buf _len = do _ <- popLexState
 774                          lexToken
 775
 776 hopefully_open_brace :: Action
 777 hopefully_open_brace span buf len
 778  = do relaxed <- extension relaxedLayout
 779       ctx <- getContext
 780       (AI l _) <- getInput
 781       let offset = srcLocCol l
 782           isOK = relaxed ||
 783                  case ctx of
 784                  Layout prev_off : _ -> prev_off < offset
 785                  _                   -> True
 786       if isOK then pop_and open_brace span buf len
 787               else failSpanMsgP span (text "Missing block")
 788
 789 pop_and :: Action -> Action
 790 pop_and act span buf len = do _ <- popLexState
 791                               act span buf len
 792
 793 {-# INLINE nextCharIs #-}
 794 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 795 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 796
 797 notFollowedBy :: Char -> AlexAccPred Int
 798 notFollowedBy char _ _ _ (AI _ buf)
 799   = nextCharIs buf (/=char)
 800
 801 notFollowedBySymbol :: AlexAccPred Int
 802 notFollowedBySymbol _ _ _ (AI _ buf)
 803   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 804
 805 -- We must reject doc comments as being ordinary comments everywhere.
 806 -- In some cases the doc comment will be selected as the lexeme due to
 807 -- maximal munch, but not always, because the nested comment rule is
 808 -- valid in all states, but the doc-comment rules are only valid in
 809 -- the non-layout states.
 810 isNormalComment :: AlexAccPred Int
 811 isNormalComment bits _ _ (AI _ buf)
 812   | haddockEnabled bits = notFollowedByDocOrPragma
 813   | otherwise           = nextCharIs buf (/='#')
 814   where
 815     notFollowedByDocOrPragma
 816        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 817
 818 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 819 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 820
 821 {-
 822 haddockDisabledAnd p bits _ _ (AI _ buf)
 823   = if haddockEnabled bits then False else (p buf)
 824 -}
 825
 826 atEOL :: AlexAccPred Int
 827 atEOL _ _ _ (AI _ buf) = atEnd buf || currentChar buf == '\n'
 828
 829 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 830 ifExtension pred bits _ _ _ = pred bits
 831
 832 multiline_doc_comment :: Action
 833 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 834   where
 835     worker commentAcc input docType oneLine = case alexGetChar input of
 836       Just ('\n', input')
 837         | oneLine -> docCommentEnd input commentAcc docType buf span
 838         | otherwise -> case checkIfCommentLine input' of
 839           Just input -> worker ('\n':commentAcc) input docType False
 840           Nothing -> docCommentEnd input commentAcc docType buf span
 841       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 842       Nothing -> docCommentEnd input commentAcc docType buf span
 843
 844     checkIfCommentLine input = check (dropNonNewlineSpace input)
 845       where
 846         check input = case alexGetChar input of
 847           Just ('-', input) -> case alexGetChar input of
 848             Just ('-', input) -> case alexGetChar input of
 849               Just (c, _) | c /= '-' -> Just input
 850               _ -> Nothing
 851             _ -> Nothing
 852           _ -> Nothing
 853
 854         dropNonNewlineSpace input = case alexGetChar input of
 855           Just (c, input')
 856             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 857             | otherwise -> input
 858           Nothing -> input
 859
 860 lineCommentToken :: Action
 861 lineCommentToken span buf len = do
 862   b <- extension rawTokenStreamEnabled
 863   if b then strtoken ITlineComment span buf len else lexToken
 864
 865 {-
 866   nested comments require traversing by hand, they can't be parsed
 867   using regular expressions.
 868 -}
 869 nested_comment :: P (Located Token) -> Action
 870 nested_comment cont span _str _len = do
 871   input <- getInput
 872   go "" (1::Int) input
 873   where
 874     go commentAcc 0 input = do setInput input
 875                                b <- extension rawTokenStreamEnabled
 876                                if b
 877                                  then docCommentEnd input commentAcc ITblockComment _str span
 878                                  else cont
 879     go commentAcc n input = case alexGetChar input of
 880       Nothing -> errBrace input span
 881       Just ('-',input) -> case alexGetChar input of
 882         Nothing  -> errBrace input span
 883         Just ('\125',input) -> go commentAcc (n-1) input
 884         Just (_,_)          -> go ('-':commentAcc) n input
 885       Just ('\123',input) -> case alexGetChar input of
 886         Nothing  -> errBrace input span
 887         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 888         Just (_,_)       -> go ('\123':commentAcc) n input
 889       Just (c,input) -> go (c:commentAcc) n input
 890
 891 nested_doc_comment :: Action
 892 nested_doc_comment span buf _len = withLexedDocType (go "")
 893   where
 894     go commentAcc input docType _ = case alexGetChar input of
 895       Nothing -> errBrace input span
 896       Just ('-',input) -> case alexGetChar input of
 897         Nothing -> errBrace input span
 898         Just ('\125',input) ->
 899           docCommentEnd input commentAcc docType buf span
 900         Just (_,_) -> go ('-':commentAcc) input docType False
 901       Just ('\123', input) -> case alexGetChar input of
 902         Nothing  -> errBrace input span
 903         Just ('-',input) -> do
 904           setInput input
 905           let cont = do input <- getInput; go commentAcc input docType False
 906           nested_comment cont span buf _len
 907         Just (_,_) -> go ('\123':commentAcc) input docType False
 908       Just (c,input) -> go (c:commentAcc) input docType False
 909
 910 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 911                  -> P (Located Token)
 912 withLexedDocType lexDocComment = do
 913   input@(AI _ buf) <- getInput
 914   case prevChar buf ' ' of
 915     '|' -> lexDocComment input ITdocCommentNext False
 916     '^' -> lexDocComment input ITdocCommentPrev False
 917     '$' -> lexDocComment input ITdocCommentNamed False
 918     '*' -> lexDocSection 1 input
 919     '#' -> lexDocComment input ITdocOptionsOld False
 920     _ -> panic "withLexedDocType: Bad doc type"
 921  where
 922     lexDocSection n input = case alexGetChar input of
 923       Just ('*', input) -> lexDocSection (n+1) input
 924       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 925       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 926
 927 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 928 -- off again at the end of the pragma.
 929 rulePrag :: Action
 930 rulePrag span _buf _len = do
 931   setExts (.|. bit inRulePragBit)
 932   return (L span ITrules_prag)
 933
 934 endPrag :: Action
 935 endPrag span _buf _len = do
 936   setExts (.&. complement (bit inRulePragBit))
 937   return (L span ITclose_prag)
 938
 939 -- docCommentEnd
 940 -------------------------------------------------------------------------------
 941 -- This function is quite tricky. We can't just return a new token, we also
 942 -- need to update the state of the parser. Why? Because the token is longer
 943 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 944 -- it writes the wrong token length to the parser state. This function is
 945 -- called afterwards, so it can just update the state.
 946
 947 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 948                  SrcSpan -> P (Located Token)
 949 docCommentEnd input commentAcc docType buf span = do
 950   setInput input
 951   let (AI loc nextBuf) = input
 952       comment = reverse commentAcc
 953       span' = mkSrcSpan (srcSpanStart span) loc
 954       last_len = byteDiff buf nextBuf
 955
 956   span `seq` setLastToken span' last_len
 957   return (L span' (docType comment))
 958
 959 errBrace :: AlexInput -> SrcSpan -> P a
 960 errBrace (AI end _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 961
 962 open_brace, close_brace :: Action
 963 open_brace span _str _len = do
 964   ctx <- getContext
 965   setContext (NoLayout:ctx)
 966   return (L span ITocurly)
 967 close_brace span _str _len = do
 968   popContext
 969   return (L span ITccurly)
 970
 971 qvarid, qconid :: StringBuffer -> Int -> Token
 972 qvarid buf len = ITqvarid $! splitQualName buf len False
 973 qconid buf len = ITqconid $! splitQualName buf len False
 974
 975 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 976 -- takes a StringBuffer and a length, and returns the module name
 977 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 978 -- because of hierarchical module names.
 979 splitQualName orig_buf len parens = split orig_buf orig_buf
 980   where
 981     split buf dot_buf
 982         | orig_buf `byteDiff` buf >= len  = done dot_buf
 983         | c == '.'                        = found_dot buf'
 984         | otherwise                       = split buf' dot_buf
 985       where
 986        (c,buf') = nextChar buf
 987
 988     -- careful, we might get names like M....
 989     -- so, if the character after the dot is not upper-case, this is
 990     -- the end of the qualifier part.
 991     found_dot buf -- buf points after the '.'
 992         | isUpper c    = split buf' buf
 993         | otherwise    = done buf
 994       where
 995        (c,buf') = nextChar buf
 996
 997     done dot_buf =
 998         (lexemeToFastString orig_buf (qual_size - 1),
 999          if parens -- Prelude.(+)
1000             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
1001             else lexemeToFastString dot_buf (len - qual_size))
1002       where
1003         qual_size = orig_buf `byteDiff` dot_buf
1004
1005 varid :: Action
1006 varid span buf len =
1007   fs `seq`
1008   case lookupUFM reservedWordsFM fs of
1009         Just (keyword,0)    -> do
1010                 maybe_layout keyword
1011                 return (L span keyword)
1012         Just (keyword,exts) -> do
1013                 b <- extension (\i -> exts .&. i /= 0)
1014                 if b then do maybe_layout keyword
1015                              return (L span keyword)
1016                      else return (L span (ITvarid fs))
1017         _other -> return (L span (ITvarid fs))
1018   where
1019         fs = lexemeToFastString buf len
1020
1021 conid :: StringBuffer -> Int -> Token
1022 conid buf len = ITconid fs
1023   where fs = lexemeToFastString buf len
1024
1025 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
1026 qvarsym buf len = ITqvarsym $! splitQualName buf len False
1027 qconsym buf len = ITqconsym $! splitQualName buf len False
1028 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
1029 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
1030
1031 varsym, consym :: Action
1032 varsym = sym ITvarsym
1033 consym = sym ITconsym
1034
1035 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
1036     -> P (Located Token)
1037 sym con span buf len =
1038   case lookupUFM reservedSymsFM fs of
1039         Just (keyword,exts) -> do
1040                 b <- extension exts
1041                 if b then return (L span keyword)
1042                      else return (L span $! con fs)
1043         _other -> return (L span $! con fs)
1044   where
1045         fs = lexemeToFastString buf len
1046
1047 -- Variations on the integral numeric literal.
1048 tok_integral :: (Integer -> Token)
1049      -> (Integer -> Integer)
1050  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1051      -> Int -> Int
1052      -> (Integer, (Char->Int)) -> Action
1053 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1054   return $ L span $ itint $! transint $ parseUnsignedInteger
1055      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1056
1057 -- some conveniences for use with tok_integral
1058 tok_num :: (Integer -> Integer)
1059         -> Int -> Int
1060         -> (Integer, (Char->Int)) -> Action
1061 tok_num = tok_integral ITinteger
1062 tok_primint :: (Integer -> Integer)
1063             -> Int -> Int
1064             -> (Integer, (Char->Int)) -> Action
1065 tok_primint = tok_integral ITprimint
1066 tok_primword :: Int -> Int
1067              -> (Integer, (Char->Int)) -> Action
1068 tok_primword = tok_integral ITprimword positive
1069 positive, negative :: (Integer -> Integer)
1070 positive = id
1071 negative = negate
1072 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1073 decimal = (10,octDecDigit)
1074 octal = (8,octDecDigit)
1075 hexadecimal = (16,hexDigit)
1076
1077 -- readRational can understand negative rationals, exponents, everything.
1078 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1079 tok_float        str = ITrational   $! readRational str
1080 tok_primfloat    str = ITprimfloat  $! readRational str
1081 tok_primdouble   str = ITprimdouble $! readRational str
1082
1083 -- -----------------------------------------------------------------------------
1084 -- Layout processing
1085
1086 -- we're at the first token on a line, insert layout tokens if necessary
1087 do_bol :: Action
1088 do_bol span _str _len = do
1089         pos <- getOffside
1090         case pos of
1091             LT -> do
1092                 --trace "layout: inserting '}'" $ do
1093                 popContext
1094                 -- do NOT pop the lex state, we might have a ';' to insert
1095                 return (L span ITvccurly)
1096             EQ -> do
1097                 --trace "layout: inserting ';'" $ do
1098                 _ <- popLexState
1099                 return (L span ITsemi)
1100             GT -> do
1101                 _ <- popLexState
1102                 lexToken
1103
1104 -- certain keywords put us in the "layout" state, where we might
1105 -- add an opening curly brace.
1106 maybe_layout :: Token -> P ()
1107 maybe_layout t = do -- If the alternative layout rule is enabled then
1108                     -- we never create an implicit layout context here.
1109                     -- Layout is handled XXX instead.
1110                     -- The code for closing implicit contexts, or
1111                     -- inserting implicit semi-colons, is therefore
1112                     -- irrelevant as it only applies in an implicit
1113                     -- context.
1114                     alr <- extension alternativeLayoutRule
1115                     unless alr $ f t
1116     where f ITdo    = pushLexState layout_do
1117           f ITmdo   = pushLexState layout_do
1118           f ITof    = pushLexState layout
1119           f ITlet   = pushLexState layout
1120           f ITwhere = pushLexState layout
1121           f ITrec   = pushLexState layout
1122           f _       = return ()
1123
1124 -- Pushing a new implicit layout context.  If the indentation of the
1125 -- next token is not greater than the previous layout context, then
1126 -- Haskell 98 says that the new layout context should be empty; that is
1127 -- the lexer must generate {}.
1128 --
1129 -- We are slightly more lenient than this: when the new context is started
1130 -- by a 'do', then we allow the new context to be at the same indentation as
1131 -- the previous context.  This is what the 'strict' argument is for.
1132 --
1133 new_layout_context :: Bool -> Action
1134 new_layout_context strict span _buf _len = do
1135     _ <- popLexState
1136     (AI l _) <- getInput
1137     let offset = srcLocCol l
1138     ctx <- getContext
1139     nondecreasing <- extension nondecreasingIndentation
1140     let strict' = strict || not nondecreasing
1141     case ctx of
1142         Layout prev_off : _  |
1143            (strict'     && prev_off >= offset  ||
1144             not strict' && prev_off > offset) -> do
1145                 -- token is indented to the left of the previous context.
1146                 -- we must generate a {} sequence now.
1147                 pushLexState layout_left
1148                 return (L span ITvocurly)
1149         _ -> do
1150                 setContext (Layout offset : ctx)
1151                 return (L span ITvocurly)
1152
1153 do_layout_left :: Action
1154 do_layout_left span _buf _len = do
1155     _ <- popLexState
1156     pushLexState bol  -- we must be at the start of a line
1157     return (L span ITvccurly)
1158
1159 -- -----------------------------------------------------------------------------
1160 -- LINE pragmas
1161
1162 setLine :: Int -> Action
1163 setLine code span buf len = do
1164   let line = parseUnsignedInteger buf len 10 octDecDigit
1165   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1)
1166         -- subtract one: the line number refers to the *following* line
1167   _ <- popLexState
1168   pushLexState code
1169   lexToken
1170
1171 setFile :: Int -> Action
1172 setFile code span buf len = do
1173   let file = lexemeToFastString (stepOn buf) (len-2)
1174   setAlrLastLoc noSrcSpan
1175   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1176   _ <- popLexState
1177   pushLexState code
1178   lexToken
1179
1180
1181 -- -----------------------------------------------------------------------------
1182 -- Options, includes and language pragmas.
1183
1184 lex_string_prag :: (String -> Token) -> Action
1185 lex_string_prag mkTok span _buf _len
1186     = do input <- getInput
1187          start <- getSrcLoc
1188          tok <- go [] input
1189          end <- getSrcLoc
1190          return (L (mkSrcSpan start end) tok)
1191     where go acc input
1192               = if isString input "#-}"
1193                    then do setInput input
1194                            return (mkTok (reverse acc))
1195                    else case alexGetChar input of
1196                           Just (c,i) -> go (c:acc) i
1197                           Nothing -> err input
1198           isString _ [] = True
1199           isString i (x:xs)
1200               = case alexGetChar i of
1201                   Just (c,i') | c == x    -> isString i' xs
1202                   _other -> False
1203           err (AI end _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1204
1205
1206 -- -----------------------------------------------------------------------------
1207 -- Strings & Chars
1208
1209 -- This stuff is horrible.  I hates it.
1210
1211 lex_string_tok :: Action
1212 lex_string_tok span _buf _len = do
1213   tok <- lex_string ""
1214   end <- getSrcLoc
1215   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1216
1217 lex_string :: String -> P Token
1218 lex_string s = do
1219   i <- getInput
1220   case alexGetChar' i of
1221     Nothing -> lit_error i
1222
1223     Just ('"',i)  -> do
1224         setInput i
1225         magicHash <- extension magicHashEnabled
1226         if magicHash
1227           then do
1228             i <- getInput
1229             case alexGetChar' i of
1230               Just ('#',i) -> do
1231                    setInput i
1232                    if any (> '\xFF') s
1233                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1234                     else let s' = mkZFastString (reverse s) in
1235                          return (ITprimstring s')
1236                         -- mkZFastString is a hack to avoid encoding the
1237                         -- string in UTF-8.  We just want the exact bytes.
1238               _other ->
1239                 return (ITstring (mkFastString (reverse s)))
1240           else
1241                 return (ITstring (mkFastString (reverse s)))
1242
1243     Just ('\\',i)
1244         | Just ('&',i) <- next -> do
1245                 setInput i; lex_string s
1246         | Just (c,i) <- next, c <= '\x7f' && is_space c -> do
1247                            -- is_space only works for <= '\x7f' (#3751)
1248                 setInput i; lex_stringgap s
1249         where next = alexGetChar' i
1250
1251     Just (c, i1) -> do
1252         case c of
1253           '\\' -> do setInput i1; c' <- lex_escape; lex_string (c':s)
1254           c | isAny c -> do setInput i1; lex_string (c:s)
1255           _other -> lit_error i
1256
1257 lex_stringgap :: String -> P Token
1258 lex_stringgap s = do
1259   i <- getInput
1260   c <- getCharOrFail i
1261   case c of
1262     '\\' -> lex_string s
1263     c | is_space c -> lex_stringgap s
1264     _other -> lit_error i
1265
1266
1267 lex_char_tok :: Action
1268 -- Here we are basically parsing character literals, such as 'x' or '\n'
1269 -- but, when Template Haskell is on, we additionally spot
1270 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1271 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1272 -- So we have to do two characters of lookahead: when we see 'x we need to
1273 -- see if there's a trailing quote
1274 lex_char_tok span _buf _len = do        -- We've seen '
1275    i1 <- getInput       -- Look ahead to first character
1276    let loc = srcSpanStart span
1277    case alexGetChar' i1 of
1278         Nothing -> lit_error  i1
1279
1280         Just ('\'', i2@(AI end2 _)) -> do       -- We've seen ''
1281                   th_exts <- extension thEnabled
1282                   if th_exts then do
1283                         setInput i2
1284                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1285                    else lit_error i1
1286
1287         Just ('\\', i2@(AI _end2 _)) -> do      -- We've seen 'backslash
1288                   setInput i2
1289                   lit_ch <- lex_escape
1290                   i3 <- getInput
1291                   mc <- getCharOrFail i3 -- Trailing quote
1292                   if mc == '\'' then finish_char_tok loc lit_ch
1293                                 else lit_error i3
1294
1295         Just (c, i2@(AI _end2 _))
1296                 | not (isAny c) -> lit_error i1
1297                 | otherwise ->
1298
1299                 -- We've seen 'x, where x is a valid character
1300                 --  (i.e. not newline etc) but not a quote or backslash
1301            case alexGetChar' i2 of      -- Look ahead one more character
1302                 Just ('\'', i3) -> do   -- We've seen 'x'
1303                         setInput i3
1304                         finish_char_tok loc c
1305                 _other -> do            -- We've seen 'x not followed by quote
1306                                         -- (including the possibility of EOF)
1307                                         -- If TH is on, just parse the quote only
1308                         th_exts <- extension thEnabled
1309                         let (AI end _) = i1
1310                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1311                                    else lit_error i2
1312
1313 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1314 finish_char_tok loc ch  -- We've already seen the closing quote
1315                         -- Just need to check for trailing #
1316   = do  magicHash <- extension magicHashEnabled
1317         i@(AI end _) <- getInput
1318         if magicHash then do
1319                 case alexGetChar' i of
1320                         Just ('#',i@(AI end _)) -> do
1321                                 setInput i
1322                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1323                         _other ->
1324                                 return (L (mkSrcSpan loc end) (ITchar ch))
1325             else do
1326                    return (L (mkSrcSpan loc end) (ITchar ch))
1327
1328 isAny :: Char -> Bool
1329 isAny c | c > '\x7f' = isPrint c
1330         | otherwise  = is_any c
1331
1332 lex_escape :: P Char
1333 lex_escape = do
1334   i0 <- getInput
1335   c <- getCharOrFail i0
1336   case c of
1337         'a'   -> return '\a'
1338         'b'   -> return '\b'
1339         'f'   -> return '\f'
1340         'n'   -> return '\n'
1341         'r'   -> return '\r'
1342         't'   -> return '\t'
1343         'v'   -> return '\v'
1344         '\\'  -> return '\\'
1345         '"'   -> return '\"'
1346         '\''  -> return '\''
1347         '^'   -> do i1 <- getInput
1348                     c <- getCharOrFail i1
1349                     if c >= '@' && c <= '_'
1350                         then return (chr (ord c - ord '@'))
1351                         else lit_error i1
1352
1353         'x'   -> readNum is_hexdigit 16 hexDigit
1354         'o'   -> readNum is_octdigit  8 octDecDigit
1355         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1356
1357         c1 ->  do
1358            i <- getInput
1359            case alexGetChar' i of
1360             Nothing -> lit_error i0
1361             Just (c2,i2) ->
1362               case alexGetChar' i2 of
1363                 Nothing -> do lit_error i0
1364                 Just (c3,i3) ->
1365                    let str = [c1,c2,c3] in
1366                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1367                                      Just rest <- [stripPrefix p str] ] of
1368                           (escape_char,[]):_ -> do
1369                                 setInput i3
1370                                 return escape_char
1371                           (escape_char,_:_):_ -> do
1372                                 setInput i2
1373                                 return escape_char
1374                           [] -> lit_error i0
1375
1376 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1377 readNum is_digit base conv = do
1378   i <- getInput
1379   c <- getCharOrFail i
1380   if is_digit c
1381         then readNum2 is_digit base conv (conv c)
1382         else lit_error i
1383
1384 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1385 readNum2 is_digit base conv i = do
1386   input <- getInput
1387   read i input
1388   where read i input = do
1389           case alexGetChar' input of
1390             Just (c,input') | is_digit c -> do
1391                let i' = i*base + conv c
1392                if i' > 0x10ffff
1393                   then setInput input >> lexError "numeric escape sequence out of range"
1394                   else read i' input'
1395             _other -> do
1396               setInput input; return (chr i)
1397
1398
1399 silly_escape_chars :: [(String, Char)]
1400 silly_escape_chars = [
1401         ("NUL", '\NUL'),
1402         ("SOH", '\SOH'),
1403         ("STX", '\STX'),
1404         ("ETX", '\ETX'),
1405         ("EOT", '\EOT'),
1406         ("ENQ", '\ENQ'),
1407         ("ACK", '\ACK'),
1408         ("BEL", '\BEL'),
1409         ("BS", '\BS'),
1410         ("HT", '\HT'),
1411         ("LF", '\LF'),
1412         ("VT", '\VT'),
1413         ("FF", '\FF'),
1414         ("CR", '\CR'),
1415         ("SO", '\SO'),
1416         ("SI", '\SI'),
1417         ("DLE", '\DLE'),
1418         ("DC1", '\DC1'),
1419         ("DC2", '\DC2'),
1420         ("DC3", '\DC3'),
1421         ("DC4", '\DC4'),
1422         ("NAK", '\NAK'),
1423         ("SYN", '\SYN'),
1424         ("ETB", '\ETB'),
1425         ("CAN", '\CAN'),
1426         ("EM", '\EM'),
1427         ("SUB", '\SUB'),
1428         ("ESC", '\ESC'),
1429         ("FS", '\FS'),
1430         ("GS", '\GS'),
1431         ("RS", '\RS'),
1432         ("US", '\US'),
1433         ("SP", '\SP'),
1434         ("DEL", '\DEL')
1435         ]
1436
1437 -- before calling lit_error, ensure that the current input is pointing to
1438 -- the position of the error in the buffer.  This is so that we can report
1439 -- a correct location to the user, but also so we can detect UTF-8 decoding
1440 -- errors if they occur.
1441 lit_error :: AlexInput -> P a
1442 lit_error i = do setInput i; lexError "lexical error in string/character literal"
1443
1444 getCharOrFail :: AlexInput -> P Char
1445 getCharOrFail i =  do
1446   case alexGetChar' i of
1447         Nothing -> lexError "unexpected end-of-file in string/character literal"
1448         Just (c,i)  -> do setInput i; return c
1449
1450 -- -----------------------------------------------------------------------------
1451 -- QuasiQuote
1452
1453 lex_quasiquote_tok :: Action
1454 lex_quasiquote_tok span buf len = do
1455   let quoter = tail (lexemeToString buf (len - 1))
1456                 -- 'tail' drops the initial '[',
1457                 -- while the -1 drops the trailing '|'
1458   quoteStart <- getSrcLoc
1459   quote <- lex_quasiquote ""
1460   end <- getSrcLoc
1461   return (L (mkSrcSpan (srcSpanStart span) end)
1462            (ITquasiQuote (mkFastString quoter,
1463                           mkFastString (reverse quote),
1464                           mkSrcSpan quoteStart end)))
1465
1466 lex_quasiquote :: String -> P String
1467 lex_quasiquote s = do
1468   i <- getInput
1469   case alexGetChar' i of
1470     Nothing -> lit_error i
1471
1472     Just ('\\',i)
1473         | Just ('|',i) <- next -> do
1474                 setInput i; lex_quasiquote ('|' : s)
1475         | Just (']',i) <- next -> do
1476                 setInput i; lex_quasiquote (']' : s)
1477         where next = alexGetChar' i
1478
1479     Just ('|',i)
1480         | Just (']',i) <- next -> do
1481                 setInput i; return s
1482         where next = alexGetChar' i
1483
1484     Just (c, i) -> do
1485          setInput i; lex_quasiquote (c : s)
1486
1487 -- -----------------------------------------------------------------------------
1488 -- Warnings
1489
1490 warn :: DynFlag -> SDoc -> Action
1491 warn option warning srcspan _buf _len = do
1492     addWarning option srcspan warning
1493     lexToken
1494
1495 warnThen :: DynFlag -> SDoc -> Action -> Action
1496 warnThen option warning action srcspan buf len = do
1497     addWarning option srcspan warning
1498     action srcspan buf len
1499
1500 -- -----------------------------------------------------------------------------
1501 -- The Parse Monad
1502
1503 data LayoutContext
1504   = NoLayout
1505   | Layout !Int
1506   deriving Show
1507
1508 data ParseResult a
1509   = POk PState a
1510   | PFailed
1511         SrcSpan         -- The start and end of the text span related to
1512                         -- the error.  Might be used in environments which can
1513                         -- show this span, e.g. by highlighting it.
1514         Message         -- The error message
1515
1516 data PState = PState {
1517         buffer     :: StringBuffer,
1518         dflags     :: DynFlags,
1519         messages   :: Messages,
1520         last_loc   :: SrcSpan,  -- pos of previous token
1521         last_len   :: !Int,     -- len of previous token
1522         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1523         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1524         context    :: [LayoutContext],
1525         lex_state  :: [Int],
1526         -- Used in the alternative layout rule:
1527         -- These tokens are the next ones to be sent out. They are
1528         -- just blindly emitted, without the rule looking at them again:
1529         alr_pending_implicit_tokens :: [Located Token],
1530         -- This is the next token to be considered or, if it is Nothing,
1531         -- we need to get the next token from the input stream:
1532         alr_next_token :: Maybe (Located Token),
1533         -- This is what we consider to be the locatino of the last token
1534         -- emitted:
1535         alr_last_loc :: SrcSpan,
1536         -- The stack of layout contexts:
1537         alr_context :: [ALRContext],
1538         -- Are we expecting a '{'? If it's Just, then the ALRLayout tells
1539         -- us what sort of layout the '{' will open:
1540         alr_expecting_ocurly :: Maybe ALRLayout,
1541         -- Have we just had the '}' for a let block? If so, than an 'in'
1542         -- token doesn't need to close anything:
1543         alr_justClosedExplicitLetBlock :: Bool,
1544         code_type_bracket_depth :: Int
1545      }
1546         -- last_loc and last_len are used when generating error messages,
1547         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1548         -- current token to happyError, we could at least get rid of last_len.
1549         -- Getting rid of last_loc would require finding another way to
1550         -- implement pushCurrentContext (which is only called from one place).
1551
1552 data ALRContext = ALRNoLayout Bool{- does it contain commas? -}
1553                               Bool{- is it a 'let' block? -}
1554                 | ALRLayout ALRLayout Int
1555 data ALRLayout = ALRLayoutLet
1556                | ALRLayoutWhere
1557                | ALRLayoutOf
1558                | ALRLayoutDo
1559
1560 newtype P a = P { unP :: PState -> ParseResult a }
1561
1562 instance Monad P where
1563   return = returnP
1564   (>>=) = thenP
1565   fail = failP
1566
1567 returnP :: a -> P a
1568 returnP a = a `seq` (P $ \s -> POk s a)
1569
1570 thenP :: P a -> (a -> P b) -> P b
1571 (P m) `thenP` k = P $ \ s ->
1572         case m s of
1573                 POk s1 a         -> (unP (k a)) s1
1574                 PFailed span err -> PFailed span err
1575
1576 failP :: String -> P a
1577 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1578
1579 failMsgP :: String -> P a
1580 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1581
1582 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1583 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1584
1585 failSpanMsgP :: SrcSpan -> SDoc -> P a
1586 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1587
1588 getPState :: P PState
1589 getPState = P $ \s -> POk s s
1590
1591 getDynFlags :: P DynFlags
1592 getDynFlags = P $ \s -> POk s (dflags s)
1593
1594 withThisPackage :: (PackageId -> a) -> P a
1595 withThisPackage f
1596  = do   pkg     <- liftM thisPackage getDynFlags
1597         return  $ f pkg
1598
1599 extension :: (Int -> Bool) -> P Bool
1600 extension p = P $ \s -> POk s (p $! extsBitmap s)
1601
1602 getExts :: P Int
1603 getExts = P $ \s -> POk s (extsBitmap s)
1604
1605 setExts :: (Int -> Int) -> P ()
1606 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1607
1608 setSrcLoc :: SrcLoc -> P ()
1609 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1610
1611 incrBracketDepth :: P ()
1612 incrBracketDepth = P $ \s -> POk (s{code_type_bracket_depth = (code_type_bracket_depth s)+1}) ()
1613 decrBracketDepth :: P ()
1614 decrBracketDepth = P $ \s -> POk (s{code_type_bracket_depth = (code_type_bracket_depth s)-1}) ()
1615 getParserBrakDepth :: P Int
1616 getParserBrakDepth = P $ \s -> POk s (code_type_bracket_depth s)
1617
1618 getSrcLoc :: P SrcLoc
1619 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1620
1621 setLastToken :: SrcSpan -> Int -> P ()
1622 setLastToken loc len = P $ \s -> POk s {
1623   last_loc=loc,
1624   last_len=len
1625   } ()
1626
1627 data AlexInput = AI SrcLoc StringBuffer
1628
1629 alexInputPrevChar :: AlexInput -> Char
1630 alexInputPrevChar (AI _ buf) = prevChar buf '\n'
1631
1632 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1633 alexGetChar (AI loc s)
1634   | atEnd s   = Nothing
1635   | otherwise = adj_c `seq` loc' `seq` s' `seq`
1636                 --trace (show (ord c)) $
1637                 Just (adj_c, (AI loc' s'))
1638   where (c,s') = nextChar s
1639         loc'   = advanceSrcLoc loc c
1640
1641         non_graphic     = '\x0'
1642         upper           = '\x1'
1643         lower           = '\x2'
1644         digit           = '\x3'
1645         symbol          = '\x4'
1646         space           = '\x5'
1647         other_graphic   = '\x6'
1648
1649         adj_c
1650           | c <= '\x06' = non_graphic
1651           | c <= '\x7f' = c
1652           -- Alex doesn't handle Unicode, so when Unicode
1653           -- character is encountered we output these values
1654           -- with the actual character value hidden in the state.
1655           | otherwise =
1656                 case generalCategory c of
1657                   UppercaseLetter       -> upper
1658                   LowercaseLetter       -> lower
1659                   TitlecaseLetter       -> upper
1660                   ModifierLetter        -> other_graphic
1661                   OtherLetter           -> lower -- see #1103
1662                   NonSpacingMark        -> other_graphic
1663                   SpacingCombiningMark  -> other_graphic
1664                   EnclosingMark         -> other_graphic
1665                   DecimalNumber         -> digit
1666                   LetterNumber          -> other_graphic
1667                   OtherNumber           -> digit -- see #4373
1668                   ConnectorPunctuation  -> symbol
1669                   DashPunctuation       -> symbol
1670                   OpenPunctuation       -> other_graphic
1671                   ClosePunctuation      -> other_graphic
1672                   InitialQuote          -> other_graphic
1673                   FinalQuote            -> other_graphic
1674                   OtherPunctuation      -> symbol
1675                   MathSymbol            -> symbol
1676                   CurrencySymbol        -> symbol
1677                   ModifierSymbol        -> symbol
1678                   OtherSymbol           -> symbol
1679                   Space                 -> space
1680                   _other                -> non_graphic
1681
1682 -- This version does not squash unicode characters, it is used when
1683 -- lexing strings.
1684 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1685 alexGetChar' (AI loc s)
1686   | atEnd s   = Nothing
1687   | otherwise = c `seq` loc' `seq` s' `seq`
1688                 --trace (show (ord c)) $
1689                 Just (c, (AI loc' s'))
1690   where (c,s') = nextChar s
1691         loc'   = advanceSrcLoc loc c
1692
1693 getInput :: P AlexInput
1694 getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (AI l b)
1695
1696 setInput :: AlexInput -> P ()
1697 setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } ()
1698
1699 nextIsEOF :: P Bool
1700 nextIsEOF = do
1701   AI _ s <- getInput
1702   return $ atEnd s
1703
1704 pushLexState :: Int -> P ()
1705 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1706
1707 popLexState :: P Int
1708 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1709
1710 getLexState :: P Int
1711 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1712
1713 popNextToken :: P (Maybe (Located Token))
1714 popNextToken
1715     = P $ \s@PState{ alr_next_token = m } ->
1716               POk (s {alr_next_token = Nothing}) m
1717
1718 activeContext :: P Bool
1719 activeContext = do
1720   ctxt <- getALRContext
1721   expc <- getAlrExpectingOCurly
1722   impt <- implicitTokenPending
1723   case (ctxt,expc) of
1724     ([],Nothing) -> return impt
1725     _other       -> return True
1726
1727 setAlrLastLoc :: SrcSpan -> P ()
1728 setAlrLastLoc l = P $ \s -> POk (s {alr_last_loc = l}) ()
1729
1730 getAlrLastLoc :: P SrcSpan
1731 getAlrLastLoc = P $ \s@(PState {alr_last_loc = l}) -> POk s l
1732
1733 getALRContext :: P [ALRContext]
1734 getALRContext = P $ \s@(PState {alr_context = cs}) -> POk s cs
1735
1736 setALRContext :: [ALRContext] -> P ()
1737 setALRContext cs = P $ \s -> POk (s {alr_context = cs}) ()
1738
1739 getJustClosedExplicitLetBlock :: P Bool
1740 getJustClosedExplicitLetBlock
1741  = P $ \s@(PState {alr_justClosedExplicitLetBlock = b}) -> POk s b
1742
1743 setJustClosedExplicitLetBlock :: Bool -> P ()
1744 setJustClosedExplicitLetBlock b
1745  = P $ \s -> POk (s {alr_justClosedExplicitLetBlock = b}) ()
1746
1747 setNextToken :: Located Token -> P ()
1748 setNextToken t = P $ \s -> POk (s {alr_next_token = Just t}) ()
1749
1750 implicitTokenPending :: P Bool
1751 implicitTokenPending
1752     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1753               case ts of
1754               [] -> POk s False
1755               _  -> POk s True
1756
1757 popPendingImplicitToken :: P (Maybe (Located Token))
1758 popPendingImplicitToken
1759     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1760               case ts of
1761               [] -> POk s Nothing
1762               (t : ts') -> POk (s {alr_pending_implicit_tokens = ts'}) (Just t)
1763
1764 setPendingImplicitTokens :: [Located Token] -> P ()
1765 setPendingImplicitTokens ts = P $ \s -> POk (s {alr_pending_implicit_tokens = ts}) ()
1766
1767 getAlrExpectingOCurly :: P (Maybe ALRLayout)
1768 getAlrExpectingOCurly = P $ \s@(PState {alr_expecting_ocurly = b}) -> POk s b
1769
1770 setAlrExpectingOCurly :: Maybe ALRLayout -> P ()
1771 setAlrExpectingOCurly b = P $ \s -> POk (s {alr_expecting_ocurly = b}) ()
1772
1773 -- for reasons of efficiency, flags indicating language extensions (eg,
1774 -- -fglasgow-exts or -XParallelArrays) are represented by a bitmap stored in an unboxed
1775 -- integer
1776
1777 genericsBit :: Int
1778 genericsBit = 0 -- {| and |}
1779 ffiBit :: Int
1780 ffiBit     = 1
1781 parrBit :: Int
1782 parrBit    = 2
1783 arrowsBit :: Int
1784 arrowsBit  = 4
1785 thBit :: Int
1786 thBit      = 5
1787 ipBit :: Int
1788 ipBit      = 6
1789 explicitForallBit :: Int
1790 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1791 bangPatBit :: Int
1792 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1793                 -- (doesn't affect the lexer)
1794 tyFamBit :: Int
1795 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1796 haddockBit :: Int
1797 haddockBit = 10 -- Lex and parse Haddock comments
1798 magicHashBit :: Int
1799 magicHashBit = 11 -- "#" in both functions and operators
1800 kindSigsBit :: Int
1801 kindSigsBit = 12 -- Kind signatures on type variables
1802 recursiveDoBit :: Int
1803 recursiveDoBit = 13 -- mdo
1804 unicodeSyntaxBit :: Int
1805 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1806 unboxedTuplesBit :: Int
1807 unboxedTuplesBit = 15 -- (# and #)
1808 datatypeContextsBit :: Int
1809 datatypeContextsBit = 16
1810 transformComprehensionsBit :: Int
1811 transformComprehensionsBit = 17
1812 qqBit :: Int
1813 qqBit      = 18 -- enable quasiquoting
1814 inRulePragBit :: Int
1815 inRulePragBit = 19
1816 rawTokenStreamBit :: Int
1817 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1818 recBit :: Int
1819 recBit = 22 -- rec
1820 alternativeLayoutRuleBit :: Int
1821 alternativeLayoutRuleBit = 23
1822 relaxedLayoutBit :: Int
1823 relaxedLayoutBit = 24
1824 nondecreasingIndentationBit :: Int
1825 nondecreasingIndentationBit = 25
1826 hetMetBit :: Int
1827 hetMetBit = 31
1828
1829 always :: Int -> Bool
1830 always           _     = True
1831 genericsEnabled :: Int -> Bool
1832 genericsEnabled  flags = testBit flags genericsBit
1833 parrEnabled :: Int -> Bool
1834 parrEnabled      flags = testBit flags parrBit
1835 arrowsEnabled :: Int -> Bool
1836 arrowsEnabled    flags = testBit flags arrowsBit
1837 hetMetEnabled :: Int -> Bool
1838 hetMetEnabled    flags = testBit flags hetMetBit
1839 thEnabled :: Int -> Bool
1840 thEnabled        flags = testBit flags thBit
1841 ipEnabled :: Int -> Bool
1842 ipEnabled        flags = testBit flags ipBit
1843 explicitForallEnabled :: Int -> Bool
1844 explicitForallEnabled flags = testBit flags explicitForallBit
1845 bangPatEnabled :: Int -> Bool
1846 bangPatEnabled   flags = testBit flags bangPatBit
1847 -- tyFamEnabled :: Int -> Bool
1848 -- tyFamEnabled     flags = testBit flags tyFamBit
1849 haddockEnabled :: Int -> Bool
1850 haddockEnabled   flags = testBit flags haddockBit
1851 magicHashEnabled :: Int -> Bool
1852 magicHashEnabled flags = testBit flags magicHashBit
1853 -- kindSigsEnabled :: Int -> Bool
1854 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1855 unicodeSyntaxEnabled :: Int -> Bool
1856 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1857 unboxedTuplesEnabled :: Int -> Bool
1858 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1859 datatypeContextsEnabled :: Int -> Bool
1860 datatypeContextsEnabled flags = testBit flags datatypeContextsBit
1861 qqEnabled :: Int -> Bool
1862 qqEnabled        flags = testBit flags qqBit
1863 -- inRulePrag :: Int -> Bool
1864 -- inRulePrag       flags = testBit flags inRulePragBit
1865 rawTokenStreamEnabled :: Int -> Bool
1866 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1867 alternativeLayoutRule :: Int -> Bool
1868 alternativeLayoutRule flags = testBit flags alternativeLayoutRuleBit
1869 relaxedLayout :: Int -> Bool
1870 relaxedLayout flags = testBit flags relaxedLayoutBit
1871 nondecreasingIndentation :: Int -> Bool
1872 nondecreasingIndentation flags = testBit flags nondecreasingIndentationBit
1873
1874 -- PState for parsing options pragmas
1875 --
1876 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1877 pragState dynflags buf loc = (mkPState dynflags buf loc) {
1878                                  lex_state = [bol, option_prags, 0]
1879                              }
1880
1881 -- create a parse state
1882 --
1883 mkPState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1884 mkPState flags buf loc =
1885   PState {
1886       buffer          = buf,
1887       dflags        = flags,
1888       messages      = emptyMessages,
1889       last_loc      = mkSrcSpan loc loc,
1890       last_len      = 0,
1891       loc           = loc,
1892       extsBitmap    = fromIntegral bitmap,
1893       context       = [],
1894       lex_state     = [bol, 0],
1895       alr_pending_implicit_tokens = [],
1896       alr_next_token = Nothing,
1897       alr_last_loc = noSrcSpan,
1898       alr_context = [],
1899       alr_expecting_ocurly = Nothing,
1900       alr_justClosedExplicitLetBlock = False,
1901       code_type_bracket_depth = 0
1902     }
1903     where
1904       bitmap = genericsBit `setBitIf` xopt Opt_Generics flags
1905                .|. ffiBit            `setBitIf` xopt Opt_ForeignFunctionInterface flags
1906                .|. parrBit           `setBitIf` xopt Opt_ParallelArrays  flags
1907                .|. arrowsBit         `setBitIf` xopt Opt_Arrows          flags
1908                .|. hetMetBit        `setBitIf` xopt Opt_ModalTypes         flags
1909                .|. thBit             `setBitIf` xopt Opt_TemplateHaskell flags
1910                .|. qqBit             `setBitIf` xopt Opt_QuasiQuotes     flags
1911                .|. ipBit             `setBitIf` xopt Opt_ImplicitParams  flags
1912                .|. explicitForallBit `setBitIf` xopt Opt_ExplicitForAll  flags
1913                .|. bangPatBit        `setBitIf` xopt Opt_BangPatterns flags
1914                .|. tyFamBit          `setBitIf` xopt Opt_TypeFamilies flags
1915                .|. haddockBit        `setBitIf` dopt Opt_Haddock      flags
1916                .|. magicHashBit      `setBitIf` xopt Opt_MagicHash    flags
1917                .|. kindSigsBit       `setBitIf` xopt Opt_KindSignatures flags
1918                .|. recursiveDoBit    `setBitIf` xopt Opt_RecursiveDo flags
1919                .|. recBit            `setBitIf` xopt Opt_DoRec  flags
1920                .|. recBit            `setBitIf` xopt Opt_Arrows flags
1921                .|. unicodeSyntaxBit  `setBitIf` xopt Opt_UnicodeSyntax flags
1922                .|. unboxedTuplesBit  `setBitIf` xopt Opt_UnboxedTuples flags
1923                .|. datatypeContextsBit `setBitIf` xopt Opt_DatatypeContexts flags
1924                .|. transformComprehensionsBit `setBitIf` xopt Opt_TransformListComp flags
1925                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1926                .|. alternativeLayoutRuleBit `setBitIf` xopt Opt_AlternativeLayoutRule flags
1927                .|. relaxedLayoutBit `setBitIf` xopt Opt_RelaxedLayout flags
1928                .|. nondecreasingIndentationBit `setBitIf` xopt Opt_NondecreasingIndentation flags
1929       --
1930       setBitIf :: Int -> Bool -> Int
1931       b `setBitIf` cond | cond      = bit b
1932                         | otherwise = 0
1933
1934 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1935 addWarning option srcspan warning
1936  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1937        let warning' = mkWarnMsg srcspan alwaysQualify warning
1938            ws' = if dopt option d then ws `snocBag` warning' else ws
1939        in POk s{messages=(ws', es)} ()
1940
1941 getMessages :: PState -> Messages
1942 getMessages PState{messages=ms} = ms
1943
1944 getContext :: P [LayoutContext]
1945 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1946
1947 setContext :: [LayoutContext] -> P ()
1948 setContext ctx = P $ \s -> POk s{context=ctx} ()
1949
1950 popContext :: P ()
1951 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1952                               last_len = len, last_loc = last_loc }) ->
1953   case ctx of
1954         (_:tl) -> POk s{ context = tl } ()
1955         []     -> PFailed last_loc (srcParseErr buf len)
1956
1957 -- Push a new layout context at the indentation of the last token read.
1958 -- This is only used at the outer level of a module when the 'module'
1959 -- keyword is missing.
1960 pushCurrentContext :: P ()
1961 pushCurrentContext = P $ \ s@PState{ last_loc=loc, context=ctx } ->
1962     POk s{context = Layout (srcSpanStartCol loc) : ctx} ()
1963
1964 getOffside :: P Ordering
1965 getOffside = P $ \s@PState{last_loc=loc, context=stk} ->
1966                 let offs = srcSpanStartCol loc in
1967                 let ord = case stk of
1968                         (Layout n:_) -> --trace ("layout: " ++ show n ++ ", offs: " ++ show offs) $
1969                                         compare offs n
1970                         _            -> GT
1971                 in POk s ord
1972
1973 -- ---------------------------------------------------------------------------
1974 -- Construct a parse error
1975
1976 srcParseErr
1977   :: StringBuffer       -- current buffer (placed just after the last token)
1978   -> Int                -- length of the previous token
1979   -> Message
1980 srcParseErr buf len
1981   = hcat [ if null token
1982              then ptext (sLit "parse error (possibly incorrect indentation)")
1983              else hcat [ptext (sLit "parse error on input "),
1984                         char '`', text token, char '\'']
1985     ]
1986   where token = lexemeToString (offsetBytes (-len) buf) len
1987
1988 -- Report a parse failure, giving the span of the previous token as
1989 -- the location of the error.  This is the entry point for errors
1990 -- detected during parsing.
1991 srcParseFail :: P a
1992 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1993                             last_loc = last_loc } ->
1994     PFailed last_loc (srcParseErr buf len)
1995
1996 -- A lexical error is reported at a particular position in the source file,
1997 -- not over a token range.
1998 lexError :: String -> P a
1999 lexError str = do
2000   loc <- getSrcLoc
2001   (AI end buf) <- getInput
2002   reportLexError loc end buf str
2003
2004 -- -----------------------------------------------------------------------------
2005 -- This is the top-level function: called from the parser each time a
2006 -- new token is to be read from the input.
2007
2008 lexer :: (Located Token -> P a) -> P a
2009 lexer cont = do
2010   alr <- extension alternativeLayoutRule
2011   let lexTokenFun = if alr then lexTokenAlr else lexToken
2012   tok@(L _span _tok__) <- lexTokenFun
2013   --trace ("token: " ++ show _tok__) $ do
2014   cont tok
2015
2016 lexTokenAlr :: P (Located Token)
2017 lexTokenAlr = do mPending <- popPendingImplicitToken
2018                  t <- case mPending of
2019                       Nothing ->
2020                           do mNext <- popNextToken
2021                              t <- case mNext of
2022                                   Nothing -> lexToken
2023                                   Just next -> return next
2024                              alternativeLayoutRuleToken t
2025                       Just t ->
2026                           return t
2027                  setAlrLastLoc (getLoc t)
2028                  case unLoc t of
2029                      ITwhere -> setAlrExpectingOCurly (Just ALRLayoutWhere)
2030                      ITlet   -> setAlrExpectingOCurly (Just ALRLayoutLet)
2031                      ITof    -> setAlrExpectingOCurly (Just ALRLayoutOf)
2032                      ITdo    -> setAlrExpectingOCurly (Just ALRLayoutDo)
2033                      ITmdo   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2034                      ITrec   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2035                      _       -> return ()
2036                  return t
2037
2038 alternativeLayoutRuleToken :: Located Token -> P (Located Token)
2039 alternativeLayoutRuleToken t
2040     = do context <- getALRContext
2041          lastLoc <- getAlrLastLoc
2042          mExpectingOCurly <- getAlrExpectingOCurly
2043          justClosedExplicitLetBlock <- getJustClosedExplicitLetBlock
2044          setJustClosedExplicitLetBlock False
2045          dflags <- getDynFlags
2046          let transitional = xopt Opt_AlternativeLayoutRuleTransitional dflags
2047              thisLoc = getLoc t
2048              thisCol = srcSpanStartCol thisLoc
2049              newLine = (lastLoc == noSrcSpan)
2050                     || (srcSpanStartLine thisLoc > srcSpanEndLine lastLoc)
2051          case (unLoc t, context, mExpectingOCurly) of
2052              -- This case handles a GHC extension to the original H98
2053              -- layout rule...
2054              (ITocurly, _, Just alrLayout) ->
2055                  do setAlrExpectingOCurly Nothing
2056                     let isLet = case alrLayout of
2057                                 ALRLayoutLet -> True
2058                                 _ -> False
2059                     setALRContext (ALRNoLayout (containsCommas ITocurly) isLet : context)
2060                     return t
2061              -- ...and makes this case unnecessary
2062              {-
2063              -- I think our implicit open-curly handling is slightly
2064              -- different to John's, in how it interacts with newlines
2065              -- and "in"
2066              (ITocurly, _, Just _) ->
2067                  do setAlrExpectingOCurly Nothing
2068                     setNextToken t
2069                     lexTokenAlr
2070              -}
2071              (_, ALRLayout _ col : ls, Just expectingOCurly)
2072               | (thisCol > col) ||
2073                 (thisCol == col &&
2074                  isNonDecreasingIntentation expectingOCurly) ->
2075                  do setAlrExpectingOCurly Nothing
2076                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2077                     setNextToken t
2078                     return (L thisLoc ITocurly)
2079               | otherwise ->
2080                  do setAlrExpectingOCurly Nothing
2081                     setPendingImplicitTokens [L lastLoc ITccurly]
2082                     setNextToken t
2083                     return (L lastLoc ITocurly)
2084              (_, _, Just expectingOCurly) ->
2085                  do setAlrExpectingOCurly Nothing
2086                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2087                     setNextToken t
2088                     return (L thisLoc ITocurly)
2089              -- We do the [] cases earlier than in the spec, as we
2090              -- have an actual EOF token
2091              (ITeof, ALRLayout _ _ : ls, _) ->
2092                  do setALRContext ls
2093                     setNextToken t
2094                     return (L thisLoc ITccurly)
2095              (ITeof, _, _) ->
2096                  return t
2097              -- the other ITeof case omitted; general case below covers it
2098              (ITin, _, _)
2099               | justClosedExplicitLetBlock ->
2100                  return t
2101              (ITin, ALRLayout ALRLayoutLet _ : ls, _)
2102               | newLine ->
2103                  do setPendingImplicitTokens [t]
2104                     setALRContext ls
2105                     return (L thisLoc ITccurly)
2106              -- This next case is to handle a transitional issue:
2107              (ITwhere, ALRLayout _ col : ls, _)
2108               | newLine && thisCol == col && transitional ->
2109                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2110                                thisLoc
2111                                (transitionalAlternativeLayoutWarning
2112                                     "`where' clause at the same depth as implicit layout block")
2113                     setALRContext ls
2114                     setNextToken t
2115                     -- Note that we use lastLoc, as we may need to close
2116                     -- more layouts, or give a semicolon
2117                     return (L lastLoc ITccurly)
2118              -- This next case is to handle a transitional issue:
2119              (ITvbar, ALRLayout _ col : ls, _)
2120               | newLine && thisCol == col && transitional ->
2121                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2122                                thisLoc
2123                                (transitionalAlternativeLayoutWarning
2124                                     "`|' at the same depth as implicit layout block")
2125                     setALRContext ls
2126                     setNextToken t
2127                     -- Note that we use lastLoc, as we may need to close
2128                     -- more layouts, or give a semicolon
2129                     return (L lastLoc ITccurly)
2130              (_, ALRLayout _ col : ls, _)
2131               | newLine && thisCol == col ->
2132                  do setNextToken t
2133                     return (L thisLoc ITsemi)
2134               | newLine && thisCol < col ->
2135                  do setALRContext ls
2136                     setNextToken t
2137                     -- Note that we use lastLoc, as we may need to close
2138                     -- more layouts, or give a semicolon
2139                     return (L lastLoc ITccurly)
2140              -- We need to handle close before open, as 'then' is both
2141              -- an open and a close
2142              (u, _, _)
2143               | isALRclose u ->
2144                  case context of
2145                  ALRLayout _ _ : ls ->
2146                      do setALRContext ls
2147                         setNextToken t
2148                         return (L thisLoc ITccurly)
2149                  ALRNoLayout _ isLet : ls ->
2150                      do let ls' = if isALRopen u
2151                                      then ALRNoLayout (containsCommas u) False : ls
2152                                      else ls
2153                         setALRContext ls'
2154                         when isLet $ setJustClosedExplicitLetBlock True
2155                         return t
2156                  [] ->
2157                      do let ls = if isALRopen u
2158                                     then [ALRNoLayout (containsCommas u) False]
2159                                     else ls
2160                         setALRContext ls
2161                         -- XXX This is an error in John's code, but
2162                         -- it looks reachable to me at first glance
2163                         return t
2164              (u, _, _)
2165               | isALRopen u ->
2166                  do setALRContext (ALRNoLayout (containsCommas u) False : context)
2167                     return t
2168              (ITin, ALRLayout ALRLayoutLet _ : ls, _) ->
2169                  do setALRContext ls
2170                     setPendingImplicitTokens [t]
2171                     return (L thisLoc ITccurly)
2172              (ITin, ALRLayout _ _ : ls, _) ->
2173                  do setALRContext ls
2174                     setNextToken t
2175                     return (L thisLoc ITccurly)
2176              -- the other ITin case omitted; general case below covers it
2177              (ITcomma, ALRLayout _ _ : ls, _)
2178               | topNoLayoutContainsCommas ls ->
2179                  do setALRContext ls
2180                     setNextToken t
2181                     return (L thisLoc ITccurly)
2182              (ITwhere, ALRLayout ALRLayoutDo _ : ls, _) ->
2183                  do setALRContext ls
2184                     setPendingImplicitTokens [t]
2185                     return (L thisLoc ITccurly)
2186              -- the other ITwhere case omitted; general case below covers it
2187              (_, _, _) -> return t
2188
2189 transitionalAlternativeLayoutWarning :: String -> SDoc
2190 transitionalAlternativeLayoutWarning msg
2191     = text "transitional layout will not be accepted in the future:"
2192    $$ text msg
2193
2194 isALRopen :: Token -> Bool
2195 isALRopen ITcase        = True
2196 isALRopen ITif          = True
2197 isALRopen ITthen        = True
2198 isALRopen IToparen      = True
2199 isALRopen ITobrack      = True
2200 isALRopen ITocurly      = True
2201 -- GHC Extensions:
2202 isALRopen IToubxparen   = True
2203 isALRopen ITparenEscape = True
2204 isALRopen _             = False
2205
2206 isALRclose :: Token -> Bool
2207 isALRclose ITof     = True
2208 isALRclose ITthen   = True
2209 isALRclose ITelse   = True
2210 isALRclose ITcparen = True
2211 isALRclose ITcbrack = True
2212 isALRclose ITccurly = True
2213 -- GHC Extensions:
2214 isALRclose ITcubxparen = True
2215 isALRclose _        = False
2216
2217 isNonDecreasingIntentation :: ALRLayout -> Bool
2218 isNonDecreasingIntentation ALRLayoutDo = True
2219 isNonDecreasingIntentation _           = False
2220
2221 containsCommas :: Token -> Bool
2222 containsCommas IToparen = True
2223 containsCommas ITobrack = True
2224 -- John doesn't have {} as containing commas, but records contain them,
2225 -- which caused a problem parsing Cabal's Distribution.Simple.InstallDirs
2226 -- (defaultInstallDirs).
2227 containsCommas ITocurly = True
2228 -- GHC Extensions:
2229 containsCommas IToubxparen = True
2230 containsCommas _        = False
2231
2232 topNoLayoutContainsCommas :: [ALRContext] -> Bool
2233 topNoLayoutContainsCommas [] = False
2234 topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
2235 topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
2236
2237 lexToken :: P (Located Token)
2238 lexToken = do
2239   inp@(AI loc1 buf) <- getInput
2240   sc <- getLexState
2241   exts <- getExts
2242   case alexScanUser exts inp sc of
2243     AlexEOF -> do
2244         let span = mkSrcSpan loc1 loc1
2245         setLastToken span 0
2246         return (L span ITeof)
2247     AlexError (AI loc2 buf) ->
2248         reportLexError loc1 loc2 buf "lexical error"
2249     AlexSkip inp2 _ -> do
2250         setInput inp2
2251         lexToken
2252     AlexToken inp2@(AI end buf2) _ t -> do
2253         setInput inp2
2254         let span = mkSrcSpan loc1 end
2255         let bytes = byteDiff buf buf2
2256         span `seq` setLastToken span bytes
2257         t span buf bytes
2258
2259 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
2260 reportLexError loc1 loc2 buf str
2261   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
2262   | otherwise =
2263   let
2264         c = fst (nextChar buf)
2265   in
2266   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
2267     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
2268     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
2269
2270 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
2271 lexTokenStream buf loc dflags = unP go initState
2272     where dflags' = dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream
2273           initState = mkPState dflags' buf loc
2274           go = do
2275             ltok <- lexer return
2276             case ltok of
2277               L _ ITeof -> return []
2278               _ -> liftM (ltok:) go
2279
2280 linePrags = Map.singleton "line" (begin line_prag2)
2281
2282 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
2283                                  ("options_ghc", lex_string_prag IToptions_prag),
2284                                  ("options_haddock", lex_string_prag ITdocOptions),
2285                                  ("language", token ITlanguage_prag),
2286                                  ("include", lex_string_prag ITinclude_prag)])
2287
2288 ignoredPrags = Map.fromList (map ignored pragmas)
2289                where ignored opt = (opt, nested_comment lexToken)
2290                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
2291                      options_pragmas = map ("options_" ++) impls
2292                      -- CFILES is a hugs-only thing.
2293                      pragmas = options_pragmas ++ ["cfiles", "contract"]
2294
2295 oneWordPrags = Map.fromList([("rules", rulePrag),
2296                            ("inline", token (ITinline_prag Inline FunLike)),
2297                            ("inlinable", token (ITinline_prag Inlinable FunLike)),
2298                            ("inlineable", token (ITinline_prag Inlinable FunLike)),
2299                                           -- Spelling variant
2300                            ("notinline", token (ITinline_prag NoInline FunLike)),
2301                            ("specialize", token ITspec_prag),
2302                            ("source", token ITsource_prag),
2303                            ("warning", token ITwarning_prag),
2304                            ("deprecated", token ITdeprecated_prag),
2305                            ("scc", token ITscc_prag),
2306                            ("generated", token ITgenerated_prag),
2307                            ("core", token ITcore_prag),
2308                            ("unpack", token ITunpack_prag),
2309                            ("ann", token ITann_prag)])
2310
2311 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_prag Inline ConLike)),
2312                              ("notinline conlike", token (ITinline_prag NoInline ConLike)),
2313                              ("specialize inline", token (ITspec_inline_prag True)),
2314                              ("specialize notinline", token (ITspec_inline_prag False))])
2315
2316
2317 dispatch_pragmas :: Map String Action -> Action
2318 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
2319                                        Just found -> found span buf len
2320                                        Nothing -> lexError "unknown pragma"
2321
2322 known_pragma :: Map String Action -> AlexAccPred Int
2323 known_pragma prags _ _ len (AI _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
2324                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
2325
2326 clean_pragma :: String -> String
2327 clean_pragma prag = canon_ws (map toLower (unprefix prag))
2328                     where unprefix prag' = case stripPrefix "{-#" prag' of
2329                                              Just rest -> rest
2330                                              Nothing -> prag'
2331                           canonical prag' = case prag' of
2332                                               "noinline" -> "notinline"
2333                                               "specialise" -> "specialize"
2334                                               "constructorlike" -> "conlike"
2335                                               _ -> prag'
2336                           canon_ws s = unwords (map canonical (words s))
2337 }