compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end RealSrcLoc
  11 -- attached to it.
  12 --
  13 -----------------------------------------------------------------------------
  14
  15 --   ToDo / known bugs:
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 --   qualified operator NOTES.
  24 --
  25 --   - If M.(+) is a single lexeme, then..
  26 --     - Probably (+) should be a single lexeme too, for consistency.
  27 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  28 --     - But we have to rule out reserved operators, otherwise (..) becomes
  29 --       a different lexeme.
  30 --     - Should we therefore also rule out reserved operators in the qualified
  31 --       form?  This is quite difficult to achieve.  We don't do it for
  32 --       qualified varids.
  33
  34 {
  35 -- XXX The above flags turn off warnings in the generated code:
  36 {-# LANGUAGE BangPatterns #-}
  37 {-# OPTIONS_GHC -fno-warn-unused-matches #-}
  38 {-# OPTIONS_GHC -fno-warn-unused-binds #-}
  39 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
  40 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
  41 -- But alex still generates some code that causes the "lazy unlifted bindings"
  42 -- warning, and old compilers don't know about it so we can't easily turn
  43 -- it off, so for now we use the sledge hammer:
  44 {-# OPTIONS_GHC -w #-}
  45
  46 {-# OPTIONS_GHC -funbox-strict-fields #-}
  47
  48 module Lexer (
  49    Token(..), lexer, pragState, mkPState, PState(..),
  50    P(..), ParseResult(..), getSrcLoc,
  51    getPState, getDynFlags, withThisPackage,
  52    failLocMsgP, failSpanMsgP, srcParseFail,
  53    getMessages,
  54    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  55    activeContext, nextIsEOF,
  56    getLexState, popLexState, pushLexState,
  57    extension, bangPatEnabled, datatypeContextsEnabled,
  58    addWarning,
  59    incrBracketDepth, incrBracketDepth1, decrBracketDepth, getParserBrakDepth, pushBracketDepth, popBracketDepth,
  60    lexTokenStream
  61   ) where
  62
  63 import Bag
  64 import ErrUtils
  65 import Outputable
  66 import StringBuffer
  67 import FastString
  68 import SrcLoc
  69 import UniqFM
  70 import DynFlags
  71 import Module
  72 import Ctype
  73 import BasicTypes       ( InlineSpec(..), RuleMatchInfo(..), FractionalLit(..) )
  74 import Util             ( readRational )
  75 import HsSyn (CodeFlavor(..))
  76
  77 import Control.Monad
  78 import Data.Bits
  79 import Data.Char
  80 import Data.List
  81 import Data.Maybe
  82 import Data.Map (Map)
  83 import qualified Data.Map as Map
  84 import Data.Ratio
  85 }
  86
  87 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  88 $whitechar   = [\ \n\r\f\v $unispace]
  89 $white_no_nl = $whitechar # \n
  90 $tab         = \t
  91
  92 $ascdigit  = 0-9
  93 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  94 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  95 $digit     = [$ascdigit $unidigit]
  96
  97 $special   = [\(\)\,\;\[\]\`\{\}]
  98 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  99 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
 100 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
 101
 102 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
 103 $asclarge  = [A-Z]
 104 $large     = [$asclarge $unilarge]
 105
 106 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 107 $ascsmall  = [a-z]
 108 $small     = [$ascsmall $unismall \_]
 109
 110 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 111 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 112
 113 $octit     = 0-7
 114 $hexit     = [$decdigit A-F a-f]
 115 $symchar   = [$symbol \:]
 116 $nl        = [\n\r]
 117 $idchar    = [$small $large $digit \']
 118
 119 $pragmachar = [$small $large $digit]
 120
 121 $docsym    = [\| \^ \* \$]
 122
 123 @varid     = $small $idchar*
 124 @conid     = $large $idchar*
 125
 126 @varsym    = $symbol $symchar*
 127 @consym    = \: $symchar*
 128
 129 @decimal     = $decdigit+
 130 @octal       = $octit+
 131 @hexadecimal = $hexit+
 132 @exponent    = [eE] [\-\+]? @decimal
 133
 134 -- we support the hierarchical module name extension:
 135 @qual = (@conid \.)+
 136
 137 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 138
 139 -- normal signed numerical literals can only be explicitly negative,
 140 -- not explicitly positive (contrast @exponent)
 141 @negative = \-
 142 @signed = @negative ?
 143
 144 haskell :-
 145
 146 -- everywhere: skip whitespace and comments
 147 $white_no_nl+                           ;
 148 $tab+         { warn Opt_WarnTabs (text "Warning: Tab character") }
 149
 150 -- Everywhere: deal with nested comments.  We explicitly rule out
 151 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 152 -- (this can happen even though pragmas will normally take precedence due to
 153 -- longest-match, because pragmas aren't valid in every state, but comments
 154 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 155 -- set.
 156
 157 "{-" / { isNormalComment } { nested_comment lexToken }
 158
 159 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 160 -- more dashes followed by a symbol should be parsed as a varsym, so we
 161 -- have to exclude those.
 162
 163 -- Since Haddock comments aren't valid in every state, we need to rule them
 164 -- out here.
 165
 166 -- The following two rules match comments that begin with two dashes, but
 167 -- continue with a different character. The rules test that this character
 168 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 169 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 170 -- have a Haddock comment). The rules then munch the rest of the line.
 171
 172 "-- " ~[$docsym \#] .* { lineCommentToken }
 173 "--" [^$symbol : \ ] .* { lineCommentToken }
 174
 175 -- Next, match Haddock comments if no -haddock flag
 176
 177 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 178
 179 -- Now, when we've matched comments that begin with 2 dashes and continue
 180 -- with a different character, we need to match comments that begin with three
 181 -- or more dashes (which clearly can't be Haddock comments). We only need to
 182 -- make sure that the first non-dash character isn't a symbol, and munch the
 183 -- rest of the line.
 184
 185 "---"\-* [^$symbol :] .* { lineCommentToken }
 186
 187 -- Since the previous rules all match dashes followed by at least one
 188 -- character, we also need to match a whole line filled with just dashes.
 189
 190 "--"\-* / { atEOL } { lineCommentToken }
 191
 192 -- We need this rule since none of the other single line comment rules
 193 -- actually match this case.
 194
 195 "-- " / { atEOL } { lineCommentToken }
 196
 197 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 198 -- blank lines) until we find a non-whitespace character, then do layout
 199 -- processing.
 200 --
 201 -- One slight wibble here: what if the line begins with {-#? In
 202 -- theory, we have to lex the pragma to see if it's one we recognise,
 203 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 204 -- as a nested comment.  We don't bother with this: if the line begins
 205 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 206 <bol> {
 207   \n                                    ;
 208   ^\# (line)?                           { begin line_prag1 }
 209   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 210   ^\# \! .* \n                          ; -- #!, for scripts
 211   ()                                    { do_bol }
 212 }
 213
 214 -- after a layout keyword (let, where, do, of), we begin a new layout
 215 -- context if the curly brace is missing.
 216 -- Careful! This stuff is quite delicate.
 217 <layout, layout_do> {
 218   \{ / { notFollowedBy '-' }            { hopefully_open_brace }
 219         -- we might encounter {-# here, but {- has been handled already
 220   \n                                    ;
 221   ^\# (line)?                           { begin line_prag1 }
 222 }
 223
 224 -- do is treated in a subtly different way, see new_layout_context
 225 <layout>    ()                          { new_layout_context True }
 226 <layout_do> ()                          { new_layout_context False }
 227
 228 -- after a new layout context which was found to be to the left of the
 229 -- previous context, we have generated a '{' token, and we now need to
 230 -- generate a matching '}' token.
 231 <layout_left>  ()                       { do_layout_left }
 232
 233 <0,option_prags> \n                             { begin bol }
 234
 235 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 236                                 { dispatch_pragmas linePrags }
 237
 238 -- single-line line pragmas, of the form
 239 --    # <line> "<file>" <extra-stuff> \n
 240 <line_prag1> $decdigit+                 { setLine line_prag1a }
 241 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 242 <line_prag1b> .*                        { pop }
 243
 244 -- Haskell-style line pragmas, of the form
 245 --    {-# LINE <line> "<file>" #-}
 246 <line_prag2> $decdigit+                 { setLine line_prag2a }
 247 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 248 <line_prag2b> "#-}"|"-}"                { pop }
 249    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 250    -- with older versions of GHC which generated these.
 251
 252 <0,option_prags> {
 253   "{-#" $whitechar* $pragmachar+
 254         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 255                                  { dispatch_pragmas twoWordPrags }
 256
 257   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 258                                  { dispatch_pragmas oneWordPrags }
 259
 260   -- We ignore all these pragmas, but don't generate a warning for them
 261   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 262                                  { dispatch_pragmas ignoredPrags }
 263
 264   -- ToDo: should only be valid inside a pragma:
 265   "#-}"                                 { endPrag }
 266 }
 267
 268 <option_prags> {
 269   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 270                                    { dispatch_pragmas fileHeaderPrags }
 271
 272   "-- #"                                 { multiline_doc_comment }
 273 }
 274
 275 <0> {
 276   -- In the "0" mode we ignore these pragmas
 277   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 278                      { nested_comment lexToken }
 279 }
 280
 281 <0> {
 282   "-- #" .* { lineCommentToken }
 283 }
 284
 285 <0,option_prags> {
 286   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 287                     (nested_comment lexToken) }
 288 }
 289
 290 -- '0' state: ordinary lexemes
 291
 292 -- Haddock comments
 293
 294 <0,option_prags> {
 295   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 296   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 297 }
 298
 299 -- "special" symbols
 300
 301 <0> {
 302   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 303   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 304 }
 305
 306 <0> {
 307   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 308   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 309   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 310   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 311   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 312   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 313   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 314   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 315
 316 -- For backward compatibility, accept the old dollar syntax
 317   "[$" @varid "|"  / { ifExtension qqEnabled }
 318                      { lex_quasiquote_tok }
 319
 320   "[" @varid "|"  / { ifExtension qqEnabled }
 321                      { lex_quasiquote_tok }
 322 }
 323
 324 <0> {
 325   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 326                                         { special IToparenbar }
 327   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 328 }
 329
 330 <0> {
 331   "<[" / { ifExtension hetMetEnabled `alexAndPred` notFollowedBySymbol }
 332                                         { special ITopenBrak }
 333   "]>" / { ifExtension hetMetEnabled }  { special ITcloseBrak }
 334   "<{" / { ifExtension hetMetEnabled `alexAndPred` notFollowedBySymbol }
 335                                         { special ITopenBrak1 }
 336   "}>" / { ifExtension hetMetEnabled }  { special ITcloseBrak1 }
 337   "~~" / { ifExtension hetMetEnabled }  { special ITescape }
 338   "%%" / { ifExtension hetMetEnabled }  { special ITdoublePercent }
 339   "~~$" / { ifExtension hetMetEnabled }  { special ITescapeDollar }
 340 }
 341
 342 <0> {
 343   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 344 }
 345
 346 <0> {
 347   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 348          { token IToubxparen }
 349   "#)" / { ifExtension unboxedTuplesEnabled }
 350          { token ITcubxparen }
 351 }
 352
 353 <0,option_prags> {
 354   \(                                    { special IToparen }
 355   \)                                    { special ITcparen }
 356   \[                                    { special ITobrack }
 357   \]                                    { special ITcbrack }
 358   \,                                    { special ITcomma }
 359   \;                                    { special ITsemi }
 360   \`                                    { special ITbackquote }
 361
 362   \{                                    { open_brace }
 363   \}                                    { close_brace }
 364 }
 365
 366 <0,option_prags> {
 367   @qual @varid                  { idtoken qvarid }
 368   @qual @conid                  { idtoken qconid }
 369   @varid                        { varid }
 370   @conid                        { idtoken conid }
 371 }
 372
 373 <0> {
 374   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 375   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 376   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 377   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 378 }
 379
 380 -- ToDo: - move `var` and (sym) into lexical syntax?
 381 --       - remove backquote from $special?
 382 <0> {
 383   @qual @varsym                                    { idtoken qvarsym }
 384   @qual @consym                                    { idtoken qconsym }
 385   @varsym                                          { varsym }
 386   @consym                                          { consym }
 387 }
 388
 389 -- For the normal boxed literals we need to be careful
 390 -- when trying to be close to Haskell98
 391 <0> {
 392   -- Normal integral literals (:: Num a => a, from Integer)
 393   @decimal           { tok_num positive 0 0 decimal }
 394   0[oO] @octal       { tok_num positive 2 2 octal }
 395   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 396
 397   -- Normal rational literals (:: Fractional a => a, from Rational)
 398   @floating_point    { strtoken tok_float }
 399 }
 400
 401 <0> {
 402   -- Unboxed ints (:: Int#) and words (:: Word#)
 403   -- It's simpler (and faster?) to give separate cases to the negatives,
 404   -- especially considering octal/hexadecimal prefixes.
 405   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 406   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 407   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 408   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 409   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 410   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 411
 412   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 413   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 414   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 415
 416   -- Unboxed floats and doubles (:: Float#, :: Double#)
 417   -- prim_{float,double} work with signed literals
 418   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 419   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 420 }
 421
 422 -- Strings and chars are lexed by hand-written code.  The reason is
 423 -- that even if we recognise the string or char here in the regex
 424 -- lexer, we would still have to parse the string afterward in order
 425 -- to convert it to a String.
 426 <0> {
 427   \'                            { lex_char_tok }
 428   \"                            { lex_string_tok }
 429 }
 430
 431 {
 432 -- -----------------------------------------------------------------------------
 433 -- The token type
 434
 435 data Token
 436   = ITas                        -- Haskell keywords
 437   | ITcase
 438   | ITclass
 439   | ITdata
 440   | ITdefault
 441   | ITderiving
 442   | ITdo
 443   | ITelse
 444   | IThiding
 445   | ITif
 446   | ITimport
 447   | ITin
 448   | ITinfix
 449   | ITinfixl
 450   | ITinfixr
 451   | ITinstance
 452   | ITlet
 453   | ITmodule
 454   | ITnewtype
 455   | ITof
 456   | ITqualified
 457   | ITthen
 458   | ITtype
 459   | ITwhere
 460   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 461
 462   | ITforall                    -- GHC extension keywords
 463   | ITforeign
 464   | ITexport
 465   | ITlabel
 466   | ITdynamic
 467   | ITsafe
 468   | ITthreadsafe
 469   | ITinterruptible
 470   | ITunsafe
 471   | ITstdcallconv
 472   | ITccallconv
 473   | ITprimcallconv
 474   | ITmdo
 475   | ITfamily
 476   | ITgroup
 477   | ITby
 478   | ITusing
 479
 480         -- Pragmas
 481   | ITinline_prag InlineSpec RuleMatchInfo
 482   | ITspec_prag                 -- SPECIALISE
 483   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 484   | ITsource_prag
 485   | ITrules_prag
 486   | ITwarning_prag
 487   | ITdeprecated_prag
 488   | ITline_prag
 489   | ITscc_prag
 490   | ITgenerated_prag
 491   | ITcore_prag                 -- hdaume: core annotations
 492   | ITunpack_prag
 493   | ITann_prag
 494   | ITclose_prag
 495   | IToptions_prag String
 496   | ITinclude_prag String
 497   | ITlanguage_prag
 498   | ITvect_prag
 499   | ITvect_scalar_prag
 500   | ITnovect_prag
 501
 502   | ITdotdot                    -- reserved symbols
 503   | ITcolon
 504   | ITdcolon
 505   | ITequal
 506   | ITlam
 507   | ITvbar
 508   | ITlarrow
 509   | ITrarrow
 510   | ITkappa
 511   | ITat
 512   | ITtilde
 513   | ITdarrow
 514   | ITminus
 515   | ITbang
 516   | ITstar
 517   | ITdot
 518
 519   | ITbiglam                    -- GHC-extension symbols
 520
 521   | ITocurly                    -- special symbols
 522   | ITccurly
 523   | ITocurlybar                 -- {|, for type applications
 524   | ITccurlybar                 -- |}, for type applications
 525   | ITvocurly
 526   | ITvccurly
 527   | ITobrack
 528   | ITopabrack                  -- [:, for parallel arrays with -XParallelArrays
 529   | ITcpabrack                  -- :], for parallel arrays with -XParallelArrays
 530   | ITcbrack
 531   | IToparen
 532   | ITcparen
 533   | IToubxparen
 534   | ITcubxparen
 535   | ITsemi
 536   | ITcomma
 537   | ITunderscore
 538   | ITbackquote
 539
 540   | ITvarid   FastString        -- identifiers
 541   | ITconid   FastString
 542   | ITvarsym  FastString
 543   | ITconsym  FastString
 544   | ITqvarid  (FastString,FastString)
 545   | ITqconid  (FastString,FastString)
 546   | ITqvarsym (FastString,FastString)
 547   | ITqconsym (FastString,FastString)
 548   | ITprefixqvarsym (FastString,FastString)
 549   | ITprefixqconsym (FastString,FastString)
 550
 551   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 552
 553   | ITchar       Char
 554   | ITstring     FastString
 555   | ITinteger    Integer
 556   | ITrational   FractionalLit
 557
 558   | ITprimchar   Char
 559   | ITprimstring FastString
 560   | ITprimint    Integer
 561   | ITprimword   Integer
 562   | ITprimfloat  FractionalLit
 563   | ITprimdouble FractionalLit
 564
 565   -- Template Haskell extension tokens
 566   | ITopenExpQuote              --  [| or [e|
 567   | ITopenPatQuote              --  [p|
 568   | ITopenDecQuote              --  [d|
 569   | ITopenTypQuote              --  [t|
 570   | ITcloseQuote                --  |]
 571   | ITidEscape   FastString     --  $x
 572   | ITparenEscape               --  $(
 573   | ITvarQuote                  --  '
 574   | ITtyQuote                   --  ''
 575   | ITquasiQuote (FastString,FastString,RealSrcSpan) --  [:...|...|]
 576
 577   -- Arrow notation extension
 578   | ITproc
 579   | ITrec
 580   | IToparenbar                 --  (|
 581   | ITcparenbar                 --  |)
 582   | ITlarrowtail                --  -<
 583   | ITrarrowtail                --  >-
 584   | ITLarrowtail                --  -<<
 585   | ITRarrowtail                --  >>-
 586
 587   -- Heterogeneous Metaprogramming extension
 588   | ITopenBrak                  --  <[
 589   | ITcloseBrak                 --  ]>
 590   | ITopenBrak1                 --  <{
 591   | ITcloseBrak1                --  }>
 592   | ITescape                    --  ~~
 593   | ITescapeDollar              --  ~~$
 594   | ITdoublePercent             --  %%
 595
 596   | ITunknown String            -- Used when the lexer can't make sense of it
 597   | ITeof                       -- end of file token
 598
 599   -- Documentation annotations
 600   | ITdocCommentNext  String     -- something beginning '-- |'
 601   | ITdocCommentPrev  String     -- something beginning '-- ^'
 602   | ITdocCommentNamed String     -- something beginning '-- $'
 603   | ITdocSection      Int String -- a section heading
 604   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 605   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 606   | ITlineComment     String     -- comment starting by "--"
 607   | ITblockComment    String     -- comment in {- -}
 608
 609 #ifdef DEBUG
 610   deriving Show -- debugging
 611 #endif
 612
 613 {-
 614 isSpecial :: Token -> Bool
 615 -- If we see M.x, where x is a keyword, but
 616 -- is special, we treat is as just plain M.x,
 617 -- not as a keyword.
 618 isSpecial ITas          = True
 619 isSpecial IThiding      = True
 620 isSpecial ITqualified   = True
 621 isSpecial ITforall      = True
 622 isSpecial ITexport      = True
 623 isSpecial ITlabel       = True
 624 isSpecial ITdynamic     = True
 625 isSpecial ITsafe        = True
 626 isSpecial ITthreadsafe  = True
 627 isSpecial ITinterruptible = True
 628 isSpecial ITunsafe      = True
 629 isSpecial ITccallconv   = True
 630 isSpecial ITstdcallconv = True
 631 isSpecial ITprimcallconv = True
 632 isSpecial ITmdo         = True
 633 isSpecial ITfamily      = True
 634 isSpecial ITgroup   = True
 635 isSpecial ITby      = True
 636 isSpecial ITusing   = True
 637 isSpecial _             = False
 638 -}
 639
 640 -- the bitmap provided as the third component indicates whether the
 641 -- corresponding extension keyword is valid under the extension options
 642 -- provided to the compiler; if the extension corresponding to *any* of the
 643 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 644 -- facilitates using a keyword in two different extensions that can be
 645 -- activated independently)
 646 --
 647 reservedWordsFM :: UniqFM (Token, Int)
 648 reservedWordsFM = listToUFM $
 649         map (\(x, y, z) -> (mkFastString x, (y, z)))
 650        [( "_",          ITunderscore,   0 ),
 651         ( "as",         ITas,           0 ),
 652         ( "case",       ITcase,         0 ),
 653         ( "class",      ITclass,        0 ),
 654         ( "data",       ITdata,         0 ),
 655         ( "default",    ITdefault,      0 ),
 656         ( "deriving",   ITderiving,     0 ),
 657         ( "do",         ITdo,           0 ),
 658         ( "else",       ITelse,         0 ),
 659         ( "hiding",     IThiding,       0 ),
 660         ( "if",         ITif,           0 ),
 661         ( "import",     ITimport,       0 ),
 662         ( "in",         ITin,           0 ),
 663         ( "infix",      ITinfix,        0 ),
 664         ( "infixl",     ITinfixl,       0 ),
 665         ( "infixr",     ITinfixr,       0 ),
 666         ( "instance",   ITinstance,     0 ),
 667         ( "let",        ITlet,          0 ),
 668         ( "module",     ITmodule,       0 ),
 669         ( "newtype",    ITnewtype,      0 ),
 670         ( "of",         ITof,           0 ),
 671         ( "qualified",  ITqualified,    0 ),
 672         ( "then",       ITthen,         0 ),
 673         ( "type",       ITtype,         0 ),
 674         ( "where",      ITwhere,        0 ),
 675         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 676
 677     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 678         ( "mdo",        ITmdo,           bit recursiveDoBit),
 679         ( "family",     ITfamily,        bit tyFamBit),
 680     ( "group",  ITgroup,     bit transformComprehensionsBit),
 681     ( "by",     ITby,        bit transformComprehensionsBit),
 682     ( "using",  ITusing,     bit transformComprehensionsBit),
 683
 684         ( "foreign",    ITforeign,       bit ffiBit),
 685         ( "export",     ITexport,        bit ffiBit),
 686         ( "label",      ITlabel,         bit ffiBit),
 687         ( "dynamic",    ITdynamic,       bit ffiBit),
 688         ( "safe",       ITsafe,          bit ffiBit),
 689         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 690         ( "interruptible", ITinterruptible, bit ffiBit),
 691         ( "unsafe",     ITunsafe,        bit ffiBit),
 692         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 693         ( "ccall",      ITccallconv,     bit ffiBit),
 694         ( "prim",       ITprimcallconv,  bit ffiBit),
 695
 696         ( "rec",        ITrec,           bit recBit),
 697         ( "proc",       ITproc,          bit arrowsBit)
 698      ]
 699
 700 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 701 reservedSymsFM = listToUFM $
 702     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 703       [ ("..",  ITdotdot,   always)
 704         -- (:) is a reserved op, meaning only list cons
 705        ,(":",   ITcolon,    always)
 706        ,("::",  ITdcolon,   always)
 707        ,("=",   ITequal,    always)
 708        ,("\\",  ITlam,      always)
 709        ,("|",   ITvbar,     always)
 710        ,("<-",  ITlarrow,   always)
 711        ,("->",  ITrarrow,   always)
 712        ,("~~>",  ITkappa,   always)
 713        ,("@",   ITat,       always)
 714        ,("~",   ITtilde,    always)
 715        ,("=>",  ITdarrow,   always)
 716        ,("-",   ITminus,    always)
 717        ,("!",   ITbang,     always)
 718
 719         -- For data T (a::*) = MkT
 720        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 721         -- For 'forall a . t'
 722        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 723
 724        ,("-<",  ITlarrowtail, arrowsEnabled)
 725        ,(">-",  ITrarrowtail, arrowsEnabled)
 726        ,("-<<", ITLarrowtail, arrowsEnabled)
 727        ,(">>-", ITRarrowtail, arrowsEnabled)
 728
 729        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 730        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 731        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 732                                 explicitForallEnabled i)
 733        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 734        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 735
 736        ,("⤙",   ITlarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 737        ,("⤚",   ITrarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 738        ,("⤛",   ITLarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 739        ,("⤜",   ITRarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 740
 741        ,("★", ITstar, unicodeSyntaxEnabled)
 742
 743         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 744         -- form part of a large operator.  This would let us have a better
 745         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 746        ]
 747
 748 -- -----------------------------------------------------------------------------
 749 -- Lexer actions
 750
 751 type Action = RealSrcSpan -> StringBuffer -> Int -> P (RealLocated Token)
 752
 753 special :: Token -> Action
 754 special tok span _buf _len = return (L span tok)
 755
 756 token, layout_token :: Token -> Action
 757 token t span _buf _len = return (L span t)
 758 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 759
 760 idtoken :: (StringBuffer -> Int -> Token) -> Action
 761 idtoken f span buf len = return (L span $! (f buf len))
 762
 763 skip_one_varid :: (FastString -> Token) -> Action
 764 skip_one_varid f span buf len
 765   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 766
 767 strtoken :: (String -> Token) -> Action
 768 strtoken f span buf len =
 769   return (L span $! (f $! lexemeToString buf len))
 770
 771 init_strtoken :: Int -> (String -> Token) -> Action
 772 -- like strtoken, but drops the last N character(s)
 773 init_strtoken drop f span buf len =
 774   return (L span $! (f $! lexemeToString buf (len-drop)))
 775
 776 begin :: Int -> Action
 777 begin code _span _str _len = do pushLexState code; lexToken
 778
 779 pop :: Action
 780 pop _span _buf _len = do _ <- popLexState
 781                          lexToken
 782
 783 hopefully_open_brace :: Action
 784 hopefully_open_brace span buf len
 785  = do relaxed <- extension relaxedLayout
 786       ctx <- getContext
 787       (AI l _) <- getInput
 788       let offset = srcLocCol l
 789           isOK = relaxed ||
 790                  case ctx of
 791                  Layout prev_off : _ -> prev_off < offset
 792                  _                   -> True
 793       if isOK then pop_and open_brace span buf len
 794               else failSpanMsgP (RealSrcSpan span) (text "Missing block")
 795
 796 pop_and :: Action -> Action
 797 pop_and act span buf len = do _ <- popLexState
 798                               act span buf len
 799
 800 {-# INLINE nextCharIs #-}
 801 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 802 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 803
 804 notFollowedBy :: Char -> AlexAccPred Int
 805 notFollowedBy char _ _ _ (AI _ buf)
 806   = nextCharIs buf (/=char)
 807
 808 notFollowedBySymbol :: AlexAccPred Int
 809 notFollowedBySymbol _ _ _ (AI _ buf)
 810   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 811
 812 -- We must reject doc comments as being ordinary comments everywhere.
 813 -- In some cases the doc comment will be selected as the lexeme due to
 814 -- maximal munch, but not always, because the nested comment rule is
 815 -- valid in all states, but the doc-comment rules are only valid in
 816 -- the non-layout states.
 817 isNormalComment :: AlexAccPred Int
 818 isNormalComment bits _ _ (AI _ buf)
 819   | haddockEnabled bits = notFollowedByDocOrPragma
 820   | otherwise           = nextCharIs buf (/='#')
 821   where
 822     notFollowedByDocOrPragma
 823        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 824
 825 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 826 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 827
 828 {-
 829 haddockDisabledAnd p bits _ _ (AI _ buf)
 830   = if haddockEnabled bits then False else (p buf)
 831 -}
 832
 833 atEOL :: AlexAccPred Int
 834 atEOL _ _ _ (AI _ buf) = atEnd buf || currentChar buf == '\n'
 835
 836 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 837 ifExtension pred bits _ _ _ = pred bits
 838
 839 multiline_doc_comment :: Action
 840 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 841   where
 842     worker commentAcc input docType oneLine = case alexGetChar input of
 843       Just ('\n', input')
 844         | oneLine -> docCommentEnd input commentAcc docType buf span
 845         | otherwise -> case checkIfCommentLine input' of
 846           Just input -> worker ('\n':commentAcc) input docType False
 847           Nothing -> docCommentEnd input commentAcc docType buf span
 848       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 849       Nothing -> docCommentEnd input commentAcc docType buf span
 850
 851     checkIfCommentLine input = check (dropNonNewlineSpace input)
 852       where
 853         check input = case alexGetChar input of
 854           Just ('-', input) -> case alexGetChar input of
 855             Just ('-', input) -> case alexGetChar input of
 856               Just (c, _) | c /= '-' -> Just input
 857               _ -> Nothing
 858             _ -> Nothing
 859           _ -> Nothing
 860
 861         dropNonNewlineSpace input = case alexGetChar input of
 862           Just (c, input')
 863             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 864             | otherwise -> input
 865           Nothing -> input
 866
 867 lineCommentToken :: Action
 868 lineCommentToken span buf len = do
 869   b <- extension rawTokenStreamEnabled
 870   if b then strtoken ITlineComment span buf len else lexToken
 871
 872 {-
 873   nested comments require traversing by hand, they can't be parsed
 874   using regular expressions.
 875 -}
 876 nested_comment :: P (RealLocated Token) -> Action
 877 nested_comment cont span _str _len = do
 878   input <- getInput
 879   go "" (1::Int) input
 880   where
 881     go commentAcc 0 input = do setInput input
 882                                b <- extension rawTokenStreamEnabled
 883                                if b
 884                                  then docCommentEnd input commentAcc ITblockComment _str span
 885                                  else cont
 886     go commentAcc n input = case alexGetChar input of
 887       Nothing -> errBrace input span
 888       Just ('-',input) -> case alexGetChar input of
 889         Nothing  -> errBrace input span
 890         Just ('\125',input) -> go commentAcc (n-1) input
 891         Just (_,_)          -> go ('-':commentAcc) n input
 892       Just ('\123',input) -> case alexGetChar input of
 893         Nothing  -> errBrace input span
 894         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 895         Just (_,_)       -> go ('\123':commentAcc) n input
 896       Just (c,input) -> go (c:commentAcc) n input
 897
 898 nested_doc_comment :: Action
 899 nested_doc_comment span buf _len = withLexedDocType (go "")
 900   where
 901     go commentAcc input docType _ = case alexGetChar input of
 902       Nothing -> errBrace input span
 903       Just ('-',input) -> case alexGetChar input of
 904         Nothing -> errBrace input span
 905         Just ('\125',input) ->
 906           docCommentEnd input commentAcc docType buf span
 907         Just (_,_) -> go ('-':commentAcc) input docType False
 908       Just ('\123', input) -> case alexGetChar input of
 909         Nothing  -> errBrace input span
 910         Just ('-',input) -> do
 911           setInput input
 912           let cont = do input <- getInput; go commentAcc input docType False
 913           nested_comment cont span buf _len
 914         Just (_,_) -> go ('\123':commentAcc) input docType False
 915       Just (c,input) -> go (c:commentAcc) input docType False
 916
 917 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (RealLocated Token))
 918                  -> P (RealLocated Token)
 919 withLexedDocType lexDocComment = do
 920   input@(AI _ buf) <- getInput
 921   case prevChar buf ' ' of
 922     '|' -> lexDocComment input ITdocCommentNext False
 923     '^' -> lexDocComment input ITdocCommentPrev False
 924     '$' -> lexDocComment input ITdocCommentNamed False
 925     '*' -> lexDocSection 1 input
 926     '#' -> lexDocComment input ITdocOptionsOld False
 927     _ -> panic "withLexedDocType: Bad doc type"
 928  where
 929     lexDocSection n input = case alexGetChar input of
 930       Just ('*', input) -> lexDocSection (n+1) input
 931       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 932       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 933
 934 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 935 -- off again at the end of the pragma.
 936 rulePrag :: Action
 937 rulePrag span _buf _len = do
 938   setExts (.|. bit inRulePragBit)
 939   return (L span ITrules_prag)
 940
 941 endPrag :: Action
 942 endPrag span _buf _len = do
 943   setExts (.&. complement (bit inRulePragBit))
 944   return (L span ITclose_prag)
 945
 946 -- docCommentEnd
 947 -------------------------------------------------------------------------------
 948 -- This function is quite tricky. We can't just return a new token, we also
 949 -- need to update the state of the parser. Why? Because the token is longer
 950 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 951 -- it writes the wrong token length to the parser state. This function is
 952 -- called afterwards, so it can just update the state.
 953
 954 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 955                  RealSrcSpan -> P (RealLocated Token)
 956 docCommentEnd input commentAcc docType buf span = do
 957   setInput input
 958   let (AI loc nextBuf) = input
 959       comment = reverse commentAcc
 960       span' = mkRealSrcSpan (realSrcSpanStart span) loc
 961       last_len = byteDiff buf nextBuf
 962
 963   span `seq` setLastToken span' last_len
 964   return (L span' (docType comment))
 965
 966 errBrace :: AlexInput -> RealSrcSpan -> P a
 967 errBrace (AI end _) span = failLocMsgP (realSrcSpanStart span) end "unterminated `{-'"
 968
 969 open_brace, close_brace :: Action
 970 open_brace span _str _len = do
 971   ctx <- getContext
 972   setContext (NoLayout:ctx)
 973   return (L span ITocurly)
 974 close_brace span _str _len = do
 975   popContext
 976   return (L span ITccurly)
 977
 978 qvarid, qconid :: StringBuffer -> Int -> Token
 979 qvarid buf len = ITqvarid $! splitQualName buf len False
 980 qconid buf len = ITqconid $! splitQualName buf len False
 981
 982 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 983 -- takes a StringBuffer and a length, and returns the module name
 984 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 985 -- because of hierarchical module names.
 986 splitQualName orig_buf len parens = split orig_buf orig_buf
 987   where
 988     split buf dot_buf
 989         | orig_buf `byteDiff` buf >= len  = done dot_buf
 990         | c == '.'                        = found_dot buf'
 991         | otherwise                       = split buf' dot_buf
 992       where
 993        (c,buf') = nextChar buf
 994
 995     -- careful, we might get names like M....
 996     -- so, if the character after the dot is not upper-case, this is
 997     -- the end of the qualifier part.
 998     found_dot buf -- buf points after the '.'
 999         | isUpper c    = split buf' buf
1000         | otherwise    = done buf
1001       where
1002        (c,buf') = nextChar buf
1003
1004     done dot_buf =
1005         (lexemeToFastString orig_buf (qual_size - 1),
1006          if parens -- Prelude.(+)
1007             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
1008             else lexemeToFastString dot_buf (len - qual_size))
1009       where
1010         qual_size = orig_buf `byteDiff` dot_buf
1011
1012 varid :: Action
1013 varid span buf len =
1014   fs `seq`
1015   case lookupUFM reservedWordsFM fs of
1016         Just (keyword,0)    -> do
1017                 maybe_layout keyword
1018                 return (L span keyword)
1019         Just (keyword,exts) -> do
1020                 b <- extension (\i -> exts .&. i /= 0)
1021                 if b then do maybe_layout keyword
1022                              return (L span keyword)
1023                      else return (L span (ITvarid fs))
1024         _other -> return (L span (ITvarid fs))
1025   where
1026         fs = lexemeToFastString buf len
1027
1028 conid :: StringBuffer -> Int -> Token
1029 conid buf len = ITconid fs
1030   where fs = lexemeToFastString buf len
1031
1032 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
1033 qvarsym buf len = ITqvarsym $! splitQualName buf len False
1034 qconsym buf len = ITqconsym $! splitQualName buf len False
1035 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
1036 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
1037
1038 varsym, consym :: Action
1039 varsym = sym ITvarsym
1040 consym = sym ITconsym
1041
1042 sym :: (FastString -> Token) -> RealSrcSpan -> StringBuffer -> Int
1043     -> P (RealLocated Token)
1044 sym con span buf len =
1045   case lookupUFM reservedSymsFM fs of
1046         Just (keyword,exts) -> do
1047                 b <- extension exts
1048                 if b then return (L span keyword)
1049                      else return (L span $! con fs)
1050         _other -> return (L span $! con fs)
1051   where
1052         fs = lexemeToFastString buf len
1053
1054 -- Variations on the integral numeric literal.
1055 tok_integral :: (Integer -> Token)
1056      -> (Integer -> Integer)
1057  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1058      -> Int -> Int
1059      -> (Integer, (Char->Int)) -> Action
1060 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1061   return $ L span $ itint $! transint $ parseUnsignedInteger
1062      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1063
1064 -- some conveniences for use with tok_integral
1065 tok_num :: (Integer -> Integer)
1066         -> Int -> Int
1067         -> (Integer, (Char->Int)) -> Action
1068 tok_num = tok_integral ITinteger
1069 tok_primint :: (Integer -> Integer)
1070             -> Int -> Int
1071             -> (Integer, (Char->Int)) -> Action
1072 tok_primint = tok_integral ITprimint
1073 tok_primword :: Int -> Int
1074              -> (Integer, (Char->Int)) -> Action
1075 tok_primword = tok_integral ITprimword positive
1076 positive, negative :: (Integer -> Integer)
1077 positive = id
1078 negative = negate
1079 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1080 decimal = (10,octDecDigit)
1081 octal = (8,octDecDigit)
1082 hexadecimal = (16,hexDigit)
1083
1084 -- readRational can understand negative rationals, exponents, everything.
1085 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1086 tok_float        str = ITrational   $! readFractionalLit str
1087 tok_primfloat    str = ITprimfloat  $! readFractionalLit str
1088 tok_primdouble   str = ITprimdouble $! readFractionalLit str
1089
1090 readFractionalLit :: String -> FractionalLit
1091 readFractionalLit str = (FL $! str) $! readRational str
1092
1093 -- -----------------------------------------------------------------------------
1094 -- Layout processing
1095
1096 -- we're at the first token on a line, insert layout tokens if necessary
1097 do_bol :: Action
1098 do_bol span _str _len = do
1099         pos <- getOffside
1100         case pos of
1101             LT -> do
1102                 --trace "layout: inserting '}'" $ do
1103                 popContext
1104                 -- do NOT pop the lex state, we might have a ';' to insert
1105                 return (L span ITvccurly)
1106             EQ -> do
1107                 --trace "layout: inserting ';'" $ do
1108                 _ <- popLexState
1109                 return (L span ITsemi)
1110             GT -> do
1111                 _ <- popLexState
1112                 lexToken
1113
1114 -- certain keywords put us in the "layout" state, where we might
1115 -- add an opening curly brace.
1116 maybe_layout :: Token -> P ()
1117 maybe_layout t = do -- If the alternative layout rule is enabled then
1118                     -- we never create an implicit layout context here.
1119                     -- Layout is handled XXX instead.
1120                     -- The code for closing implicit contexts, or
1121                     -- inserting implicit semi-colons, is therefore
1122                     -- irrelevant as it only applies in an implicit
1123                     -- context.
1124                     alr <- extension alternativeLayoutRule
1125                     unless alr $ f t
1126     where f ITdo    = pushLexState layout_do
1127           f ITmdo   = pushLexState layout_do
1128           f ITof    = pushLexState layout
1129           f ITlet   = pushLexState layout
1130           f ITwhere = pushLexState layout
1131           f ITrec   = pushLexState layout
1132           f _       = return ()
1133
1134 -- Pushing a new implicit layout context.  If the indentation of the
1135 -- next token is not greater than the previous layout context, then
1136 -- Haskell 98 says that the new layout context should be empty; that is
1137 -- the lexer must generate {}.
1138 --
1139 -- We are slightly more lenient than this: when the new context is started
1140 -- by a 'do', then we allow the new context to be at the same indentation as
1141 -- the previous context.  This is what the 'strict' argument is for.
1142 --
1143 new_layout_context :: Bool -> Action
1144 new_layout_context strict span _buf _len = do
1145     _ <- popLexState
1146     (AI l _) <- getInput
1147     let offset = srcLocCol l
1148     ctx <- getContext
1149     nondecreasing <- extension nondecreasingIndentation
1150     let strict' = strict || not nondecreasing
1151     case ctx of
1152         Layout prev_off : _  |
1153            (strict'     && prev_off >= offset  ||
1154             not strict' && prev_off > offset) -> do
1155                 -- token is indented to the left of the previous context.
1156                 -- we must generate a {} sequence now.
1157                 pushLexState layout_left
1158                 return (L span ITvocurly)
1159         _ -> do
1160                 setContext (Layout offset : ctx)
1161                 return (L span ITvocurly)
1162
1163 do_layout_left :: Action
1164 do_layout_left span _buf _len = do
1165     _ <- popLexState
1166     pushLexState bol  -- we must be at the start of a line
1167     return (L span ITvccurly)
1168
1169 -- -----------------------------------------------------------------------------
1170 -- LINE pragmas
1171
1172 setLine :: Int -> Action
1173 setLine code span buf len = do
1174   let line = parseUnsignedInteger buf len 10 octDecDigit
1175   setSrcLoc (mkRealSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1)
1176         -- subtract one: the line number refers to the *following* line
1177   _ <- popLexState
1178   pushLexState code
1179   lexToken
1180
1181 setFile :: Int -> Action
1182 setFile code span buf len = do
1183   let file = lexemeToFastString (stepOn buf) (len-2)
1184   setAlrLastLoc $ alrInitialLoc file
1185   setSrcLoc (mkRealSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1186   _ <- popLexState
1187   pushLexState code
1188   lexToken
1189
1190 alrInitialLoc :: FastString -> RealSrcSpan
1191 alrInitialLoc file = mkRealSrcSpan loc loc
1192     where -- This is a hack to ensure that the first line in a file
1193           -- looks like it is after the initial location:
1194           loc = mkRealSrcLoc file (-1) (-1)
1195
1196 -- -----------------------------------------------------------------------------
1197 -- Options, includes and language pragmas.
1198
1199 lex_string_prag :: (String -> Token) -> Action
1200 lex_string_prag mkTok span _buf _len
1201     = do input <- getInput
1202          start <- getSrcLoc
1203          tok <- go [] input
1204          end <- getSrcLoc
1205          return (L (mkRealSrcSpan start end) tok)
1206     where go acc input
1207               = if isString input "#-}"
1208                    then do setInput input
1209                            return (mkTok (reverse acc))
1210                    else case alexGetChar input of
1211                           Just (c,i) -> go (c:acc) i
1212                           Nothing -> err input
1213           isString _ [] = True
1214           isString i (x:xs)
1215               = case alexGetChar i of
1216                   Just (c,i') | c == x    -> isString i' xs
1217                   _other -> False
1218           err (AI end _) = failLocMsgP (realSrcSpanStart span) end "unterminated options pragma"
1219
1220
1221 -- -----------------------------------------------------------------------------
1222 -- Strings & Chars
1223
1224 -- This stuff is horrible.  I hates it.
1225
1226 lex_string_tok :: Action
1227 lex_string_tok span _buf _len = do
1228   tok <- lex_string ""
1229   end <- getSrcLoc
1230   return (L (mkRealSrcSpan (realSrcSpanStart span) end) tok)
1231
1232 lex_string :: String -> P Token
1233 lex_string s = do
1234   i <- getInput
1235   case alexGetChar' i of
1236     Nothing -> lit_error i
1237
1238     Just ('"',i)  -> do
1239         setInput i
1240         magicHash <- extension magicHashEnabled
1241         if magicHash
1242           then do
1243             i <- getInput
1244             case alexGetChar' i of
1245               Just ('#',i) -> do
1246                    setInput i
1247                    if any (> '\xFF') s
1248                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1249                     else let s' = mkZFastString (reverse s) in
1250                          return (ITprimstring s')
1251                         -- mkZFastString is a hack to avoid encoding the
1252                         -- string in UTF-8.  We just want the exact bytes.
1253               _other ->
1254                 return (ITstring (mkFastString (reverse s)))
1255           else
1256                 return (ITstring (mkFastString (reverse s)))
1257
1258     Just ('\\',i)
1259         | Just ('&',i) <- next -> do
1260                 setInput i; lex_string s
1261         | Just (c,i) <- next, c <= '\x7f' && is_space c -> do
1262                            -- is_space only works for <= '\x7f' (#3751)
1263                 setInput i; lex_stringgap s
1264         where next = alexGetChar' i
1265
1266     Just (c, i1) -> do
1267         case c of
1268           '\\' -> do setInput i1; c' <- lex_escape; lex_string (c':s)
1269           c | isAny c -> do setInput i1; lex_string (c:s)
1270           _other -> lit_error i
1271
1272 lex_stringgap :: String -> P Token
1273 lex_stringgap s = do
1274   i <- getInput
1275   c <- getCharOrFail i
1276   case c of
1277     '\\' -> lex_string s
1278     c | is_space c -> lex_stringgap s
1279     _other -> lit_error i
1280
1281
1282 lex_char_tok :: Action
1283 -- Here we are basically parsing character literals, such as 'x' or '\n'
1284 -- but, when Template Haskell is on, we additionally spot
1285 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1286 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1287 -- So we have to do two characters of lookahead: when we see 'x we need to
1288 -- see if there's a trailing quote
1289 lex_char_tok span _buf _len = do        -- We've seen '
1290    i1 <- getInput       -- Look ahead to first character
1291    let loc = realSrcSpanStart span
1292    case alexGetChar' i1 of
1293         Nothing -> lit_error  i1
1294
1295         Just ('\'', i2@(AI end2 _)) -> do       -- We've seen ''
1296                   th_exts <- extension thEnabled
1297                   if th_exts then do
1298                         setInput i2
1299                         return (L (mkRealSrcSpan loc end2)  ITtyQuote)
1300                    else lit_error i1
1301
1302         Just ('\\', i2@(AI _end2 _)) -> do      -- We've seen 'backslash
1303                   setInput i2
1304                   lit_ch <- lex_escape
1305                   i3 <- getInput
1306                   mc <- getCharOrFail i3 -- Trailing quote
1307                   if mc == '\'' then finish_char_tok loc lit_ch
1308                                 else lit_error i3
1309
1310         Just (c, i2@(AI _end2 _))
1311                 | not (isAny c) -> lit_error i1
1312                 | otherwise ->
1313
1314                 -- We've seen 'x, where x is a valid character
1315                 --  (i.e. not newline etc) but not a quote or backslash
1316            case alexGetChar' i2 of      -- Look ahead one more character
1317                 Just ('\'', i3) -> do   -- We've seen 'x'
1318                         setInput i3
1319                         finish_char_tok loc c
1320                 _other -> do            -- We've seen 'x not followed by quote
1321                                         -- (including the possibility of EOF)
1322                                         -- If TH is on, just parse the quote only
1323                         th_exts <- extension thEnabled
1324                         let (AI end _) = i1
1325                         if th_exts then return (L (mkRealSrcSpan loc end) ITvarQuote)
1326                                    else lit_error i2
1327
1328 finish_char_tok :: RealSrcLoc -> Char -> P (RealLocated Token)
1329 finish_char_tok loc ch  -- We've already seen the closing quote
1330                         -- Just need to check for trailing #
1331   = do  magicHash <- extension magicHashEnabled
1332         i@(AI end _) <- getInput
1333         if magicHash then do
1334                 case alexGetChar' i of
1335                         Just ('#',i@(AI end _)) -> do
1336                                 setInput i
1337                                 return (L (mkRealSrcSpan loc end) (ITprimchar ch))
1338                         _other ->
1339                                 return (L (mkRealSrcSpan loc end) (ITchar ch))
1340             else do
1341                    return (L (mkRealSrcSpan loc end) (ITchar ch))
1342
1343 isAny :: Char -> Bool
1344 isAny c | c > '\x7f' = isPrint c
1345         | otherwise  = is_any c
1346
1347 lex_escape :: P Char
1348 lex_escape = do
1349   i0 <- getInput
1350   c <- getCharOrFail i0
1351   case c of
1352         'a'   -> return '\a'
1353         'b'   -> return '\b'
1354         'f'   -> return '\f'
1355         'n'   -> return '\n'
1356         'r'   -> return '\r'
1357         't'   -> return '\t'
1358         'v'   -> return '\v'
1359         '\\'  -> return '\\'
1360         '"'   -> return '\"'
1361         '\''  -> return '\''
1362         '^'   -> do i1 <- getInput
1363                     c <- getCharOrFail i1
1364                     if c >= '@' && c <= '_'
1365                         then return (chr (ord c - ord '@'))
1366                         else lit_error i1
1367
1368         'x'   -> readNum is_hexdigit 16 hexDigit
1369         'o'   -> readNum is_octdigit  8 octDecDigit
1370         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1371
1372         c1 ->  do
1373            i <- getInput
1374            case alexGetChar' i of
1375             Nothing -> lit_error i0
1376             Just (c2,i2) ->
1377               case alexGetChar' i2 of
1378                 Nothing -> do lit_error i0
1379                 Just (c3,i3) ->
1380                    let str = [c1,c2,c3] in
1381                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1382                                      Just rest <- [stripPrefix p str] ] of
1383                           (escape_char,[]):_ -> do
1384                                 setInput i3
1385                                 return escape_char
1386                           (escape_char,_:_):_ -> do
1387                                 setInput i2
1388                                 return escape_char
1389                           [] -> lit_error i0
1390
1391 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1392 readNum is_digit base conv = do
1393   i <- getInput
1394   c <- getCharOrFail i
1395   if is_digit c
1396         then readNum2 is_digit base conv (conv c)
1397         else lit_error i
1398
1399 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1400 readNum2 is_digit base conv i = do
1401   input <- getInput
1402   read i input
1403   where read i input = do
1404           case alexGetChar' input of
1405             Just (c,input') | is_digit c -> do
1406                let i' = i*base + conv c
1407                if i' > 0x10ffff
1408                   then setInput input >> lexError "numeric escape sequence out of range"
1409                   else read i' input'
1410             _other -> do
1411               setInput input; return (chr i)
1412
1413
1414 silly_escape_chars :: [(String, Char)]
1415 silly_escape_chars = [
1416         ("NUL", '\NUL'),
1417         ("SOH", '\SOH'),
1418         ("STX", '\STX'),
1419         ("ETX", '\ETX'),
1420         ("EOT", '\EOT'),
1421         ("ENQ", '\ENQ'),
1422         ("ACK", '\ACK'),
1423         ("BEL", '\BEL'),
1424         ("BS", '\BS'),
1425         ("HT", '\HT'),
1426         ("LF", '\LF'),
1427         ("VT", '\VT'),
1428         ("FF", '\FF'),
1429         ("CR", '\CR'),
1430         ("SO", '\SO'),
1431         ("SI", '\SI'),
1432         ("DLE", '\DLE'),
1433         ("DC1", '\DC1'),
1434         ("DC2", '\DC2'),
1435         ("DC3", '\DC3'),
1436         ("DC4", '\DC4'),
1437         ("NAK", '\NAK'),
1438         ("SYN", '\SYN'),
1439         ("ETB", '\ETB'),
1440         ("CAN", '\CAN'),
1441         ("EM", '\EM'),
1442         ("SUB", '\SUB'),
1443         ("ESC", '\ESC'),
1444         ("FS", '\FS'),
1445         ("GS", '\GS'),
1446         ("RS", '\RS'),
1447         ("US", '\US'),
1448         ("SP", '\SP'),
1449         ("DEL", '\DEL')
1450         ]
1451
1452 -- before calling lit_error, ensure that the current input is pointing to
1453 -- the position of the error in the buffer.  This is so that we can report
1454 -- a correct location to the user, but also so we can detect UTF-8 decoding
1455 -- errors if they occur.
1456 lit_error :: AlexInput -> P a
1457 lit_error i = do setInput i; lexError "lexical error in string/character literal"
1458
1459 getCharOrFail :: AlexInput -> P Char
1460 getCharOrFail i =  do
1461   case alexGetChar' i of
1462         Nothing -> lexError "unexpected end-of-file in string/character literal"
1463         Just (c,i)  -> do setInput i; return c
1464
1465 -- -----------------------------------------------------------------------------
1466 -- QuasiQuote
1467
1468 lex_quasiquote_tok :: Action
1469 lex_quasiquote_tok span buf len = do
1470   let quoter = tail (lexemeToString buf (len - 1))
1471                 -- 'tail' drops the initial '[',
1472                 -- while the -1 drops the trailing '|'
1473   quoteStart <- getSrcLoc
1474   quote <- lex_quasiquote ""
1475   end <- getSrcLoc
1476   return (L (mkRealSrcSpan (realSrcSpanStart span) end)
1477            (ITquasiQuote (mkFastString quoter,
1478                           mkFastString (reverse quote),
1479                           mkRealSrcSpan quoteStart end)))
1480
1481 lex_quasiquote :: String -> P String
1482 lex_quasiquote s = do
1483   i <- getInput
1484   case alexGetChar' i of
1485     Nothing -> lit_error i
1486
1487     Just ('\\',i)
1488         | Just ('|',i) <- next -> do
1489                 setInput i; lex_quasiquote ('|' : s)
1490         | Just (']',i) <- next -> do
1491                 setInput i; lex_quasiquote (']' : s)
1492         where next = alexGetChar' i
1493
1494     Just ('|',i)
1495         | Just (']',i) <- next -> do
1496                 setInput i; return s
1497         where next = alexGetChar' i
1498
1499     Just (c, i) -> do
1500          setInput i; lex_quasiquote (c : s)
1501
1502 -- -----------------------------------------------------------------------------
1503 -- Warnings
1504
1505 warn :: DynFlag -> SDoc -> Action
1506 warn option warning srcspan _buf _len = do
1507     addWarning option (RealSrcSpan srcspan) warning
1508     lexToken
1509
1510 warnThen :: DynFlag -> SDoc -> Action -> Action
1511 warnThen option warning action srcspan buf len = do
1512     addWarning option (RealSrcSpan srcspan) warning
1513     action srcspan buf len
1514
1515 -- -----------------------------------------------------------------------------
1516 -- The Parse Monad
1517
1518 data LayoutContext
1519   = NoLayout
1520   | Layout !Int
1521   deriving Show
1522
1523 data ParseResult a
1524   = POk PState a
1525   | PFailed
1526         SrcSpan         -- The start and end of the text span related to
1527                         -- the error.  Might be used in environments which can
1528                         -- show this span, e.g. by highlighting it.
1529         Message         -- The error message
1530
1531 data PState = PState {
1532         buffer     :: StringBuffer,
1533         dflags     :: DynFlags,
1534         messages   :: Messages,
1535         last_loc   :: RealSrcSpan,      -- pos of previous token
1536         last_len   :: !Int,     -- len of previous token
1537         loc        :: RealSrcLoc,   -- current loc (end of prev token + 1)
1538         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1539         context    :: [LayoutContext],
1540         lex_state  :: [Int],
1541         -- Used in the alternative layout rule:
1542         -- These tokens are the next ones to be sent out. They are
1543         -- just blindly emitted, without the rule looking at them again:
1544         alr_pending_implicit_tokens :: [RealLocated Token],
1545         -- This is the next token to be considered or, if it is Nothing,
1546         -- we need to get the next token from the input stream:
1547         alr_next_token :: Maybe (RealLocated Token),
1548         -- This is what we consider to be the locatino of the last token
1549         -- emitted:
1550         alr_last_loc :: RealSrcSpan,
1551         -- The stack of layout contexts:
1552         alr_context :: [ALRContext],
1553         -- Are we expecting a '{'? If it's Just, then the ALRLayout tells
1554         -- us what sort of layout the '{' will open:
1555         alr_expecting_ocurly :: Maybe ALRLayout,
1556         -- Have we just had the '}' for a let block? If so, than an 'in'
1557         -- token doesn't need to close anything:
1558         alr_justClosedExplicitLetBlock :: Bool,
1559         code_type_bracket_depth       :: [CodeFlavor],
1560         code_type_bracket_depth_stack :: [CodeFlavor]
1561      }
1562         -- last_loc and last_len are used when generating error messages,
1563         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1564         -- current token to happyError, we could at least get rid of last_len.
1565         -- Getting rid of last_loc would require finding another way to
1566         -- implement pushCurrentContext (which is only called from one place).
1567
1568 data ALRContext = ALRNoLayout Bool{- does it contain commas? -}
1569                               Bool{- is it a 'let' block? -}
1570                 | ALRLayout ALRLayout Int
1571 data ALRLayout = ALRLayoutLet
1572                | ALRLayoutWhere
1573                | ALRLayoutOf
1574                | ALRLayoutDo
1575
1576 newtype P a = P { unP :: PState -> ParseResult a }
1577
1578 instance Monad P where
1579   return = returnP
1580   (>>=) = thenP
1581   fail = failP
1582
1583 returnP :: a -> P a
1584 returnP a = a `seq` (P $ \s -> POk s a)
1585
1586 thenP :: P a -> (a -> P b) -> P b
1587 (P m) `thenP` k = P $ \ s ->
1588         case m s of
1589                 POk s1 a         -> (unP (k a)) s1
1590                 PFailed span err -> PFailed span err
1591
1592 failP :: String -> P a
1593 failP msg = P $ \s -> PFailed (RealSrcSpan (last_loc s)) (text msg)
1594
1595 failMsgP :: String -> P a
1596 failMsgP msg = P $ \s -> PFailed (RealSrcSpan (last_loc s)) (text msg)
1597
1598 failLocMsgP :: RealSrcLoc -> RealSrcLoc -> String -> P a
1599 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (RealSrcSpan (mkRealSrcSpan loc1 loc2)) (text str)
1600
1601 failSpanMsgP :: SrcSpan -> SDoc -> P a
1602 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1603
1604 getPState :: P PState
1605 getPState = P $ \s -> POk s s
1606
1607 getDynFlags :: P DynFlags
1608 getDynFlags = P $ \s -> POk s (dflags s)
1609
1610 withThisPackage :: (PackageId -> a) -> P a
1611 withThisPackage f
1612  = do   pkg     <- liftM thisPackage getDynFlags
1613         return  $ f pkg
1614
1615 extension :: (Int -> Bool) -> P Bool
1616 extension p = P $ \s -> POk s (p $! extsBitmap s)
1617
1618 getExts :: P Int
1619 getExts = P $ \s -> POk s (extsBitmap s)
1620
1621 setExts :: (Int -> Int) -> P ()
1622 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1623
1624 setSrcLoc :: RealSrcLoc -> P ()
1625 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1626
1627 incrBracketDepth :: P ()
1628 incrBracketDepth = P $ \s -> POk (s{code_type_bracket_depth = LambdaFlavor:(code_type_bracket_depth s)}) ()
1629 incrBracketDepth1 :: P ()
1630 incrBracketDepth1 = P $ \s -> POk (s{code_type_bracket_depth = KappaFlavor:(code_type_bracket_depth s)}) ()
1631 decrBracketDepth :: P ()
1632 decrBracketDepth = P $ \s -> POk (s{code_type_bracket_depth = tail (code_type_bracket_depth s)}) ()
1633 pushBracketDepth :: P ()
1634 pushBracketDepth = P $ \s -> POk (s{code_type_bracket_depth       = tail (code_type_bracket_depth s),
1635                                     code_type_bracket_depth_stack = (head (code_type_bracket_depth s)):(code_type_bracket_depth_stack s)
1636                                    }) ()
1637 popBracketDepth :: P ()
1638 popBracketDepth = P $ \s -> POk (s{code_type_bracket_depth       = (head (code_type_bracket_depth_stack s)):(code_type_bracket_depth s),
1639                                    code_type_bracket_depth_stack = tail (code_type_bracket_depth_stack s)
1640                                    }) ()
1641 getParserBrakDepth :: P [CodeFlavor]
1642 getParserBrakDepth = P $ \s -> POk s (code_type_bracket_depth s)
1643
1644 getSrcLoc :: P RealSrcLoc
1645 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1646
1647 setLastToken :: RealSrcSpan -> Int -> P ()
1648 setLastToken loc len = P $ \s -> POk s {
1649   last_loc=loc,
1650   last_len=len
1651   } ()
1652
1653 data AlexInput = AI RealSrcLoc StringBuffer
1654
1655 alexInputPrevChar :: AlexInput -> Char
1656 alexInputPrevChar (AI _ buf) = prevChar buf '\n'
1657
1658 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1659 alexGetChar (AI loc s)
1660   | atEnd s   = Nothing
1661   | otherwise = adj_c `seq` loc' `seq` s' `seq`
1662                 --trace (show (ord c)) $
1663                 Just (adj_c, (AI loc' s'))
1664   where (c,s') = nextChar s
1665         loc'   = advanceSrcLoc loc c
1666
1667         non_graphic     = '\x0'
1668         upper           = '\x1'
1669         lower           = '\x2'
1670         digit           = '\x3'
1671         symbol          = '\x4'
1672         space           = '\x5'
1673         other_graphic   = '\x6'
1674
1675         adj_c
1676           | c <= '\x06' = non_graphic
1677           | c <= '\x7f' = c
1678           -- Alex doesn't handle Unicode, so when Unicode
1679           -- character is encountered we output these values
1680           -- with the actual character value hidden in the state.
1681           | otherwise =
1682                 case generalCategory c of
1683                   UppercaseLetter       -> upper
1684                   LowercaseLetter       -> lower
1685                   TitlecaseLetter       -> upper
1686                   ModifierLetter        -> other_graphic
1687                   OtherLetter           -> lower -- see #1103
1688                   NonSpacingMark        -> other_graphic
1689                   SpacingCombiningMark  -> other_graphic
1690                   EnclosingMark         -> other_graphic
1691                   DecimalNumber         -> digit
1692                   LetterNumber          -> other_graphic
1693                   OtherNumber           -> digit -- see #4373
1694                   ConnectorPunctuation  -> symbol
1695                   DashPunctuation       -> symbol
1696                   OpenPunctuation       -> other_graphic
1697                   ClosePunctuation      -> other_graphic
1698                   InitialQuote          -> other_graphic
1699                   FinalQuote            -> other_graphic
1700                   OtherPunctuation      -> symbol
1701                   MathSymbol            -> symbol
1702                   CurrencySymbol        -> symbol
1703                   ModifierSymbol        -> symbol
1704                   OtherSymbol           -> symbol
1705                   Space                 -> space
1706                   _other                -> non_graphic
1707
1708 -- This version does not squash unicode characters, it is used when
1709 -- lexing strings.
1710 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1711 alexGetChar' (AI loc s)
1712   | atEnd s   = Nothing
1713   | otherwise = c `seq` loc' `seq` s' `seq`
1714                 --trace (show (ord c)) $
1715                 Just (c, (AI loc' s'))
1716   where (c,s') = nextChar s
1717         loc'   = advanceSrcLoc loc c
1718
1719 getInput :: P AlexInput
1720 getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (AI l b)
1721
1722 setInput :: AlexInput -> P ()
1723 setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } ()
1724
1725 nextIsEOF :: P Bool
1726 nextIsEOF = do
1727   AI _ s <- getInput
1728   return $ atEnd s
1729
1730 pushLexState :: Int -> P ()
1731 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1732
1733 popLexState :: P Int
1734 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1735
1736 getLexState :: P Int
1737 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1738
1739 popNextToken :: P (Maybe (RealLocated Token))
1740 popNextToken
1741     = P $ \s@PState{ alr_next_token = m } ->
1742               POk (s {alr_next_token = Nothing}) m
1743
1744 activeContext :: P Bool
1745 activeContext = do
1746   ctxt <- getALRContext
1747   expc <- getAlrExpectingOCurly
1748   impt <- implicitTokenPending
1749   case (ctxt,expc) of
1750     ([],Nothing) -> return impt
1751     _other       -> return True
1752
1753 setAlrLastLoc :: RealSrcSpan -> P ()
1754 setAlrLastLoc l = P $ \s -> POk (s {alr_last_loc = l}) ()
1755
1756 getAlrLastLoc :: P RealSrcSpan
1757 getAlrLastLoc = P $ \s@(PState {alr_last_loc = l}) -> POk s l
1758
1759 getALRContext :: P [ALRContext]
1760 getALRContext = P $ \s@(PState {alr_context = cs}) -> POk s cs
1761
1762 setALRContext :: [ALRContext] -> P ()
1763 setALRContext cs = P $ \s -> POk (s {alr_context = cs}) ()
1764
1765 getJustClosedExplicitLetBlock :: P Bool
1766 getJustClosedExplicitLetBlock
1767  = P $ \s@(PState {alr_justClosedExplicitLetBlock = b}) -> POk s b
1768
1769 setJustClosedExplicitLetBlock :: Bool -> P ()
1770 setJustClosedExplicitLetBlock b
1771  = P $ \s -> POk (s {alr_justClosedExplicitLetBlock = b}) ()
1772
1773 setNextToken :: RealLocated Token -> P ()
1774 setNextToken t = P $ \s -> POk (s {alr_next_token = Just t}) ()
1775
1776 implicitTokenPending :: P Bool
1777 implicitTokenPending
1778     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1779               case ts of
1780               [] -> POk s False
1781               _  -> POk s True
1782
1783 popPendingImplicitToken :: P (Maybe (RealLocated Token))
1784 popPendingImplicitToken
1785     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1786               case ts of
1787               [] -> POk s Nothing
1788               (t : ts') -> POk (s {alr_pending_implicit_tokens = ts'}) (Just t)
1789
1790 setPendingImplicitTokens :: [RealLocated Token] -> P ()
1791 setPendingImplicitTokens ts = P $ \s -> POk (s {alr_pending_implicit_tokens = ts}) ()
1792
1793 getAlrExpectingOCurly :: P (Maybe ALRLayout)
1794 getAlrExpectingOCurly = P $ \s@(PState {alr_expecting_ocurly = b}) -> POk s b
1795
1796 setAlrExpectingOCurly :: Maybe ALRLayout -> P ()
1797 setAlrExpectingOCurly b = P $ \s -> POk (s {alr_expecting_ocurly = b}) ()
1798
1799 -- for reasons of efficiency, flags indicating language extensions (eg,
1800 -- -fglasgow-exts or -XParallelArrays) are represented by a bitmap stored in an unboxed
1801 -- integer
1802
1803 -- The "genericsBit" is now unused, available for others
1804 -- genericsBit :: Int
1805 -- genericsBit = 0 -- {|, |} and "generic"
1806
1807 ffiBit :: Int
1808 ffiBit     = 1
1809 parrBit :: Int
1810 parrBit    = 2
1811 arrowsBit :: Int
1812 arrowsBit  = 4
1813 thBit :: Int
1814 thBit      = 5
1815 ipBit :: Int
1816 ipBit      = 6
1817 explicitForallBit :: Int
1818 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1819 bangPatBit :: Int
1820 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1821                 -- (doesn't affect the lexer)
1822 tyFamBit :: Int
1823 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1824 haddockBit :: Int
1825 haddockBit = 10 -- Lex and parse Haddock comments
1826 magicHashBit :: Int
1827 magicHashBit = 11 -- "#" in both functions and operators
1828 kindSigsBit :: Int
1829 kindSigsBit = 12 -- Kind signatures on type variables
1830 recursiveDoBit :: Int
1831 recursiveDoBit = 13 -- mdo
1832 unicodeSyntaxBit :: Int
1833 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1834 unboxedTuplesBit :: Int
1835 unboxedTuplesBit = 15 -- (# and #)
1836 datatypeContextsBit :: Int
1837 datatypeContextsBit = 16
1838 transformComprehensionsBit :: Int
1839 transformComprehensionsBit = 17
1840 qqBit :: Int
1841 qqBit      = 18 -- enable quasiquoting
1842 inRulePragBit :: Int
1843 inRulePragBit = 19
1844 rawTokenStreamBit :: Int
1845 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1846 recBit :: Int
1847 recBit = 22 -- rec
1848 alternativeLayoutRuleBit :: Int
1849 alternativeLayoutRuleBit = 23
1850 relaxedLayoutBit :: Int
1851 relaxedLayoutBit = 24
1852 nondecreasingIndentationBit :: Int
1853 nondecreasingIndentationBit = 25
1854 hetMetBit :: Int
1855 hetMetBit = 31
1856
1857 always :: Int -> Bool
1858 always           _     = True
1859 parrEnabled :: Int -> Bool
1860 parrEnabled      flags = testBit flags parrBit
1861 arrowsEnabled :: Int -> Bool
1862 arrowsEnabled    flags = testBit flags arrowsBit
1863 hetMetEnabled :: Int -> Bool
1864 hetMetEnabled    flags = testBit flags hetMetBit
1865 thEnabled :: Int -> Bool
1866 thEnabled        flags = testBit flags thBit
1867 ipEnabled :: Int -> Bool
1868 ipEnabled        flags = testBit flags ipBit
1869 explicitForallEnabled :: Int -> Bool
1870 explicitForallEnabled flags = testBit flags explicitForallBit
1871 bangPatEnabled :: Int -> Bool
1872 bangPatEnabled   flags = testBit flags bangPatBit
1873 -- tyFamEnabled :: Int -> Bool
1874 -- tyFamEnabled     flags = testBit flags tyFamBit
1875 haddockEnabled :: Int -> Bool
1876 haddockEnabled   flags = testBit flags haddockBit
1877 magicHashEnabled :: Int -> Bool
1878 magicHashEnabled flags = testBit flags magicHashBit
1879 -- kindSigsEnabled :: Int -> Bool
1880 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1881 unicodeSyntaxEnabled :: Int -> Bool
1882 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1883 unboxedTuplesEnabled :: Int -> Bool
1884 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1885 datatypeContextsEnabled :: Int -> Bool
1886 datatypeContextsEnabled flags = testBit flags datatypeContextsBit
1887 qqEnabled :: Int -> Bool
1888 qqEnabled        flags = testBit flags qqBit
1889 -- inRulePrag :: Int -> Bool
1890 -- inRulePrag       flags = testBit flags inRulePragBit
1891 rawTokenStreamEnabled :: Int -> Bool
1892 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1893 alternativeLayoutRule :: Int -> Bool
1894 alternativeLayoutRule flags = testBit flags alternativeLayoutRuleBit
1895 relaxedLayout :: Int -> Bool
1896 relaxedLayout flags = testBit flags relaxedLayoutBit
1897 nondecreasingIndentation :: Int -> Bool
1898 nondecreasingIndentation flags = testBit flags nondecreasingIndentationBit
1899
1900 -- PState for parsing options pragmas
1901 --
1902 pragState :: DynFlags -> StringBuffer -> RealSrcLoc -> PState
1903 pragState dynflags buf loc = (mkPState dynflags buf loc) {
1904                                  lex_state = [bol, option_prags, 0]
1905                              }
1906
1907 -- create a parse state
1908 --
1909 mkPState :: DynFlags -> StringBuffer -> RealSrcLoc -> PState
1910 mkPState flags buf loc =
1911   PState {
1912       buffer        = buf,
1913       dflags        = flags,
1914       messages      = emptyMessages,
1915       last_loc      = mkRealSrcSpan loc loc,
1916       last_len      = 0,
1917       loc           = loc,
1918       extsBitmap    = fromIntegral bitmap,
1919       context       = [],
1920       lex_state     = [bol, 0],
1921       alr_pending_implicit_tokens = [],
1922       alr_next_token = Nothing,
1923       alr_last_loc = alrInitialLoc (fsLit "<no file>"),
1924       alr_context = [],
1925       alr_expecting_ocurly = Nothing,
1926       alr_justClosedExplicitLetBlock = False,
1927       code_type_bracket_depth = [],
1928       code_type_bracket_depth_stack = []
1929     }
1930     where
1931       bitmap =     ffiBit            `setBitIf` xopt Opt_ForeignFunctionInterface flags
1932                .|. parrBit           `setBitIf` xopt Opt_ParallelArrays  flags
1933                .|. arrowsBit         `setBitIf` xopt Opt_Arrows          flags
1934                .|. hetMetBit         `setBitIf` xopt Opt_ModalTypes      flags
1935                .|. thBit             `setBitIf` xopt Opt_TemplateHaskell flags
1936                .|. qqBit             `setBitIf` xopt Opt_QuasiQuotes     flags
1937                .|. ipBit             `setBitIf` xopt Opt_ImplicitParams  flags
1938                .|. explicitForallBit `setBitIf` xopt Opt_ExplicitForAll  flags
1939                .|. bangPatBit        `setBitIf` xopt Opt_BangPatterns    flags
1940                .|. tyFamBit          `setBitIf` xopt Opt_TypeFamilies    flags
1941                .|. haddockBit        `setBitIf` dopt Opt_Haddock         flags
1942                .|. magicHashBit      `setBitIf` xopt Opt_MagicHash       flags
1943                .|. kindSigsBit       `setBitIf` xopt Opt_KindSignatures  flags
1944                .|. recursiveDoBit    `setBitIf` xopt Opt_RecursiveDo     flags
1945                .|. recBit            `setBitIf` xopt Opt_DoRec           flags
1946                .|. recBit            `setBitIf` xopt Opt_Arrows          flags
1947                .|. unicodeSyntaxBit  `setBitIf` xopt Opt_UnicodeSyntax   flags
1948                .|. unboxedTuplesBit  `setBitIf` xopt Opt_UnboxedTuples   flags
1949                .|. datatypeContextsBit `setBitIf` xopt Opt_DatatypeContexts flags
1950                .|. transformComprehensionsBit `setBitIf` xopt Opt_TransformListComp flags
1951                .|. transformComprehensionsBit `setBitIf` xopt Opt_MonadComprehensions flags
1952                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1953                .|. alternativeLayoutRuleBit `setBitIf` xopt Opt_AlternativeLayoutRule flags
1954                .|. relaxedLayoutBit  `setBitIf` xopt Opt_RelaxedLayout flags
1955                .|. nondecreasingIndentationBit `setBitIf` xopt Opt_NondecreasingIndentation flags
1956       --
1957       setBitIf :: Int -> Bool -> Int
1958       b `setBitIf` cond | cond      = bit b
1959                         | otherwise = 0
1960
1961 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1962 addWarning option srcspan warning
1963  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1964        let warning' = mkWarnMsg srcspan alwaysQualify warning
1965            ws' = if dopt option d then ws `snocBag` warning' else ws
1966        in POk s{messages=(ws', es)} ()
1967
1968 getMessages :: PState -> Messages
1969 getMessages PState{messages=ms} = ms
1970
1971 getContext :: P [LayoutContext]
1972 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1973
1974 setContext :: [LayoutContext] -> P ()
1975 setContext ctx = P $ \s -> POk s{context=ctx} ()
1976
1977 popContext :: P ()
1978 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1979                               last_len = len, last_loc = last_loc }) ->
1980   case ctx of
1981         (_:tl) -> POk s{ context = tl } ()
1982         []     -> PFailed (RealSrcSpan last_loc) (srcParseErr buf len)
1983
1984 -- Push a new layout context at the indentation of the last token read.
1985 -- This is only used at the outer level of a module when the 'module'
1986 -- keyword is missing.
1987 pushCurrentContext :: P ()
1988 pushCurrentContext = P $ \ s@PState{ last_loc=loc, context=ctx } ->
1989     POk s{context = Layout (srcSpanStartCol loc) : ctx} ()
1990
1991 getOffside :: P Ordering
1992 getOffside = P $ \s@PState{last_loc=loc, context=stk} ->
1993                 let offs = srcSpanStartCol loc in
1994                 let ord = case stk of
1995                         (Layout n:_) -> --trace ("layout: " ++ show n ++ ", offs: " ++ show offs) $
1996                                         compare offs n
1997                         _            -> GT
1998                 in POk s ord
1999
2000 -- ---------------------------------------------------------------------------
2001 -- Construct a parse error
2002
2003 srcParseErr
2004   :: StringBuffer       -- current buffer (placed just after the last token)
2005   -> Int                -- length of the previous token
2006   -> Message
2007 srcParseErr buf len
2008   = hcat [ if null token
2009              then ptext (sLit "parse error (possibly incorrect indentation)")
2010              else hcat [ptext (sLit "parse error on input "),
2011                         char '`', text token, char '\'']
2012     ]
2013   where token = lexemeToString (offsetBytes (-len) buf) len
2014
2015 -- Report a parse failure, giving the span of the previous token as
2016 -- the location of the error.  This is the entry point for errors
2017 -- detected during parsing.
2018 srcParseFail :: P a
2019 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
2020                             last_loc = last_loc } ->
2021     PFailed (RealSrcSpan last_loc) (srcParseErr buf len)
2022
2023 -- A lexical error is reported at a particular position in the source file,
2024 -- not over a token range.
2025 lexError :: String -> P a
2026 lexError str = do
2027   loc <- getSrcLoc
2028   (AI end buf) <- getInput
2029   reportLexError loc end buf str
2030
2031 -- -----------------------------------------------------------------------------
2032 -- This is the top-level function: called from the parser each time a
2033 -- new token is to be read from the input.
2034
2035 lexer :: (Located Token -> P a) -> P a
2036 lexer cont = do
2037   alr <- extension alternativeLayoutRule
2038   let lexTokenFun = if alr then lexTokenAlr else lexToken
2039   (L span tok) <- lexTokenFun
2040   --trace ("token: " ++ show tok) $ do
2041   cont (L (RealSrcSpan span) tok)
2042
2043 lexTokenAlr :: P (RealLocated Token)
2044 lexTokenAlr = do mPending <- popPendingImplicitToken
2045                  t <- case mPending of
2046                       Nothing ->
2047                           do mNext <- popNextToken
2048                              t <- case mNext of
2049                                   Nothing -> lexToken
2050                                   Just next -> return next
2051                              alternativeLayoutRuleToken t
2052                       Just t ->
2053                           return t
2054                  setAlrLastLoc (getLoc t)
2055                  case unLoc t of
2056                      ITwhere -> setAlrExpectingOCurly (Just ALRLayoutWhere)
2057                      ITlet   -> setAlrExpectingOCurly (Just ALRLayoutLet)
2058                      ITof    -> setAlrExpectingOCurly (Just ALRLayoutOf)
2059                      ITdo    -> setAlrExpectingOCurly (Just ALRLayoutDo)
2060                      ITmdo   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2061                      ITrec   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2062                      _       -> return ()
2063                  return t
2064
2065 alternativeLayoutRuleToken :: RealLocated Token -> P (RealLocated Token)
2066 alternativeLayoutRuleToken t
2067     = do context <- getALRContext
2068          lastLoc <- getAlrLastLoc
2069          mExpectingOCurly <- getAlrExpectingOCurly
2070          justClosedExplicitLetBlock <- getJustClosedExplicitLetBlock
2071          setJustClosedExplicitLetBlock False
2072          dflags <- getDynFlags
2073          let transitional = xopt Opt_AlternativeLayoutRuleTransitional dflags
2074              thisLoc = getLoc t
2075              thisCol = srcSpanStartCol thisLoc
2076              newLine = srcSpanStartLine thisLoc > srcSpanEndLine lastLoc
2077          case (unLoc t, context, mExpectingOCurly) of
2078              -- This case handles a GHC extension to the original H98
2079              -- layout rule...
2080              (ITocurly, _, Just alrLayout) ->
2081                  do setAlrExpectingOCurly Nothing
2082                     let isLet = case alrLayout of
2083                                 ALRLayoutLet -> True
2084                                 _ -> False
2085                     setALRContext (ALRNoLayout (containsCommas ITocurly) isLet : context)
2086                     return t
2087              -- ...and makes this case unnecessary
2088              {-
2089              -- I think our implicit open-curly handling is slightly
2090              -- different to John's, in how it interacts with newlines
2091              -- and "in"
2092              (ITocurly, _, Just _) ->
2093                  do setAlrExpectingOCurly Nothing
2094                     setNextToken t
2095                     lexTokenAlr
2096              -}
2097              (_, ALRLayout _ col : ls, Just expectingOCurly)
2098               | (thisCol > col) ||
2099                 (thisCol == col &&
2100                  isNonDecreasingIntentation expectingOCurly) ->
2101                  do setAlrExpectingOCurly Nothing
2102                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2103                     setNextToken t
2104                     return (L thisLoc ITocurly)
2105               | otherwise ->
2106                  do setAlrExpectingOCurly Nothing
2107                     setPendingImplicitTokens [L lastLoc ITccurly]
2108                     setNextToken t
2109                     return (L lastLoc ITocurly)
2110              (_, _, Just expectingOCurly) ->
2111                  do setAlrExpectingOCurly Nothing
2112                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2113                     setNextToken t
2114                     return (L thisLoc ITocurly)
2115              -- We do the [] cases earlier than in the spec, as we
2116              -- have an actual EOF token
2117              (ITeof, ALRLayout _ _ : ls, _) ->
2118                  do setALRContext ls
2119                     setNextToken t
2120                     return (L thisLoc ITccurly)
2121              (ITeof, _, _) ->
2122                  return t
2123              -- the other ITeof case omitted; general case below covers it
2124              (ITin, _, _)
2125               | justClosedExplicitLetBlock ->
2126                  return t
2127              (ITin, ALRLayout ALRLayoutLet _ : ls, _)
2128               | newLine ->
2129                  do setPendingImplicitTokens [t]
2130                     setALRContext ls
2131                     return (L thisLoc ITccurly)
2132              -- This next case is to handle a transitional issue:
2133              (ITwhere, ALRLayout _ col : ls, _)
2134               | newLine && thisCol == col && transitional ->
2135                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2136                                (RealSrcSpan thisLoc)
2137                                (transitionalAlternativeLayoutWarning
2138                                     "`where' clause at the same depth as implicit layout block")
2139                     setALRContext ls
2140                     setNextToken t
2141                     -- Note that we use lastLoc, as we may need to close
2142                     -- more layouts, or give a semicolon
2143                     return (L lastLoc ITccurly)
2144              -- This next case is to handle a transitional issue:
2145              (ITvbar, ALRLayout _ col : ls, _)
2146               | newLine && thisCol == col && transitional ->
2147                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2148                                (RealSrcSpan thisLoc)
2149                                (transitionalAlternativeLayoutWarning
2150                                     "`|' at the same depth as implicit layout block")
2151                     setALRContext ls
2152                     setNextToken t
2153                     -- Note that we use lastLoc, as we may need to close
2154                     -- more layouts, or give a semicolon
2155                     return (L lastLoc ITccurly)
2156              (_, ALRLayout _ col : ls, _)
2157               | newLine && thisCol == col ->
2158                  do setNextToken t
2159                     return (L thisLoc ITsemi)
2160               | newLine && thisCol < col ->
2161                  do setALRContext ls
2162                     setNextToken t
2163                     -- Note that we use lastLoc, as we may need to close
2164                     -- more layouts, or give a semicolon
2165                     return (L lastLoc ITccurly)
2166              -- We need to handle close before open, as 'then' is both
2167              -- an open and a close
2168              (u, _, _)
2169               | isALRclose u ->
2170                  case context of
2171                  ALRLayout _ _ : ls ->
2172                      do setALRContext ls
2173                         setNextToken t
2174                         return (L thisLoc ITccurly)
2175                  ALRNoLayout _ isLet : ls ->
2176                      do let ls' = if isALRopen u
2177                                      then ALRNoLayout (containsCommas u) False : ls
2178                                      else ls
2179                         setALRContext ls'
2180                         when isLet $ setJustClosedExplicitLetBlock True
2181                         return t
2182                  [] ->
2183                      do let ls = if isALRopen u
2184                                     then [ALRNoLayout (containsCommas u) False]
2185                                     else ls
2186                         setALRContext ls
2187                         -- XXX This is an error in John's code, but
2188                         -- it looks reachable to me at first glance
2189                         return t
2190              (u, _, _)
2191               | isALRopen u ->
2192                  do setALRContext (ALRNoLayout (containsCommas u) False : context)
2193                     return t
2194              (ITin, ALRLayout ALRLayoutLet _ : ls, _) ->
2195                  do setALRContext ls
2196                     setPendingImplicitTokens [t]
2197                     return (L thisLoc ITccurly)
2198              (ITin, ALRLayout _ _ : ls, _) ->
2199                  do setALRContext ls
2200                     setNextToken t
2201                     return (L thisLoc ITccurly)
2202              -- the other ITin case omitted; general case below covers it
2203              (ITcomma, ALRLayout _ _ : ls, _)
2204               | topNoLayoutContainsCommas ls ->
2205                  do setALRContext ls
2206                     setNextToken t
2207                     return (L thisLoc ITccurly)
2208              (ITwhere, ALRLayout ALRLayoutDo _ : ls, _) ->
2209                  do setALRContext ls
2210                     setPendingImplicitTokens [t]
2211                     return (L thisLoc ITccurly)
2212              -- the other ITwhere case omitted; general case below covers it
2213              (_, _, _) -> return t
2214
2215 transitionalAlternativeLayoutWarning :: String -> SDoc
2216 transitionalAlternativeLayoutWarning msg
2217     = text "transitional layout will not be accepted in the future:"
2218    $$ text msg
2219
2220 isALRopen :: Token -> Bool
2221 isALRopen ITcase        = True
2222 isALRopen ITif          = True
2223 isALRopen ITthen        = True
2224 isALRopen IToparen      = True
2225 isALRopen ITobrack      = True
2226 isALRopen ITocurly      = True
2227 -- GHC Extensions:
2228 isALRopen IToubxparen   = True
2229 isALRopen ITparenEscape = True
2230 isALRopen _             = False
2231
2232 isALRclose :: Token -> Bool
2233 isALRclose ITof     = True
2234 isALRclose ITthen   = True
2235 isALRclose ITelse   = True
2236 isALRclose ITcparen = True
2237 isALRclose ITcbrack = True
2238 isALRclose ITccurly = True
2239 -- GHC Extensions:
2240 isALRclose ITcubxparen = True
2241 isALRclose _        = False
2242
2243 isNonDecreasingIntentation :: ALRLayout -> Bool
2244 isNonDecreasingIntentation ALRLayoutDo = True
2245 isNonDecreasingIntentation _           = False
2246
2247 containsCommas :: Token -> Bool
2248 containsCommas IToparen = True
2249 containsCommas ITobrack = True
2250 -- John doesn't have {} as containing commas, but records contain them,
2251 -- which caused a problem parsing Cabal's Distribution.Simple.InstallDirs
2252 -- (defaultInstallDirs).
2253 containsCommas ITocurly = True
2254 -- GHC Extensions:
2255 containsCommas IToubxparen = True
2256 containsCommas _        = False
2257
2258 topNoLayoutContainsCommas :: [ALRContext] -> Bool
2259 topNoLayoutContainsCommas [] = False
2260 topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
2261 topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
2262
2263 lexToken :: P (RealLocated Token)
2264 lexToken = do
2265   inp@(AI loc1 buf) <- getInput
2266   sc <- getLexState
2267   exts <- getExts
2268   case alexScanUser exts inp sc of
2269     AlexEOF -> do
2270         let span = mkRealSrcSpan loc1 loc1
2271         setLastToken span 0
2272         return (L span ITeof)
2273     AlexError (AI loc2 buf) ->
2274         reportLexError loc1 loc2 buf "lexical error"
2275     AlexSkip inp2 _ -> do
2276         setInput inp2
2277         lexToken
2278     AlexToken inp2@(AI end buf2) _ t -> do
2279         setInput inp2
2280         let span = mkRealSrcSpan loc1 end
2281         let bytes = byteDiff buf buf2
2282         span `seq` setLastToken span bytes
2283         t span buf bytes
2284
2285 reportLexError :: RealSrcLoc -> RealSrcLoc -> StringBuffer -> [Char] -> P a
2286 reportLexError loc1 loc2 buf str
2287   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
2288   | otherwise =
2289   let
2290         c = fst (nextChar buf)
2291   in
2292   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
2293     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
2294     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
2295
2296 lexTokenStream :: StringBuffer -> RealSrcLoc -> DynFlags -> ParseResult [Located Token]
2297 lexTokenStream buf loc dflags = unP go initState
2298     where dflags' = dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream
2299           initState = mkPState dflags' buf loc
2300           go = do
2301             ltok <- lexer return
2302             case ltok of
2303               L _ ITeof -> return []
2304               _ -> liftM (ltok:) go
2305
2306 linePrags = Map.singleton "line" (begin line_prag2)
2307
2308 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
2309                                  ("options_ghc", lex_string_prag IToptions_prag),
2310                                  ("options_haddock", lex_string_prag ITdocOptions),
2311                                  ("language", token ITlanguage_prag),
2312                                  ("include", lex_string_prag ITinclude_prag)])
2313
2314 ignoredPrags = Map.fromList (map ignored pragmas)
2315                where ignored opt = (opt, nested_comment lexToken)
2316                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
2317                      options_pragmas = map ("options_" ++) impls
2318                      -- CFILES is a hugs-only thing.
2319                      pragmas = options_pragmas ++ ["cfiles", "contract"]
2320
2321 oneWordPrags = Map.fromList([("rules", rulePrag),
2322                            ("inline", token (ITinline_prag Inline FunLike)),
2323                            ("inlinable", token (ITinline_prag Inlinable FunLike)),
2324                            ("inlineable", token (ITinline_prag Inlinable FunLike)),
2325                                           -- Spelling variant
2326                            ("notinline", token (ITinline_prag NoInline FunLike)),
2327                            ("specialize", token ITspec_prag),
2328                            ("source", token ITsource_prag),
2329                            ("warning", token ITwarning_prag),
2330                            ("deprecated", token ITdeprecated_prag),
2331                            ("scc", token ITscc_prag),
2332                            ("generated", token ITgenerated_prag),
2333                            ("core", token ITcore_prag),
2334                            ("unpack", token ITunpack_prag),
2335                            ("ann", token ITann_prag),
2336                            ("vectorize", token ITvect_prag),
2337                            ("novectorize", token ITnovect_prag)])
2338
2339 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_prag Inline ConLike)),
2340                              ("notinline conlike", token (ITinline_prag NoInline ConLike)),
2341                              ("specialize inline", token (ITspec_inline_prag True)),
2342                              ("specialize notinline", token (ITspec_inline_prag False)),
2343                              ("vectorize scalar", token ITvect_scalar_prag)])
2344
2345 dispatch_pragmas :: Map String Action -> Action
2346 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
2347                                        Just found -> found span buf len
2348                                        Nothing -> lexError "unknown pragma"
2349
2350 known_pragma :: Map String Action -> AlexAccPred Int
2351 known_pragma prags _ _ len (AI _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
2352                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
2353
2354 clean_pragma :: String -> String
2355 clean_pragma prag = canon_ws (map toLower (unprefix prag))
2356                     where unprefix prag' = case stripPrefix "{-#" prag' of
2357                                              Just rest -> rest
2358                                              Nothing -> prag'
2359                           canonical prag' = case prag' of
2360                                               "noinline" -> "notinline"
2361                                               "specialise" -> "specialize"
2362                                               "vectorise" -> "vectorize"
2363                                               "novectorise" -> "novectorize"
2364                                               "constructorlike" -> "conlike"
2365                                               _ -> prag'
2366                           canon_ws s = unwords (map canonical (words s))
2367 }