compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end RealSrcLoc
  11 -- attached to it.
  12 --
  13 -----------------------------------------------------------------------------
  14
  15 --   ToDo / known bugs:
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 --   qualified operator NOTES.
  24 --
  25 --   - If M.(+) is a single lexeme, then..
  26 --     - Probably (+) should be a single lexeme too, for consistency.
  27 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  28 --     - But we have to rule out reserved operators, otherwise (..) becomes
  29 --       a different lexeme.
  30 --     - Should we therefore also rule out reserved operators in the qualified
  31 --       form?  This is quite difficult to achieve.  We don't do it for
  32 --       qualified varids.
  33
  34 {
  35 -- XXX The above flags turn off warnings in the generated code:
  36 {-# LANGUAGE BangPatterns #-}
  37 {-# OPTIONS_GHC -fno-warn-unused-matches #-}
  38 {-# OPTIONS_GHC -fno-warn-unused-binds #-}
  39 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
  40 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
  41 -- But alex still generates some code that causes the "lazy unlifted bindings"
  42 -- warning, and old compilers don't know about it so we can't easily turn
  43 -- it off, so for now we use the sledge hammer:
  44 {-# OPTIONS_GHC -w #-}
  45
  46 {-# OPTIONS_GHC -funbox-strict-fields #-}
  47
  48 module Lexer (
  49    Token(..), lexer, pragState, mkPState, PState(..),
  50    P(..), ParseResult(..), getSrcLoc,
  51    getPState, getDynFlags, withThisPackage,
  52    failLocMsgP, failSpanMsgP, srcParseFail,
  53    getMessages,
  54    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  55    activeContext, nextIsEOF,
  56    getLexState, popLexState, pushLexState,
  57    extension, bangPatEnabled, datatypeContextsEnabled,
  58    addWarning,
  59    incrBracketDepth, decrBracketDepth, getParserBrakDepth,
  60    lexTokenStream
  61   ) where
  62
  63 import Bag
  64 import ErrUtils
  65 import Outputable
  66 import StringBuffer
  67 import FastString
  68 import SrcLoc
  69 import UniqFM
  70 import DynFlags
  71 import Module
  72 import Ctype
  73 import BasicTypes       ( InlineSpec(..), RuleMatchInfo(..), FractionalLit(..) )
  74 import Util             ( readRational )
  75
  76 import Control.Monad
  77 import Data.Bits
  78 import Data.Char
  79 import Data.List
  80 import Data.Maybe
  81 import Data.Map (Map)
  82 import qualified Data.Map as Map
  83 import Data.Ratio
  84 }
  85
  86 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  87 $whitechar   = [\ \n\r\f\v $unispace]
  88 $white_no_nl = $whitechar # \n
  89 $tab         = \t
  90
  91 $ascdigit  = 0-9
  92 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  93 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  94 $digit     = [$ascdigit $unidigit]
  95
  96 $special   = [\(\)\,\;\[\]\`\{\}]
  97 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  98 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  99 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
 100
 101 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
 102 $asclarge  = [A-Z]
 103 $large     = [$asclarge $unilarge]
 104
 105 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 106 $ascsmall  = [a-z]
 107 $small     = [$ascsmall $unismall \_]
 108
 109 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 110 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 111
 112 $octit     = 0-7
 113 $hexit     = [$decdigit A-F a-f]
 114 $symchar   = [$symbol \:]
 115 $nl        = [\n\r]
 116 $idchar    = [$small $large $digit \']
 117
 118 $pragmachar = [$small $large $digit]
 119
 120 $docsym    = [\| \^ \* \$]
 121
 122 @varid     = $small $idchar*
 123 @conid     = $large $idchar*
 124
 125 @varsym    = $symbol $symchar*
 126 @consym    = \: $symchar*
 127
 128 @decimal     = $decdigit+
 129 @octal       = $octit+
 130 @hexadecimal = $hexit+
 131 @exponent    = [eE] [\-\+]? @decimal
 132
 133 -- we support the hierarchical module name extension:
 134 @qual = (@conid \.)+
 135
 136 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 137
 138 -- normal signed numerical literals can only be explicitly negative,
 139 -- not explicitly positive (contrast @exponent)
 140 @negative = \-
 141 @signed = @negative ?
 142
 143 haskell :-
 144
 145 -- everywhere: skip whitespace and comments
 146 $white_no_nl+                           ;
 147 $tab+         { warn Opt_WarnTabs (text "Warning: Tab character") }
 148
 149 -- Everywhere: deal with nested comments.  We explicitly rule out
 150 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 151 -- (this can happen even though pragmas will normally take precedence due to
 152 -- longest-match, because pragmas aren't valid in every state, but comments
 153 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 154 -- set.
 155
 156 "{-" / { isNormalComment } { nested_comment lexToken }
 157
 158 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 159 -- more dashes followed by a symbol should be parsed as a varsym, so we
 160 -- have to exclude those.
 161
 162 -- Since Haddock comments aren't valid in every state, we need to rule them
 163 -- out here.
 164
 165 -- The following two rules match comments that begin with two dashes, but
 166 -- continue with a different character. The rules test that this character
 167 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 168 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 169 -- have a Haddock comment). The rules then munch the rest of the line.
 170
 171 "-- " ~[$docsym \#] .* { lineCommentToken }
 172 "--" [^$symbol : \ ] .* { lineCommentToken }
 173
 174 -- Next, match Haddock comments if no -haddock flag
 175
 176 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 177
 178 -- Now, when we've matched comments that begin with 2 dashes and continue
 179 -- with a different character, we need to match comments that begin with three
 180 -- or more dashes (which clearly can't be Haddock comments). We only need to
 181 -- make sure that the first non-dash character isn't a symbol, and munch the
 182 -- rest of the line.
 183
 184 "---"\-* [^$symbol :] .* { lineCommentToken }
 185
 186 -- Since the previous rules all match dashes followed by at least one
 187 -- character, we also need to match a whole line filled with just dashes.
 188
 189 "--"\-* / { atEOL } { lineCommentToken }
 190
 191 -- We need this rule since none of the other single line comment rules
 192 -- actually match this case.
 193
 194 "-- " / { atEOL } { lineCommentToken }
 195
 196 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 197 -- blank lines) until we find a non-whitespace character, then do layout
 198 -- processing.
 199 --
 200 -- One slight wibble here: what if the line begins with {-#? In
 201 -- theory, we have to lex the pragma to see if it's one we recognise,
 202 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 203 -- as a nested comment.  We don't bother with this: if the line begins
 204 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 205 <bol> {
 206   \n                                    ;
 207   ^\# (line)?                           { begin line_prag1 }
 208   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 209   ^\# \! .* \n                          ; -- #!, for scripts
 210   ()                                    { do_bol }
 211 }
 212
 213 -- after a layout keyword (let, where, do, of), we begin a new layout
 214 -- context if the curly brace is missing.
 215 -- Careful! This stuff is quite delicate.
 216 <layout, layout_do> {
 217   \{ / { notFollowedBy '-' }            { hopefully_open_brace }
 218         -- we might encounter {-# here, but {- has been handled already
 219   \n                                    ;
 220   ^\# (line)?                           { begin line_prag1 }
 221 }
 222
 223 -- do is treated in a subtly different way, see new_layout_context
 224 <layout>    ()                          { new_layout_context True }
 225 <layout_do> ()                          { new_layout_context False }
 226
 227 -- after a new layout context which was found to be to the left of the
 228 -- previous context, we have generated a '{' token, and we now need to
 229 -- generate a matching '}' token.
 230 <layout_left>  ()                       { do_layout_left }
 231
 232 <0,option_prags> \n                             { begin bol }
 233
 234 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 235                                 { dispatch_pragmas linePrags }
 236
 237 -- single-line line pragmas, of the form
 238 --    # <line> "<file>" <extra-stuff> \n
 239 <line_prag1> $decdigit+                 { setLine line_prag1a }
 240 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 241 <line_prag1b> .*                        { pop }
 242
 243 -- Haskell-style line pragmas, of the form
 244 --    {-# LINE <line> "<file>" #-}
 245 <line_prag2> $decdigit+                 { setLine line_prag2a }
 246 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 247 <line_prag2b> "#-}"|"-}"                { pop }
 248    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 249    -- with older versions of GHC which generated these.
 250
 251 <0,option_prags> {
 252   "{-#" $whitechar* $pragmachar+
 253         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 254                                  { dispatch_pragmas twoWordPrags }
 255
 256   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 257                                  { dispatch_pragmas oneWordPrags }
 258
 259   -- We ignore all these pragmas, but don't generate a warning for them
 260   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 261                                  { dispatch_pragmas ignoredPrags }
 262
 263   -- ToDo: should only be valid inside a pragma:
 264   "#-}"                                 { endPrag }
 265 }
 266
 267 <option_prags> {
 268   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 269                                    { dispatch_pragmas fileHeaderPrags }
 270
 271   "-- #"                                 { multiline_doc_comment }
 272 }
 273
 274 <0> {
 275   -- In the "0" mode we ignore these pragmas
 276   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 277                      { nested_comment lexToken }
 278 }
 279
 280 <0> {
 281   "-- #" .* { lineCommentToken }
 282 }
 283
 284 <0,option_prags> {
 285   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 286                     (nested_comment lexToken) }
 287 }
 288
 289 -- '0' state: ordinary lexemes
 290
 291 -- Haddock comments
 292
 293 <0,option_prags> {
 294   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 295   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 296 }
 297
 298 -- "special" symbols
 299
 300 <0> {
 301   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 302   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 303 }
 304
 305 <0> {
 306   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 307   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 308   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 309   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 310   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 311   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 312   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 313   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 314
 315 -- For backward compatibility, accept the old dollar syntax
 316   "[$" @varid "|"  / { ifExtension qqEnabled }
 317                      { lex_quasiquote_tok }
 318
 319   "[" @varid "|"  / { ifExtension qqEnabled }
 320                      { lex_quasiquote_tok }
 321 }
 322
 323 <0> {
 324   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 325                                         { special IToparenbar }
 326   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 327 }
 328
 329 <0> {
 330   "<[" / { ifExtension hetMetEnabled `alexAndPred` notFollowedBySymbol }
 331                                         { special ITopenBrak }
 332   "]>" / { ifExtension hetMetEnabled }  { special ITcloseBrak }
 333   "~~" / { ifExtension hetMetEnabled }  { special ITescape }
 334   "%%" / { ifExtension hetMetEnabled }  { special ITdoublePercent }
 335   "~~$" / { ifExtension hetMetEnabled }  { special ITescapeDollar }
 336 }
 337
 338 <0> {
 339   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 340 }
 341
 342 <0> {
 343   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 344          { token IToubxparen }
 345   "#)" / { ifExtension unboxedTuplesEnabled }
 346          { token ITcubxparen }
 347 }
 348
 349 <0,option_prags> {
 350   \(                                    { special IToparen }
 351   \)                                    { special ITcparen }
 352   \[                                    { special ITobrack }
 353   \]                                    { special ITcbrack }
 354   \,                                    { special ITcomma }
 355   \;                                    { special ITsemi }
 356   \`                                    { special ITbackquote }
 357
 358   \{                                    { open_brace }
 359   \}                                    { close_brace }
 360 }
 361
 362 <0,option_prags> {
 363   @qual @varid                  { idtoken qvarid }
 364   @qual @conid                  { idtoken qconid }
 365   @varid                        { varid }
 366   @conid                        { idtoken conid }
 367 }
 368
 369 <0> {
 370   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 371   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 372   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 373   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 374 }
 375
 376 -- ToDo: - move `var` and (sym) into lexical syntax?
 377 --       - remove backquote from $special?
 378 <0> {
 379   @qual @varsym                                    { idtoken qvarsym }
 380   @qual @consym                                    { idtoken qconsym }
 381   @varsym                                          { varsym }
 382   @consym                                          { consym }
 383 }
 384
 385 -- For the normal boxed literals we need to be careful
 386 -- when trying to be close to Haskell98
 387 <0> {
 388   -- Normal integral literals (:: Num a => a, from Integer)
 389   @decimal           { tok_num positive 0 0 decimal }
 390   0[oO] @octal       { tok_num positive 2 2 octal }
 391   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 392
 393   -- Normal rational literals (:: Fractional a => a, from Rational)
 394   @floating_point    { strtoken tok_float }
 395 }
 396
 397 <0> {
 398   -- Unboxed ints (:: Int#) and words (:: Word#)
 399   -- It's simpler (and faster?) to give separate cases to the negatives,
 400   -- especially considering octal/hexadecimal prefixes.
 401   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 402   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 403   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 404   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 405   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 406   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 407
 408   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 409   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 410   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 411
 412   -- Unboxed floats and doubles (:: Float#, :: Double#)
 413   -- prim_{float,double} work with signed literals
 414   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 415   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 416 }
 417
 418 -- Strings and chars are lexed by hand-written code.  The reason is
 419 -- that even if we recognise the string or char here in the regex
 420 -- lexer, we would still have to parse the string afterward in order
 421 -- to convert it to a String.
 422 <0> {
 423   \'                            { lex_char_tok }
 424   \"                            { lex_string_tok }
 425 }
 426
 427 {
 428 -- -----------------------------------------------------------------------------
 429 -- The token type
 430
 431 data Token
 432   = ITas                        -- Haskell keywords
 433   | ITcase
 434   | ITclass
 435   | ITdata
 436   | ITdefault
 437   | ITderiving
 438   | ITdo
 439   | ITelse
 440   | IThiding
 441   | ITif
 442   | ITimport
 443   | ITin
 444   | ITinfix
 445   | ITinfixl
 446   | ITinfixr
 447   | ITinstance
 448   | ITlet
 449   | ITmodule
 450   | ITnewtype
 451   | ITof
 452   | ITqualified
 453   | ITthen
 454   | ITtype
 455   | ITwhere
 456   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 457
 458   | ITforall                    -- GHC extension keywords
 459   | ITforeign
 460   | ITexport
 461   | ITlabel
 462   | ITdynamic
 463   | ITsafe
 464   | ITthreadsafe
 465   | ITinterruptible
 466   | ITunsafe
 467   | ITstdcallconv
 468   | ITccallconv
 469   | ITprimcallconv
 470   | ITmdo
 471   | ITfamily
 472   | ITgroup
 473   | ITby
 474   | ITusing
 475
 476         -- Pragmas
 477   | ITinline_prag InlineSpec RuleMatchInfo
 478   | ITspec_prag                 -- SPECIALISE
 479   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 480   | ITsource_prag
 481   | ITrules_prag
 482   | ITwarning_prag
 483   | ITdeprecated_prag
 484   | ITline_prag
 485   | ITscc_prag
 486   | ITgenerated_prag
 487   | ITcore_prag                 -- hdaume: core annotations
 488   | ITunpack_prag
 489   | ITann_prag
 490   | ITclose_prag
 491   | IToptions_prag String
 492   | ITinclude_prag String
 493   | ITlanguage_prag
 494   | ITvect_prag
 495   | ITvect_scalar_prag
 496   | ITnovect_prag
 497
 498   | ITdotdot                    -- reserved symbols
 499   | ITcolon
 500   | ITdcolon
 501   | ITequal
 502   | ITlam
 503   | ITvbar
 504   | ITlarrow
 505   | ITrarrow
 506   | ITat
 507   | ITtilde
 508   | ITdarrow
 509   | ITminus
 510   | ITbang
 511   | ITstar
 512   | ITdot
 513
 514   | ITbiglam                    -- GHC-extension symbols
 515
 516   | ITocurly                    -- special symbols
 517   | ITccurly
 518   | ITocurlybar                 -- {|, for type applications
 519   | ITccurlybar                 -- |}, for type applications
 520   | ITvocurly
 521   | ITvccurly
 522   | ITobrack
 523   | ITopabrack                  -- [:, for parallel arrays with -XParallelArrays
 524   | ITcpabrack                  -- :], for parallel arrays with -XParallelArrays
 525   | ITcbrack
 526   | IToparen
 527   | ITcparen
 528   | IToubxparen
 529   | ITcubxparen
 530   | ITsemi
 531   | ITcomma
 532   | ITunderscore
 533   | ITbackquote
 534
 535   | ITvarid   FastString        -- identifiers
 536   | ITconid   FastString
 537   | ITvarsym  FastString
 538   | ITconsym  FastString
 539   | ITqvarid  (FastString,FastString)
 540   | ITqconid  (FastString,FastString)
 541   | ITqvarsym (FastString,FastString)
 542   | ITqconsym (FastString,FastString)
 543   | ITprefixqvarsym (FastString,FastString)
 544   | ITprefixqconsym (FastString,FastString)
 545
 546   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 547
 548   | ITchar       Char
 549   | ITstring     FastString
 550   | ITinteger    Integer
 551   | ITrational   FractionalLit
 552
 553   | ITprimchar   Char
 554   | ITprimstring FastString
 555   | ITprimint    Integer
 556   | ITprimword   Integer
 557   | ITprimfloat  FractionalLit
 558   | ITprimdouble FractionalLit
 559
 560   -- Template Haskell extension tokens
 561   | ITopenExpQuote              --  [| or [e|
 562   | ITopenPatQuote              --  [p|
 563   | ITopenDecQuote              --  [d|
 564   | ITopenTypQuote              --  [t|
 565   | ITcloseQuote                --  |]
 566   | ITidEscape   FastString     --  $x
 567   | ITparenEscape               --  $(
 568   | ITvarQuote                  --  '
 569   | ITtyQuote                   --  ''
 570   | ITquasiQuote (FastString,FastString,RealSrcSpan) --  [:...|...|]
 571
 572   -- Arrow notation extension
 573   | ITproc
 574   | ITrec
 575   | IToparenbar                 --  (|
 576   | ITcparenbar                 --  |)
 577   | ITlarrowtail                --  -<
 578   | ITrarrowtail                --  >-
 579   | ITLarrowtail                --  -<<
 580   | ITRarrowtail                --  >>-
 581
 582   -- Heterogeneous Metaprogramming extension
 583   | ITopenBrak                  --  <[
 584   | ITcloseBrak                 --  ]>
 585   | ITescape                    --  ~~
 586   | ITescapeDollar              --  ~~$
 587   | ITdoublePercent             --  %%
 588
 589   | ITunknown String            -- Used when the lexer can't make sense of it
 590   | ITeof                       -- end of file token
 591
 592   -- Documentation annotations
 593   | ITdocCommentNext  String     -- something beginning '-- |'
 594   | ITdocCommentPrev  String     -- something beginning '-- ^'
 595   | ITdocCommentNamed String     -- something beginning '-- $'
 596   | ITdocSection      Int String -- a section heading
 597   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 598   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 599   | ITlineComment     String     -- comment starting by "--"
 600   | ITblockComment    String     -- comment in {- -}
 601
 602 #ifdef DEBUG
 603   deriving Show -- debugging
 604 #endif
 605
 606 {-
 607 isSpecial :: Token -> Bool
 608 -- If we see M.x, where x is a keyword, but
 609 -- is special, we treat is as just plain M.x,
 610 -- not as a keyword.
 611 isSpecial ITas          = True
 612 isSpecial IThiding      = True
 613 isSpecial ITqualified   = True
 614 isSpecial ITforall      = True
 615 isSpecial ITexport      = True
 616 isSpecial ITlabel       = True
 617 isSpecial ITdynamic     = True
 618 isSpecial ITsafe        = True
 619 isSpecial ITthreadsafe  = True
 620 isSpecial ITinterruptible = True
 621 isSpecial ITunsafe      = True
 622 isSpecial ITccallconv   = True
 623 isSpecial ITstdcallconv = True
 624 isSpecial ITprimcallconv = True
 625 isSpecial ITmdo         = True
 626 isSpecial ITfamily      = True
 627 isSpecial ITgroup   = True
 628 isSpecial ITby      = True
 629 isSpecial ITusing   = True
 630 isSpecial _             = False
 631 -}
 632
 633 -- the bitmap provided as the third component indicates whether the
 634 -- corresponding extension keyword is valid under the extension options
 635 -- provided to the compiler; if the extension corresponding to *any* of the
 636 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 637 -- facilitates using a keyword in two different extensions that can be
 638 -- activated independently)
 639 --
 640 reservedWordsFM :: UniqFM (Token, Int)
 641 reservedWordsFM = listToUFM $
 642         map (\(x, y, z) -> (mkFastString x, (y, z)))
 643        [( "_",          ITunderscore,   0 ),
 644         ( "as",         ITas,           0 ),
 645         ( "case",       ITcase,         0 ),
 646         ( "class",      ITclass,        0 ),
 647         ( "data",       ITdata,         0 ),
 648         ( "default",    ITdefault,      0 ),
 649         ( "deriving",   ITderiving,     0 ),
 650         ( "do",         ITdo,           0 ),
 651         ( "else",       ITelse,         0 ),
 652         ( "hiding",     IThiding,       0 ),
 653         ( "if",         ITif,           0 ),
 654         ( "import",     ITimport,       0 ),
 655         ( "in",         ITin,           0 ),
 656         ( "infix",      ITinfix,        0 ),
 657         ( "infixl",     ITinfixl,       0 ),
 658         ( "infixr",     ITinfixr,       0 ),
 659         ( "instance",   ITinstance,     0 ),
 660         ( "let",        ITlet,          0 ),
 661         ( "module",     ITmodule,       0 ),
 662         ( "newtype",    ITnewtype,      0 ),
 663         ( "of",         ITof,           0 ),
 664         ( "qualified",  ITqualified,    0 ),
 665         ( "then",       ITthen,         0 ),
 666         ( "type",       ITtype,         0 ),
 667         ( "where",      ITwhere,        0 ),
 668         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 669
 670     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 671         ( "mdo",        ITmdo,           bit recursiveDoBit),
 672         ( "family",     ITfamily,        bit tyFamBit),
 673     ( "group",  ITgroup,     bit transformComprehensionsBit),
 674     ( "by",     ITby,        bit transformComprehensionsBit),
 675     ( "using",  ITusing,     bit transformComprehensionsBit),
 676
 677         ( "foreign",    ITforeign,       bit ffiBit),
 678         ( "export",     ITexport,        bit ffiBit),
 679         ( "label",      ITlabel,         bit ffiBit),
 680         ( "dynamic",    ITdynamic,       bit ffiBit),
 681         ( "safe",       ITsafe,          bit ffiBit),
 682         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 683         ( "interruptible", ITinterruptible, bit ffiBit),
 684         ( "unsafe",     ITunsafe,        bit ffiBit),
 685         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 686         ( "ccall",      ITccallconv,     bit ffiBit),
 687         ( "prim",       ITprimcallconv,  bit ffiBit),
 688
 689         ( "rec",        ITrec,           bit recBit),
 690         ( "proc",       ITproc,          bit arrowsBit)
 691      ]
 692
 693 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 694 reservedSymsFM = listToUFM $
 695     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 696       [ ("..",  ITdotdot,   always)
 697         -- (:) is a reserved op, meaning only list cons
 698        ,(":",   ITcolon,    always)
 699        ,("::",  ITdcolon,   always)
 700        ,("=",   ITequal,    always)
 701        ,("\\",  ITlam,      always)
 702        ,("|",   ITvbar,     always)
 703        ,("<-",  ITlarrow,   always)
 704        ,("->",  ITrarrow,   always)
 705        ,("@",   ITat,       always)
 706        ,("~",   ITtilde,    always)
 707        ,("=>",  ITdarrow,   always)
 708        ,("-",   ITminus,    always)
 709        ,("!",   ITbang,     always)
 710
 711         -- For data T (a::*) = MkT
 712        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 713         -- For 'forall a . t'
 714        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 715
 716        ,("-<",  ITlarrowtail, arrowsEnabled)
 717        ,(">-",  ITrarrowtail, arrowsEnabled)
 718        ,("-<<", ITLarrowtail, arrowsEnabled)
 719        ,(">>-", ITRarrowtail, arrowsEnabled)
 720
 721        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 722        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 723        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 724                                 explicitForallEnabled i)
 725        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 726        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 727
 728        ,("⤙",   ITlarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 729        ,("⤚",   ITrarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 730        ,("⤛",   ITLarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 731        ,("⤜",   ITRarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 732
 733        ,("★", ITstar, unicodeSyntaxEnabled)
 734
 735         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 736         -- form part of a large operator.  This would let us have a better
 737         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 738        ]
 739
 740 -- -----------------------------------------------------------------------------
 741 -- Lexer actions
 742
 743 type Action = RealSrcSpan -> StringBuffer -> Int -> P (RealLocated Token)
 744
 745 special :: Token -> Action
 746 special tok span _buf _len = return (L span tok)
 747
 748 token, layout_token :: Token -> Action
 749 token t span _buf _len = return (L span t)
 750 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 751
 752 idtoken :: (StringBuffer -> Int -> Token) -> Action
 753 idtoken f span buf len = return (L span $! (f buf len))
 754
 755 skip_one_varid :: (FastString -> Token) -> Action
 756 skip_one_varid f span buf len
 757   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 758
 759 strtoken :: (String -> Token) -> Action
 760 strtoken f span buf len =
 761   return (L span $! (f $! lexemeToString buf len))
 762
 763 init_strtoken :: Int -> (String -> Token) -> Action
 764 -- like strtoken, but drops the last N character(s)
 765 init_strtoken drop f span buf len =
 766   return (L span $! (f $! lexemeToString buf (len-drop)))
 767
 768 begin :: Int -> Action
 769 begin code _span _str _len = do pushLexState code; lexToken
 770
 771 pop :: Action
 772 pop _span _buf _len = do _ <- popLexState
 773                          lexToken
 774
 775 hopefully_open_brace :: Action
 776 hopefully_open_brace span buf len
 777  = do relaxed <- extension relaxedLayout
 778       ctx <- getContext
 779       (AI l _) <- getInput
 780       let offset = srcLocCol l
 781           isOK = relaxed ||
 782                  case ctx of
 783                  Layout prev_off : _ -> prev_off < offset
 784                  _                   -> True
 785       if isOK then pop_and open_brace span buf len
 786               else failSpanMsgP (RealSrcSpan span) (text "Missing block")
 787
 788 pop_and :: Action -> Action
 789 pop_and act span buf len = do _ <- popLexState
 790                               act span buf len
 791
 792 {-# INLINE nextCharIs #-}
 793 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 794 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 795
 796 notFollowedBy :: Char -> AlexAccPred Int
 797 notFollowedBy char _ _ _ (AI _ buf)
 798   = nextCharIs buf (/=char)
 799
 800 notFollowedBySymbol :: AlexAccPred Int
 801 notFollowedBySymbol _ _ _ (AI _ buf)
 802   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 803
 804 -- We must reject doc comments as being ordinary comments everywhere.
 805 -- In some cases the doc comment will be selected as the lexeme due to
 806 -- maximal munch, but not always, because the nested comment rule is
 807 -- valid in all states, but the doc-comment rules are only valid in
 808 -- the non-layout states.
 809 isNormalComment :: AlexAccPred Int
 810 isNormalComment bits _ _ (AI _ buf)
 811   | haddockEnabled bits = notFollowedByDocOrPragma
 812   | otherwise           = nextCharIs buf (/='#')
 813   where
 814     notFollowedByDocOrPragma
 815        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 816
 817 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 818 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 819
 820 {-
 821 haddockDisabledAnd p bits _ _ (AI _ buf)
 822   = if haddockEnabled bits then False else (p buf)
 823 -}
 824
 825 atEOL :: AlexAccPred Int
 826 atEOL _ _ _ (AI _ buf) = atEnd buf || currentChar buf == '\n'
 827
 828 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 829 ifExtension pred bits _ _ _ = pred bits
 830
 831 multiline_doc_comment :: Action
 832 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 833   where
 834     worker commentAcc input docType oneLine = case alexGetChar input of
 835       Just ('\n', input')
 836         | oneLine -> docCommentEnd input commentAcc docType buf span
 837         | otherwise -> case checkIfCommentLine input' of
 838           Just input -> worker ('\n':commentAcc) input docType False
 839           Nothing -> docCommentEnd input commentAcc docType buf span
 840       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 841       Nothing -> docCommentEnd input commentAcc docType buf span
 842
 843     checkIfCommentLine input = check (dropNonNewlineSpace input)
 844       where
 845         check input = case alexGetChar input of
 846           Just ('-', input) -> case alexGetChar input of
 847             Just ('-', input) -> case alexGetChar input of
 848               Just (c, _) | c /= '-' -> Just input
 849               _ -> Nothing
 850             _ -> Nothing
 851           _ -> Nothing
 852
 853         dropNonNewlineSpace input = case alexGetChar input of
 854           Just (c, input')
 855             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 856             | otherwise -> input
 857           Nothing -> input
 858
 859 lineCommentToken :: Action
 860 lineCommentToken span buf len = do
 861   b <- extension rawTokenStreamEnabled
 862   if b then strtoken ITlineComment span buf len else lexToken
 863
 864 {-
 865   nested comments require traversing by hand, they can't be parsed
 866   using regular expressions.
 867 -}
 868 nested_comment :: P (RealLocated Token) -> Action
 869 nested_comment cont span _str _len = do
 870   input <- getInput
 871   go "" (1::Int) input
 872   where
 873     go commentAcc 0 input = do setInput input
 874                                b <- extension rawTokenStreamEnabled
 875                                if b
 876                                  then docCommentEnd input commentAcc ITblockComment _str span
 877                                  else cont
 878     go commentAcc n input = case alexGetChar input of
 879       Nothing -> errBrace input span
 880       Just ('-',input) -> case alexGetChar input of
 881         Nothing  -> errBrace input span
 882         Just ('\125',input) -> go commentAcc (n-1) input
 883         Just (_,_)          -> go ('-':commentAcc) n input
 884       Just ('\123',input) -> case alexGetChar input of
 885         Nothing  -> errBrace input span
 886         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 887         Just (_,_)       -> go ('\123':commentAcc) n input
 888       Just (c,input) -> go (c:commentAcc) n input
 889
 890 nested_doc_comment :: Action
 891 nested_doc_comment span buf _len = withLexedDocType (go "")
 892   where
 893     go commentAcc input docType _ = case alexGetChar input of
 894       Nothing -> errBrace input span
 895       Just ('-',input) -> case alexGetChar input of
 896         Nothing -> errBrace input span
 897         Just ('\125',input) ->
 898           docCommentEnd input commentAcc docType buf span
 899         Just (_,_) -> go ('-':commentAcc) input docType False
 900       Just ('\123', input) -> case alexGetChar input of
 901         Nothing  -> errBrace input span
 902         Just ('-',input) -> do
 903           setInput input
 904           let cont = do input <- getInput; go commentAcc input docType False
 905           nested_comment cont span buf _len
 906         Just (_,_) -> go ('\123':commentAcc) input docType False
 907       Just (c,input) -> go (c:commentAcc) input docType False
 908
 909 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (RealLocated Token))
 910                  -> P (RealLocated Token)
 911 withLexedDocType lexDocComment = do
 912   input@(AI _ buf) <- getInput
 913   case prevChar buf ' ' of
 914     '|' -> lexDocComment input ITdocCommentNext False
 915     '^' -> lexDocComment input ITdocCommentPrev False
 916     '$' -> lexDocComment input ITdocCommentNamed False
 917     '*' -> lexDocSection 1 input
 918     '#' -> lexDocComment input ITdocOptionsOld False
 919     _ -> panic "withLexedDocType: Bad doc type"
 920  where
 921     lexDocSection n input = case alexGetChar input of
 922       Just ('*', input) -> lexDocSection (n+1) input
 923       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 924       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 925
 926 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 927 -- off again at the end of the pragma.
 928 rulePrag :: Action
 929 rulePrag span _buf _len = do
 930   setExts (.|. bit inRulePragBit)
 931   return (L span ITrules_prag)
 932
 933 endPrag :: Action
 934 endPrag span _buf _len = do
 935   setExts (.&. complement (bit inRulePragBit))
 936   return (L span ITclose_prag)
 937
 938 -- docCommentEnd
 939 -------------------------------------------------------------------------------
 940 -- This function is quite tricky. We can't just return a new token, we also
 941 -- need to update the state of the parser. Why? Because the token is longer
 942 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 943 -- it writes the wrong token length to the parser state. This function is
 944 -- called afterwards, so it can just update the state.
 945
 946 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 947                  RealSrcSpan -> P (RealLocated Token)
 948 docCommentEnd input commentAcc docType buf span = do
 949   setInput input
 950   let (AI loc nextBuf) = input
 951       comment = reverse commentAcc
 952       span' = mkRealSrcSpan (realSrcSpanStart span) loc
 953       last_len = byteDiff buf nextBuf
 954
 955   span `seq` setLastToken span' last_len
 956   return (L span' (docType comment))
 957
 958 errBrace :: AlexInput -> RealSrcSpan -> P a
 959 errBrace (AI end _) span = failLocMsgP (realSrcSpanStart span) end "unterminated `{-'"
 960
 961 open_brace, close_brace :: Action
 962 open_brace span _str _len = do
 963   ctx <- getContext
 964   setContext (NoLayout:ctx)
 965   return (L span ITocurly)
 966 close_brace span _str _len = do
 967   popContext
 968   return (L span ITccurly)
 969
 970 qvarid, qconid :: StringBuffer -> Int -> Token
 971 qvarid buf len = ITqvarid $! splitQualName buf len False
 972 qconid buf len = ITqconid $! splitQualName buf len False
 973
 974 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 975 -- takes a StringBuffer and a length, and returns the module name
 976 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 977 -- because of hierarchical module names.
 978 splitQualName orig_buf len parens = split orig_buf orig_buf
 979   where
 980     split buf dot_buf
 981         | orig_buf `byteDiff` buf >= len  = done dot_buf
 982         | c == '.'                        = found_dot buf'
 983         | otherwise                       = split buf' dot_buf
 984       where
 985        (c,buf') = nextChar buf
 986
 987     -- careful, we might get names like M....
 988     -- so, if the character after the dot is not upper-case, this is
 989     -- the end of the qualifier part.
 990     found_dot buf -- buf points after the '.'
 991         | isUpper c    = split buf' buf
 992         | otherwise    = done buf
 993       where
 994        (c,buf') = nextChar buf
 995
 996     done dot_buf =
 997         (lexemeToFastString orig_buf (qual_size - 1),
 998          if parens -- Prelude.(+)
 999             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
1000             else lexemeToFastString dot_buf (len - qual_size))
1001       where
1002         qual_size = orig_buf `byteDiff` dot_buf
1003
1004 varid :: Action
1005 varid span buf len =
1006   fs `seq`
1007   case lookupUFM reservedWordsFM fs of
1008         Just (keyword,0)    -> do
1009                 maybe_layout keyword
1010                 return (L span keyword)
1011         Just (keyword,exts) -> do
1012                 b <- extension (\i -> exts .&. i /= 0)
1013                 if b then do maybe_layout keyword
1014                              return (L span keyword)
1015                      else return (L span (ITvarid fs))
1016         _other -> return (L span (ITvarid fs))
1017   where
1018         fs = lexemeToFastString buf len
1019
1020 conid :: StringBuffer -> Int -> Token
1021 conid buf len = ITconid fs
1022   where fs = lexemeToFastString buf len
1023
1024 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
1025 qvarsym buf len = ITqvarsym $! splitQualName buf len False
1026 qconsym buf len = ITqconsym $! splitQualName buf len False
1027 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
1028 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
1029
1030 varsym, consym :: Action
1031 varsym = sym ITvarsym
1032 consym = sym ITconsym
1033
1034 sym :: (FastString -> Token) -> RealSrcSpan -> StringBuffer -> Int
1035     -> P (RealLocated Token)
1036 sym con span buf len =
1037   case lookupUFM reservedSymsFM fs of
1038         Just (keyword,exts) -> do
1039                 b <- extension exts
1040                 if b then return (L span keyword)
1041                      else return (L span $! con fs)
1042         _other -> return (L span $! con fs)
1043   where
1044         fs = lexemeToFastString buf len
1045
1046 -- Variations on the integral numeric literal.
1047 tok_integral :: (Integer -> Token)
1048      -> (Integer -> Integer)
1049  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1050      -> Int -> Int
1051      -> (Integer, (Char->Int)) -> Action
1052 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1053   return $ L span $ itint $! transint $ parseUnsignedInteger
1054      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1055
1056 -- some conveniences for use with tok_integral
1057 tok_num :: (Integer -> Integer)
1058         -> Int -> Int
1059         -> (Integer, (Char->Int)) -> Action
1060 tok_num = tok_integral ITinteger
1061 tok_primint :: (Integer -> Integer)
1062             -> Int -> Int
1063             -> (Integer, (Char->Int)) -> Action
1064 tok_primint = tok_integral ITprimint
1065 tok_primword :: Int -> Int
1066              -> (Integer, (Char->Int)) -> Action
1067 tok_primword = tok_integral ITprimword positive
1068 positive, negative :: (Integer -> Integer)
1069 positive = id
1070 negative = negate
1071 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1072 decimal = (10,octDecDigit)
1073 octal = (8,octDecDigit)
1074 hexadecimal = (16,hexDigit)
1075
1076 -- readRational can understand negative rationals, exponents, everything.
1077 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1078 tok_float        str = ITrational   $! readFractionalLit str
1079 tok_primfloat    str = ITprimfloat  $! readFractionalLit str
1080 tok_primdouble   str = ITprimdouble $! readFractionalLit str
1081
1082 readFractionalLit :: String -> FractionalLit
1083 readFractionalLit str = (FL $! str) $! readRational str
1084
1085 -- -----------------------------------------------------------------------------
1086 -- Layout processing
1087
1088 -- we're at the first token on a line, insert layout tokens if necessary
1089 do_bol :: Action
1090 do_bol span _str _len = do
1091         pos <- getOffside
1092         case pos of
1093             LT -> do
1094                 --trace "layout: inserting '}'" $ do
1095                 popContext
1096                 -- do NOT pop the lex state, we might have a ';' to insert
1097                 return (L span ITvccurly)
1098             EQ -> do
1099                 --trace "layout: inserting ';'" $ do
1100                 _ <- popLexState
1101                 return (L span ITsemi)
1102             GT -> do
1103                 _ <- popLexState
1104                 lexToken
1105
1106 -- certain keywords put us in the "layout" state, where we might
1107 -- add an opening curly brace.
1108 maybe_layout :: Token -> P ()
1109 maybe_layout t = do -- If the alternative layout rule is enabled then
1110                     -- we never create an implicit layout context here.
1111                     -- Layout is handled XXX instead.
1112                     -- The code for closing implicit contexts, or
1113                     -- inserting implicit semi-colons, is therefore
1114                     -- irrelevant as it only applies in an implicit
1115                     -- context.
1116                     alr <- extension alternativeLayoutRule
1117                     unless alr $ f t
1118     where f ITdo    = pushLexState layout_do
1119           f ITmdo   = pushLexState layout_do
1120           f ITof    = pushLexState layout
1121           f ITlet   = pushLexState layout
1122           f ITwhere = pushLexState layout
1123           f ITrec   = pushLexState layout
1124           f _       = return ()
1125
1126 -- Pushing a new implicit layout context.  If the indentation of the
1127 -- next token is not greater than the previous layout context, then
1128 -- Haskell 98 says that the new layout context should be empty; that is
1129 -- the lexer must generate {}.
1130 --
1131 -- We are slightly more lenient than this: when the new context is started
1132 -- by a 'do', then we allow the new context to be at the same indentation as
1133 -- the previous context.  This is what the 'strict' argument is for.
1134 --
1135 new_layout_context :: Bool -> Action
1136 new_layout_context strict span _buf _len = do
1137     _ <- popLexState
1138     (AI l _) <- getInput
1139     let offset = srcLocCol l
1140     ctx <- getContext
1141     nondecreasing <- extension nondecreasingIndentation
1142     let strict' = strict || not nondecreasing
1143     case ctx of
1144         Layout prev_off : _  |
1145            (strict'     && prev_off >= offset  ||
1146             not strict' && prev_off > offset) -> do
1147                 -- token is indented to the left of the previous context.
1148                 -- we must generate a {} sequence now.
1149                 pushLexState layout_left
1150                 return (L span ITvocurly)
1151         _ -> do
1152                 setContext (Layout offset : ctx)
1153                 return (L span ITvocurly)
1154
1155 do_layout_left :: Action
1156 do_layout_left span _buf _len = do
1157     _ <- popLexState
1158     pushLexState bol  -- we must be at the start of a line
1159     return (L span ITvccurly)
1160
1161 -- -----------------------------------------------------------------------------
1162 -- LINE pragmas
1163
1164 setLine :: Int -> Action
1165 setLine code span buf len = do
1166   let line = parseUnsignedInteger buf len 10 octDecDigit
1167   setSrcLoc (mkRealSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1)
1168         -- subtract one: the line number refers to the *following* line
1169   _ <- popLexState
1170   pushLexState code
1171   lexToken
1172
1173 setFile :: Int -> Action
1174 setFile code span buf len = do
1175   let file = lexemeToFastString (stepOn buf) (len-2)
1176   setAlrLastLoc $ alrInitialLoc file
1177   setSrcLoc (mkRealSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1178   _ <- popLexState
1179   pushLexState code
1180   lexToken
1181
1182 alrInitialLoc :: FastString -> RealSrcSpan
1183 alrInitialLoc file = mkRealSrcSpan loc loc
1184     where -- This is a hack to ensure that the first line in a file
1185           -- looks like it is after the initial location:
1186           loc = mkRealSrcLoc file (-1) (-1)
1187
1188 -- -----------------------------------------------------------------------------
1189 -- Options, includes and language pragmas.
1190
1191 lex_string_prag :: (String -> Token) -> Action
1192 lex_string_prag mkTok span _buf _len
1193     = do input <- getInput
1194          start <- getSrcLoc
1195          tok <- go [] input
1196          end <- getSrcLoc
1197          return (L (mkRealSrcSpan start end) tok)
1198     where go acc input
1199               = if isString input "#-}"
1200                    then do setInput input
1201                            return (mkTok (reverse acc))
1202                    else case alexGetChar input of
1203                           Just (c,i) -> go (c:acc) i
1204                           Nothing -> err input
1205           isString _ [] = True
1206           isString i (x:xs)
1207               = case alexGetChar i of
1208                   Just (c,i') | c == x    -> isString i' xs
1209                   _other -> False
1210           err (AI end _) = failLocMsgP (realSrcSpanStart span) end "unterminated options pragma"
1211
1212
1213 -- -----------------------------------------------------------------------------
1214 -- Strings & Chars
1215
1216 -- This stuff is horrible.  I hates it.
1217
1218 lex_string_tok :: Action
1219 lex_string_tok span _buf _len = do
1220   tok <- lex_string ""
1221   end <- getSrcLoc
1222   return (L (mkRealSrcSpan (realSrcSpanStart span) end) tok)
1223
1224 lex_string :: String -> P Token
1225 lex_string s = do
1226   i <- getInput
1227   case alexGetChar' i of
1228     Nothing -> lit_error i
1229
1230     Just ('"',i)  -> do
1231         setInput i
1232         magicHash <- extension magicHashEnabled
1233         if magicHash
1234           then do
1235             i <- getInput
1236             case alexGetChar' i of
1237               Just ('#',i) -> do
1238                    setInput i
1239                    if any (> '\xFF') s
1240                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1241                     else let s' = mkZFastString (reverse s) in
1242                          return (ITprimstring s')
1243                         -- mkZFastString is a hack to avoid encoding the
1244                         -- string in UTF-8.  We just want the exact bytes.
1245               _other ->
1246                 return (ITstring (mkFastString (reverse s)))
1247           else
1248                 return (ITstring (mkFastString (reverse s)))
1249
1250     Just ('\\',i)
1251         | Just ('&',i) <- next -> do
1252                 setInput i; lex_string s
1253         | Just (c,i) <- next, c <= '\x7f' && is_space c -> do
1254                            -- is_space only works for <= '\x7f' (#3751)
1255                 setInput i; lex_stringgap s
1256         where next = alexGetChar' i
1257
1258     Just (c, i1) -> do
1259         case c of
1260           '\\' -> do setInput i1; c' <- lex_escape; lex_string (c':s)
1261           c | isAny c -> do setInput i1; lex_string (c:s)
1262           _other -> lit_error i
1263
1264 lex_stringgap :: String -> P Token
1265 lex_stringgap s = do
1266   i <- getInput
1267   c <- getCharOrFail i
1268   case c of
1269     '\\' -> lex_string s
1270     c | is_space c -> lex_stringgap s
1271     _other -> lit_error i
1272
1273
1274 lex_char_tok :: Action
1275 -- Here we are basically parsing character literals, such as 'x' or '\n'
1276 -- but, when Template Haskell is on, we additionally spot
1277 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1278 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1279 -- So we have to do two characters of lookahead: when we see 'x we need to
1280 -- see if there's a trailing quote
1281 lex_char_tok span _buf _len = do        -- We've seen '
1282    i1 <- getInput       -- Look ahead to first character
1283    let loc = realSrcSpanStart span
1284    case alexGetChar' i1 of
1285         Nothing -> lit_error  i1
1286
1287         Just ('\'', i2@(AI end2 _)) -> do       -- We've seen ''
1288                   th_exts <- extension thEnabled
1289                   if th_exts then do
1290                         setInput i2
1291                         return (L (mkRealSrcSpan loc end2)  ITtyQuote)
1292                    else lit_error i1
1293
1294         Just ('\\', i2@(AI _end2 _)) -> do      -- We've seen 'backslash
1295                   setInput i2
1296                   lit_ch <- lex_escape
1297                   i3 <- getInput
1298                   mc <- getCharOrFail i3 -- Trailing quote
1299                   if mc == '\'' then finish_char_tok loc lit_ch
1300                                 else lit_error i3
1301
1302         Just (c, i2@(AI _end2 _))
1303                 | not (isAny c) -> lit_error i1
1304                 | otherwise ->
1305
1306                 -- We've seen 'x, where x is a valid character
1307                 --  (i.e. not newline etc) but not a quote or backslash
1308            case alexGetChar' i2 of      -- Look ahead one more character
1309                 Just ('\'', i3) -> do   -- We've seen 'x'
1310                         setInput i3
1311                         finish_char_tok loc c
1312                 _other -> do            -- We've seen 'x not followed by quote
1313                                         -- (including the possibility of EOF)
1314                                         -- If TH is on, just parse the quote only
1315                         th_exts <- extension thEnabled
1316                         let (AI end _) = i1
1317                         if th_exts then return (L (mkRealSrcSpan loc end) ITvarQuote)
1318                                    else lit_error i2
1319
1320 finish_char_tok :: RealSrcLoc -> Char -> P (RealLocated Token)
1321 finish_char_tok loc ch  -- We've already seen the closing quote
1322                         -- Just need to check for trailing #
1323   = do  magicHash <- extension magicHashEnabled
1324         i@(AI end _) <- getInput
1325         if magicHash then do
1326                 case alexGetChar' i of
1327                         Just ('#',i@(AI end _)) -> do
1328                                 setInput i
1329                                 return (L (mkRealSrcSpan loc end) (ITprimchar ch))
1330                         _other ->
1331                                 return (L (mkRealSrcSpan loc end) (ITchar ch))
1332             else do
1333                    return (L (mkRealSrcSpan loc end) (ITchar ch))
1334
1335 isAny :: Char -> Bool
1336 isAny c | c > '\x7f' = isPrint c
1337         | otherwise  = is_any c
1338
1339 lex_escape :: P Char
1340 lex_escape = do
1341   i0 <- getInput
1342   c <- getCharOrFail i0
1343   case c of
1344         'a'   -> return '\a'
1345         'b'   -> return '\b'
1346         'f'   -> return '\f'
1347         'n'   -> return '\n'
1348         'r'   -> return '\r'
1349         't'   -> return '\t'
1350         'v'   -> return '\v'
1351         '\\'  -> return '\\'
1352         '"'   -> return '\"'
1353         '\''  -> return '\''
1354         '^'   -> do i1 <- getInput
1355                     c <- getCharOrFail i1
1356                     if c >= '@' && c <= '_'
1357                         then return (chr (ord c - ord '@'))
1358                         else lit_error i1
1359
1360         'x'   -> readNum is_hexdigit 16 hexDigit
1361         'o'   -> readNum is_octdigit  8 octDecDigit
1362         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1363
1364         c1 ->  do
1365            i <- getInput
1366            case alexGetChar' i of
1367             Nothing -> lit_error i0
1368             Just (c2,i2) ->
1369               case alexGetChar' i2 of
1370                 Nothing -> do lit_error i0
1371                 Just (c3,i3) ->
1372                    let str = [c1,c2,c3] in
1373                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1374                                      Just rest <- [stripPrefix p str] ] of
1375                           (escape_char,[]):_ -> do
1376                                 setInput i3
1377                                 return escape_char
1378                           (escape_char,_:_):_ -> do
1379                                 setInput i2
1380                                 return escape_char
1381                           [] -> lit_error i0
1382
1383 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1384 readNum is_digit base conv = do
1385   i <- getInput
1386   c <- getCharOrFail i
1387   if is_digit c
1388         then readNum2 is_digit base conv (conv c)
1389         else lit_error i
1390
1391 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1392 readNum2 is_digit base conv i = do
1393   input <- getInput
1394   read i input
1395   where read i input = do
1396           case alexGetChar' input of
1397             Just (c,input') | is_digit c -> do
1398                let i' = i*base + conv c
1399                if i' > 0x10ffff
1400                   then setInput input >> lexError "numeric escape sequence out of range"
1401                   else read i' input'
1402             _other -> do
1403               setInput input; return (chr i)
1404
1405
1406 silly_escape_chars :: [(String, Char)]
1407 silly_escape_chars = [
1408         ("NUL", '\NUL'),
1409         ("SOH", '\SOH'),
1410         ("STX", '\STX'),
1411         ("ETX", '\ETX'),
1412         ("EOT", '\EOT'),
1413         ("ENQ", '\ENQ'),
1414         ("ACK", '\ACK'),
1415         ("BEL", '\BEL'),
1416         ("BS", '\BS'),
1417         ("HT", '\HT'),
1418         ("LF", '\LF'),
1419         ("VT", '\VT'),
1420         ("FF", '\FF'),
1421         ("CR", '\CR'),
1422         ("SO", '\SO'),
1423         ("SI", '\SI'),
1424         ("DLE", '\DLE'),
1425         ("DC1", '\DC1'),
1426         ("DC2", '\DC2'),
1427         ("DC3", '\DC3'),
1428         ("DC4", '\DC4'),
1429         ("NAK", '\NAK'),
1430         ("SYN", '\SYN'),
1431         ("ETB", '\ETB'),
1432         ("CAN", '\CAN'),
1433         ("EM", '\EM'),
1434         ("SUB", '\SUB'),
1435         ("ESC", '\ESC'),
1436         ("FS", '\FS'),
1437         ("GS", '\GS'),
1438         ("RS", '\RS'),
1439         ("US", '\US'),
1440         ("SP", '\SP'),
1441         ("DEL", '\DEL')
1442         ]
1443
1444 -- before calling lit_error, ensure that the current input is pointing to
1445 -- the position of the error in the buffer.  This is so that we can report
1446 -- a correct location to the user, but also so we can detect UTF-8 decoding
1447 -- errors if they occur.
1448 lit_error :: AlexInput -> P a
1449 lit_error i = do setInput i; lexError "lexical error in string/character literal"
1450
1451 getCharOrFail :: AlexInput -> P Char
1452 getCharOrFail i =  do
1453   case alexGetChar' i of
1454         Nothing -> lexError "unexpected end-of-file in string/character literal"
1455         Just (c,i)  -> do setInput i; return c
1456
1457 -- -----------------------------------------------------------------------------
1458 -- QuasiQuote
1459
1460 lex_quasiquote_tok :: Action
1461 lex_quasiquote_tok span buf len = do
1462   let quoter = tail (lexemeToString buf (len - 1))
1463                 -- 'tail' drops the initial '[',
1464                 -- while the -1 drops the trailing '|'
1465   quoteStart <- getSrcLoc
1466   quote <- lex_quasiquote ""
1467   end <- getSrcLoc
1468   return (L (mkRealSrcSpan (realSrcSpanStart span) end)
1469            (ITquasiQuote (mkFastString quoter,
1470                           mkFastString (reverse quote),
1471                           mkRealSrcSpan quoteStart end)))
1472
1473 lex_quasiquote :: String -> P String
1474 lex_quasiquote s = do
1475   i <- getInput
1476   case alexGetChar' i of
1477     Nothing -> lit_error i
1478
1479     Just ('\\',i)
1480         | Just ('|',i) <- next -> do
1481                 setInput i; lex_quasiquote ('|' : s)
1482         | Just (']',i) <- next -> do
1483                 setInput i; lex_quasiquote (']' : s)
1484         where next = alexGetChar' i
1485
1486     Just ('|',i)
1487         | Just (']',i) <- next -> do
1488                 setInput i; return s
1489         where next = alexGetChar' i
1490
1491     Just (c, i) -> do
1492          setInput i; lex_quasiquote (c : s)
1493
1494 -- -----------------------------------------------------------------------------
1495 -- Warnings
1496
1497 warn :: DynFlag -> SDoc -> Action
1498 warn option warning srcspan _buf _len = do
1499     addWarning option (RealSrcSpan srcspan) warning
1500     lexToken
1501
1502 warnThen :: DynFlag -> SDoc -> Action -> Action
1503 warnThen option warning action srcspan buf len = do
1504     addWarning option (RealSrcSpan srcspan) warning
1505     action srcspan buf len
1506
1507 -- -----------------------------------------------------------------------------
1508 -- The Parse Monad
1509
1510 data LayoutContext
1511   = NoLayout
1512   | Layout !Int
1513   deriving Show
1514
1515 data ParseResult a
1516   = POk PState a
1517   | PFailed
1518         SrcSpan         -- The start and end of the text span related to
1519                         -- the error.  Might be used in environments which can
1520                         -- show this span, e.g. by highlighting it.
1521         Message         -- The error message
1522
1523 data PState = PState {
1524         buffer     :: StringBuffer,
1525         dflags     :: DynFlags,
1526         messages   :: Messages,
1527         last_loc   :: RealSrcSpan,      -- pos of previous token
1528         last_len   :: !Int,     -- len of previous token
1529         loc        :: RealSrcLoc,   -- current loc (end of prev token + 1)
1530         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1531         context    :: [LayoutContext],
1532         lex_state  :: [Int],
1533         -- Used in the alternative layout rule:
1534         -- These tokens are the next ones to be sent out. They are
1535         -- just blindly emitted, without the rule looking at them again:
1536         alr_pending_implicit_tokens :: [RealLocated Token],
1537         -- This is the next token to be considered or, if it is Nothing,
1538         -- we need to get the next token from the input stream:
1539         alr_next_token :: Maybe (RealLocated Token),
1540         -- This is what we consider to be the locatino of the last token
1541         -- emitted:
1542         alr_last_loc :: RealSrcSpan,
1543         -- The stack of layout contexts:
1544         alr_context :: [ALRContext],
1545         -- Are we expecting a '{'? If it's Just, then the ALRLayout tells
1546         -- us what sort of layout the '{' will open:
1547         alr_expecting_ocurly :: Maybe ALRLayout,
1548         -- Have we just had the '}' for a let block? If so, than an 'in'
1549         -- token doesn't need to close anything:
1550         alr_justClosedExplicitLetBlock :: Bool,
1551         code_type_bracket_depth :: Int
1552      }
1553         -- last_loc and last_len are used when generating error messages,
1554         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1555         -- current token to happyError, we could at least get rid of last_len.
1556         -- Getting rid of last_loc would require finding another way to
1557         -- implement pushCurrentContext (which is only called from one place).
1558
1559 data ALRContext = ALRNoLayout Bool{- does it contain commas? -}
1560                               Bool{- is it a 'let' block? -}
1561                 | ALRLayout ALRLayout Int
1562 data ALRLayout = ALRLayoutLet
1563                | ALRLayoutWhere
1564                | ALRLayoutOf
1565                | ALRLayoutDo
1566
1567 newtype P a = P { unP :: PState -> ParseResult a }
1568
1569 instance Monad P where
1570   return = returnP
1571   (>>=) = thenP
1572   fail = failP
1573
1574 returnP :: a -> P a
1575 returnP a = a `seq` (P $ \s -> POk s a)
1576
1577 thenP :: P a -> (a -> P b) -> P b
1578 (P m) `thenP` k = P $ \ s ->
1579         case m s of
1580                 POk s1 a         -> (unP (k a)) s1
1581                 PFailed span err -> PFailed span err
1582
1583 failP :: String -> P a
1584 failP msg = P $ \s -> PFailed (RealSrcSpan (last_loc s)) (text msg)
1585
1586 failMsgP :: String -> P a
1587 failMsgP msg = P $ \s -> PFailed (RealSrcSpan (last_loc s)) (text msg)
1588
1589 failLocMsgP :: RealSrcLoc -> RealSrcLoc -> String -> P a
1590 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (RealSrcSpan (mkRealSrcSpan loc1 loc2)) (text str)
1591
1592 failSpanMsgP :: SrcSpan -> SDoc -> P a
1593 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1594
1595 getPState :: P PState
1596 getPState = P $ \s -> POk s s
1597
1598 getDynFlags :: P DynFlags
1599 getDynFlags = P $ \s -> POk s (dflags s)
1600
1601 withThisPackage :: (PackageId -> a) -> P a
1602 withThisPackage f
1603  = do   pkg     <- liftM thisPackage getDynFlags
1604         return  $ f pkg
1605
1606 extension :: (Int -> Bool) -> P Bool
1607 extension p = P $ \s -> POk s (p $! extsBitmap s)
1608
1609 getExts :: P Int
1610 getExts = P $ \s -> POk s (extsBitmap s)
1611
1612 setExts :: (Int -> Int) -> P ()
1613 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1614
1615 setSrcLoc :: RealSrcLoc -> P ()
1616 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1617
1618 incrBracketDepth :: P ()
1619 incrBracketDepth = P $ \s -> POk (s{code_type_bracket_depth = (code_type_bracket_depth s)+1}) ()
1620 decrBracketDepth :: P ()
1621 decrBracketDepth = P $ \s -> POk (s{code_type_bracket_depth = (code_type_bracket_depth s)-1}) ()
1622 getParserBrakDepth :: P Int
1623 getParserBrakDepth = P $ \s -> POk s (code_type_bracket_depth s)
1624
1625 getSrcLoc :: P RealSrcLoc
1626 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1627
1628 setLastToken :: RealSrcSpan -> Int -> P ()
1629 setLastToken loc len = P $ \s -> POk s {
1630   last_loc=loc,
1631   last_len=len
1632   } ()
1633
1634 data AlexInput = AI RealSrcLoc StringBuffer
1635
1636 alexInputPrevChar :: AlexInput -> Char
1637 alexInputPrevChar (AI _ buf) = prevChar buf '\n'
1638
1639 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1640 alexGetChar (AI loc s)
1641   | atEnd s   = Nothing
1642   | otherwise = adj_c `seq` loc' `seq` s' `seq`
1643                 --trace (show (ord c)) $
1644                 Just (adj_c, (AI loc' s'))
1645   where (c,s') = nextChar s
1646         loc'   = advanceSrcLoc loc c
1647
1648         non_graphic     = '\x0'
1649         upper           = '\x1'
1650         lower           = '\x2'
1651         digit           = '\x3'
1652         symbol          = '\x4'
1653         space           = '\x5'
1654         other_graphic   = '\x6'
1655
1656         adj_c
1657           | c <= '\x06' = non_graphic
1658           | c <= '\x7f' = c
1659           -- Alex doesn't handle Unicode, so when Unicode
1660           -- character is encountered we output these values
1661           -- with the actual character value hidden in the state.
1662           | otherwise =
1663                 case generalCategory c of
1664                   UppercaseLetter       -> upper
1665                   LowercaseLetter       -> lower
1666                   TitlecaseLetter       -> upper
1667                   ModifierLetter        -> other_graphic
1668                   OtherLetter           -> lower -- see #1103
1669                   NonSpacingMark        -> other_graphic
1670                   SpacingCombiningMark  -> other_graphic
1671                   EnclosingMark         -> other_graphic
1672                   DecimalNumber         -> digit
1673                   LetterNumber          -> other_graphic
1674                   OtherNumber           -> digit -- see #4373
1675                   ConnectorPunctuation  -> symbol
1676                   DashPunctuation       -> symbol
1677                   OpenPunctuation       -> other_graphic
1678                   ClosePunctuation      -> other_graphic
1679                   InitialQuote          -> other_graphic
1680                   FinalQuote            -> other_graphic
1681                   OtherPunctuation      -> symbol
1682                   MathSymbol            -> symbol
1683                   CurrencySymbol        -> symbol
1684                   ModifierSymbol        -> symbol
1685                   OtherSymbol           -> symbol
1686                   Space                 -> space
1687                   _other                -> non_graphic
1688
1689 -- This version does not squash unicode characters, it is used when
1690 -- lexing strings.
1691 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1692 alexGetChar' (AI loc s)
1693   | atEnd s   = Nothing
1694   | otherwise = c `seq` loc' `seq` s' `seq`
1695                 --trace (show (ord c)) $
1696                 Just (c, (AI loc' s'))
1697   where (c,s') = nextChar s
1698         loc'   = advanceSrcLoc loc c
1699
1700 getInput :: P AlexInput
1701 getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (AI l b)
1702
1703 setInput :: AlexInput -> P ()
1704 setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } ()
1705
1706 nextIsEOF :: P Bool
1707 nextIsEOF = do
1708   AI _ s <- getInput
1709   return $ atEnd s
1710
1711 pushLexState :: Int -> P ()
1712 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1713
1714 popLexState :: P Int
1715 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1716
1717 getLexState :: P Int
1718 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1719
1720 popNextToken :: P (Maybe (RealLocated Token))
1721 popNextToken
1722     = P $ \s@PState{ alr_next_token = m } ->
1723               POk (s {alr_next_token = Nothing}) m
1724
1725 activeContext :: P Bool
1726 activeContext = do
1727   ctxt <- getALRContext
1728   expc <- getAlrExpectingOCurly
1729   impt <- implicitTokenPending
1730   case (ctxt,expc) of
1731     ([],Nothing) -> return impt
1732     _other       -> return True
1733
1734 setAlrLastLoc :: RealSrcSpan -> P ()
1735 setAlrLastLoc l = P $ \s -> POk (s {alr_last_loc = l}) ()
1736
1737 getAlrLastLoc :: P RealSrcSpan
1738 getAlrLastLoc = P $ \s@(PState {alr_last_loc = l}) -> POk s l
1739
1740 getALRContext :: P [ALRContext]
1741 getALRContext = P $ \s@(PState {alr_context = cs}) -> POk s cs
1742
1743 setALRContext :: [ALRContext] -> P ()
1744 setALRContext cs = P $ \s -> POk (s {alr_context = cs}) ()
1745
1746 getJustClosedExplicitLetBlock :: P Bool
1747 getJustClosedExplicitLetBlock
1748  = P $ \s@(PState {alr_justClosedExplicitLetBlock = b}) -> POk s b
1749
1750 setJustClosedExplicitLetBlock :: Bool -> P ()
1751 setJustClosedExplicitLetBlock b
1752  = P $ \s -> POk (s {alr_justClosedExplicitLetBlock = b}) ()
1753
1754 setNextToken :: RealLocated Token -> P ()
1755 setNextToken t = P $ \s -> POk (s {alr_next_token = Just t}) ()
1756
1757 implicitTokenPending :: P Bool
1758 implicitTokenPending
1759     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1760               case ts of
1761               [] -> POk s False
1762               _  -> POk s True
1763
1764 popPendingImplicitToken :: P (Maybe (RealLocated Token))
1765 popPendingImplicitToken
1766     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1767               case ts of
1768               [] -> POk s Nothing
1769               (t : ts') -> POk (s {alr_pending_implicit_tokens = ts'}) (Just t)
1770
1771 setPendingImplicitTokens :: [RealLocated Token] -> P ()
1772 setPendingImplicitTokens ts = P $ \s -> POk (s {alr_pending_implicit_tokens = ts}) ()
1773
1774 getAlrExpectingOCurly :: P (Maybe ALRLayout)
1775 getAlrExpectingOCurly = P $ \s@(PState {alr_expecting_ocurly = b}) -> POk s b
1776
1777 setAlrExpectingOCurly :: Maybe ALRLayout -> P ()
1778 setAlrExpectingOCurly b = P $ \s -> POk (s {alr_expecting_ocurly = b}) ()
1779
1780 -- for reasons of efficiency, flags indicating language extensions (eg,
1781 -- -fglasgow-exts or -XParallelArrays) are represented by a bitmap stored in an unboxed
1782 -- integer
1783
1784 -- The "genericsBit" is now unused, available for others
1785 -- genericsBit :: Int
1786 -- genericsBit = 0 -- {|, |} and "generic"
1787
1788 ffiBit :: Int
1789 ffiBit     = 1
1790 parrBit :: Int
1791 parrBit    = 2
1792 arrowsBit :: Int
1793 arrowsBit  = 4
1794 thBit :: Int
1795 thBit      = 5
1796 ipBit :: Int
1797 ipBit      = 6
1798 explicitForallBit :: Int
1799 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1800 bangPatBit :: Int
1801 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1802                 -- (doesn't affect the lexer)
1803 tyFamBit :: Int
1804 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1805 haddockBit :: Int
1806 haddockBit = 10 -- Lex and parse Haddock comments
1807 magicHashBit :: Int
1808 magicHashBit = 11 -- "#" in both functions and operators
1809 kindSigsBit :: Int
1810 kindSigsBit = 12 -- Kind signatures on type variables
1811 recursiveDoBit :: Int
1812 recursiveDoBit = 13 -- mdo
1813 unicodeSyntaxBit :: Int
1814 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1815 unboxedTuplesBit :: Int
1816 unboxedTuplesBit = 15 -- (# and #)
1817 datatypeContextsBit :: Int
1818 datatypeContextsBit = 16
1819 transformComprehensionsBit :: Int
1820 transformComprehensionsBit = 17
1821 qqBit :: Int
1822 qqBit      = 18 -- enable quasiquoting
1823 inRulePragBit :: Int
1824 inRulePragBit = 19
1825 rawTokenStreamBit :: Int
1826 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1827 recBit :: Int
1828 recBit = 22 -- rec
1829 alternativeLayoutRuleBit :: Int
1830 alternativeLayoutRuleBit = 23
1831 relaxedLayoutBit :: Int
1832 relaxedLayoutBit = 24
1833 nondecreasingIndentationBit :: Int
1834 nondecreasingIndentationBit = 25
1835 hetMetBit :: Int
1836 hetMetBit = 31
1837
1838 always :: Int -> Bool
1839 always           _     = True
1840 parrEnabled :: Int -> Bool
1841 parrEnabled      flags = testBit flags parrBit
1842 arrowsEnabled :: Int -> Bool
1843 arrowsEnabled    flags = testBit flags arrowsBit
1844 hetMetEnabled :: Int -> Bool
1845 hetMetEnabled    flags = testBit flags hetMetBit
1846 thEnabled :: Int -> Bool
1847 thEnabled        flags = testBit flags thBit
1848 ipEnabled :: Int -> Bool
1849 ipEnabled        flags = testBit flags ipBit
1850 explicitForallEnabled :: Int -> Bool
1851 explicitForallEnabled flags = testBit flags explicitForallBit
1852 bangPatEnabled :: Int -> Bool
1853 bangPatEnabled   flags = testBit flags bangPatBit
1854 -- tyFamEnabled :: Int -> Bool
1855 -- tyFamEnabled     flags = testBit flags tyFamBit
1856 haddockEnabled :: Int -> Bool
1857 haddockEnabled   flags = testBit flags haddockBit
1858 magicHashEnabled :: Int -> Bool
1859 magicHashEnabled flags = testBit flags magicHashBit
1860 -- kindSigsEnabled :: Int -> Bool
1861 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1862 unicodeSyntaxEnabled :: Int -> Bool
1863 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1864 unboxedTuplesEnabled :: Int -> Bool
1865 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1866 datatypeContextsEnabled :: Int -> Bool
1867 datatypeContextsEnabled flags = testBit flags datatypeContextsBit
1868 qqEnabled :: Int -> Bool
1869 qqEnabled        flags = testBit flags qqBit
1870 -- inRulePrag :: Int -> Bool
1871 -- inRulePrag       flags = testBit flags inRulePragBit
1872 rawTokenStreamEnabled :: Int -> Bool
1873 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1874 alternativeLayoutRule :: Int -> Bool
1875 alternativeLayoutRule flags = testBit flags alternativeLayoutRuleBit
1876 relaxedLayout :: Int -> Bool
1877 relaxedLayout flags = testBit flags relaxedLayoutBit
1878 nondecreasingIndentation :: Int -> Bool
1879 nondecreasingIndentation flags = testBit flags nondecreasingIndentationBit
1880
1881 -- PState for parsing options pragmas
1882 --
1883 pragState :: DynFlags -> StringBuffer -> RealSrcLoc -> PState
1884 pragState dynflags buf loc = (mkPState dynflags buf loc) {
1885                                  lex_state = [bol, option_prags, 0]
1886                              }
1887
1888 -- create a parse state
1889 --
1890 mkPState :: DynFlags -> StringBuffer -> RealSrcLoc -> PState
1891 mkPState flags buf loc =
1892   PState {
1893       buffer        = buf,
1894       dflags        = flags,
1895       messages      = emptyMessages,
1896       last_loc      = mkRealSrcSpan loc loc,
1897       last_len      = 0,
1898       loc           = loc,
1899       extsBitmap    = fromIntegral bitmap,
1900       context       = [],
1901       lex_state     = [bol, 0],
1902       alr_pending_implicit_tokens = [],
1903       alr_next_token = Nothing,
1904       alr_last_loc = alrInitialLoc (fsLit "<no file>"),
1905       alr_context = [],
1906       alr_expecting_ocurly = Nothing,
1907       alr_justClosedExplicitLetBlock = False,
1908       code_type_bracket_depth = 0
1909     }
1910     where
1911       bitmap =     ffiBit            `setBitIf` xopt Opt_ForeignFunctionInterface flags
1912                .|. parrBit           `setBitIf` xopt Opt_ParallelArrays  flags
1913                .|. arrowsBit         `setBitIf` xopt Opt_Arrows          flags
1914                .|. hetMetBit         `setBitIf` xopt Opt_ModalTypes      flags
1915                .|. thBit             `setBitIf` xopt Opt_TemplateHaskell flags
1916                .|. qqBit             `setBitIf` xopt Opt_QuasiQuotes     flags
1917                .|. ipBit             `setBitIf` xopt Opt_ImplicitParams  flags
1918                .|. explicitForallBit `setBitIf` xopt Opt_ExplicitForAll  flags
1919                .|. bangPatBit        `setBitIf` xopt Opt_BangPatterns    flags
1920                .|. tyFamBit          `setBitIf` xopt Opt_TypeFamilies    flags
1921                .|. haddockBit        `setBitIf` dopt Opt_Haddock         flags
1922                .|. magicHashBit      `setBitIf` xopt Opt_MagicHash       flags
1923                .|. kindSigsBit       `setBitIf` xopt Opt_KindSignatures  flags
1924                .|. recursiveDoBit    `setBitIf` xopt Opt_RecursiveDo     flags
1925                .|. recBit            `setBitIf` xopt Opt_DoRec           flags
1926                .|. recBit            `setBitIf` xopt Opt_Arrows          flags
1927                .|. unicodeSyntaxBit  `setBitIf` xopt Opt_UnicodeSyntax   flags
1928                .|. unboxedTuplesBit  `setBitIf` xopt Opt_UnboxedTuples   flags
1929                .|. datatypeContextsBit `setBitIf` xopt Opt_DatatypeContexts flags
1930                .|. transformComprehensionsBit `setBitIf` xopt Opt_TransformListComp flags
1931                .|. transformComprehensionsBit `setBitIf` xopt Opt_MonadComprehensions flags
1932                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1933                .|. alternativeLayoutRuleBit `setBitIf` xopt Opt_AlternativeLayoutRule flags
1934                .|. relaxedLayoutBit  `setBitIf` xopt Opt_RelaxedLayout flags
1935                .|. nondecreasingIndentationBit `setBitIf` xopt Opt_NondecreasingIndentation flags
1936       --
1937       setBitIf :: Int -> Bool -> Int
1938       b `setBitIf` cond | cond      = bit b
1939                         | otherwise = 0
1940
1941 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1942 addWarning option srcspan warning
1943  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1944        let warning' = mkWarnMsg srcspan alwaysQualify warning
1945            ws' = if dopt option d then ws `snocBag` warning' else ws
1946        in POk s{messages=(ws', es)} ()
1947
1948 getMessages :: PState -> Messages
1949 getMessages PState{messages=ms} = ms
1950
1951 getContext :: P [LayoutContext]
1952 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1953
1954 setContext :: [LayoutContext] -> P ()
1955 setContext ctx = P $ \s -> POk s{context=ctx} ()
1956
1957 popContext :: P ()
1958 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1959                               last_len = len, last_loc = last_loc }) ->
1960   case ctx of
1961         (_:tl) -> POk s{ context = tl } ()
1962         []     -> PFailed (RealSrcSpan last_loc) (srcParseErr buf len)
1963
1964 -- Push a new layout context at the indentation of the last token read.
1965 -- This is only used at the outer level of a module when the 'module'
1966 -- keyword is missing.
1967 pushCurrentContext :: P ()
1968 pushCurrentContext = P $ \ s@PState{ last_loc=loc, context=ctx } ->
1969     POk s{context = Layout (srcSpanStartCol loc) : ctx} ()
1970
1971 getOffside :: P Ordering
1972 getOffside = P $ \s@PState{last_loc=loc, context=stk} ->
1973                 let offs = srcSpanStartCol loc in
1974                 let ord = case stk of
1975                         (Layout n:_) -> --trace ("layout: " ++ show n ++ ", offs: " ++ show offs) $
1976                                         compare offs n
1977                         _            -> GT
1978                 in POk s ord
1979
1980 -- ---------------------------------------------------------------------------
1981 -- Construct a parse error
1982
1983 srcParseErr
1984   :: StringBuffer       -- current buffer (placed just after the last token)
1985   -> Int                -- length of the previous token
1986   -> Message
1987 srcParseErr buf len
1988   = hcat [ if null token
1989              then ptext (sLit "parse error (possibly incorrect indentation)")
1990              else hcat [ptext (sLit "parse error on input "),
1991                         char '`', text token, char '\'']
1992     ]
1993   where token = lexemeToString (offsetBytes (-len) buf) len
1994
1995 -- Report a parse failure, giving the span of the previous token as
1996 -- the location of the error.  This is the entry point for errors
1997 -- detected during parsing.
1998 srcParseFail :: P a
1999 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
2000                             last_loc = last_loc } ->
2001     PFailed (RealSrcSpan last_loc) (srcParseErr buf len)
2002
2003 -- A lexical error is reported at a particular position in the source file,
2004 -- not over a token range.
2005 lexError :: String -> P a
2006 lexError str = do
2007   loc <- getSrcLoc
2008   (AI end buf) <- getInput
2009   reportLexError loc end buf str
2010
2011 -- -----------------------------------------------------------------------------
2012 -- This is the top-level function: called from the parser each time a
2013 -- new token is to be read from the input.
2014
2015 lexer :: (Located Token -> P a) -> P a
2016 lexer cont = do
2017   alr <- extension alternativeLayoutRule
2018   let lexTokenFun = if alr then lexTokenAlr else lexToken
2019   (L span tok) <- lexTokenFun
2020   --trace ("token: " ++ show tok) $ do
2021   cont (L (RealSrcSpan span) tok)
2022
2023 lexTokenAlr :: P (RealLocated Token)
2024 lexTokenAlr = do mPending <- popPendingImplicitToken
2025                  t <- case mPending of
2026                       Nothing ->
2027                           do mNext <- popNextToken
2028                              t <- case mNext of
2029                                   Nothing -> lexToken
2030                                   Just next -> return next
2031                              alternativeLayoutRuleToken t
2032                       Just t ->
2033                           return t
2034                  setAlrLastLoc (getLoc t)
2035                  case unLoc t of
2036                      ITwhere -> setAlrExpectingOCurly (Just ALRLayoutWhere)
2037                      ITlet   -> setAlrExpectingOCurly (Just ALRLayoutLet)
2038                      ITof    -> setAlrExpectingOCurly (Just ALRLayoutOf)
2039                      ITdo    -> setAlrExpectingOCurly (Just ALRLayoutDo)
2040                      ITmdo   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2041                      ITrec   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2042                      _       -> return ()
2043                  return t
2044
2045 alternativeLayoutRuleToken :: RealLocated Token -> P (RealLocated Token)
2046 alternativeLayoutRuleToken t
2047     = do context <- getALRContext
2048          lastLoc <- getAlrLastLoc
2049          mExpectingOCurly <- getAlrExpectingOCurly
2050          justClosedExplicitLetBlock <- getJustClosedExplicitLetBlock
2051          setJustClosedExplicitLetBlock False
2052          dflags <- getDynFlags
2053          let transitional = xopt Opt_AlternativeLayoutRuleTransitional dflags
2054              thisLoc = getLoc t
2055              thisCol = srcSpanStartCol thisLoc
2056              newLine = srcSpanStartLine thisLoc > srcSpanEndLine lastLoc
2057          case (unLoc t, context, mExpectingOCurly) of
2058              -- This case handles a GHC extension to the original H98
2059              -- layout rule...
2060              (ITocurly, _, Just alrLayout) ->
2061                  do setAlrExpectingOCurly Nothing
2062                     let isLet = case alrLayout of
2063                                 ALRLayoutLet -> True
2064                                 _ -> False
2065                     setALRContext (ALRNoLayout (containsCommas ITocurly) isLet : context)
2066                     return t
2067              -- ...and makes this case unnecessary
2068              {-
2069              -- I think our implicit open-curly handling is slightly
2070              -- different to John's, in how it interacts with newlines
2071              -- and "in"
2072              (ITocurly, _, Just _) ->
2073                  do setAlrExpectingOCurly Nothing
2074                     setNextToken t
2075                     lexTokenAlr
2076              -}
2077              (_, ALRLayout _ col : ls, Just expectingOCurly)
2078               | (thisCol > col) ||
2079                 (thisCol == col &&
2080                  isNonDecreasingIntentation expectingOCurly) ->
2081                  do setAlrExpectingOCurly Nothing
2082                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2083                     setNextToken t
2084                     return (L thisLoc ITocurly)
2085               | otherwise ->
2086                  do setAlrExpectingOCurly Nothing
2087                     setPendingImplicitTokens [L lastLoc ITccurly]
2088                     setNextToken t
2089                     return (L lastLoc ITocurly)
2090              (_, _, Just expectingOCurly) ->
2091                  do setAlrExpectingOCurly Nothing
2092                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2093                     setNextToken t
2094                     return (L thisLoc ITocurly)
2095              -- We do the [] cases earlier than in the spec, as we
2096              -- have an actual EOF token
2097              (ITeof, ALRLayout _ _ : ls, _) ->
2098                  do setALRContext ls
2099                     setNextToken t
2100                     return (L thisLoc ITccurly)
2101              (ITeof, _, _) ->
2102                  return t
2103              -- the other ITeof case omitted; general case below covers it
2104              (ITin, _, _)
2105               | justClosedExplicitLetBlock ->
2106                  return t
2107              (ITin, ALRLayout ALRLayoutLet _ : ls, _)
2108               | newLine ->
2109                  do setPendingImplicitTokens [t]
2110                     setALRContext ls
2111                     return (L thisLoc ITccurly)
2112              -- This next case is to handle a transitional issue:
2113              (ITwhere, ALRLayout _ col : ls, _)
2114               | newLine && thisCol == col && transitional ->
2115                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2116                                (RealSrcSpan thisLoc)
2117                                (transitionalAlternativeLayoutWarning
2118                                     "`where' clause at the same depth as implicit layout block")
2119                     setALRContext ls
2120                     setNextToken t
2121                     -- Note that we use lastLoc, as we may need to close
2122                     -- more layouts, or give a semicolon
2123                     return (L lastLoc ITccurly)
2124              -- This next case is to handle a transitional issue:
2125              (ITvbar, ALRLayout _ col : ls, _)
2126               | newLine && thisCol == col && transitional ->
2127                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2128                                (RealSrcSpan thisLoc)
2129                                (transitionalAlternativeLayoutWarning
2130                                     "`|' at the same depth as implicit layout block")
2131                     setALRContext ls
2132                     setNextToken t
2133                     -- Note that we use lastLoc, as we may need to close
2134                     -- more layouts, or give a semicolon
2135                     return (L lastLoc ITccurly)
2136              (_, ALRLayout _ col : ls, _)
2137               | newLine && thisCol == col ->
2138                  do setNextToken t
2139                     return (L thisLoc ITsemi)
2140               | newLine && thisCol < col ->
2141                  do setALRContext ls
2142                     setNextToken t
2143                     -- Note that we use lastLoc, as we may need to close
2144                     -- more layouts, or give a semicolon
2145                     return (L lastLoc ITccurly)
2146              -- We need to handle close before open, as 'then' is both
2147              -- an open and a close
2148              (u, _, _)
2149               | isALRclose u ->
2150                  case context of
2151                  ALRLayout _ _ : ls ->
2152                      do setALRContext ls
2153                         setNextToken t
2154                         return (L thisLoc ITccurly)
2155                  ALRNoLayout _ isLet : ls ->
2156                      do let ls' = if isALRopen u
2157                                      then ALRNoLayout (containsCommas u) False : ls
2158                                      else ls
2159                         setALRContext ls'
2160                         when isLet $ setJustClosedExplicitLetBlock True
2161                         return t
2162                  [] ->
2163                      do let ls = if isALRopen u
2164                                     then [ALRNoLayout (containsCommas u) False]
2165                                     else ls
2166                         setALRContext ls
2167                         -- XXX This is an error in John's code, but
2168                         -- it looks reachable to me at first glance
2169                         return t
2170              (u, _, _)
2171               | isALRopen u ->
2172                  do setALRContext (ALRNoLayout (containsCommas u) False : context)
2173                     return t
2174              (ITin, ALRLayout ALRLayoutLet _ : ls, _) ->
2175                  do setALRContext ls
2176                     setPendingImplicitTokens [t]
2177                     return (L thisLoc ITccurly)
2178              (ITin, ALRLayout _ _ : ls, _) ->
2179                  do setALRContext ls
2180                     setNextToken t
2181                     return (L thisLoc ITccurly)
2182              -- the other ITin case omitted; general case below covers it
2183              (ITcomma, ALRLayout _ _ : ls, _)
2184               | topNoLayoutContainsCommas ls ->
2185                  do setALRContext ls
2186                     setNextToken t
2187                     return (L thisLoc ITccurly)
2188              (ITwhere, ALRLayout ALRLayoutDo _ : ls, _) ->
2189                  do setALRContext ls
2190                     setPendingImplicitTokens [t]
2191                     return (L thisLoc ITccurly)
2192              -- the other ITwhere case omitted; general case below covers it
2193              (_, _, _) -> return t
2194
2195 transitionalAlternativeLayoutWarning :: String -> SDoc
2196 transitionalAlternativeLayoutWarning msg
2197     = text "transitional layout will not be accepted in the future:"
2198    $$ text msg
2199
2200 isALRopen :: Token -> Bool
2201 isALRopen ITcase        = True
2202 isALRopen ITif          = True
2203 isALRopen ITthen        = True
2204 isALRopen IToparen      = True
2205 isALRopen ITobrack      = True
2206 isALRopen ITocurly      = True
2207 -- GHC Extensions:
2208 isALRopen IToubxparen   = True
2209 isALRopen ITparenEscape = True
2210 isALRopen _             = False
2211
2212 isALRclose :: Token -> Bool
2213 isALRclose ITof     = True
2214 isALRclose ITthen   = True
2215 isALRclose ITelse   = True
2216 isALRclose ITcparen = True
2217 isALRclose ITcbrack = True
2218 isALRclose ITccurly = True
2219 -- GHC Extensions:
2220 isALRclose ITcubxparen = True
2221 isALRclose _        = False
2222
2223 isNonDecreasingIntentation :: ALRLayout -> Bool
2224 isNonDecreasingIntentation ALRLayoutDo = True
2225 isNonDecreasingIntentation _           = False
2226
2227 containsCommas :: Token -> Bool
2228 containsCommas IToparen = True
2229 containsCommas ITobrack = True
2230 -- John doesn't have {} as containing commas, but records contain them,
2231 -- which caused a problem parsing Cabal's Distribution.Simple.InstallDirs
2232 -- (defaultInstallDirs).
2233 containsCommas ITocurly = True
2234 -- GHC Extensions:
2235 containsCommas IToubxparen = True
2236 containsCommas _        = False
2237
2238 topNoLayoutContainsCommas :: [ALRContext] -> Bool
2239 topNoLayoutContainsCommas [] = False
2240 topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
2241 topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
2242
2243 lexToken :: P (RealLocated Token)
2244 lexToken = do
2245   inp@(AI loc1 buf) <- getInput
2246   sc <- getLexState
2247   exts <- getExts
2248   case alexScanUser exts inp sc of
2249     AlexEOF -> do
2250         let span = mkRealSrcSpan loc1 loc1
2251         setLastToken span 0
2252         return (L span ITeof)
2253     AlexError (AI loc2 buf) ->
2254         reportLexError loc1 loc2 buf "lexical error"
2255     AlexSkip inp2 _ -> do
2256         setInput inp2
2257         lexToken
2258     AlexToken inp2@(AI end buf2) _ t -> do
2259         setInput inp2
2260         let span = mkRealSrcSpan loc1 end
2261         let bytes = byteDiff buf buf2
2262         span `seq` setLastToken span bytes
2263         t span buf bytes
2264
2265 reportLexError :: RealSrcLoc -> RealSrcLoc -> StringBuffer -> [Char] -> P a
2266 reportLexError loc1 loc2 buf str
2267   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
2268   | otherwise =
2269   let
2270         c = fst (nextChar buf)
2271   in
2272   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
2273     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
2274     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
2275
2276 lexTokenStream :: StringBuffer -> RealSrcLoc -> DynFlags -> ParseResult [Located Token]
2277 lexTokenStream buf loc dflags = unP go initState
2278     where dflags' = dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream
2279           initState = mkPState dflags' buf loc
2280           go = do
2281             ltok <- lexer return
2282             case ltok of
2283               L _ ITeof -> return []
2284               _ -> liftM (ltok:) go
2285
2286 linePrags = Map.singleton "line" (begin line_prag2)
2287
2288 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
2289                                  ("options_ghc", lex_string_prag IToptions_prag),
2290                                  ("options_haddock", lex_string_prag ITdocOptions),
2291                                  ("language", token ITlanguage_prag),
2292                                  ("include", lex_string_prag ITinclude_prag)])
2293
2294 ignoredPrags = Map.fromList (map ignored pragmas)
2295                where ignored opt = (opt, nested_comment lexToken)
2296                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
2297                      options_pragmas = map ("options_" ++) impls
2298                      -- CFILES is a hugs-only thing.
2299                      pragmas = options_pragmas ++ ["cfiles", "contract"]
2300
2301 oneWordPrags = Map.fromList([("rules", rulePrag),
2302                            ("inline", token (ITinline_prag Inline FunLike)),
2303                            ("inlinable", token (ITinline_prag Inlinable FunLike)),
2304                            ("inlineable", token (ITinline_prag Inlinable FunLike)),
2305                                           -- Spelling variant
2306                            ("notinline", token (ITinline_prag NoInline FunLike)),
2307                            ("specialize", token ITspec_prag),
2308                            ("source", token ITsource_prag),
2309                            ("warning", token ITwarning_prag),
2310                            ("deprecated", token ITdeprecated_prag),
2311                            ("scc", token ITscc_prag),
2312                            ("generated", token ITgenerated_prag),
2313                            ("core", token ITcore_prag),
2314                            ("unpack", token ITunpack_prag),
2315                            ("ann", token ITann_prag),
2316                            ("vectorize", token ITvect_prag),
2317                            ("novectorize", token ITnovect_prag)])
2318
2319 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_prag Inline ConLike)),
2320                              ("notinline conlike", token (ITinline_prag NoInline ConLike)),
2321                              ("specialize inline", token (ITspec_inline_prag True)),
2322                              ("specialize notinline", token (ITspec_inline_prag False)),
2323                              ("vectorize scalar", token ITvect_scalar_prag)])
2324
2325 dispatch_pragmas :: Map String Action -> Action
2326 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
2327                                        Just found -> found span buf len
2328                                        Nothing -> lexError "unknown pragma"
2329
2330 known_pragma :: Map String Action -> AlexAccPred Int
2331 known_pragma prags _ _ len (AI _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
2332                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
2333
2334 clean_pragma :: String -> String
2335 clean_pragma prag = canon_ws (map toLower (unprefix prag))
2336                     where unprefix prag' = case stripPrefix "{-#" prag' of
2337                                              Just rest -> rest
2338                                              Nothing -> prag'
2339                           canonical prag' = case prag' of
2340                                               "noinline" -> "notinline"
2341                                               "specialise" -> "specialize"
2342                                               "vectorise" -> "vectorize"
2343                                               "novectorise" -> "novectorize"
2344                                               "constructorlike" -> "conlike"
2345                                               _ -> prag'
2346                           canon_ws s = unwords (map canonical (words s))
2347 }