compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 -- XXX The above flags turn off warnings in the generated code:
  35 {-# LANGUAGE BangPatterns #-}
  36 {-# OPTIONS_GHC -fno-warn-unused-matches #-}
  37 {-# OPTIONS_GHC -fno-warn-unused-binds #-}
  38 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
  39 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
  40 -- But alex still generates some code that causes the "lazy unlifted bindings"
  41 -- warning, and old compilers don't know about it so we can't easily turn
  42 -- it off, so for now we use the sledge hammer:
  43 {-# OPTIONS_GHC -w #-}
  44
  45 {-# OPTIONS_GHC -funbox-strict-fields #-}
  46
  47 module Lexer (
  48    Token(..), lexer, pragState, mkPState, PState(..),
  49    P(..), ParseResult(..), getSrcLoc,
  50    getPState, getDynFlags, withThisPackage,
  51    failLocMsgP, failSpanMsgP, srcParseFail,
  52    getMessages,
  53    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  54    activeContext, nextIsEOF,
  55    getLexState, popLexState, pushLexState,
  56    extension, bangPatEnabled, datatypeContextsEnabled,
  57    addWarning,
  58    incrBracketDepth, decrBracketDepth, getParserBrakDepth,
  59    lexTokenStream
  60   ) where
  61
  62 import Bag
  63 import ErrUtils
  64 import Outputable
  65 import StringBuffer
  66 import FastString
  67 import SrcLoc
  68 import UniqFM
  69 import DynFlags
  70 import Module
  71 import Ctype
  72 import BasicTypes       ( InlineSpec(..), RuleMatchInfo(..) )
  73 import Util             ( readRational )
  74
  75 import Control.Monad
  76 import Data.Bits
  77 import Data.Char
  78 import Data.List
  79 import Data.Maybe
  80 import Data.Map (Map)
  81 import qualified Data.Map as Map
  82 import Data.Ratio
  83 }
  84
  85 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  86 $whitechar   = [\ \n\r\f\v $unispace]
  87 $white_no_nl = $whitechar # \n
  88 $tab         = \t
  89
  90 $ascdigit  = 0-9
  91 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  92 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  93 $digit     = [$ascdigit $unidigit]
  94
  95 $special   = [\(\)\,\;\[\]\`\{\}]
  96 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  97 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  98 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  99
 100 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
 101 $asclarge  = [A-Z]
 102 $large     = [$asclarge $unilarge]
 103
 104 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 105 $ascsmall  = [a-z]
 106 $small     = [$ascsmall $unismall \_]
 107
 108 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 109 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 110
 111 $octit     = 0-7
 112 $hexit     = [$decdigit A-F a-f]
 113 $symchar   = [$symbol \:]
 114 $nl        = [\n\r]
 115 $idchar    = [$small $large $digit \']
 116
 117 $pragmachar = [$small $large $digit]
 118
 119 $docsym    = [\| \^ \* \$]
 120
 121 @varid     = $small $idchar*
 122 @conid     = $large $idchar*
 123
 124 @varsym    = $symbol $symchar*
 125 @consym    = \: $symchar*
 126
 127 @decimal     = $decdigit+
 128 @octal       = $octit+
 129 @hexadecimal = $hexit+
 130 @exponent    = [eE] [\-\+]? @decimal
 131
 132 -- we support the hierarchical module name extension:
 133 @qual = (@conid \.)+
 134
 135 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 136
 137 -- normal signed numerical literals can only be explicitly negative,
 138 -- not explicitly positive (contrast @exponent)
 139 @negative = \-
 140 @signed = @negative ?
 141
 142 haskell :-
 143
 144 -- everywhere: skip whitespace and comments
 145 $white_no_nl+                           ;
 146 $tab+         { warn Opt_WarnTabs (text "Warning: Tab character") }
 147
 148 -- Everywhere: deal with nested comments.  We explicitly rule out
 149 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 150 -- (this can happen even though pragmas will normally take precedence due to
 151 -- longest-match, because pragmas aren't valid in every state, but comments
 152 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 153 -- set.
 154
 155 "{-" / { isNormalComment } { nested_comment lexToken }
 156
 157 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 158 -- more dashes followed by a symbol should be parsed as a varsym, so we
 159 -- have to exclude those.
 160
 161 -- Since Haddock comments aren't valid in every state, we need to rule them
 162 -- out here.
 163
 164 -- The following two rules match comments that begin with two dashes, but
 165 -- continue with a different character. The rules test that this character
 166 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 167 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 168 -- have a Haddock comment). The rules then munch the rest of the line.
 169
 170 "-- " ~[$docsym \#] .* { lineCommentToken }
 171 "--" [^$symbol : \ ] .* { lineCommentToken }
 172
 173 -- Next, match Haddock comments if no -haddock flag
 174
 175 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 176
 177 -- Now, when we've matched comments that begin with 2 dashes and continue
 178 -- with a different character, we need to match comments that begin with three
 179 -- or more dashes (which clearly can't be Haddock comments). We only need to
 180 -- make sure that the first non-dash character isn't a symbol, and munch the
 181 -- rest of the line.
 182
 183 "---"\-* [^$symbol :] .* { lineCommentToken }
 184
 185 -- Since the previous rules all match dashes followed by at least one
 186 -- character, we also need to match a whole line filled with just dashes.
 187
 188 "--"\-* / { atEOL } { lineCommentToken }
 189
 190 -- We need this rule since none of the other single line comment rules
 191 -- actually match this case.
 192
 193 "-- " / { atEOL } { lineCommentToken }
 194
 195 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 196 -- blank lines) until we find a non-whitespace character, then do layout
 197 -- processing.
 198 --
 199 -- One slight wibble here: what if the line begins with {-#? In
 200 -- theory, we have to lex the pragma to see if it's one we recognise,
 201 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 202 -- as a nested comment.  We don't bother with this: if the line begins
 203 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 204 <bol> {
 205   \n                                    ;
 206   ^\# (line)?                           { begin line_prag1 }
 207   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 208   ^\# \! .* \n                          ; -- #!, for scripts
 209   ()                                    { do_bol }
 210 }
 211
 212 -- after a layout keyword (let, where, do, of), we begin a new layout
 213 -- context if the curly brace is missing.
 214 -- Careful! This stuff is quite delicate.
 215 <layout, layout_do> {
 216   \{ / { notFollowedBy '-' }            { hopefully_open_brace }
 217         -- we might encounter {-# here, but {- has been handled already
 218   \n                                    ;
 219   ^\# (line)?                           { begin line_prag1 }
 220 }
 221
 222 -- do is treated in a subtly different way, see new_layout_context
 223 <layout>    ()                          { new_layout_context True }
 224 <layout_do> ()                          { new_layout_context False }
 225
 226 -- after a new layout context which was found to be to the left of the
 227 -- previous context, we have generated a '{' token, and we now need to
 228 -- generate a matching '}' token.
 229 <layout_left>  ()                       { do_layout_left }
 230
 231 <0,option_prags> \n                             { begin bol }
 232
 233 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 234                                 { dispatch_pragmas linePrags }
 235
 236 -- single-line line pragmas, of the form
 237 --    # <line> "<file>" <extra-stuff> \n
 238 <line_prag1> $decdigit+                 { setLine line_prag1a }
 239 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 240 <line_prag1b> .*                        { pop }
 241
 242 -- Haskell-style line pragmas, of the form
 243 --    {-# LINE <line> "<file>" #-}
 244 <line_prag2> $decdigit+                 { setLine line_prag2a }
 245 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 246 <line_prag2b> "#-}"|"-}"                { pop }
 247    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 248    -- with older versions of GHC which generated these.
 249
 250 <0,option_prags> {
 251   "{-#" $whitechar* $pragmachar+
 252         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 253                                  { dispatch_pragmas twoWordPrags }
 254
 255   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 256                                  { dispatch_pragmas oneWordPrags }
 257
 258   -- We ignore all these pragmas, but don't generate a warning for them
 259   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 260                                  { dispatch_pragmas ignoredPrags }
 261
 262   -- ToDo: should only be valid inside a pragma:
 263   "#-}"                                 { endPrag }
 264 }
 265
 266 <option_prags> {
 267   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 268                                    { dispatch_pragmas fileHeaderPrags }
 269
 270   "-- #"                                 { multiline_doc_comment }
 271 }
 272
 273 <0> {
 274   -- In the "0" mode we ignore these pragmas
 275   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 276                      { nested_comment lexToken }
 277 }
 278
 279 <0> {
 280   "-- #" .* { lineCommentToken }
 281 }
 282
 283 <0,option_prags> {
 284   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 285                     (nested_comment lexToken) }
 286 }
 287
 288 -- '0' state: ordinary lexemes
 289
 290 -- Haddock comments
 291
 292 <0,option_prags> {
 293   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 294   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 295 }
 296
 297 -- "special" symbols
 298
 299 <0> {
 300   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 301   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 302 }
 303
 304 <0> {
 305   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 306   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 307   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 308   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 309   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 310   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 311   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 312   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 313
 314 -- For backward compatibility, accept the old dollar syntax
 315   "[$" @varid "|"  / { ifExtension qqEnabled }
 316                      { lex_quasiquote_tok }
 317
 318   "[" @varid "|"  / { ifExtension qqEnabled }
 319                      { lex_quasiquote_tok }
 320 }
 321
 322 <0> {
 323   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 324                                         { special IToparenbar }
 325   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 326 }
 327
 328 <0> {
 329   "<[" / { ifExtension hetMetEnabled `alexAndPred` notFollowedBySymbol }
 330                                         { special ITopenBrak }
 331   "]>" / { ifExtension hetMetEnabled }  { special ITcloseBrak }
 332   "~~" / { ifExtension hetMetEnabled }  { special ITescape }
 333   "%%" / { ifExtension hetMetEnabled }  { special ITdoublePercent }
 334   "~~$" / { ifExtension hetMetEnabled }  { special ITescapeDollar }
 335 }
 336
 337 <0> {
 338   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 339 }
 340
 341 <0> {
 342   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 343          { token IToubxparen }
 344   "#)" / { ifExtension unboxedTuplesEnabled }
 345          { token ITcubxparen }
 346 }
 347
 348 <0> {
 349   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 350   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 351 }
 352
 353 <0,option_prags> {
 354   \(                                    { special IToparen }
 355   \)                                    { special ITcparen }
 356   \[                                    { special ITobrack }
 357   \]                                    { special ITcbrack }
 358   \,                                    { special ITcomma }
 359   \;                                    { special ITsemi }
 360   \`                                    { special ITbackquote }
 361
 362   \{                                    { open_brace }
 363   \}                                    { close_brace }
 364 }
 365
 366 <0,option_prags> {
 367   @qual @varid                  { idtoken qvarid }
 368   @qual @conid                  { idtoken qconid }
 369   @varid                        { varid }
 370   @conid                        { idtoken conid }
 371 }
 372
 373 <0> {
 374   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 375   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 376   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 377   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 378 }
 379
 380 -- ToDo: - move `var` and (sym) into lexical syntax?
 381 --       - remove backquote from $special?
 382 <0> {
 383   @qual @varsym                                    { idtoken qvarsym }
 384   @qual @consym                                    { idtoken qconsym }
 385   @varsym                                          { varsym }
 386   @consym                                          { consym }
 387 }
 388
 389 -- For the normal boxed literals we need to be careful
 390 -- when trying to be close to Haskell98
 391 <0> {
 392   -- Normal integral literals (:: Num a => a, from Integer)
 393   @decimal           { tok_num positive 0 0 decimal }
 394   0[oO] @octal       { tok_num positive 2 2 octal }
 395   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 396
 397   -- Normal rational literals (:: Fractional a => a, from Rational)
 398   @floating_point    { strtoken tok_float }
 399 }
 400
 401 <0> {
 402   -- Unboxed ints (:: Int#) and words (:: Word#)
 403   -- It's simpler (and faster?) to give separate cases to the negatives,
 404   -- especially considering octal/hexadecimal prefixes.
 405   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 406   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 407   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 408   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 409   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 410   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 411
 412   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 413   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 414   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 415
 416   -- Unboxed floats and doubles (:: Float#, :: Double#)
 417   -- prim_{float,double} work with signed literals
 418   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 419   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 420 }
 421
 422 -- Strings and chars are lexed by hand-written code.  The reason is
 423 -- that even if we recognise the string or char here in the regex
 424 -- lexer, we would still have to parse the string afterward in order
 425 -- to convert it to a String.
 426 <0> {
 427   \'                            { lex_char_tok }
 428   \"                            { lex_string_tok }
 429 }
 430
 431 {
 432 -- -----------------------------------------------------------------------------
 433 -- The token type
 434
 435 data Token
 436   = ITas                        -- Haskell keywords
 437   | ITcase
 438   | ITclass
 439   | ITdata
 440   | ITdefault
 441   | ITderiving
 442   | ITdo
 443   | ITelse
 444   | IThiding
 445   | ITif
 446   | ITimport
 447   | ITin
 448   | ITinfix
 449   | ITinfixl
 450   | ITinfixr
 451   | ITinstance
 452   | ITlet
 453   | ITmodule
 454   | ITnewtype
 455   | ITof
 456   | ITqualified
 457   | ITthen
 458   | ITtype
 459   | ITwhere
 460   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 461
 462   | ITforall                    -- GHC extension keywords
 463   | ITforeign
 464   | ITexport
 465   | ITlabel
 466   | ITdynamic
 467   | ITsafe
 468   | ITthreadsafe
 469   | ITinterruptible
 470   | ITunsafe
 471   | ITstdcallconv
 472   | ITccallconv
 473   | ITprimcallconv
 474   | ITmdo
 475   | ITfamily
 476   | ITgroup
 477   | ITby
 478   | ITusing
 479
 480         -- Pragmas
 481   | ITinline_prag InlineSpec RuleMatchInfo
 482   | ITspec_prag                 -- SPECIALISE
 483   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 484   | ITsource_prag
 485   | ITrules_prag
 486   | ITwarning_prag
 487   | ITdeprecated_prag
 488   | ITline_prag
 489   | ITscc_prag
 490   | ITgenerated_prag
 491   | ITcore_prag                 -- hdaume: core annotations
 492   | ITunpack_prag
 493   | ITann_prag
 494   | ITclose_prag
 495   | IToptions_prag String
 496   | ITinclude_prag String
 497   | ITlanguage_prag
 498   | ITvect_prag
 499   | ITvect_scalar_prag
 500
 501   | ITdotdot                    -- reserved symbols
 502   | ITcolon
 503   | ITdcolon
 504   | ITequal
 505   | ITlam
 506   | ITvbar
 507   | ITlarrow
 508   | ITrarrow
 509   | ITat
 510   | ITtilde
 511   | ITdarrow
 512   | ITminus
 513   | ITbang
 514   | ITstar
 515   | ITdot
 516
 517   | ITbiglam                    -- GHC-extension symbols
 518
 519   | ITocurly                    -- special symbols
 520   | ITccurly
 521   | ITocurlybar                 -- {|, for type applications
 522   | ITccurlybar                 -- |}, for type applications
 523   | ITvocurly
 524   | ITvccurly
 525   | ITobrack
 526   | ITopabrack                  -- [:, for parallel arrays with -XParallelArrays
 527   | ITcpabrack                  -- :], for parallel arrays with -XParallelArrays
 528   | ITcbrack
 529   | IToparen
 530   | ITcparen
 531   | IToubxparen
 532   | ITcubxparen
 533   | ITsemi
 534   | ITcomma
 535   | ITunderscore
 536   | ITbackquote
 537
 538   | ITvarid   FastString        -- identifiers
 539   | ITconid   FastString
 540   | ITvarsym  FastString
 541   | ITconsym  FastString
 542   | ITqvarid  (FastString,FastString)
 543   | ITqconid  (FastString,FastString)
 544   | ITqvarsym (FastString,FastString)
 545   | ITqconsym (FastString,FastString)
 546   | ITprefixqvarsym (FastString,FastString)
 547   | ITprefixqconsym (FastString,FastString)
 548
 549   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 550
 551   | ITchar       Char
 552   | ITstring     FastString
 553   | ITinteger    Integer
 554   | ITrational   Rational
 555
 556   | ITprimchar   Char
 557   | ITprimstring FastString
 558   | ITprimint    Integer
 559   | ITprimword   Integer
 560   | ITprimfloat  Rational
 561   | ITprimdouble Rational
 562
 563   -- Template Haskell extension tokens
 564   | ITopenExpQuote              --  [| or [e|
 565   | ITopenPatQuote              --  [p|
 566   | ITopenDecQuote              --  [d|
 567   | ITopenTypQuote              --  [t|
 568   | ITcloseQuote                --  |]
 569   | ITidEscape   FastString     --  $x
 570   | ITparenEscape               --  $(
 571   | ITvarQuote                  --  '
 572   | ITtyQuote                   --  ''
 573   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 574
 575   -- Arrow notation extension
 576   | ITproc
 577   | ITrec
 578   | IToparenbar                 --  (|
 579   | ITcparenbar                 --  |)
 580   | ITlarrowtail                --  -<
 581   | ITrarrowtail                --  >-
 582   | ITLarrowtail                --  -<<
 583   | ITRarrowtail                --  >>-
 584
 585   -- Heterogeneous Metaprogramming extension
 586   | ITopenBrak                  --  <[
 587   | ITcloseBrak                 --  ]>
 588   | ITescape                    --  ~~
 589   | ITescapeDollar              --  ~~$
 590   | ITdoublePercent             --  %%
 591
 592   | ITunknown String            -- Used when the lexer can't make sense of it
 593   | ITeof                       -- end of file token
 594
 595   -- Documentation annotations
 596   | ITdocCommentNext  String     -- something beginning '-- |'
 597   | ITdocCommentPrev  String     -- something beginning '-- ^'
 598   | ITdocCommentNamed String     -- something beginning '-- $'
 599   | ITdocSection      Int String -- a section heading
 600   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 601   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 602   | ITlineComment     String     -- comment starting by "--"
 603   | ITblockComment    String     -- comment in {- -}
 604
 605 #ifdef DEBUG
 606   deriving Show -- debugging
 607 #endif
 608
 609 {-
 610 isSpecial :: Token -> Bool
 611 -- If we see M.x, where x is a keyword, but
 612 -- is special, we treat is as just plain M.x,
 613 -- not as a keyword.
 614 isSpecial ITas          = True
 615 isSpecial IThiding      = True
 616 isSpecial ITqualified   = True
 617 isSpecial ITforall      = True
 618 isSpecial ITexport      = True
 619 isSpecial ITlabel       = True
 620 isSpecial ITdynamic     = True
 621 isSpecial ITsafe        = True
 622 isSpecial ITthreadsafe  = True
 623 isSpecial ITinterruptible = True
 624 isSpecial ITunsafe      = True
 625 isSpecial ITccallconv   = True
 626 isSpecial ITstdcallconv = True
 627 isSpecial ITprimcallconv = True
 628 isSpecial ITmdo         = True
 629 isSpecial ITfamily      = True
 630 isSpecial ITgroup   = True
 631 isSpecial ITby      = True
 632 isSpecial ITusing   = True
 633 isSpecial _             = False
 634 -}
 635
 636 -- the bitmap provided as the third component indicates whether the
 637 -- corresponding extension keyword is valid under the extension options
 638 -- provided to the compiler; if the extension corresponding to *any* of the
 639 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 640 -- facilitates using a keyword in two different extensions that can be
 641 -- activated independently)
 642 --
 643 reservedWordsFM :: UniqFM (Token, Int)
 644 reservedWordsFM = listToUFM $
 645         map (\(x, y, z) -> (mkFastString x, (y, z)))
 646        [( "_",          ITunderscore,   0 ),
 647         ( "as",         ITas,           0 ),
 648         ( "case",       ITcase,         0 ),
 649         ( "class",      ITclass,        0 ),
 650         ( "data",       ITdata,         0 ),
 651         ( "default",    ITdefault,      0 ),
 652         ( "deriving",   ITderiving,     0 ),
 653         ( "do",         ITdo,           0 ),
 654         ( "else",       ITelse,         0 ),
 655         ( "hiding",     IThiding,       0 ),
 656         ( "if",         ITif,           0 ),
 657         ( "import",     ITimport,       0 ),
 658         ( "in",         ITin,           0 ),
 659         ( "infix",      ITinfix,        0 ),
 660         ( "infixl",     ITinfixl,       0 ),
 661         ( "infixr",     ITinfixr,       0 ),
 662         ( "instance",   ITinstance,     0 ),
 663         ( "let",        ITlet,          0 ),
 664         ( "module",     ITmodule,       0 ),
 665         ( "newtype",    ITnewtype,      0 ),
 666         ( "of",         ITof,           0 ),
 667         ( "qualified",  ITqualified,    0 ),
 668         ( "then",       ITthen,         0 ),
 669         ( "type",       ITtype,         0 ),
 670         ( "where",      ITwhere,        0 ),
 671         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 672
 673     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 674         ( "mdo",        ITmdo,           bit recursiveDoBit),
 675         ( "family",     ITfamily,        bit tyFamBit),
 676     ( "group",  ITgroup,     bit transformComprehensionsBit),
 677     ( "by",     ITby,        bit transformComprehensionsBit),
 678     ( "using",  ITusing,     bit transformComprehensionsBit),
 679
 680         ( "foreign",    ITforeign,       bit ffiBit),
 681         ( "export",     ITexport,        bit ffiBit),
 682         ( "label",      ITlabel,         bit ffiBit),
 683         ( "dynamic",    ITdynamic,       bit ffiBit),
 684         ( "safe",       ITsafe,          bit ffiBit),
 685         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 686         ( "interruptible", ITinterruptible, bit ffiBit),
 687         ( "unsafe",     ITunsafe,        bit ffiBit),
 688         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 689         ( "ccall",      ITccallconv,     bit ffiBit),
 690         ( "prim",       ITprimcallconv,  bit ffiBit),
 691
 692         ( "rec",        ITrec,           bit recBit),
 693         ( "proc",       ITproc,          bit arrowsBit)
 694      ]
 695
 696 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 697 reservedSymsFM = listToUFM $
 698     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 699       [ ("..",  ITdotdot,   always)
 700         -- (:) is a reserved op, meaning only list cons
 701        ,(":",   ITcolon,    always)
 702        ,("::",  ITdcolon,   always)
 703        ,("=",   ITequal,    always)
 704        ,("\\",  ITlam,      always)
 705        ,("|",   ITvbar,     always)
 706        ,("<-",  ITlarrow,   always)
 707        ,("->",  ITrarrow,   always)
 708        ,("@",   ITat,       always)
 709        ,("~",   ITtilde,    always)
 710        ,("=>",  ITdarrow,   always)
 711        ,("-",   ITminus,    always)
 712        ,("!",   ITbang,     always)
 713
 714         -- For data T (a::*) = MkT
 715        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 716         -- For 'forall a . t'
 717        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 718
 719        ,("-<",  ITlarrowtail, arrowsEnabled)
 720        ,(">-",  ITrarrowtail, arrowsEnabled)
 721        ,("-<<", ITLarrowtail, arrowsEnabled)
 722        ,(">>-", ITRarrowtail, arrowsEnabled)
 723
 724        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 725        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 726        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 727                                 explicitForallEnabled i)
 728        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 729        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 730
 731        ,("⤙",   ITlarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 732        ,("⤚",   ITrarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 733        ,("⤛",   ITLarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 734        ,("⤜",   ITRarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 735
 736        ,("★", ITstar, unicodeSyntaxEnabled)
 737
 738         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 739         -- form part of a large operator.  This would let us have a better
 740         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 741        ]
 742
 743 -- -----------------------------------------------------------------------------
 744 -- Lexer actions
 745
 746 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 747
 748 special :: Token -> Action
 749 special tok span _buf _len = return (L span tok)
 750
 751 token, layout_token :: Token -> Action
 752 token t span _buf _len = return (L span t)
 753 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 754
 755 idtoken :: (StringBuffer -> Int -> Token) -> Action
 756 idtoken f span buf len = return (L span $! (f buf len))
 757
 758 skip_one_varid :: (FastString -> Token) -> Action
 759 skip_one_varid f span buf len
 760   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 761
 762 strtoken :: (String -> Token) -> Action
 763 strtoken f span buf len =
 764   return (L span $! (f $! lexemeToString buf len))
 765
 766 init_strtoken :: Int -> (String -> Token) -> Action
 767 -- like strtoken, but drops the last N character(s)
 768 init_strtoken drop f span buf len =
 769   return (L span $! (f $! lexemeToString buf (len-drop)))
 770
 771 begin :: Int -> Action
 772 begin code _span _str _len = do pushLexState code; lexToken
 773
 774 pop :: Action
 775 pop _span _buf _len = do _ <- popLexState
 776                          lexToken
 777
 778 hopefully_open_brace :: Action
 779 hopefully_open_brace span buf len
 780  = do relaxed <- extension relaxedLayout
 781       ctx <- getContext
 782       (AI l _) <- getInput
 783       let offset = srcLocCol l
 784           isOK = relaxed ||
 785                  case ctx of
 786                  Layout prev_off : _ -> prev_off < offset
 787                  _                   -> True
 788       if isOK then pop_and open_brace span buf len
 789               else failSpanMsgP span (text "Missing block")
 790
 791 pop_and :: Action -> Action
 792 pop_and act span buf len = do _ <- popLexState
 793                               act span buf len
 794
 795 {-# INLINE nextCharIs #-}
 796 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 797 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 798
 799 notFollowedBy :: Char -> AlexAccPred Int
 800 notFollowedBy char _ _ _ (AI _ buf)
 801   = nextCharIs buf (/=char)
 802
 803 notFollowedBySymbol :: AlexAccPred Int
 804 notFollowedBySymbol _ _ _ (AI _ buf)
 805   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 806
 807 -- We must reject doc comments as being ordinary comments everywhere.
 808 -- In some cases the doc comment will be selected as the lexeme due to
 809 -- maximal munch, but not always, because the nested comment rule is
 810 -- valid in all states, but the doc-comment rules are only valid in
 811 -- the non-layout states.
 812 isNormalComment :: AlexAccPred Int
 813 isNormalComment bits _ _ (AI _ buf)
 814   | haddockEnabled bits = notFollowedByDocOrPragma
 815   | otherwise           = nextCharIs buf (/='#')
 816   where
 817     notFollowedByDocOrPragma
 818        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 819
 820 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 821 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 822
 823 {-
 824 haddockDisabledAnd p bits _ _ (AI _ buf)
 825   = if haddockEnabled bits then False else (p buf)
 826 -}
 827
 828 atEOL :: AlexAccPred Int
 829 atEOL _ _ _ (AI _ buf) = atEnd buf || currentChar buf == '\n'
 830
 831 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 832 ifExtension pred bits _ _ _ = pred bits
 833
 834 multiline_doc_comment :: Action
 835 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 836   where
 837     worker commentAcc input docType oneLine = case alexGetChar input of
 838       Just ('\n', input')
 839         | oneLine -> docCommentEnd input commentAcc docType buf span
 840         | otherwise -> case checkIfCommentLine input' of
 841           Just input -> worker ('\n':commentAcc) input docType False
 842           Nothing -> docCommentEnd input commentAcc docType buf span
 843       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 844       Nothing -> docCommentEnd input commentAcc docType buf span
 845
 846     checkIfCommentLine input = check (dropNonNewlineSpace input)
 847       where
 848         check input = case alexGetChar input of
 849           Just ('-', input) -> case alexGetChar input of
 850             Just ('-', input) -> case alexGetChar input of
 851               Just (c, _) | c /= '-' -> Just input
 852               _ -> Nothing
 853             _ -> Nothing
 854           _ -> Nothing
 855
 856         dropNonNewlineSpace input = case alexGetChar input of
 857           Just (c, input')
 858             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 859             | otherwise -> input
 860           Nothing -> input
 861
 862 lineCommentToken :: Action
 863 lineCommentToken span buf len = do
 864   b <- extension rawTokenStreamEnabled
 865   if b then strtoken ITlineComment span buf len else lexToken
 866
 867 {-
 868   nested comments require traversing by hand, they can't be parsed
 869   using regular expressions.
 870 -}
 871 nested_comment :: P (Located Token) -> Action
 872 nested_comment cont span _str _len = do
 873   input <- getInput
 874   go "" (1::Int) input
 875   where
 876     go commentAcc 0 input = do setInput input
 877                                b <- extension rawTokenStreamEnabled
 878                                if b
 879                                  then docCommentEnd input commentAcc ITblockComment _str span
 880                                  else cont
 881     go commentAcc n input = case alexGetChar input of
 882       Nothing -> errBrace input span
 883       Just ('-',input) -> case alexGetChar input of
 884         Nothing  -> errBrace input span
 885         Just ('\125',input) -> go commentAcc (n-1) input
 886         Just (_,_)          -> go ('-':commentAcc) n input
 887       Just ('\123',input) -> case alexGetChar input of
 888         Nothing  -> errBrace input span
 889         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 890         Just (_,_)       -> go ('\123':commentAcc) n input
 891       Just (c,input) -> go (c:commentAcc) n input
 892
 893 nested_doc_comment :: Action
 894 nested_doc_comment span buf _len = withLexedDocType (go "")
 895   where
 896     go commentAcc input docType _ = case alexGetChar input of
 897       Nothing -> errBrace input span
 898       Just ('-',input) -> case alexGetChar input of
 899         Nothing -> errBrace input span
 900         Just ('\125',input) ->
 901           docCommentEnd input commentAcc docType buf span
 902         Just (_,_) -> go ('-':commentAcc) input docType False
 903       Just ('\123', input) -> case alexGetChar input of
 904         Nothing  -> errBrace input span
 905         Just ('-',input) -> do
 906           setInput input
 907           let cont = do input <- getInput; go commentAcc input docType False
 908           nested_comment cont span buf _len
 909         Just (_,_) -> go ('\123':commentAcc) input docType False
 910       Just (c,input) -> go (c:commentAcc) input docType False
 911
 912 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 913                  -> P (Located Token)
 914 withLexedDocType lexDocComment = do
 915   input@(AI _ buf) <- getInput
 916   case prevChar buf ' ' of
 917     '|' -> lexDocComment input ITdocCommentNext False
 918     '^' -> lexDocComment input ITdocCommentPrev False
 919     '$' -> lexDocComment input ITdocCommentNamed False
 920     '*' -> lexDocSection 1 input
 921     '#' -> lexDocComment input ITdocOptionsOld False
 922     _ -> panic "withLexedDocType: Bad doc type"
 923  where
 924     lexDocSection n input = case alexGetChar input of
 925       Just ('*', input) -> lexDocSection (n+1) input
 926       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 927       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 928
 929 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 930 -- off again at the end of the pragma.
 931 rulePrag :: Action
 932 rulePrag span _buf _len = do
 933   setExts (.|. bit inRulePragBit)
 934   return (L span ITrules_prag)
 935
 936 endPrag :: Action
 937 endPrag span _buf _len = do
 938   setExts (.&. complement (bit inRulePragBit))
 939   return (L span ITclose_prag)
 940
 941 -- docCommentEnd
 942 -------------------------------------------------------------------------------
 943 -- This function is quite tricky. We can't just return a new token, we also
 944 -- need to update the state of the parser. Why? Because the token is longer
 945 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 946 -- it writes the wrong token length to the parser state. This function is
 947 -- called afterwards, so it can just update the state.
 948
 949 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 950                  SrcSpan -> P (Located Token)
 951 docCommentEnd input commentAcc docType buf span = do
 952   setInput input
 953   let (AI loc nextBuf) = input
 954       comment = reverse commentAcc
 955       span' = mkSrcSpan (srcSpanStart span) loc
 956       last_len = byteDiff buf nextBuf
 957
 958   span `seq` setLastToken span' last_len
 959   return (L span' (docType comment))
 960
 961 errBrace :: AlexInput -> SrcSpan -> P a
 962 errBrace (AI end _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 963
 964 open_brace, close_brace :: Action
 965 open_brace span _str _len = do
 966   ctx <- getContext
 967   setContext (NoLayout:ctx)
 968   return (L span ITocurly)
 969 close_brace span _str _len = do
 970   popContext
 971   return (L span ITccurly)
 972
 973 qvarid, qconid :: StringBuffer -> Int -> Token
 974 qvarid buf len = ITqvarid $! splitQualName buf len False
 975 qconid buf len = ITqconid $! splitQualName buf len False
 976
 977 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 978 -- takes a StringBuffer and a length, and returns the module name
 979 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 980 -- because of hierarchical module names.
 981 splitQualName orig_buf len parens = split orig_buf orig_buf
 982   where
 983     split buf dot_buf
 984         | orig_buf `byteDiff` buf >= len  = done dot_buf
 985         | c == '.'                        = found_dot buf'
 986         | otherwise                       = split buf' dot_buf
 987       where
 988        (c,buf') = nextChar buf
 989
 990     -- careful, we might get names like M....
 991     -- so, if the character after the dot is not upper-case, this is
 992     -- the end of the qualifier part.
 993     found_dot buf -- buf points after the '.'
 994         | isUpper c    = split buf' buf
 995         | otherwise    = done buf
 996       where
 997        (c,buf') = nextChar buf
 998
 999     done dot_buf =
1000         (lexemeToFastString orig_buf (qual_size - 1),
1001          if parens -- Prelude.(+)
1002             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
1003             else lexemeToFastString dot_buf (len - qual_size))
1004       where
1005         qual_size = orig_buf `byteDiff` dot_buf
1006
1007 varid :: Action
1008 varid span buf len =
1009   fs `seq`
1010   case lookupUFM reservedWordsFM fs of
1011         Just (keyword,0)    -> do
1012                 maybe_layout keyword
1013                 return (L span keyword)
1014         Just (keyword,exts) -> do
1015                 b <- extension (\i -> exts .&. i /= 0)
1016                 if b then do maybe_layout keyword
1017                              return (L span keyword)
1018                      else return (L span (ITvarid fs))
1019         _other -> return (L span (ITvarid fs))
1020   where
1021         fs = lexemeToFastString buf len
1022
1023 conid :: StringBuffer -> Int -> Token
1024 conid buf len = ITconid fs
1025   where fs = lexemeToFastString buf len
1026
1027 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
1028 qvarsym buf len = ITqvarsym $! splitQualName buf len False
1029 qconsym buf len = ITqconsym $! splitQualName buf len False
1030 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
1031 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
1032
1033 varsym, consym :: Action
1034 varsym = sym ITvarsym
1035 consym = sym ITconsym
1036
1037 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
1038     -> P (Located Token)
1039 sym con span buf len =
1040   case lookupUFM reservedSymsFM fs of
1041         Just (keyword,exts) -> do
1042                 b <- extension exts
1043                 if b then return (L span keyword)
1044                      else return (L span $! con fs)
1045         _other -> return (L span $! con fs)
1046   where
1047         fs = lexemeToFastString buf len
1048
1049 -- Variations on the integral numeric literal.
1050 tok_integral :: (Integer -> Token)
1051      -> (Integer -> Integer)
1052  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1053      -> Int -> Int
1054      -> (Integer, (Char->Int)) -> Action
1055 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1056   return $ L span $ itint $! transint $ parseUnsignedInteger
1057      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1058
1059 -- some conveniences for use with tok_integral
1060 tok_num :: (Integer -> Integer)
1061         -> Int -> Int
1062         -> (Integer, (Char->Int)) -> Action
1063 tok_num = tok_integral ITinteger
1064 tok_primint :: (Integer -> Integer)
1065             -> Int -> Int
1066             -> (Integer, (Char->Int)) -> Action
1067 tok_primint = tok_integral ITprimint
1068 tok_primword :: Int -> Int
1069              -> (Integer, (Char->Int)) -> Action
1070 tok_primword = tok_integral ITprimword positive
1071 positive, negative :: (Integer -> Integer)
1072 positive = id
1073 negative = negate
1074 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1075 decimal = (10,octDecDigit)
1076 octal = (8,octDecDigit)
1077 hexadecimal = (16,hexDigit)
1078
1079 -- readRational can understand negative rationals, exponents, everything.
1080 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1081 tok_float        str = ITrational   $! readRational str
1082 tok_primfloat    str = ITprimfloat  $! readRational str
1083 tok_primdouble   str = ITprimdouble $! readRational str
1084
1085 -- -----------------------------------------------------------------------------
1086 -- Layout processing
1087
1088 -- we're at the first token on a line, insert layout tokens if necessary
1089 do_bol :: Action
1090 do_bol span _str _len = do
1091         pos <- getOffside
1092         case pos of
1093             LT -> do
1094                 --trace "layout: inserting '}'" $ do
1095                 popContext
1096                 -- do NOT pop the lex state, we might have a ';' to insert
1097                 return (L span ITvccurly)
1098             EQ -> do
1099                 --trace "layout: inserting ';'" $ do
1100                 _ <- popLexState
1101                 return (L span ITsemi)
1102             GT -> do
1103                 _ <- popLexState
1104                 lexToken
1105
1106 -- certain keywords put us in the "layout" state, where we might
1107 -- add an opening curly brace.
1108 maybe_layout :: Token -> P ()
1109 maybe_layout t = do -- If the alternative layout rule is enabled then
1110                     -- we never create an implicit layout context here.
1111                     -- Layout is handled XXX instead.
1112                     -- The code for closing implicit contexts, or
1113                     -- inserting implicit semi-colons, is therefore
1114                     -- irrelevant as it only applies in an implicit
1115                     -- context.
1116                     alr <- extension alternativeLayoutRule
1117                     unless alr $ f t
1118     where f ITdo    = pushLexState layout_do
1119           f ITmdo   = pushLexState layout_do
1120           f ITof    = pushLexState layout
1121           f ITlet   = pushLexState layout
1122           f ITwhere = pushLexState layout
1123           f ITrec   = pushLexState layout
1124           f _       = return ()
1125
1126 -- Pushing a new implicit layout context.  If the indentation of the
1127 -- next token is not greater than the previous layout context, then
1128 -- Haskell 98 says that the new layout context should be empty; that is
1129 -- the lexer must generate {}.
1130 --
1131 -- We are slightly more lenient than this: when the new context is started
1132 -- by a 'do', then we allow the new context to be at the same indentation as
1133 -- the previous context.  This is what the 'strict' argument is for.
1134 --
1135 new_layout_context :: Bool -> Action
1136 new_layout_context strict span _buf _len = do
1137     _ <- popLexState
1138     (AI l _) <- getInput
1139     let offset = srcLocCol l
1140     ctx <- getContext
1141     nondecreasing <- extension nondecreasingIndentation
1142     let strict' = strict || not nondecreasing
1143     case ctx of
1144         Layout prev_off : _  |
1145            (strict'     && prev_off >= offset  ||
1146             not strict' && prev_off > offset) -> do
1147                 -- token is indented to the left of the previous context.
1148                 -- we must generate a {} sequence now.
1149                 pushLexState layout_left
1150                 return (L span ITvocurly)
1151         _ -> do
1152                 setContext (Layout offset : ctx)
1153                 return (L span ITvocurly)
1154
1155 do_layout_left :: Action
1156 do_layout_left span _buf _len = do
1157     _ <- popLexState
1158     pushLexState bol  -- we must be at the start of a line
1159     return (L span ITvccurly)
1160
1161 -- -----------------------------------------------------------------------------
1162 -- LINE pragmas
1163
1164 setLine :: Int -> Action
1165 setLine code span buf len = do
1166   let line = parseUnsignedInteger buf len 10 octDecDigit
1167   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1)
1168         -- subtract one: the line number refers to the *following* line
1169   _ <- popLexState
1170   pushLexState code
1171   lexToken
1172
1173 setFile :: Int -> Action
1174 setFile code span buf len = do
1175   let file = lexemeToFastString (stepOn buf) (len-2)
1176   setAlrLastLoc noSrcSpan
1177   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1178   _ <- popLexState
1179   pushLexState code
1180   lexToken
1181
1182
1183 -- -----------------------------------------------------------------------------
1184 -- Options, includes and language pragmas.
1185
1186 lex_string_prag :: (String -> Token) -> Action
1187 lex_string_prag mkTok span _buf _len
1188     = do input <- getInput
1189          start <- getSrcLoc
1190          tok <- go [] input
1191          end <- getSrcLoc
1192          return (L (mkSrcSpan start end) tok)
1193     where go acc input
1194               = if isString input "#-}"
1195                    then do setInput input
1196                            return (mkTok (reverse acc))
1197                    else case alexGetChar input of
1198                           Just (c,i) -> go (c:acc) i
1199                           Nothing -> err input
1200           isString _ [] = True
1201           isString i (x:xs)
1202               = case alexGetChar i of
1203                   Just (c,i') | c == x    -> isString i' xs
1204                   _other -> False
1205           err (AI end _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1206
1207
1208 -- -----------------------------------------------------------------------------
1209 -- Strings & Chars
1210
1211 -- This stuff is horrible.  I hates it.
1212
1213 lex_string_tok :: Action
1214 lex_string_tok span _buf _len = do
1215   tok <- lex_string ""
1216   end <- getSrcLoc
1217   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1218
1219 lex_string :: String -> P Token
1220 lex_string s = do
1221   i <- getInput
1222   case alexGetChar' i of
1223     Nothing -> lit_error i
1224
1225     Just ('"',i)  -> do
1226         setInput i
1227         magicHash <- extension magicHashEnabled
1228         if magicHash
1229           then do
1230             i <- getInput
1231             case alexGetChar' i of
1232               Just ('#',i) -> do
1233                    setInput i
1234                    if any (> '\xFF') s
1235                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1236                     else let s' = mkZFastString (reverse s) in
1237                          return (ITprimstring s')
1238                         -- mkZFastString is a hack to avoid encoding the
1239                         -- string in UTF-8.  We just want the exact bytes.
1240               _other ->
1241                 return (ITstring (mkFastString (reverse s)))
1242           else
1243                 return (ITstring (mkFastString (reverse s)))
1244
1245     Just ('\\',i)
1246         | Just ('&',i) <- next -> do
1247                 setInput i; lex_string s
1248         | Just (c,i) <- next, c <= '\x7f' && is_space c -> do
1249                            -- is_space only works for <= '\x7f' (#3751)
1250                 setInput i; lex_stringgap s
1251         where next = alexGetChar' i
1252
1253     Just (c, i1) -> do
1254         case c of
1255           '\\' -> do setInput i1; c' <- lex_escape; lex_string (c':s)
1256           c | isAny c -> do setInput i1; lex_string (c:s)
1257           _other -> lit_error i
1258
1259 lex_stringgap :: String -> P Token
1260 lex_stringgap s = do
1261   i <- getInput
1262   c <- getCharOrFail i
1263   case c of
1264     '\\' -> lex_string s
1265     c | is_space c -> lex_stringgap s
1266     _other -> lit_error i
1267
1268
1269 lex_char_tok :: Action
1270 -- Here we are basically parsing character literals, such as 'x' or '\n'
1271 -- but, when Template Haskell is on, we additionally spot
1272 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1273 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1274 -- So we have to do two characters of lookahead: when we see 'x we need to
1275 -- see if there's a trailing quote
1276 lex_char_tok span _buf _len = do        -- We've seen '
1277    i1 <- getInput       -- Look ahead to first character
1278    let loc = srcSpanStart span
1279    case alexGetChar' i1 of
1280         Nothing -> lit_error  i1
1281
1282         Just ('\'', i2@(AI end2 _)) -> do       -- We've seen ''
1283                   th_exts <- extension thEnabled
1284                   if th_exts then do
1285                         setInput i2
1286                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1287                    else lit_error i1
1288
1289         Just ('\\', i2@(AI _end2 _)) -> do      -- We've seen 'backslash
1290                   setInput i2
1291                   lit_ch <- lex_escape
1292                   i3 <- getInput
1293                   mc <- getCharOrFail i3 -- Trailing quote
1294                   if mc == '\'' then finish_char_tok loc lit_ch
1295                                 else lit_error i3
1296
1297         Just (c, i2@(AI _end2 _))
1298                 | not (isAny c) -> lit_error i1
1299                 | otherwise ->
1300
1301                 -- We've seen 'x, where x is a valid character
1302                 --  (i.e. not newline etc) but not a quote or backslash
1303            case alexGetChar' i2 of      -- Look ahead one more character
1304                 Just ('\'', i3) -> do   -- We've seen 'x'
1305                         setInput i3
1306                         finish_char_tok loc c
1307                 _other -> do            -- We've seen 'x not followed by quote
1308                                         -- (including the possibility of EOF)
1309                                         -- If TH is on, just parse the quote only
1310                         th_exts <- extension thEnabled
1311                         let (AI end _) = i1
1312                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1313                                    else lit_error i2
1314
1315 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1316 finish_char_tok loc ch  -- We've already seen the closing quote
1317                         -- Just need to check for trailing #
1318   = do  magicHash <- extension magicHashEnabled
1319         i@(AI end _) <- getInput
1320         if magicHash then do
1321                 case alexGetChar' i of
1322                         Just ('#',i@(AI end _)) -> do
1323                                 setInput i
1324                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1325                         _other ->
1326                                 return (L (mkSrcSpan loc end) (ITchar ch))
1327             else do
1328                    return (L (mkSrcSpan loc end) (ITchar ch))
1329
1330 isAny :: Char -> Bool
1331 isAny c | c > '\x7f' = isPrint c
1332         | otherwise  = is_any c
1333
1334 lex_escape :: P Char
1335 lex_escape = do
1336   i0 <- getInput
1337   c <- getCharOrFail i0
1338   case c of
1339         'a'   -> return '\a'
1340         'b'   -> return '\b'
1341         'f'   -> return '\f'
1342         'n'   -> return '\n'
1343         'r'   -> return '\r'
1344         't'   -> return '\t'
1345         'v'   -> return '\v'
1346         '\\'  -> return '\\'
1347         '"'   -> return '\"'
1348         '\''  -> return '\''
1349         '^'   -> do i1 <- getInput
1350                     c <- getCharOrFail i1
1351                     if c >= '@' && c <= '_'
1352                         then return (chr (ord c - ord '@'))
1353                         else lit_error i1
1354
1355         'x'   -> readNum is_hexdigit 16 hexDigit
1356         'o'   -> readNum is_octdigit  8 octDecDigit
1357         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1358
1359         c1 ->  do
1360            i <- getInput
1361            case alexGetChar' i of
1362             Nothing -> lit_error i0
1363             Just (c2,i2) ->
1364               case alexGetChar' i2 of
1365                 Nothing -> do lit_error i0
1366                 Just (c3,i3) ->
1367                    let str = [c1,c2,c3] in
1368                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1369                                      Just rest <- [stripPrefix p str] ] of
1370                           (escape_char,[]):_ -> do
1371                                 setInput i3
1372                                 return escape_char
1373                           (escape_char,_:_):_ -> do
1374                                 setInput i2
1375                                 return escape_char
1376                           [] -> lit_error i0
1377
1378 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1379 readNum is_digit base conv = do
1380   i <- getInput
1381   c <- getCharOrFail i
1382   if is_digit c
1383         then readNum2 is_digit base conv (conv c)
1384         else lit_error i
1385
1386 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1387 readNum2 is_digit base conv i = do
1388   input <- getInput
1389   read i input
1390   where read i input = do
1391           case alexGetChar' input of
1392             Just (c,input') | is_digit c -> do
1393                let i' = i*base + conv c
1394                if i' > 0x10ffff
1395                   then setInput input >> lexError "numeric escape sequence out of range"
1396                   else read i' input'
1397             _other -> do
1398               setInput input; return (chr i)
1399
1400
1401 silly_escape_chars :: [(String, Char)]
1402 silly_escape_chars = [
1403         ("NUL", '\NUL'),
1404         ("SOH", '\SOH'),
1405         ("STX", '\STX'),
1406         ("ETX", '\ETX'),
1407         ("EOT", '\EOT'),
1408         ("ENQ", '\ENQ'),
1409         ("ACK", '\ACK'),
1410         ("BEL", '\BEL'),
1411         ("BS", '\BS'),
1412         ("HT", '\HT'),
1413         ("LF", '\LF'),
1414         ("VT", '\VT'),
1415         ("FF", '\FF'),
1416         ("CR", '\CR'),
1417         ("SO", '\SO'),
1418         ("SI", '\SI'),
1419         ("DLE", '\DLE'),
1420         ("DC1", '\DC1'),
1421         ("DC2", '\DC2'),
1422         ("DC3", '\DC3'),
1423         ("DC4", '\DC4'),
1424         ("NAK", '\NAK'),
1425         ("SYN", '\SYN'),
1426         ("ETB", '\ETB'),
1427         ("CAN", '\CAN'),
1428         ("EM", '\EM'),
1429         ("SUB", '\SUB'),
1430         ("ESC", '\ESC'),
1431         ("FS", '\FS'),
1432         ("GS", '\GS'),
1433         ("RS", '\RS'),
1434         ("US", '\US'),
1435         ("SP", '\SP'),
1436         ("DEL", '\DEL')
1437         ]
1438
1439 -- before calling lit_error, ensure that the current input is pointing to
1440 -- the position of the error in the buffer.  This is so that we can report
1441 -- a correct location to the user, but also so we can detect UTF-8 decoding
1442 -- errors if they occur.
1443 lit_error :: AlexInput -> P a
1444 lit_error i = do setInput i; lexError "lexical error in string/character literal"
1445
1446 getCharOrFail :: AlexInput -> P Char
1447 getCharOrFail i =  do
1448   case alexGetChar' i of
1449         Nothing -> lexError "unexpected end-of-file in string/character literal"
1450         Just (c,i)  -> do setInput i; return c
1451
1452 -- -----------------------------------------------------------------------------
1453 -- QuasiQuote
1454
1455 lex_quasiquote_tok :: Action
1456 lex_quasiquote_tok span buf len = do
1457   let quoter = tail (lexemeToString buf (len - 1))
1458                 -- 'tail' drops the initial '[',
1459                 -- while the -1 drops the trailing '|'
1460   quoteStart <- getSrcLoc
1461   quote <- lex_quasiquote ""
1462   end <- getSrcLoc
1463   return (L (mkSrcSpan (srcSpanStart span) end)
1464            (ITquasiQuote (mkFastString quoter,
1465                           mkFastString (reverse quote),
1466                           mkSrcSpan quoteStart end)))
1467
1468 lex_quasiquote :: String -> P String
1469 lex_quasiquote s = do
1470   i <- getInput
1471   case alexGetChar' i of
1472     Nothing -> lit_error i
1473
1474     Just ('\\',i)
1475         | Just ('|',i) <- next -> do
1476                 setInput i; lex_quasiquote ('|' : s)
1477         | Just (']',i) <- next -> do
1478                 setInput i; lex_quasiquote (']' : s)
1479         where next = alexGetChar' i
1480
1481     Just ('|',i)
1482         | Just (']',i) <- next -> do
1483                 setInput i; return s
1484         where next = alexGetChar' i
1485
1486     Just (c, i) -> do
1487          setInput i; lex_quasiquote (c : s)
1488
1489 -- -----------------------------------------------------------------------------
1490 -- Warnings
1491
1492 warn :: DynFlag -> SDoc -> Action
1493 warn option warning srcspan _buf _len = do
1494     addWarning option srcspan warning
1495     lexToken
1496
1497 warnThen :: DynFlag -> SDoc -> Action -> Action
1498 warnThen option warning action srcspan buf len = do
1499     addWarning option srcspan warning
1500     action srcspan buf len
1501
1502 -- -----------------------------------------------------------------------------
1503 -- The Parse Monad
1504
1505 data LayoutContext
1506   = NoLayout
1507   | Layout !Int
1508   deriving Show
1509
1510 data ParseResult a
1511   = POk PState a
1512   | PFailed
1513         SrcSpan         -- The start and end of the text span related to
1514                         -- the error.  Might be used in environments which can
1515                         -- show this span, e.g. by highlighting it.
1516         Message         -- The error message
1517
1518 data PState = PState {
1519         buffer     :: StringBuffer,
1520         dflags     :: DynFlags,
1521         messages   :: Messages,
1522         last_loc   :: SrcSpan,  -- pos of previous token
1523         last_len   :: !Int,     -- len of previous token
1524         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1525         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1526         context    :: [LayoutContext],
1527         lex_state  :: [Int],
1528         -- Used in the alternative layout rule:
1529         -- These tokens are the next ones to be sent out. They are
1530         -- just blindly emitted, without the rule looking at them again:
1531         alr_pending_implicit_tokens :: [Located Token],
1532         -- This is the next token to be considered or, if it is Nothing,
1533         -- we need to get the next token from the input stream:
1534         alr_next_token :: Maybe (Located Token),
1535         -- This is what we consider to be the locatino of the last token
1536         -- emitted:
1537         alr_last_loc :: SrcSpan,
1538         -- The stack of layout contexts:
1539         alr_context :: [ALRContext],
1540         -- Are we expecting a '{'? If it's Just, then the ALRLayout tells
1541         -- us what sort of layout the '{' will open:
1542         alr_expecting_ocurly :: Maybe ALRLayout,
1543         -- Have we just had the '}' for a let block? If so, than an 'in'
1544         -- token doesn't need to close anything:
1545         alr_justClosedExplicitLetBlock :: Bool,
1546         code_type_bracket_depth :: Int
1547      }
1548         -- last_loc and last_len are used when generating error messages,
1549         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1550         -- current token to happyError, we could at least get rid of last_len.
1551         -- Getting rid of last_loc would require finding another way to
1552         -- implement pushCurrentContext (which is only called from one place).
1553
1554 data ALRContext = ALRNoLayout Bool{- does it contain commas? -}
1555                               Bool{- is it a 'let' block? -}
1556                 | ALRLayout ALRLayout Int
1557 data ALRLayout = ALRLayoutLet
1558                | ALRLayoutWhere
1559                | ALRLayoutOf
1560                | ALRLayoutDo
1561
1562 newtype P a = P { unP :: PState -> ParseResult a }
1563
1564 instance Monad P where
1565   return = returnP
1566   (>>=) = thenP
1567   fail = failP
1568
1569 returnP :: a -> P a
1570 returnP a = a `seq` (P $ \s -> POk s a)
1571
1572 thenP :: P a -> (a -> P b) -> P b
1573 (P m) `thenP` k = P $ \ s ->
1574         case m s of
1575                 POk s1 a         -> (unP (k a)) s1
1576                 PFailed span err -> PFailed span err
1577
1578 failP :: String -> P a
1579 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1580
1581 failMsgP :: String -> P a
1582 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1583
1584 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1585 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1586
1587 failSpanMsgP :: SrcSpan -> SDoc -> P a
1588 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1589
1590 getPState :: P PState
1591 getPState = P $ \s -> POk s s
1592
1593 getDynFlags :: P DynFlags
1594 getDynFlags = P $ \s -> POk s (dflags s)
1595
1596 withThisPackage :: (PackageId -> a) -> P a
1597 withThisPackage f
1598  = do   pkg     <- liftM thisPackage getDynFlags
1599         return  $ f pkg
1600
1601 extension :: (Int -> Bool) -> P Bool
1602 extension p = P $ \s -> POk s (p $! extsBitmap s)
1603
1604 getExts :: P Int
1605 getExts = P $ \s -> POk s (extsBitmap s)
1606
1607 setExts :: (Int -> Int) -> P ()
1608 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1609
1610 setSrcLoc :: SrcLoc -> P ()
1611 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1612
1613 incrBracketDepth :: P ()
1614 incrBracketDepth = P $ \s -> POk (s{code_type_bracket_depth = (code_type_bracket_depth s)+1}) ()
1615 decrBracketDepth :: P ()
1616 decrBracketDepth = P $ \s -> POk (s{code_type_bracket_depth = (code_type_bracket_depth s)-1}) ()
1617 getParserBrakDepth :: P Int
1618 getParserBrakDepth = P $ \s -> POk s (code_type_bracket_depth s)
1619
1620 getSrcLoc :: P SrcLoc
1621 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1622
1623 setLastToken :: SrcSpan -> Int -> P ()
1624 setLastToken loc len = P $ \s -> POk s {
1625   last_loc=loc,
1626   last_len=len
1627   } ()
1628
1629 data AlexInput = AI SrcLoc StringBuffer
1630
1631 alexInputPrevChar :: AlexInput -> Char
1632 alexInputPrevChar (AI _ buf) = prevChar buf '\n'
1633
1634 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1635 alexGetChar (AI loc s)
1636   | atEnd s   = Nothing
1637   | otherwise = adj_c `seq` loc' `seq` s' `seq`
1638                 --trace (show (ord c)) $
1639                 Just (adj_c, (AI loc' s'))
1640   where (c,s') = nextChar s
1641         loc'   = advanceSrcLoc loc c
1642
1643         non_graphic     = '\x0'
1644         upper           = '\x1'
1645         lower           = '\x2'
1646         digit           = '\x3'
1647         symbol          = '\x4'
1648         space           = '\x5'
1649         other_graphic   = '\x6'
1650
1651         adj_c
1652           | c <= '\x06' = non_graphic
1653           | c <= '\x7f' = c
1654           -- Alex doesn't handle Unicode, so when Unicode
1655           -- character is encountered we output these values
1656           -- with the actual character value hidden in the state.
1657           | otherwise =
1658                 case generalCategory c of
1659                   UppercaseLetter       -> upper
1660                   LowercaseLetter       -> lower
1661                   TitlecaseLetter       -> upper
1662                   ModifierLetter        -> other_graphic
1663                   OtherLetter           -> lower -- see #1103
1664                   NonSpacingMark        -> other_graphic
1665                   SpacingCombiningMark  -> other_graphic
1666                   EnclosingMark         -> other_graphic
1667                   DecimalNumber         -> digit
1668                   LetterNumber          -> other_graphic
1669                   OtherNumber           -> digit -- see #4373
1670                   ConnectorPunctuation  -> symbol
1671                   DashPunctuation       -> symbol
1672                   OpenPunctuation       -> other_graphic
1673                   ClosePunctuation      -> other_graphic
1674                   InitialQuote          -> other_graphic
1675                   FinalQuote            -> other_graphic
1676                   OtherPunctuation      -> symbol
1677                   MathSymbol            -> symbol
1678                   CurrencySymbol        -> symbol
1679                   ModifierSymbol        -> symbol
1680                   OtherSymbol           -> symbol
1681                   Space                 -> space
1682                   _other                -> non_graphic
1683
1684 -- This version does not squash unicode characters, it is used when
1685 -- lexing strings.
1686 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1687 alexGetChar' (AI loc s)
1688   | atEnd s   = Nothing
1689   | otherwise = c `seq` loc' `seq` s' `seq`
1690                 --trace (show (ord c)) $
1691                 Just (c, (AI loc' s'))
1692   where (c,s') = nextChar s
1693         loc'   = advanceSrcLoc loc c
1694
1695 getInput :: P AlexInput
1696 getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (AI l b)
1697
1698 setInput :: AlexInput -> P ()
1699 setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } ()
1700
1701 nextIsEOF :: P Bool
1702 nextIsEOF = do
1703   AI _ s <- getInput
1704   return $ atEnd s
1705
1706 pushLexState :: Int -> P ()
1707 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1708
1709 popLexState :: P Int
1710 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1711
1712 getLexState :: P Int
1713 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1714
1715 popNextToken :: P (Maybe (Located Token))
1716 popNextToken
1717     = P $ \s@PState{ alr_next_token = m } ->
1718               POk (s {alr_next_token = Nothing}) m
1719
1720 activeContext :: P Bool
1721 activeContext = do
1722   ctxt <- getALRContext
1723   expc <- getAlrExpectingOCurly
1724   impt <- implicitTokenPending
1725   case (ctxt,expc) of
1726     ([],Nothing) -> return impt
1727     _other       -> return True
1728
1729 setAlrLastLoc :: SrcSpan -> P ()
1730 setAlrLastLoc l = P $ \s -> POk (s {alr_last_loc = l}) ()
1731
1732 getAlrLastLoc :: P SrcSpan
1733 getAlrLastLoc = P $ \s@(PState {alr_last_loc = l}) -> POk s l
1734
1735 getALRContext :: P [ALRContext]
1736 getALRContext = P $ \s@(PState {alr_context = cs}) -> POk s cs
1737
1738 setALRContext :: [ALRContext] -> P ()
1739 setALRContext cs = P $ \s -> POk (s {alr_context = cs}) ()
1740
1741 getJustClosedExplicitLetBlock :: P Bool
1742 getJustClosedExplicitLetBlock
1743  = P $ \s@(PState {alr_justClosedExplicitLetBlock = b}) -> POk s b
1744
1745 setJustClosedExplicitLetBlock :: Bool -> P ()
1746 setJustClosedExplicitLetBlock b
1747  = P $ \s -> POk (s {alr_justClosedExplicitLetBlock = b}) ()
1748
1749 setNextToken :: Located Token -> P ()
1750 setNextToken t = P $ \s -> POk (s {alr_next_token = Just t}) ()
1751
1752 implicitTokenPending :: P Bool
1753 implicitTokenPending
1754     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1755               case ts of
1756               [] -> POk s False
1757               _  -> POk s True
1758
1759 popPendingImplicitToken :: P (Maybe (Located Token))
1760 popPendingImplicitToken
1761     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1762               case ts of
1763               [] -> POk s Nothing
1764               (t : ts') -> POk (s {alr_pending_implicit_tokens = ts'}) (Just t)
1765
1766 setPendingImplicitTokens :: [Located Token] -> P ()
1767 setPendingImplicitTokens ts = P $ \s -> POk (s {alr_pending_implicit_tokens = ts}) ()
1768
1769 getAlrExpectingOCurly :: P (Maybe ALRLayout)
1770 getAlrExpectingOCurly = P $ \s@(PState {alr_expecting_ocurly = b}) -> POk s b
1771
1772 setAlrExpectingOCurly :: Maybe ALRLayout -> P ()
1773 setAlrExpectingOCurly b = P $ \s -> POk (s {alr_expecting_ocurly = b}) ()
1774
1775 -- for reasons of efficiency, flags indicating language extensions (eg,
1776 -- -fglasgow-exts or -XParallelArrays) are represented by a bitmap stored in an unboxed
1777 -- integer
1778
1779 genericsBit :: Int
1780 genericsBit = 0 -- {| and |}
1781 ffiBit :: Int
1782 ffiBit     = 1
1783 parrBit :: Int
1784 parrBit    = 2
1785 arrowsBit :: Int
1786 arrowsBit  = 4
1787 thBit :: Int
1788 thBit      = 5
1789 ipBit :: Int
1790 ipBit      = 6
1791 explicitForallBit :: Int
1792 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1793 bangPatBit :: Int
1794 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1795                 -- (doesn't affect the lexer)
1796 tyFamBit :: Int
1797 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1798 haddockBit :: Int
1799 haddockBit = 10 -- Lex and parse Haddock comments
1800 magicHashBit :: Int
1801 magicHashBit = 11 -- "#" in both functions and operators
1802 kindSigsBit :: Int
1803 kindSigsBit = 12 -- Kind signatures on type variables
1804 recursiveDoBit :: Int
1805 recursiveDoBit = 13 -- mdo
1806 unicodeSyntaxBit :: Int
1807 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1808 unboxedTuplesBit :: Int
1809 unboxedTuplesBit = 15 -- (# and #)
1810 datatypeContextsBit :: Int
1811 datatypeContextsBit = 16
1812 transformComprehensionsBit :: Int
1813 transformComprehensionsBit = 17
1814 qqBit :: Int
1815 qqBit      = 18 -- enable quasiquoting
1816 inRulePragBit :: Int
1817 inRulePragBit = 19
1818 rawTokenStreamBit :: Int
1819 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1820 recBit :: Int
1821 recBit = 22 -- rec
1822 alternativeLayoutRuleBit :: Int
1823 alternativeLayoutRuleBit = 23
1824 relaxedLayoutBit :: Int
1825 relaxedLayoutBit = 24
1826 nondecreasingIndentationBit :: Int
1827 nondecreasingIndentationBit = 25
1828 hetMetBit :: Int
1829 hetMetBit = 31
1830
1831 always :: Int -> Bool
1832 always           _     = True
1833 genericsEnabled :: Int -> Bool
1834 genericsEnabled  flags = testBit flags genericsBit
1835 parrEnabled :: Int -> Bool
1836 parrEnabled      flags = testBit flags parrBit
1837 arrowsEnabled :: Int -> Bool
1838 arrowsEnabled    flags = testBit flags arrowsBit
1839 hetMetEnabled :: Int -> Bool
1840 hetMetEnabled    flags = testBit flags hetMetBit
1841 thEnabled :: Int -> Bool
1842 thEnabled        flags = testBit flags thBit
1843 ipEnabled :: Int -> Bool
1844 ipEnabled        flags = testBit flags ipBit
1845 explicitForallEnabled :: Int -> Bool
1846 explicitForallEnabled flags = testBit flags explicitForallBit
1847 bangPatEnabled :: Int -> Bool
1848 bangPatEnabled   flags = testBit flags bangPatBit
1849 -- tyFamEnabled :: Int -> Bool
1850 -- tyFamEnabled     flags = testBit flags tyFamBit
1851 haddockEnabled :: Int -> Bool
1852 haddockEnabled   flags = testBit flags haddockBit
1853 magicHashEnabled :: Int -> Bool
1854 magicHashEnabled flags = testBit flags magicHashBit
1855 -- kindSigsEnabled :: Int -> Bool
1856 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1857 unicodeSyntaxEnabled :: Int -> Bool
1858 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1859 unboxedTuplesEnabled :: Int -> Bool
1860 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1861 datatypeContextsEnabled :: Int -> Bool
1862 datatypeContextsEnabled flags = testBit flags datatypeContextsBit
1863 qqEnabled :: Int -> Bool
1864 qqEnabled        flags = testBit flags qqBit
1865 -- inRulePrag :: Int -> Bool
1866 -- inRulePrag       flags = testBit flags inRulePragBit
1867 rawTokenStreamEnabled :: Int -> Bool
1868 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1869 alternativeLayoutRule :: Int -> Bool
1870 alternativeLayoutRule flags = testBit flags alternativeLayoutRuleBit
1871 relaxedLayout :: Int -> Bool
1872 relaxedLayout flags = testBit flags relaxedLayoutBit
1873 nondecreasingIndentation :: Int -> Bool
1874 nondecreasingIndentation flags = testBit flags nondecreasingIndentationBit
1875
1876 -- PState for parsing options pragmas
1877 --
1878 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1879 pragState dynflags buf loc = (mkPState dynflags buf loc) {
1880                                  lex_state = [bol, option_prags, 0]
1881                              }
1882
1883 -- create a parse state
1884 --
1885 mkPState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1886 mkPState flags buf loc =
1887   PState {
1888       buffer          = buf,
1889       dflags        = flags,
1890       messages      = emptyMessages,
1891       last_loc      = mkSrcSpan loc loc,
1892       last_len      = 0,
1893       loc           = loc,
1894       extsBitmap    = fromIntegral bitmap,
1895       context       = [],
1896       lex_state     = [bol, 0],
1897       alr_pending_implicit_tokens = [],
1898       alr_next_token = Nothing,
1899       alr_last_loc = noSrcSpan,
1900       alr_context = [],
1901       alr_expecting_ocurly = Nothing,
1902       alr_justClosedExplicitLetBlock = False,
1903       code_type_bracket_depth = 0
1904     }
1905     where
1906       bitmap = genericsBit `setBitIf` xopt Opt_Generics flags
1907                .|. ffiBit            `setBitIf` xopt Opt_ForeignFunctionInterface flags
1908                .|. parrBit           `setBitIf` xopt Opt_ParallelArrays  flags
1909                .|. arrowsBit         `setBitIf` xopt Opt_Arrows          flags
1910                .|. hetMetBit        `setBitIf` xopt Opt_ModalTypes         flags
1911                .|. thBit             `setBitIf` xopt Opt_TemplateHaskell flags
1912                .|. qqBit             `setBitIf` xopt Opt_QuasiQuotes     flags
1913                .|. ipBit             `setBitIf` xopt Opt_ImplicitParams  flags
1914                .|. explicitForallBit `setBitIf` xopt Opt_ExplicitForAll  flags
1915                .|. bangPatBit        `setBitIf` xopt Opt_BangPatterns flags
1916                .|. tyFamBit          `setBitIf` xopt Opt_TypeFamilies flags
1917                .|. haddockBit        `setBitIf` dopt Opt_Haddock      flags
1918                .|. magicHashBit      `setBitIf` xopt Opt_MagicHash    flags
1919                .|. kindSigsBit       `setBitIf` xopt Opt_KindSignatures flags
1920                .|. recursiveDoBit    `setBitIf` xopt Opt_RecursiveDo flags
1921                .|. recBit            `setBitIf` xopt Opt_DoRec  flags
1922                .|. recBit            `setBitIf` xopt Opt_Arrows flags
1923                .|. unicodeSyntaxBit  `setBitIf` xopt Opt_UnicodeSyntax flags
1924                .|. unboxedTuplesBit  `setBitIf` xopt Opt_UnboxedTuples flags
1925                .|. datatypeContextsBit `setBitIf` xopt Opt_DatatypeContexts flags
1926                .|. transformComprehensionsBit `setBitIf` xopt Opt_TransformListComp flags
1927                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1928                .|. alternativeLayoutRuleBit `setBitIf` xopt Opt_AlternativeLayoutRule flags
1929                .|. relaxedLayoutBit `setBitIf` xopt Opt_RelaxedLayout flags
1930                .|. nondecreasingIndentationBit `setBitIf` xopt Opt_NondecreasingIndentation flags
1931       --
1932       setBitIf :: Int -> Bool -> Int
1933       b `setBitIf` cond | cond      = bit b
1934                         | otherwise = 0
1935
1936 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1937 addWarning option srcspan warning
1938  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1939        let warning' = mkWarnMsg srcspan alwaysQualify warning
1940            ws' = if dopt option d then ws `snocBag` warning' else ws
1941        in POk s{messages=(ws', es)} ()
1942
1943 getMessages :: PState -> Messages
1944 getMessages PState{messages=ms} = ms
1945
1946 getContext :: P [LayoutContext]
1947 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1948
1949 setContext :: [LayoutContext] -> P ()
1950 setContext ctx = P $ \s -> POk s{context=ctx} ()
1951
1952 popContext :: P ()
1953 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1954                               last_len = len, last_loc = last_loc }) ->
1955   case ctx of
1956         (_:tl) -> POk s{ context = tl } ()
1957         []     -> PFailed last_loc (srcParseErr buf len)
1958
1959 -- Push a new layout context at the indentation of the last token read.
1960 -- This is only used at the outer level of a module when the 'module'
1961 -- keyword is missing.
1962 pushCurrentContext :: P ()
1963 pushCurrentContext = P $ \ s@PState{ last_loc=loc, context=ctx } ->
1964     POk s{context = Layout (srcSpanStartCol loc) : ctx} ()
1965
1966 getOffside :: P Ordering
1967 getOffside = P $ \s@PState{last_loc=loc, context=stk} ->
1968                 let offs = srcSpanStartCol loc in
1969                 let ord = case stk of
1970                         (Layout n:_) -> --trace ("layout: " ++ show n ++ ", offs: " ++ show offs) $
1971                                         compare offs n
1972                         _            -> GT
1973                 in POk s ord
1974
1975 -- ---------------------------------------------------------------------------
1976 -- Construct a parse error
1977
1978 srcParseErr
1979   :: StringBuffer       -- current buffer (placed just after the last token)
1980   -> Int                -- length of the previous token
1981   -> Message
1982 srcParseErr buf len
1983   = hcat [ if null token
1984              then ptext (sLit "parse error (possibly incorrect indentation)")
1985              else hcat [ptext (sLit "parse error on input "),
1986                         char '`', text token, char '\'']
1987     ]
1988   where token = lexemeToString (offsetBytes (-len) buf) len
1989
1990 -- Report a parse failure, giving the span of the previous token as
1991 -- the location of the error.  This is the entry point for errors
1992 -- detected during parsing.
1993 srcParseFail :: P a
1994 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1995                             last_loc = last_loc } ->
1996     PFailed last_loc (srcParseErr buf len)
1997
1998 -- A lexical error is reported at a particular position in the source file,
1999 -- not over a token range.
2000 lexError :: String -> P a
2001 lexError str = do
2002   loc <- getSrcLoc
2003   (AI end buf) <- getInput
2004   reportLexError loc end buf str
2005
2006 -- -----------------------------------------------------------------------------
2007 -- This is the top-level function: called from the parser each time a
2008 -- new token is to be read from the input.
2009
2010 lexer :: (Located Token -> P a) -> P a
2011 lexer cont = do
2012   alr <- extension alternativeLayoutRule
2013   let lexTokenFun = if alr then lexTokenAlr else lexToken
2014   tok@(L _span _tok__) <- lexTokenFun
2015   --trace ("token: " ++ show _tok__) $ do
2016   cont tok
2017
2018 lexTokenAlr :: P (Located Token)
2019 lexTokenAlr = do mPending <- popPendingImplicitToken
2020                  t <- case mPending of
2021                       Nothing ->
2022                           do mNext <- popNextToken
2023                              t <- case mNext of
2024                                   Nothing -> lexToken
2025                                   Just next -> return next
2026                              alternativeLayoutRuleToken t
2027                       Just t ->
2028                           return t
2029                  setAlrLastLoc (getLoc t)
2030                  case unLoc t of
2031                      ITwhere -> setAlrExpectingOCurly (Just ALRLayoutWhere)
2032                      ITlet   -> setAlrExpectingOCurly (Just ALRLayoutLet)
2033                      ITof    -> setAlrExpectingOCurly (Just ALRLayoutOf)
2034                      ITdo    -> setAlrExpectingOCurly (Just ALRLayoutDo)
2035                      ITmdo   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2036                      ITrec   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2037                      _       -> return ()
2038                  return t
2039
2040 alternativeLayoutRuleToken :: Located Token -> P (Located Token)
2041 alternativeLayoutRuleToken t
2042     = do context <- getALRContext
2043          lastLoc <- getAlrLastLoc
2044          mExpectingOCurly <- getAlrExpectingOCurly
2045          justClosedExplicitLetBlock <- getJustClosedExplicitLetBlock
2046          setJustClosedExplicitLetBlock False
2047          dflags <- getDynFlags
2048          let transitional = xopt Opt_AlternativeLayoutRuleTransitional dflags
2049              thisLoc = getLoc t
2050              thisCol = srcSpanStartCol thisLoc
2051              newLine = (lastLoc == noSrcSpan)
2052                     || (srcSpanStartLine thisLoc > srcSpanEndLine lastLoc)
2053          case (unLoc t, context, mExpectingOCurly) of
2054              -- This case handles a GHC extension to the original H98
2055              -- layout rule...
2056              (ITocurly, _, Just alrLayout) ->
2057                  do setAlrExpectingOCurly Nothing
2058                     let isLet = case alrLayout of
2059                                 ALRLayoutLet -> True
2060                                 _ -> False
2061                     setALRContext (ALRNoLayout (containsCommas ITocurly) isLet : context)
2062                     return t
2063              -- ...and makes this case unnecessary
2064              {-
2065              -- I think our implicit open-curly handling is slightly
2066              -- different to John's, in how it interacts with newlines
2067              -- and "in"
2068              (ITocurly, _, Just _) ->
2069                  do setAlrExpectingOCurly Nothing
2070                     setNextToken t
2071                     lexTokenAlr
2072              -}
2073              (_, ALRLayout _ col : ls, Just expectingOCurly)
2074               | (thisCol > col) ||
2075                 (thisCol == col &&
2076                  isNonDecreasingIntentation expectingOCurly) ->
2077                  do setAlrExpectingOCurly Nothing
2078                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2079                     setNextToken t
2080                     return (L thisLoc ITocurly)
2081               | otherwise ->
2082                  do setAlrExpectingOCurly Nothing
2083                     setPendingImplicitTokens [L lastLoc ITccurly]
2084                     setNextToken t
2085                     return (L lastLoc ITocurly)
2086              (_, _, Just expectingOCurly) ->
2087                  do setAlrExpectingOCurly Nothing
2088                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2089                     setNextToken t
2090                     return (L thisLoc ITocurly)
2091              -- We do the [] cases earlier than in the spec, as we
2092              -- have an actual EOF token
2093              (ITeof, ALRLayout _ _ : ls, _) ->
2094                  do setALRContext ls
2095                     setNextToken t
2096                     return (L thisLoc ITccurly)
2097              (ITeof, _, _) ->
2098                  return t
2099              -- the other ITeof case omitted; general case below covers it
2100              (ITin, _, _)
2101               | justClosedExplicitLetBlock ->
2102                  return t
2103              (ITin, ALRLayout ALRLayoutLet _ : ls, _)
2104               | newLine ->
2105                  do setPendingImplicitTokens [t]
2106                     setALRContext ls
2107                     return (L thisLoc ITccurly)
2108              -- This next case is to handle a transitional issue:
2109              (ITwhere, ALRLayout _ col : ls, _)
2110               | newLine && thisCol == col && transitional ->
2111                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2112                                thisLoc
2113                                (transitionalAlternativeLayoutWarning
2114                                     "`where' clause at the same depth as implicit layout block")
2115                     setALRContext ls
2116                     setNextToken t
2117                     -- Note that we use lastLoc, as we may need to close
2118                     -- more layouts, or give a semicolon
2119                     return (L lastLoc ITccurly)
2120              -- This next case is to handle a transitional issue:
2121              (ITvbar, ALRLayout _ col : ls, _)
2122               | newLine && thisCol == col && transitional ->
2123                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2124                                thisLoc
2125                                (transitionalAlternativeLayoutWarning
2126                                     "`|' at the same depth as implicit layout block")
2127                     setALRContext ls
2128                     setNextToken t
2129                     -- Note that we use lastLoc, as we may need to close
2130                     -- more layouts, or give a semicolon
2131                     return (L lastLoc ITccurly)
2132              (_, ALRLayout _ col : ls, _)
2133               | newLine && thisCol == col ->
2134                  do setNextToken t
2135                     return (L thisLoc ITsemi)
2136               | newLine && thisCol < col ->
2137                  do setALRContext ls
2138                     setNextToken t
2139                     -- Note that we use lastLoc, as we may need to close
2140                     -- more layouts, or give a semicolon
2141                     return (L lastLoc ITccurly)
2142              -- We need to handle close before open, as 'then' is both
2143              -- an open and a close
2144              (u, _, _)
2145               | isALRclose u ->
2146                  case context of
2147                  ALRLayout _ _ : ls ->
2148                      do setALRContext ls
2149                         setNextToken t
2150                         return (L thisLoc ITccurly)
2151                  ALRNoLayout _ isLet : ls ->
2152                      do let ls' = if isALRopen u
2153                                      then ALRNoLayout (containsCommas u) False : ls
2154                                      else ls
2155                         setALRContext ls'
2156                         when isLet $ setJustClosedExplicitLetBlock True
2157                         return t
2158                  [] ->
2159                      do let ls = if isALRopen u
2160                                     then [ALRNoLayout (containsCommas u) False]
2161                                     else ls
2162                         setALRContext ls
2163                         -- XXX This is an error in John's code, but
2164                         -- it looks reachable to me at first glance
2165                         return t
2166              (u, _, _)
2167               | isALRopen u ->
2168                  do setALRContext (ALRNoLayout (containsCommas u) False : context)
2169                     return t
2170              (ITin, ALRLayout ALRLayoutLet _ : ls, _) ->
2171                  do setALRContext ls
2172                     setPendingImplicitTokens [t]
2173                     return (L thisLoc ITccurly)
2174              (ITin, ALRLayout _ _ : ls, _) ->
2175                  do setALRContext ls
2176                     setNextToken t
2177                     return (L thisLoc ITccurly)
2178              -- the other ITin case omitted; general case below covers it
2179              (ITcomma, ALRLayout _ _ : ls, _)
2180               | topNoLayoutContainsCommas ls ->
2181                  do setALRContext ls
2182                     setNextToken t
2183                     return (L thisLoc ITccurly)
2184              (ITwhere, ALRLayout ALRLayoutDo _ : ls, _) ->
2185                  do setALRContext ls
2186                     setPendingImplicitTokens [t]
2187                     return (L thisLoc ITccurly)
2188              -- the other ITwhere case omitted; general case below covers it
2189              (_, _, _) -> return t
2190
2191 transitionalAlternativeLayoutWarning :: String -> SDoc
2192 transitionalAlternativeLayoutWarning msg
2193     = text "transitional layout will not be accepted in the future:"
2194    $$ text msg
2195
2196 isALRopen :: Token -> Bool
2197 isALRopen ITcase        = True
2198 isALRopen ITif          = True
2199 isALRopen ITthen        = True
2200 isALRopen IToparen      = True
2201 isALRopen ITobrack      = True
2202 isALRopen ITocurly      = True
2203 -- GHC Extensions:
2204 isALRopen IToubxparen   = True
2205 isALRopen ITparenEscape = True
2206 isALRopen _             = False
2207
2208 isALRclose :: Token -> Bool
2209 isALRclose ITof     = True
2210 isALRclose ITthen   = True
2211 isALRclose ITelse   = True
2212 isALRclose ITcparen = True
2213 isALRclose ITcbrack = True
2214 isALRclose ITccurly = True
2215 -- GHC Extensions:
2216 isALRclose ITcubxparen = True
2217 isALRclose _        = False
2218
2219 isNonDecreasingIntentation :: ALRLayout -> Bool
2220 isNonDecreasingIntentation ALRLayoutDo = True
2221 isNonDecreasingIntentation _           = False
2222
2223 containsCommas :: Token -> Bool
2224 containsCommas IToparen = True
2225 containsCommas ITobrack = True
2226 -- John doesn't have {} as containing commas, but records contain them,
2227 -- which caused a problem parsing Cabal's Distribution.Simple.InstallDirs
2228 -- (defaultInstallDirs).
2229 containsCommas ITocurly = True
2230 -- GHC Extensions:
2231 containsCommas IToubxparen = True
2232 containsCommas _        = False
2233
2234 topNoLayoutContainsCommas :: [ALRContext] -> Bool
2235 topNoLayoutContainsCommas [] = False
2236 topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
2237 topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
2238
2239 lexToken :: P (Located Token)
2240 lexToken = do
2241   inp@(AI loc1 buf) <- getInput
2242   sc <- getLexState
2243   exts <- getExts
2244   case alexScanUser exts inp sc of
2245     AlexEOF -> do
2246         let span = mkSrcSpan loc1 loc1
2247         setLastToken span 0
2248         return (L span ITeof)
2249     AlexError (AI loc2 buf) ->
2250         reportLexError loc1 loc2 buf "lexical error"
2251     AlexSkip inp2 _ -> do
2252         setInput inp2
2253         lexToken
2254     AlexToken inp2@(AI end buf2) _ t -> do
2255         setInput inp2
2256         let span = mkSrcSpan loc1 end
2257         let bytes = byteDiff buf buf2
2258         span `seq` setLastToken span bytes
2259         t span buf bytes
2260
2261 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
2262 reportLexError loc1 loc2 buf str
2263   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
2264   | otherwise =
2265   let
2266         c = fst (nextChar buf)
2267   in
2268   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
2269     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
2270     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
2271
2272 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
2273 lexTokenStream buf loc dflags = unP go initState
2274     where dflags' = dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream
2275           initState = mkPState dflags' buf loc
2276           go = do
2277             ltok <- lexer return
2278             case ltok of
2279               L _ ITeof -> return []
2280               _ -> liftM (ltok:) go
2281
2282 linePrags = Map.singleton "line" (begin line_prag2)
2283
2284 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
2285                                  ("options_ghc", lex_string_prag IToptions_prag),
2286                                  ("options_haddock", lex_string_prag ITdocOptions),
2287                                  ("language", token ITlanguage_prag),
2288                                  ("include", lex_string_prag ITinclude_prag)])
2289
2290 ignoredPrags = Map.fromList (map ignored pragmas)
2291                where ignored opt = (opt, nested_comment lexToken)
2292                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
2293                      options_pragmas = map ("options_" ++) impls
2294                      -- CFILES is a hugs-only thing.
2295                      pragmas = options_pragmas ++ ["cfiles", "contract"]
2296
2297 oneWordPrags = Map.fromList([("rules", rulePrag),
2298                            ("inline", token (ITinline_prag Inline FunLike)),
2299                            ("inlinable", token (ITinline_prag Inlinable FunLike)),
2300                            ("inlineable", token (ITinline_prag Inlinable FunLike)),
2301                                           -- Spelling variant
2302                            ("notinline", token (ITinline_prag NoInline FunLike)),
2303                            ("specialize", token ITspec_prag),
2304                            ("source", token ITsource_prag),
2305                            ("warning", token ITwarning_prag),
2306                            ("deprecated", token ITdeprecated_prag),
2307                            ("scc", token ITscc_prag),
2308                            ("generated", token ITgenerated_prag),
2309                            ("core", token ITcore_prag),
2310                            ("unpack", token ITunpack_prag),
2311                            ("ann", token ITann_prag),
2312                            ("vectorize", token ITvect_prag)])
2313
2314 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_prag Inline ConLike)),
2315                              ("notinline conlike", token (ITinline_prag NoInline ConLike)),
2316                              ("specialize inline", token (ITspec_inline_prag True)),
2317                              ("specialize notinline", token (ITspec_inline_prag False)),
2318                              ("vectorize scalar", token ITvect_scalar_prag)])
2319
2320 dispatch_pragmas :: Map String Action -> Action
2321 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
2322                                        Just found -> found span buf len
2323                                        Nothing -> lexError "unknown pragma"
2324
2325 known_pragma :: Map String Action -> AlexAccPred Int
2326 known_pragma prags _ _ len (AI _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
2327                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
2328
2329 clean_pragma :: String -> String
2330 clean_pragma prag = canon_ws (map toLower (unprefix prag))
2331                     where unprefix prag' = case stripPrefix "{-#" prag' of
2332                                              Just rest -> rest
2333                                              Nothing -> prag'
2334                           canonical prag' = case prag' of
2335                                               "noinline" -> "notinline"
2336                                               "specialise" -> "specialize"
2337                                               "vectorise" -> "vectorize"
2338                                               "constructorlike" -> "conlike"
2339                                               _ -> prag'
2340                           canon_ws s = unwords (map canonical (words s))
2341 }