compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - parsing integers is a bit slow
  16 --    - readRational is a bit slow
  17 --
  18 --   Known bugs, that were also in the previous version:
  19 --    - M... should be 3 tokens, not 1.
  20 --    - pragma-end should be only valid in a pragma
  21
  22 --   qualified operator NOTES.
  23 --
  24 --   - If M.(+) is a single lexeme, then..
  25 --     - Probably (+) should be a single lexeme too, for consistency.
  26 --       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  27 --     - But we have to rule out reserved operators, otherwise (..) becomes
  28 --       a different lexeme.
  29 --     - Should we therefore also rule out reserved operators in the qualified
  30 --       form?  This is quite difficult to achieve.  We don't do it for
  31 --       qualified varids.
  32
  33 {
  34 -- XXX The above flags turn off warnings in the generated code:
  35 {-# LANGUAGE BangPatterns #-}
  36 {-# OPTIONS_GHC -fno-warn-unused-matches #-}
  37 {-# OPTIONS_GHC -fno-warn-unused-binds #-}
  38 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
  39 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
  40 -- But alex still generates some code that causes the "lazy unlifted bindings"
  41 -- warning, and old compilers don't know about it so we can't easily turn
  42 -- it off, so for now we use the sledge hammer:
  43 {-# OPTIONS_GHC -w #-}
  44
  45 {-# OPTIONS_GHC -funbox-strict-fields #-}
  46
  47 module Lexer (
  48    Token(..), lexer, pragState, mkPState, PState(..),
  49    P(..), ParseResult(..), getSrcLoc,
  50    getPState, getDynFlags, withThisPackage,
  51    failLocMsgP, failSpanMsgP, srcParseFail,
  52    getMessages,
  53    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  54    activeContext, nextIsEOF,
  55    getLexState, popLexState, pushLexState,
  56    extension, bangPatEnabled, datatypeContextsEnabled,
  57    addWarning,
  58    incrBracketDepth, decrBracketDepth, getParserBrakDepth,
  59    lexTokenStream
  60   ) where
  61
  62 import Bag
  63 import ErrUtils
  64 import Outputable
  65 import StringBuffer
  66 import FastString
  67 import SrcLoc
  68 import UniqFM
  69 import DynFlags
  70 import Module
  71 import Ctype
  72 import BasicTypes       ( InlineSpec(..), RuleMatchInfo(..), FractionalLit(..) )
  73 import Util             ( readRational )
  74
  75 import Control.Monad
  76 import Data.Bits
  77 import Data.Char
  78 import Data.List
  79 import Data.Maybe
  80 import Data.Map (Map)
  81 import qualified Data.Map as Map
  82 import Data.Ratio
  83 }
  84
  85 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  86 $whitechar   = [\ \n\r\f\v $unispace]
  87 $white_no_nl = $whitechar # \n
  88 $tab         = \t
  89
  90 $ascdigit  = 0-9
  91 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  92 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  93 $digit     = [$ascdigit $unidigit]
  94
  95 $special   = [\(\)\,\;\[\]\`\{\}]
  96 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  97 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  98 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  99
 100 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
 101 $asclarge  = [A-Z]
 102 $large     = [$asclarge $unilarge]
 103
 104 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 105 $ascsmall  = [a-z]
 106 $small     = [$ascsmall $unismall \_]
 107
 108 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 109 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 110
 111 $octit     = 0-7
 112 $hexit     = [$decdigit A-F a-f]
 113 $symchar   = [$symbol \:]
 114 $nl        = [\n\r]
 115 $idchar    = [$small $large $digit \']
 116
 117 $pragmachar = [$small $large $digit]
 118
 119 $docsym    = [\| \^ \* \$]
 120
 121 @varid     = $small $idchar*
 122 @conid     = $large $idchar*
 123
 124 @varsym    = $symbol $symchar*
 125 @consym    = \: $symchar*
 126
 127 @decimal     = $decdigit+
 128 @octal       = $octit+
 129 @hexadecimal = $hexit+
 130 @exponent    = [eE] [\-\+]? @decimal
 131
 132 -- we support the hierarchical module name extension:
 133 @qual = (@conid \.)+
 134
 135 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 136
 137 -- normal signed numerical literals can only be explicitly negative,
 138 -- not explicitly positive (contrast @exponent)
 139 @negative = \-
 140 @signed = @negative ?
 141
 142 haskell :-
 143
 144 -- everywhere: skip whitespace and comments
 145 $white_no_nl+                           ;
 146 $tab+         { warn Opt_WarnTabs (text "Warning: Tab character") }
 147
 148 -- Everywhere: deal with nested comments.  We explicitly rule out
 149 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 150 -- (this can happen even though pragmas will normally take precedence due to
 151 -- longest-match, because pragmas aren't valid in every state, but comments
 152 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 153 -- set.
 154
 155 "{-" / { isNormalComment } { nested_comment lexToken }
 156
 157 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 158 -- more dashes followed by a symbol should be parsed as a varsym, so we
 159 -- have to exclude those.
 160
 161 -- Since Haddock comments aren't valid in every state, we need to rule them
 162 -- out here.
 163
 164 -- The following two rules match comments that begin with two dashes, but
 165 -- continue with a different character. The rules test that this character
 166 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 167 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 168 -- have a Haddock comment). The rules then munch the rest of the line.
 169
 170 "-- " ~[$docsym \#] .* { lineCommentToken }
 171 "--" [^$symbol : \ ] .* { lineCommentToken }
 172
 173 -- Next, match Haddock comments if no -haddock flag
 174
 175 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 176
 177 -- Now, when we've matched comments that begin with 2 dashes and continue
 178 -- with a different character, we need to match comments that begin with three
 179 -- or more dashes (which clearly can't be Haddock comments). We only need to
 180 -- make sure that the first non-dash character isn't a symbol, and munch the
 181 -- rest of the line.
 182
 183 "---"\-* [^$symbol :] .* { lineCommentToken }
 184
 185 -- Since the previous rules all match dashes followed by at least one
 186 -- character, we also need to match a whole line filled with just dashes.
 187
 188 "--"\-* / { atEOL } { lineCommentToken }
 189
 190 -- We need this rule since none of the other single line comment rules
 191 -- actually match this case.
 192
 193 "-- " / { atEOL } { lineCommentToken }
 194
 195 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 196 -- blank lines) until we find a non-whitespace character, then do layout
 197 -- processing.
 198 --
 199 -- One slight wibble here: what if the line begins with {-#? In
 200 -- theory, we have to lex the pragma to see if it's one we recognise,
 201 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 202 -- as a nested comment.  We don't bother with this: if the line begins
 203 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 204 <bol> {
 205   \n                                    ;
 206   ^\# (line)?                           { begin line_prag1 }
 207   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 208   ^\# \! .* \n                          ; -- #!, for scripts
 209   ()                                    { do_bol }
 210 }
 211
 212 -- after a layout keyword (let, where, do, of), we begin a new layout
 213 -- context if the curly brace is missing.
 214 -- Careful! This stuff is quite delicate.
 215 <layout, layout_do> {
 216   \{ / { notFollowedBy '-' }            { hopefully_open_brace }
 217         -- we might encounter {-# here, but {- has been handled already
 218   \n                                    ;
 219   ^\# (line)?                           { begin line_prag1 }
 220 }
 221
 222 -- do is treated in a subtly different way, see new_layout_context
 223 <layout>    ()                          { new_layout_context True }
 224 <layout_do> ()                          { new_layout_context False }
 225
 226 -- after a new layout context which was found to be to the left of the
 227 -- previous context, we have generated a '{' token, and we now need to
 228 -- generate a matching '}' token.
 229 <layout_left>  ()                       { do_layout_left }
 230
 231 <0,option_prags> \n                             { begin bol }
 232
 233 "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
 234                                 { dispatch_pragmas linePrags }
 235
 236 -- single-line line pragmas, of the form
 237 --    # <line> "<file>" <extra-stuff> \n
 238 <line_prag1> $decdigit+                 { setLine line_prag1a }
 239 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 240 <line_prag1b> .*                        { pop }
 241
 242 -- Haskell-style line pragmas, of the form
 243 --    {-# LINE <line> "<file>" #-}
 244 <line_prag2> $decdigit+                 { setLine line_prag2a }
 245 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 246 <line_prag2b> "#-}"|"-}"                { pop }
 247    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 248    -- with older versions of GHC which generated these.
 249
 250 <0,option_prags> {
 251   "{-#" $whitechar* $pragmachar+
 252         $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
 253                                  { dispatch_pragmas twoWordPrags }
 254
 255   "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
 256                                  { dispatch_pragmas oneWordPrags }
 257
 258   -- We ignore all these pragmas, but don't generate a warning for them
 259   "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
 260                                  { dispatch_pragmas ignoredPrags }
 261
 262   -- ToDo: should only be valid inside a pragma:
 263   "#-}"                                 { endPrag }
 264 }
 265
 266 <option_prags> {
 267   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 268                                    { dispatch_pragmas fileHeaderPrags }
 269
 270   "-- #"                                 { multiline_doc_comment }
 271 }
 272
 273 <0> {
 274   -- In the "0" mode we ignore these pragmas
 275   "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
 276                      { nested_comment lexToken }
 277 }
 278
 279 <0> {
 280   "-- #" .* { lineCommentToken }
 281 }
 282
 283 <0,option_prags> {
 284   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 285                     (nested_comment lexToken) }
 286 }
 287
 288 -- '0' state: ordinary lexemes
 289
 290 -- Haddock comments
 291
 292 <0,option_prags> {
 293   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 294   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 295 }
 296
 297 -- "special" symbols
 298
 299 <0> {
 300   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 301   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 302 }
 303
 304 <0> {
 305   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 306   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 307   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 308   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 309   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 310   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 311   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 312   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 313
 314 -- For backward compatibility, accept the old dollar syntax
 315   "[$" @varid "|"  / { ifExtension qqEnabled }
 316                      { lex_quasiquote_tok }
 317
 318   "[" @varid "|"  / { ifExtension qqEnabled }
 319                      { lex_quasiquote_tok }
 320 }
 321
 322 <0> {
 323   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 324                                         { special IToparenbar }
 325   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 326 }
 327
 328 <0> {
 329   "<[" / { ifExtension hetMetEnabled `alexAndPred` notFollowedBySymbol }
 330                                         { special ITopenBrak }
 331   "]>" / { ifExtension hetMetEnabled }  { special ITcloseBrak }
 332   "~~" / { ifExtension hetMetEnabled }  { special ITescape }
 333   "%%" / { ifExtension hetMetEnabled }  { special ITdoublePercent }
 334   "~~$" / { ifExtension hetMetEnabled }  { special ITescapeDollar }
 335 }
 336
 337 <0> {
 338   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 339 }
 340
 341 <0> {
 342   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 343          { token IToubxparen }
 344   "#)" / { ifExtension unboxedTuplesEnabled }
 345          { token ITcubxparen }
 346 }
 347
 348 <0,option_prags> {
 349   \(                                    { special IToparen }
 350   \)                                    { special ITcparen }
 351   \[                                    { special ITobrack }
 352   \]                                    { special ITcbrack }
 353   \,                                    { special ITcomma }
 354   \;                                    { special ITsemi }
 355   \`                                    { special ITbackquote }
 356
 357   \{                                    { open_brace }
 358   \}                                    { close_brace }
 359 }
 360
 361 <0,option_prags> {
 362   @qual @varid                  { idtoken qvarid }
 363   @qual @conid                  { idtoken qconid }
 364   @varid                        { varid }
 365   @conid                        { idtoken conid }
 366 }
 367
 368 <0> {
 369   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 370   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 371   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 372   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 373 }
 374
 375 -- ToDo: - move `var` and (sym) into lexical syntax?
 376 --       - remove backquote from $special?
 377 <0> {
 378   @qual @varsym                                    { idtoken qvarsym }
 379   @qual @consym                                    { idtoken qconsym }
 380   @varsym                                          { varsym }
 381   @consym                                          { consym }
 382 }
 383
 384 -- For the normal boxed literals we need to be careful
 385 -- when trying to be close to Haskell98
 386 <0> {
 387   -- Normal integral literals (:: Num a => a, from Integer)
 388   @decimal           { tok_num positive 0 0 decimal }
 389   0[oO] @octal       { tok_num positive 2 2 octal }
 390   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 391
 392   -- Normal rational literals (:: Fractional a => a, from Rational)
 393   @floating_point    { strtoken tok_float }
 394 }
 395
 396 <0> {
 397   -- Unboxed ints (:: Int#) and words (:: Word#)
 398   -- It's simpler (and faster?) to give separate cases to the negatives,
 399   -- especially considering octal/hexadecimal prefixes.
 400   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 401   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 402   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 403   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 404   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 405   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 406
 407   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 408   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 409   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 410
 411   -- Unboxed floats and doubles (:: Float#, :: Double#)
 412   -- prim_{float,double} work with signed literals
 413   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 414   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 415 }
 416
 417 -- Strings and chars are lexed by hand-written code.  The reason is
 418 -- that even if we recognise the string or char here in the regex
 419 -- lexer, we would still have to parse the string afterward in order
 420 -- to convert it to a String.
 421 <0> {
 422   \'                            { lex_char_tok }
 423   \"                            { lex_string_tok }
 424 }
 425
 426 {
 427 -- -----------------------------------------------------------------------------
 428 -- The token type
 429
 430 data Token
 431   = ITas                        -- Haskell keywords
 432   | ITcase
 433   | ITclass
 434   | ITdata
 435   | ITdefault
 436   | ITderiving
 437   | ITdo
 438   | ITelse
 439   | IThiding
 440   | ITif
 441   | ITimport
 442   | ITin
 443   | ITinfix
 444   | ITinfixl
 445   | ITinfixr
 446   | ITinstance
 447   | ITlet
 448   | ITmodule
 449   | ITnewtype
 450   | ITof
 451   | ITqualified
 452   | ITthen
 453   | ITtype
 454   | ITwhere
 455   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 456
 457   | ITforall                    -- GHC extension keywords
 458   | ITforeign
 459   | ITexport
 460   | ITlabel
 461   | ITdynamic
 462   | ITsafe
 463   | ITthreadsafe
 464   | ITinterruptible
 465   | ITunsafe
 466   | ITstdcallconv
 467   | ITccallconv
 468   | ITprimcallconv
 469   | ITmdo
 470   | ITfamily
 471   | ITgroup
 472   | ITby
 473   | ITusing
 474
 475         -- Pragmas
 476   | ITinline_prag InlineSpec RuleMatchInfo
 477   | ITspec_prag                 -- SPECIALISE
 478   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 479   | ITsource_prag
 480   | ITrules_prag
 481   | ITwarning_prag
 482   | ITdeprecated_prag
 483   | ITline_prag
 484   | ITscc_prag
 485   | ITgenerated_prag
 486   | ITcore_prag                 -- hdaume: core annotations
 487   | ITunpack_prag
 488   | ITann_prag
 489   | ITclose_prag
 490   | IToptions_prag String
 491   | ITinclude_prag String
 492   | ITlanguage_prag
 493   | ITvect_prag
 494   | ITvect_scalar_prag
 495
 496   | ITdotdot                    -- reserved symbols
 497   | ITcolon
 498   | ITdcolon
 499   | ITequal
 500   | ITlam
 501   | ITvbar
 502   | ITlarrow
 503   | ITrarrow
 504   | ITat
 505   | ITtilde
 506   | ITdarrow
 507   | ITminus
 508   | ITbang
 509   | ITstar
 510   | ITdot
 511
 512   | ITbiglam                    -- GHC-extension symbols
 513
 514   | ITocurly                    -- special symbols
 515   | ITccurly
 516   | ITocurlybar                 -- {|, for type applications
 517   | ITccurlybar                 -- |}, for type applications
 518   | ITvocurly
 519   | ITvccurly
 520   | ITobrack
 521   | ITopabrack                  -- [:, for parallel arrays with -XParallelArrays
 522   | ITcpabrack                  -- :], for parallel arrays with -XParallelArrays
 523   | ITcbrack
 524   | IToparen
 525   | ITcparen
 526   | IToubxparen
 527   | ITcubxparen
 528   | ITsemi
 529   | ITcomma
 530   | ITunderscore
 531   | ITbackquote
 532
 533   | ITvarid   FastString        -- identifiers
 534   | ITconid   FastString
 535   | ITvarsym  FastString
 536   | ITconsym  FastString
 537   | ITqvarid  (FastString,FastString)
 538   | ITqconid  (FastString,FastString)
 539   | ITqvarsym (FastString,FastString)
 540   | ITqconsym (FastString,FastString)
 541   | ITprefixqvarsym (FastString,FastString)
 542   | ITprefixqconsym (FastString,FastString)
 543
 544   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 545
 546   | ITchar       Char
 547   | ITstring     FastString
 548   | ITinteger    Integer
 549   | ITrational   FractionalLit
 550
 551   | ITprimchar   Char
 552   | ITprimstring FastString
 553   | ITprimint    Integer
 554   | ITprimword   Integer
 555   | ITprimfloat  FractionalLit
 556   | ITprimdouble FractionalLit
 557
 558   -- Template Haskell extension tokens
 559   | ITopenExpQuote              --  [| or [e|
 560   | ITopenPatQuote              --  [p|
 561   | ITopenDecQuote              --  [d|
 562   | ITopenTypQuote              --  [t|
 563   | ITcloseQuote                --  |]
 564   | ITidEscape   FastString     --  $x
 565   | ITparenEscape               --  $(
 566   | ITvarQuote                  --  '
 567   | ITtyQuote                   --  ''
 568   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 569
 570   -- Arrow notation extension
 571   | ITproc
 572   | ITrec
 573   | IToparenbar                 --  (|
 574   | ITcparenbar                 --  |)
 575   | ITlarrowtail                --  -<
 576   | ITrarrowtail                --  >-
 577   | ITLarrowtail                --  -<<
 578   | ITRarrowtail                --  >>-
 579
 580   -- Heterogeneous Metaprogramming extension
 581   | ITopenBrak                  --  <[
 582   | ITcloseBrak                 --  ]>
 583   | ITescape                    --  ~~
 584   | ITescapeDollar              --  ~~$
 585   | ITdoublePercent             --  %%
 586
 587   | ITunknown String            -- Used when the lexer can't make sense of it
 588   | ITeof                       -- end of file token
 589
 590   -- Documentation annotations
 591   | ITdocCommentNext  String     -- something beginning '-- |'
 592   | ITdocCommentPrev  String     -- something beginning '-- ^'
 593   | ITdocCommentNamed String     -- something beginning '-- $'
 594   | ITdocSection      Int String -- a section heading
 595   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 596   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 597   | ITlineComment     String     -- comment starting by "--"
 598   | ITblockComment    String     -- comment in {- -}
 599
 600 #ifdef DEBUG
 601   deriving Show -- debugging
 602 #endif
 603
 604 {-
 605 isSpecial :: Token -> Bool
 606 -- If we see M.x, where x is a keyword, but
 607 -- is special, we treat is as just plain M.x,
 608 -- not as a keyword.
 609 isSpecial ITas          = True
 610 isSpecial IThiding      = True
 611 isSpecial ITqualified   = True
 612 isSpecial ITforall      = True
 613 isSpecial ITexport      = True
 614 isSpecial ITlabel       = True
 615 isSpecial ITdynamic     = True
 616 isSpecial ITsafe        = True
 617 isSpecial ITthreadsafe  = True
 618 isSpecial ITinterruptible = True
 619 isSpecial ITunsafe      = True
 620 isSpecial ITccallconv   = True
 621 isSpecial ITstdcallconv = True
 622 isSpecial ITprimcallconv = True
 623 isSpecial ITmdo         = True
 624 isSpecial ITfamily      = True
 625 isSpecial ITgroup   = True
 626 isSpecial ITby      = True
 627 isSpecial ITusing   = True
 628 isSpecial _             = False
 629 -}
 630
 631 -- the bitmap provided as the third component indicates whether the
 632 -- corresponding extension keyword is valid under the extension options
 633 -- provided to the compiler; if the extension corresponding to *any* of the
 634 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 635 -- facilitates using a keyword in two different extensions that can be
 636 -- activated independently)
 637 --
 638 reservedWordsFM :: UniqFM (Token, Int)
 639 reservedWordsFM = listToUFM $
 640         map (\(x, y, z) -> (mkFastString x, (y, z)))
 641        [( "_",          ITunderscore,   0 ),
 642         ( "as",         ITas,           0 ),
 643         ( "case",       ITcase,         0 ),
 644         ( "class",      ITclass,        0 ),
 645         ( "data",       ITdata,         0 ),
 646         ( "default",    ITdefault,      0 ),
 647         ( "deriving",   ITderiving,     0 ),
 648         ( "do",         ITdo,           0 ),
 649         ( "else",       ITelse,         0 ),
 650         ( "hiding",     IThiding,       0 ),
 651         ( "if",         ITif,           0 ),
 652         ( "import",     ITimport,       0 ),
 653         ( "in",         ITin,           0 ),
 654         ( "infix",      ITinfix,        0 ),
 655         ( "infixl",     ITinfixl,       0 ),
 656         ( "infixr",     ITinfixr,       0 ),
 657         ( "instance",   ITinstance,     0 ),
 658         ( "let",        ITlet,          0 ),
 659         ( "module",     ITmodule,       0 ),
 660         ( "newtype",    ITnewtype,      0 ),
 661         ( "of",         ITof,           0 ),
 662         ( "qualified",  ITqualified,    0 ),
 663         ( "then",       ITthen,         0 ),
 664         ( "type",       ITtype,         0 ),
 665         ( "where",      ITwhere,        0 ),
 666         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 667
 668     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 669         ( "mdo",        ITmdo,           bit recursiveDoBit),
 670         ( "family",     ITfamily,        bit tyFamBit),
 671     ( "group",  ITgroup,     bit transformComprehensionsBit),
 672     ( "by",     ITby,        bit transformComprehensionsBit),
 673     ( "using",  ITusing,     bit transformComprehensionsBit),
 674
 675         ( "foreign",    ITforeign,       bit ffiBit),
 676         ( "export",     ITexport,        bit ffiBit),
 677         ( "label",      ITlabel,         bit ffiBit),
 678         ( "dynamic",    ITdynamic,       bit ffiBit),
 679         ( "safe",       ITsafe,          bit ffiBit),
 680         ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
 681         ( "interruptible", ITinterruptible, bit ffiBit),
 682         ( "unsafe",     ITunsafe,        bit ffiBit),
 683         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 684         ( "ccall",      ITccallconv,     bit ffiBit),
 685         ( "prim",       ITprimcallconv,  bit ffiBit),
 686
 687         ( "rec",        ITrec,           bit recBit),
 688         ( "proc",       ITproc,          bit arrowsBit)
 689      ]
 690
 691 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 692 reservedSymsFM = listToUFM $
 693     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 694       [ ("..",  ITdotdot,   always)
 695         -- (:) is a reserved op, meaning only list cons
 696        ,(":",   ITcolon,    always)
 697        ,("::",  ITdcolon,   always)
 698        ,("=",   ITequal,    always)
 699        ,("\\",  ITlam,      always)
 700        ,("|",   ITvbar,     always)
 701        ,("<-",  ITlarrow,   always)
 702        ,("->",  ITrarrow,   always)
 703        ,("@",   ITat,       always)
 704        ,("~",   ITtilde,    always)
 705        ,("=>",  ITdarrow,   always)
 706        ,("-",   ITminus,    always)
 707        ,("!",   ITbang,     always)
 708
 709         -- For data T (a::*) = MkT
 710        ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
 711         -- For 'forall a . t'
 712        ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
 713
 714        ,("-<",  ITlarrowtail, arrowsEnabled)
 715        ,(">-",  ITrarrowtail, arrowsEnabled)
 716        ,("-<<", ITLarrowtail, arrowsEnabled)
 717        ,(">>-", ITRarrowtail, arrowsEnabled)
 718
 719        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 720        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 721        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 722                                 explicitForallEnabled i)
 723        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 724        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 725
 726        ,("⤙",   ITlarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 727        ,("⤚",   ITrarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 728        ,("⤛",   ITLarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 729        ,("⤜",   ITRarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
 730
 731        ,("★", ITstar, unicodeSyntaxEnabled)
 732
 733         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 734         -- form part of a large operator.  This would let us have a better
 735         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 736        ]
 737
 738 -- -----------------------------------------------------------------------------
 739 -- Lexer actions
 740
 741 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 742
 743 special :: Token -> Action
 744 special tok span _buf _len = return (L span tok)
 745
 746 token, layout_token :: Token -> Action
 747 token t span _buf _len = return (L span t)
 748 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 749
 750 idtoken :: (StringBuffer -> Int -> Token) -> Action
 751 idtoken f span buf len = return (L span $! (f buf len))
 752
 753 skip_one_varid :: (FastString -> Token) -> Action
 754 skip_one_varid f span buf len
 755   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 756
 757 strtoken :: (String -> Token) -> Action
 758 strtoken f span buf len =
 759   return (L span $! (f $! lexemeToString buf len))
 760
 761 init_strtoken :: Int -> (String -> Token) -> Action
 762 -- like strtoken, but drops the last N character(s)
 763 init_strtoken drop f span buf len =
 764   return (L span $! (f $! lexemeToString buf (len-drop)))
 765
 766 begin :: Int -> Action
 767 begin code _span _str _len = do pushLexState code; lexToken
 768
 769 pop :: Action
 770 pop _span _buf _len = do _ <- popLexState
 771                          lexToken
 772
 773 hopefully_open_brace :: Action
 774 hopefully_open_brace span buf len
 775  = do relaxed <- extension relaxedLayout
 776       ctx <- getContext
 777       (AI l _) <- getInput
 778       let offset = srcLocCol l
 779           isOK = relaxed ||
 780                  case ctx of
 781                  Layout prev_off : _ -> prev_off < offset
 782                  _                   -> True
 783       if isOK then pop_and open_brace span buf len
 784               else failSpanMsgP span (text "Missing block")
 785
 786 pop_and :: Action -> Action
 787 pop_and act span buf len = do _ <- popLexState
 788                               act span buf len
 789
 790 {-# INLINE nextCharIs #-}
 791 nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
 792 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 793
 794 notFollowedBy :: Char -> AlexAccPred Int
 795 notFollowedBy char _ _ _ (AI _ buf)
 796   = nextCharIs buf (/=char)
 797
 798 notFollowedBySymbol :: AlexAccPred Int
 799 notFollowedBySymbol _ _ _ (AI _ buf)
 800   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 801
 802 -- We must reject doc comments as being ordinary comments everywhere.
 803 -- In some cases the doc comment will be selected as the lexeme due to
 804 -- maximal munch, but not always, because the nested comment rule is
 805 -- valid in all states, but the doc-comment rules are only valid in
 806 -- the non-layout states.
 807 isNormalComment :: AlexAccPred Int
 808 isNormalComment bits _ _ (AI _ buf)
 809   | haddockEnabled bits = notFollowedByDocOrPragma
 810   | otherwise           = nextCharIs buf (/='#')
 811   where
 812     notFollowedByDocOrPragma
 813        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 814
 815 spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
 816 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 817
 818 {-
 819 haddockDisabledAnd p bits _ _ (AI _ buf)
 820   = if haddockEnabled bits then False else (p buf)
 821 -}
 822
 823 atEOL :: AlexAccPred Int
 824 atEOL _ _ _ (AI _ buf) = atEnd buf || currentChar buf == '\n'
 825
 826 ifExtension :: (Int -> Bool) -> AlexAccPred Int
 827 ifExtension pred bits _ _ _ = pred bits
 828
 829 multiline_doc_comment :: Action
 830 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 831   where
 832     worker commentAcc input docType oneLine = case alexGetChar input of
 833       Just ('\n', input')
 834         | oneLine -> docCommentEnd input commentAcc docType buf span
 835         | otherwise -> case checkIfCommentLine input' of
 836           Just input -> worker ('\n':commentAcc) input docType False
 837           Nothing -> docCommentEnd input commentAcc docType buf span
 838       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 839       Nothing -> docCommentEnd input commentAcc docType buf span
 840
 841     checkIfCommentLine input = check (dropNonNewlineSpace input)
 842       where
 843         check input = case alexGetChar input of
 844           Just ('-', input) -> case alexGetChar input of
 845             Just ('-', input) -> case alexGetChar input of
 846               Just (c, _) | c /= '-' -> Just input
 847               _ -> Nothing
 848             _ -> Nothing
 849           _ -> Nothing
 850
 851         dropNonNewlineSpace input = case alexGetChar input of
 852           Just (c, input')
 853             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 854             | otherwise -> input
 855           Nothing -> input
 856
 857 lineCommentToken :: Action
 858 lineCommentToken span buf len = do
 859   b <- extension rawTokenStreamEnabled
 860   if b then strtoken ITlineComment span buf len else lexToken
 861
 862 {-
 863   nested comments require traversing by hand, they can't be parsed
 864   using regular expressions.
 865 -}
 866 nested_comment :: P (Located Token) -> Action
 867 nested_comment cont span _str _len = do
 868   input <- getInput
 869   go "" (1::Int) input
 870   where
 871     go commentAcc 0 input = do setInput input
 872                                b <- extension rawTokenStreamEnabled
 873                                if b
 874                                  then docCommentEnd input commentAcc ITblockComment _str span
 875                                  else cont
 876     go commentAcc n input = case alexGetChar input of
 877       Nothing -> errBrace input span
 878       Just ('-',input) -> case alexGetChar input of
 879         Nothing  -> errBrace input span
 880         Just ('\125',input) -> go commentAcc (n-1) input
 881         Just (_,_)          -> go ('-':commentAcc) n input
 882       Just ('\123',input) -> case alexGetChar input of
 883         Nothing  -> errBrace input span
 884         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 885         Just (_,_)       -> go ('\123':commentAcc) n input
 886       Just (c,input) -> go (c:commentAcc) n input
 887
 888 nested_doc_comment :: Action
 889 nested_doc_comment span buf _len = withLexedDocType (go "")
 890   where
 891     go commentAcc input docType _ = case alexGetChar input of
 892       Nothing -> errBrace input span
 893       Just ('-',input) -> case alexGetChar input of
 894         Nothing -> errBrace input span
 895         Just ('\125',input) ->
 896           docCommentEnd input commentAcc docType buf span
 897         Just (_,_) -> go ('-':commentAcc) input docType False
 898       Just ('\123', input) -> case alexGetChar input of
 899         Nothing  -> errBrace input span
 900         Just ('-',input) -> do
 901           setInput input
 902           let cont = do input <- getInput; go commentAcc input docType False
 903           nested_comment cont span buf _len
 904         Just (_,_) -> go ('\123':commentAcc) input docType False
 905       Just (c,input) -> go (c:commentAcc) input docType False
 906
 907 withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
 908                  -> P (Located Token)
 909 withLexedDocType lexDocComment = do
 910   input@(AI _ buf) <- getInput
 911   case prevChar buf ' ' of
 912     '|' -> lexDocComment input ITdocCommentNext False
 913     '^' -> lexDocComment input ITdocCommentPrev False
 914     '$' -> lexDocComment input ITdocCommentNamed False
 915     '*' -> lexDocSection 1 input
 916     '#' -> lexDocComment input ITdocOptionsOld False
 917     _ -> panic "withLexedDocType: Bad doc type"
 918  where
 919     lexDocSection n input = case alexGetChar input of
 920       Just ('*', input) -> lexDocSection (n+1) input
 921       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 922       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 923
 924 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 925 -- off again at the end of the pragma.
 926 rulePrag :: Action
 927 rulePrag span _buf _len = do
 928   setExts (.|. bit inRulePragBit)
 929   return (L span ITrules_prag)
 930
 931 endPrag :: Action
 932 endPrag span _buf _len = do
 933   setExts (.&. complement (bit inRulePragBit))
 934   return (L span ITclose_prag)
 935
 936 -- docCommentEnd
 937 -------------------------------------------------------------------------------
 938 -- This function is quite tricky. We can't just return a new token, we also
 939 -- need to update the state of the parser. Why? Because the token is longer
 940 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 941 -- it writes the wrong token length to the parser state. This function is
 942 -- called afterwards, so it can just update the state.
 943
 944 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 945                  SrcSpan -> P (Located Token)
 946 docCommentEnd input commentAcc docType buf span = do
 947   setInput input
 948   let (AI loc nextBuf) = input
 949       comment = reverse commentAcc
 950       span' = mkSrcSpan (srcSpanStart span) loc
 951       last_len = byteDiff buf nextBuf
 952
 953   span `seq` setLastToken span' last_len
 954   return (L span' (docType comment))
 955
 956 errBrace :: AlexInput -> SrcSpan -> P a
 957 errBrace (AI end _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 958
 959 open_brace, close_brace :: Action
 960 open_brace span _str _len = do
 961   ctx <- getContext
 962   setContext (NoLayout:ctx)
 963   return (L span ITocurly)
 964 close_brace span _str _len = do
 965   popContext
 966   return (L span ITccurly)
 967
 968 qvarid, qconid :: StringBuffer -> Int -> Token
 969 qvarid buf len = ITqvarid $! splitQualName buf len False
 970 qconid buf len = ITqconid $! splitQualName buf len False
 971
 972 splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
 973 -- takes a StringBuffer and a length, and returns the module name
 974 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 975 -- because of hierarchical module names.
 976 splitQualName orig_buf len parens = split orig_buf orig_buf
 977   where
 978     split buf dot_buf
 979         | orig_buf `byteDiff` buf >= len  = done dot_buf
 980         | c == '.'                        = found_dot buf'
 981         | otherwise                       = split buf' dot_buf
 982       where
 983        (c,buf') = nextChar buf
 984
 985     -- careful, we might get names like M....
 986     -- so, if the character after the dot is not upper-case, this is
 987     -- the end of the qualifier part.
 988     found_dot buf -- buf points after the '.'
 989         | isUpper c    = split buf' buf
 990         | otherwise    = done buf
 991       where
 992        (c,buf') = nextChar buf
 993
 994     done dot_buf =
 995         (lexemeToFastString orig_buf (qual_size - 1),
 996          if parens -- Prelude.(+)
 997             then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
 998             else lexemeToFastString dot_buf (len - qual_size))
 999       where
1000         qual_size = orig_buf `byteDiff` dot_buf
1001
1002 varid :: Action
1003 varid span buf len =
1004   fs `seq`
1005   case lookupUFM reservedWordsFM fs of
1006         Just (keyword,0)    -> do
1007                 maybe_layout keyword
1008                 return (L span keyword)
1009         Just (keyword,exts) -> do
1010                 b <- extension (\i -> exts .&. i /= 0)
1011                 if b then do maybe_layout keyword
1012                              return (L span keyword)
1013                      else return (L span (ITvarid fs))
1014         _other -> return (L span (ITvarid fs))
1015   where
1016         fs = lexemeToFastString buf len
1017
1018 conid :: StringBuffer -> Int -> Token
1019 conid buf len = ITconid fs
1020   where fs = lexemeToFastString buf len
1021
1022 qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
1023 qvarsym buf len = ITqvarsym $! splitQualName buf len False
1024 qconsym buf len = ITqconsym $! splitQualName buf len False
1025 prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
1026 prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
1027
1028 varsym, consym :: Action
1029 varsym = sym ITvarsym
1030 consym = sym ITconsym
1031
1032 sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
1033     -> P (Located Token)
1034 sym con span buf len =
1035   case lookupUFM reservedSymsFM fs of
1036         Just (keyword,exts) -> do
1037                 b <- extension exts
1038                 if b then return (L span keyword)
1039                      else return (L span $! con fs)
1040         _other -> return (L span $! con fs)
1041   where
1042         fs = lexemeToFastString buf len
1043
1044 -- Variations on the integral numeric literal.
1045 tok_integral :: (Integer -> Token)
1046      -> (Integer -> Integer)
1047  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
1048      -> Int -> Int
1049      -> (Integer, (Char->Int)) -> Action
1050 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
1051   return $ L span $ itint $! transint $ parseUnsignedInteger
1052      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1053
1054 -- some conveniences for use with tok_integral
1055 tok_num :: (Integer -> Integer)
1056         -> Int -> Int
1057         -> (Integer, (Char->Int)) -> Action
1058 tok_num = tok_integral ITinteger
1059 tok_primint :: (Integer -> Integer)
1060             -> Int -> Int
1061             -> (Integer, (Char->Int)) -> Action
1062 tok_primint = tok_integral ITprimint
1063 tok_primword :: Int -> Int
1064              -> (Integer, (Char->Int)) -> Action
1065 tok_primword = tok_integral ITprimword positive
1066 positive, negative :: (Integer -> Integer)
1067 positive = id
1068 negative = negate
1069 decimal, octal, hexadecimal :: (Integer, Char -> Int)
1070 decimal = (10,octDecDigit)
1071 octal = (8,octDecDigit)
1072 hexadecimal = (16,hexDigit)
1073
1074 -- readRational can understand negative rationals, exponents, everything.
1075 tok_float, tok_primfloat, tok_primdouble :: String -> Token
1076 tok_float        str = ITrational   $! readFractionalLit str
1077 tok_primfloat    str = ITprimfloat  $! readFractionalLit str
1078 tok_primdouble   str = ITprimdouble $! readFractionalLit str
1079
1080 readFractionalLit :: String -> FractionalLit
1081 readFractionalLit str = (FL $! str) $! readRational str
1082
1083 -- -----------------------------------------------------------------------------
1084 -- Layout processing
1085
1086 -- we're at the first token on a line, insert layout tokens if necessary
1087 do_bol :: Action
1088 do_bol span _str _len = do
1089         pos <- getOffside
1090         case pos of
1091             LT -> do
1092                 --trace "layout: inserting '}'" $ do
1093                 popContext
1094                 -- do NOT pop the lex state, we might have a ';' to insert
1095                 return (L span ITvccurly)
1096             EQ -> do
1097                 --trace "layout: inserting ';'" $ do
1098                 _ <- popLexState
1099                 return (L span ITsemi)
1100             GT -> do
1101                 _ <- popLexState
1102                 lexToken
1103
1104 -- certain keywords put us in the "layout" state, where we might
1105 -- add an opening curly brace.
1106 maybe_layout :: Token -> P ()
1107 maybe_layout t = do -- If the alternative layout rule is enabled then
1108                     -- we never create an implicit layout context here.
1109                     -- Layout is handled XXX instead.
1110                     -- The code for closing implicit contexts, or
1111                     -- inserting implicit semi-colons, is therefore
1112                     -- irrelevant as it only applies in an implicit
1113                     -- context.
1114                     alr <- extension alternativeLayoutRule
1115                     unless alr $ f t
1116     where f ITdo    = pushLexState layout_do
1117           f ITmdo   = pushLexState layout_do
1118           f ITof    = pushLexState layout
1119           f ITlet   = pushLexState layout
1120           f ITwhere = pushLexState layout
1121           f ITrec   = pushLexState layout
1122           f _       = return ()
1123
1124 -- Pushing a new implicit layout context.  If the indentation of the
1125 -- next token is not greater than the previous layout context, then
1126 -- Haskell 98 says that the new layout context should be empty; that is
1127 -- the lexer must generate {}.
1128 --
1129 -- We are slightly more lenient than this: when the new context is started
1130 -- by a 'do', then we allow the new context to be at the same indentation as
1131 -- the previous context.  This is what the 'strict' argument is for.
1132 --
1133 new_layout_context :: Bool -> Action
1134 new_layout_context strict span _buf _len = do
1135     _ <- popLexState
1136     (AI l _) <- getInput
1137     let offset = srcLocCol l
1138     ctx <- getContext
1139     nondecreasing <- extension nondecreasingIndentation
1140     let strict' = strict || not nondecreasing
1141     case ctx of
1142         Layout prev_off : _  |
1143            (strict'     && prev_off >= offset  ||
1144             not strict' && prev_off > offset) -> do
1145                 -- token is indented to the left of the previous context.
1146                 -- we must generate a {} sequence now.
1147                 pushLexState layout_left
1148                 return (L span ITvocurly)
1149         _ -> do
1150                 setContext (Layout offset : ctx)
1151                 return (L span ITvocurly)
1152
1153 do_layout_left :: Action
1154 do_layout_left span _buf _len = do
1155     _ <- popLexState
1156     pushLexState bol  -- we must be at the start of a line
1157     return (L span ITvccurly)
1158
1159 -- -----------------------------------------------------------------------------
1160 -- LINE pragmas
1161
1162 setLine :: Int -> Action
1163 setLine code span buf len = do
1164   let line = parseUnsignedInteger buf len 10 octDecDigit
1165   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1)
1166         -- subtract one: the line number refers to the *following* line
1167   _ <- popLexState
1168   pushLexState code
1169   lexToken
1170
1171 setFile :: Int -> Action
1172 setFile code span buf len = do
1173   let file = lexemeToFastString (stepOn buf) (len-2)
1174   setAlrLastLoc noSrcSpan
1175   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1176   _ <- popLexState
1177   pushLexState code
1178   lexToken
1179
1180
1181 -- -----------------------------------------------------------------------------
1182 -- Options, includes and language pragmas.
1183
1184 lex_string_prag :: (String -> Token) -> Action
1185 lex_string_prag mkTok span _buf _len
1186     = do input <- getInput
1187          start <- getSrcLoc
1188          tok <- go [] input
1189          end <- getSrcLoc
1190          return (L (mkSrcSpan start end) tok)
1191     where go acc input
1192               = if isString input "#-}"
1193                    then do setInput input
1194                            return (mkTok (reverse acc))
1195                    else case alexGetChar input of
1196                           Just (c,i) -> go (c:acc) i
1197                           Nothing -> err input
1198           isString _ [] = True
1199           isString i (x:xs)
1200               = case alexGetChar i of
1201                   Just (c,i') | c == x    -> isString i' xs
1202                   _other -> False
1203           err (AI end _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1204
1205
1206 -- -----------------------------------------------------------------------------
1207 -- Strings & Chars
1208
1209 -- This stuff is horrible.  I hates it.
1210
1211 lex_string_tok :: Action
1212 lex_string_tok span _buf _len = do
1213   tok <- lex_string ""
1214   end <- getSrcLoc
1215   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1216
1217 lex_string :: String -> P Token
1218 lex_string s = do
1219   i <- getInput
1220   case alexGetChar' i of
1221     Nothing -> lit_error i
1222
1223     Just ('"',i)  -> do
1224         setInput i
1225         magicHash <- extension magicHashEnabled
1226         if magicHash
1227           then do
1228             i <- getInput
1229             case alexGetChar' i of
1230               Just ('#',i) -> do
1231                    setInput i
1232                    if any (> '\xFF') s
1233                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1234                     else let s' = mkZFastString (reverse s) in
1235                          return (ITprimstring s')
1236                         -- mkZFastString is a hack to avoid encoding the
1237                         -- string in UTF-8.  We just want the exact bytes.
1238               _other ->
1239                 return (ITstring (mkFastString (reverse s)))
1240           else
1241                 return (ITstring (mkFastString (reverse s)))
1242
1243     Just ('\\',i)
1244         | Just ('&',i) <- next -> do
1245                 setInput i; lex_string s
1246         | Just (c,i) <- next, c <= '\x7f' && is_space c -> do
1247                            -- is_space only works for <= '\x7f' (#3751)
1248                 setInput i; lex_stringgap s
1249         where next = alexGetChar' i
1250
1251     Just (c, i1) -> do
1252         case c of
1253           '\\' -> do setInput i1; c' <- lex_escape; lex_string (c':s)
1254           c | isAny c -> do setInput i1; lex_string (c:s)
1255           _other -> lit_error i
1256
1257 lex_stringgap :: String -> P Token
1258 lex_stringgap s = do
1259   i <- getInput
1260   c <- getCharOrFail i
1261   case c of
1262     '\\' -> lex_string s
1263     c | is_space c -> lex_stringgap s
1264     _other -> lit_error i
1265
1266
1267 lex_char_tok :: Action
1268 -- Here we are basically parsing character literals, such as 'x' or '\n'
1269 -- but, when Template Haskell is on, we additionally spot
1270 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1271 -- but WITHOUT CONSUMING the x or T part  (the parser does that).
1272 -- So we have to do two characters of lookahead: when we see 'x we need to
1273 -- see if there's a trailing quote
1274 lex_char_tok span _buf _len = do        -- We've seen '
1275    i1 <- getInput       -- Look ahead to first character
1276    let loc = srcSpanStart span
1277    case alexGetChar' i1 of
1278         Nothing -> lit_error  i1
1279
1280         Just ('\'', i2@(AI end2 _)) -> do       -- We've seen ''
1281                   th_exts <- extension thEnabled
1282                   if th_exts then do
1283                         setInput i2
1284                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1285                    else lit_error i1
1286
1287         Just ('\\', i2@(AI _end2 _)) -> do      -- We've seen 'backslash
1288                   setInput i2
1289                   lit_ch <- lex_escape
1290                   i3 <- getInput
1291                   mc <- getCharOrFail i3 -- Trailing quote
1292                   if mc == '\'' then finish_char_tok loc lit_ch
1293                                 else lit_error i3
1294
1295         Just (c, i2@(AI _end2 _))
1296                 | not (isAny c) -> lit_error i1
1297                 | otherwise ->
1298
1299                 -- We've seen 'x, where x is a valid character
1300                 --  (i.e. not newline etc) but not a quote or backslash
1301            case alexGetChar' i2 of      -- Look ahead one more character
1302                 Just ('\'', i3) -> do   -- We've seen 'x'
1303                         setInput i3
1304                         finish_char_tok loc c
1305                 _other -> do            -- We've seen 'x not followed by quote
1306                                         -- (including the possibility of EOF)
1307                                         -- If TH is on, just parse the quote only
1308                         th_exts <- extension thEnabled
1309                         let (AI end _) = i1
1310                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1311                                    else lit_error i2
1312
1313 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1314 finish_char_tok loc ch  -- We've already seen the closing quote
1315                         -- Just need to check for trailing #
1316   = do  magicHash <- extension magicHashEnabled
1317         i@(AI end _) <- getInput
1318         if magicHash then do
1319                 case alexGetChar' i of
1320                         Just ('#',i@(AI end _)) -> do
1321                                 setInput i
1322                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1323                         _other ->
1324                                 return (L (mkSrcSpan loc end) (ITchar ch))
1325             else do
1326                    return (L (mkSrcSpan loc end) (ITchar ch))
1327
1328 isAny :: Char -> Bool
1329 isAny c | c > '\x7f' = isPrint c
1330         | otherwise  = is_any c
1331
1332 lex_escape :: P Char
1333 lex_escape = do
1334   i0 <- getInput
1335   c <- getCharOrFail i0
1336   case c of
1337         'a'   -> return '\a'
1338         'b'   -> return '\b'
1339         'f'   -> return '\f'
1340         'n'   -> return '\n'
1341         'r'   -> return '\r'
1342         't'   -> return '\t'
1343         'v'   -> return '\v'
1344         '\\'  -> return '\\'
1345         '"'   -> return '\"'
1346         '\''  -> return '\''
1347         '^'   -> do i1 <- getInput
1348                     c <- getCharOrFail i1
1349                     if c >= '@' && c <= '_'
1350                         then return (chr (ord c - ord '@'))
1351                         else lit_error i1
1352
1353         'x'   -> readNum is_hexdigit 16 hexDigit
1354         'o'   -> readNum is_octdigit  8 octDecDigit
1355         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1356
1357         c1 ->  do
1358            i <- getInput
1359            case alexGetChar' i of
1360             Nothing -> lit_error i0
1361             Just (c2,i2) ->
1362               case alexGetChar' i2 of
1363                 Nothing -> do lit_error i0
1364                 Just (c3,i3) ->
1365                    let str = [c1,c2,c3] in
1366                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1367                                      Just rest <- [stripPrefix p str] ] of
1368                           (escape_char,[]):_ -> do
1369                                 setInput i3
1370                                 return escape_char
1371                           (escape_char,_:_):_ -> do
1372                                 setInput i2
1373                                 return escape_char
1374                           [] -> lit_error i0
1375
1376 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1377 readNum is_digit base conv = do
1378   i <- getInput
1379   c <- getCharOrFail i
1380   if is_digit c
1381         then readNum2 is_digit base conv (conv c)
1382         else lit_error i
1383
1384 readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
1385 readNum2 is_digit base conv i = do
1386   input <- getInput
1387   read i input
1388   where read i input = do
1389           case alexGetChar' input of
1390             Just (c,input') | is_digit c -> do
1391                let i' = i*base + conv c
1392                if i' > 0x10ffff
1393                   then setInput input >> lexError "numeric escape sequence out of range"
1394                   else read i' input'
1395             _other -> do
1396               setInput input; return (chr i)
1397
1398
1399 silly_escape_chars :: [(String, Char)]
1400 silly_escape_chars = [
1401         ("NUL", '\NUL'),
1402         ("SOH", '\SOH'),
1403         ("STX", '\STX'),
1404         ("ETX", '\ETX'),
1405         ("EOT", '\EOT'),
1406         ("ENQ", '\ENQ'),
1407         ("ACK", '\ACK'),
1408         ("BEL", '\BEL'),
1409         ("BS", '\BS'),
1410         ("HT", '\HT'),
1411         ("LF", '\LF'),
1412         ("VT", '\VT'),
1413         ("FF", '\FF'),
1414         ("CR", '\CR'),
1415         ("SO", '\SO'),
1416         ("SI", '\SI'),
1417         ("DLE", '\DLE'),
1418         ("DC1", '\DC1'),
1419         ("DC2", '\DC2'),
1420         ("DC3", '\DC3'),
1421         ("DC4", '\DC4'),
1422         ("NAK", '\NAK'),
1423         ("SYN", '\SYN'),
1424         ("ETB", '\ETB'),
1425         ("CAN", '\CAN'),
1426         ("EM", '\EM'),
1427         ("SUB", '\SUB'),
1428         ("ESC", '\ESC'),
1429         ("FS", '\FS'),
1430         ("GS", '\GS'),
1431         ("RS", '\RS'),
1432         ("US", '\US'),
1433         ("SP", '\SP'),
1434         ("DEL", '\DEL')
1435         ]
1436
1437 -- before calling lit_error, ensure that the current input is pointing to
1438 -- the position of the error in the buffer.  This is so that we can report
1439 -- a correct location to the user, but also so we can detect UTF-8 decoding
1440 -- errors if they occur.
1441 lit_error :: AlexInput -> P a
1442 lit_error i = do setInput i; lexError "lexical error in string/character literal"
1443
1444 getCharOrFail :: AlexInput -> P Char
1445 getCharOrFail i =  do
1446   case alexGetChar' i of
1447         Nothing -> lexError "unexpected end-of-file in string/character literal"
1448         Just (c,i)  -> do setInput i; return c
1449
1450 -- -----------------------------------------------------------------------------
1451 -- QuasiQuote
1452
1453 lex_quasiquote_tok :: Action
1454 lex_quasiquote_tok span buf len = do
1455   let quoter = tail (lexemeToString buf (len - 1))
1456                 -- 'tail' drops the initial '[',
1457                 -- while the -1 drops the trailing '|'
1458   quoteStart <- getSrcLoc
1459   quote <- lex_quasiquote ""
1460   end <- getSrcLoc
1461   return (L (mkSrcSpan (srcSpanStart span) end)
1462            (ITquasiQuote (mkFastString quoter,
1463                           mkFastString (reverse quote),
1464                           mkSrcSpan quoteStart end)))
1465
1466 lex_quasiquote :: String -> P String
1467 lex_quasiquote s = do
1468   i <- getInput
1469   case alexGetChar' i of
1470     Nothing -> lit_error i
1471
1472     Just ('\\',i)
1473         | Just ('|',i) <- next -> do
1474                 setInput i; lex_quasiquote ('|' : s)
1475         | Just (']',i) <- next -> do
1476                 setInput i; lex_quasiquote (']' : s)
1477         where next = alexGetChar' i
1478
1479     Just ('|',i)
1480         | Just (']',i) <- next -> do
1481                 setInput i; return s
1482         where next = alexGetChar' i
1483
1484     Just (c, i) -> do
1485          setInput i; lex_quasiquote (c : s)
1486
1487 -- -----------------------------------------------------------------------------
1488 -- Warnings
1489
1490 warn :: DynFlag -> SDoc -> Action
1491 warn option warning srcspan _buf _len = do
1492     addWarning option srcspan warning
1493     lexToken
1494
1495 warnThen :: DynFlag -> SDoc -> Action -> Action
1496 warnThen option warning action srcspan buf len = do
1497     addWarning option srcspan warning
1498     action srcspan buf len
1499
1500 -- -----------------------------------------------------------------------------
1501 -- The Parse Monad
1502
1503 data LayoutContext
1504   = NoLayout
1505   | Layout !Int
1506   deriving Show
1507
1508 data ParseResult a
1509   = POk PState a
1510   | PFailed
1511         SrcSpan         -- The start and end of the text span related to
1512                         -- the error.  Might be used in environments which can
1513                         -- show this span, e.g. by highlighting it.
1514         Message         -- The error message
1515
1516 data PState = PState {
1517         buffer     :: StringBuffer,
1518         dflags     :: DynFlags,
1519         messages   :: Messages,
1520         last_loc   :: SrcSpan,  -- pos of previous token
1521         last_len   :: !Int,     -- len of previous token
1522         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1523         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1524         context    :: [LayoutContext],
1525         lex_state  :: [Int],
1526         -- Used in the alternative layout rule:
1527         -- These tokens are the next ones to be sent out. They are
1528         -- just blindly emitted, without the rule looking at them again:
1529         alr_pending_implicit_tokens :: [Located Token],
1530         -- This is the next token to be considered or, if it is Nothing,
1531         -- we need to get the next token from the input stream:
1532         alr_next_token :: Maybe (Located Token),
1533         -- This is what we consider to be the locatino of the last token
1534         -- emitted:
1535         alr_last_loc :: SrcSpan,
1536         -- The stack of layout contexts:
1537         alr_context :: [ALRContext],
1538         -- Are we expecting a '{'? If it's Just, then the ALRLayout tells
1539         -- us what sort of layout the '{' will open:
1540         alr_expecting_ocurly :: Maybe ALRLayout,
1541         -- Have we just had the '}' for a let block? If so, than an 'in'
1542         -- token doesn't need to close anything:
1543         alr_justClosedExplicitLetBlock :: Bool,
1544         code_type_bracket_depth :: Int
1545      }
1546         -- last_loc and last_len are used when generating error messages,
1547         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1548         -- current token to happyError, we could at least get rid of last_len.
1549         -- Getting rid of last_loc would require finding another way to
1550         -- implement pushCurrentContext (which is only called from one place).
1551
1552 data ALRContext = ALRNoLayout Bool{- does it contain commas? -}
1553                               Bool{- is it a 'let' block? -}
1554                 | ALRLayout ALRLayout Int
1555 data ALRLayout = ALRLayoutLet
1556                | ALRLayoutWhere
1557                | ALRLayoutOf
1558                | ALRLayoutDo
1559
1560 newtype P a = P { unP :: PState -> ParseResult a }
1561
1562 instance Monad P where
1563   return = returnP
1564   (>>=) = thenP
1565   fail = failP
1566
1567 returnP :: a -> P a
1568 returnP a = a `seq` (P $ \s -> POk s a)
1569
1570 thenP :: P a -> (a -> P b) -> P b
1571 (P m) `thenP` k = P $ \ s ->
1572         case m s of
1573                 POk s1 a         -> (unP (k a)) s1
1574                 PFailed span err -> PFailed span err
1575
1576 failP :: String -> P a
1577 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1578
1579 failMsgP :: String -> P a
1580 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1581
1582 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1583 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1584
1585 failSpanMsgP :: SrcSpan -> SDoc -> P a
1586 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1587
1588 getPState :: P PState
1589 getPState = P $ \s -> POk s s
1590
1591 getDynFlags :: P DynFlags
1592 getDynFlags = P $ \s -> POk s (dflags s)
1593
1594 withThisPackage :: (PackageId -> a) -> P a
1595 withThisPackage f
1596  = do   pkg     <- liftM thisPackage getDynFlags
1597         return  $ f pkg
1598
1599 extension :: (Int -> Bool) -> P Bool
1600 extension p = P $ \s -> POk s (p $! extsBitmap s)
1601
1602 getExts :: P Int
1603 getExts = P $ \s -> POk s (extsBitmap s)
1604
1605 setExts :: (Int -> Int) -> P ()
1606 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1607
1608 setSrcLoc :: SrcLoc -> P ()
1609 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1610
1611 incrBracketDepth :: P ()
1612 incrBracketDepth = P $ \s -> POk (s{code_type_bracket_depth = (code_type_bracket_depth s)+1}) ()
1613 decrBracketDepth :: P ()
1614 decrBracketDepth = P $ \s -> POk (s{code_type_bracket_depth = (code_type_bracket_depth s)-1}) ()
1615 getParserBrakDepth :: P Int
1616 getParserBrakDepth = P $ \s -> POk s (code_type_bracket_depth s)
1617
1618 getSrcLoc :: P SrcLoc
1619 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1620
1621 setLastToken :: SrcSpan -> Int -> P ()
1622 setLastToken loc len = P $ \s -> POk s {
1623   last_loc=loc,
1624   last_len=len
1625   } ()
1626
1627 data AlexInput = AI SrcLoc StringBuffer
1628
1629 alexInputPrevChar :: AlexInput -> Char
1630 alexInputPrevChar (AI _ buf) = prevChar buf '\n'
1631
1632 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1633 alexGetChar (AI loc s)
1634   | atEnd s   = Nothing
1635   | otherwise = adj_c `seq` loc' `seq` s' `seq`
1636                 --trace (show (ord c)) $
1637                 Just (adj_c, (AI loc' s'))
1638   where (c,s') = nextChar s
1639         loc'   = advanceSrcLoc loc c
1640
1641         non_graphic     = '\x0'
1642         upper           = '\x1'
1643         lower           = '\x2'
1644         digit           = '\x3'
1645         symbol          = '\x4'
1646         space           = '\x5'
1647         other_graphic   = '\x6'
1648
1649         adj_c
1650           | c <= '\x06' = non_graphic
1651           | c <= '\x7f' = c
1652           -- Alex doesn't handle Unicode, so when Unicode
1653           -- character is encountered we output these values
1654           -- with the actual character value hidden in the state.
1655           | otherwise =
1656                 case generalCategory c of
1657                   UppercaseLetter       -> upper
1658                   LowercaseLetter       -> lower
1659                   TitlecaseLetter       -> upper
1660                   ModifierLetter        -> other_graphic
1661                   OtherLetter           -> lower -- see #1103
1662                   NonSpacingMark        -> other_graphic
1663                   SpacingCombiningMark  -> other_graphic
1664                   EnclosingMark         -> other_graphic
1665                   DecimalNumber         -> digit
1666                   LetterNumber          -> other_graphic
1667                   OtherNumber           -> digit -- see #4373
1668                   ConnectorPunctuation  -> symbol
1669                   DashPunctuation       -> symbol
1670                   OpenPunctuation       -> other_graphic
1671                   ClosePunctuation      -> other_graphic
1672                   InitialQuote          -> other_graphic
1673                   FinalQuote            -> other_graphic
1674                   OtherPunctuation      -> symbol
1675                   MathSymbol            -> symbol
1676                   CurrencySymbol        -> symbol
1677                   ModifierSymbol        -> symbol
1678                   OtherSymbol           -> symbol
1679                   Space                 -> space
1680                   _other                -> non_graphic
1681
1682 -- This version does not squash unicode characters, it is used when
1683 -- lexing strings.
1684 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1685 alexGetChar' (AI loc s)
1686   | atEnd s   = Nothing
1687   | otherwise = c `seq` loc' `seq` s' `seq`
1688                 --trace (show (ord c)) $
1689                 Just (c, (AI loc' s'))
1690   where (c,s') = nextChar s
1691         loc'   = advanceSrcLoc loc c
1692
1693 getInput :: P AlexInput
1694 getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (AI l b)
1695
1696 setInput :: AlexInput -> P ()
1697 setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } ()
1698
1699 nextIsEOF :: P Bool
1700 nextIsEOF = do
1701   AI _ s <- getInput
1702   return $ atEnd s
1703
1704 pushLexState :: Int -> P ()
1705 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1706
1707 popLexState :: P Int
1708 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1709
1710 getLexState :: P Int
1711 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1712
1713 popNextToken :: P (Maybe (Located Token))
1714 popNextToken
1715     = P $ \s@PState{ alr_next_token = m } ->
1716               POk (s {alr_next_token = Nothing}) m
1717
1718 activeContext :: P Bool
1719 activeContext = do
1720   ctxt <- getALRContext
1721   expc <- getAlrExpectingOCurly
1722   impt <- implicitTokenPending
1723   case (ctxt,expc) of
1724     ([],Nothing) -> return impt
1725     _other       -> return True
1726
1727 setAlrLastLoc :: SrcSpan -> P ()
1728 setAlrLastLoc l = P $ \s -> POk (s {alr_last_loc = l}) ()
1729
1730 getAlrLastLoc :: P SrcSpan
1731 getAlrLastLoc = P $ \s@(PState {alr_last_loc = l}) -> POk s l
1732
1733 getALRContext :: P [ALRContext]
1734 getALRContext = P $ \s@(PState {alr_context = cs}) -> POk s cs
1735
1736 setALRContext :: [ALRContext] -> P ()
1737 setALRContext cs = P $ \s -> POk (s {alr_context = cs}) ()
1738
1739 getJustClosedExplicitLetBlock :: P Bool
1740 getJustClosedExplicitLetBlock
1741  = P $ \s@(PState {alr_justClosedExplicitLetBlock = b}) -> POk s b
1742
1743 setJustClosedExplicitLetBlock :: Bool -> P ()
1744 setJustClosedExplicitLetBlock b
1745  = P $ \s -> POk (s {alr_justClosedExplicitLetBlock = b}) ()
1746
1747 setNextToken :: Located Token -> P ()
1748 setNextToken t = P $ \s -> POk (s {alr_next_token = Just t}) ()
1749
1750 implicitTokenPending :: P Bool
1751 implicitTokenPending
1752     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1753               case ts of
1754               [] -> POk s False
1755               _  -> POk s True
1756
1757 popPendingImplicitToken :: P (Maybe (Located Token))
1758 popPendingImplicitToken
1759     = P $ \s@PState{ alr_pending_implicit_tokens = ts } ->
1760               case ts of
1761               [] -> POk s Nothing
1762               (t : ts') -> POk (s {alr_pending_implicit_tokens = ts'}) (Just t)
1763
1764 setPendingImplicitTokens :: [Located Token] -> P ()
1765 setPendingImplicitTokens ts = P $ \s -> POk (s {alr_pending_implicit_tokens = ts}) ()
1766
1767 getAlrExpectingOCurly :: P (Maybe ALRLayout)
1768 getAlrExpectingOCurly = P $ \s@(PState {alr_expecting_ocurly = b}) -> POk s b
1769
1770 setAlrExpectingOCurly :: Maybe ALRLayout -> P ()
1771 setAlrExpectingOCurly b = P $ \s -> POk (s {alr_expecting_ocurly = b}) ()
1772
1773 -- for reasons of efficiency, flags indicating language extensions (eg,
1774 -- -fglasgow-exts or -XParallelArrays) are represented by a bitmap stored in an unboxed
1775 -- integer
1776
1777 -- The "genericsBit" is now unused, available for others
1778 -- genericsBit :: Int
1779 -- genericsBit = 0 -- {|, |} and "generic"
1780
1781 ffiBit :: Int
1782 ffiBit     = 1
1783 parrBit :: Int
1784 parrBit    = 2
1785 arrowsBit :: Int
1786 arrowsBit  = 4
1787 thBit :: Int
1788 thBit      = 5
1789 ipBit :: Int
1790 ipBit      = 6
1791 explicitForallBit :: Int
1792 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1793 bangPatBit :: Int
1794 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1795                 -- (doesn't affect the lexer)
1796 tyFamBit :: Int
1797 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1798 haddockBit :: Int
1799 haddockBit = 10 -- Lex and parse Haddock comments
1800 magicHashBit :: Int
1801 magicHashBit = 11 -- "#" in both functions and operators
1802 kindSigsBit :: Int
1803 kindSigsBit = 12 -- Kind signatures on type variables
1804 recursiveDoBit :: Int
1805 recursiveDoBit = 13 -- mdo
1806 unicodeSyntaxBit :: Int
1807 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1808 unboxedTuplesBit :: Int
1809 unboxedTuplesBit = 15 -- (# and #)
1810 datatypeContextsBit :: Int
1811 datatypeContextsBit = 16
1812 transformComprehensionsBit :: Int
1813 transformComprehensionsBit = 17
1814 qqBit :: Int
1815 qqBit      = 18 -- enable quasiquoting
1816 inRulePragBit :: Int
1817 inRulePragBit = 19
1818 rawTokenStreamBit :: Int
1819 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1820 recBit :: Int
1821 recBit = 22 -- rec
1822 alternativeLayoutRuleBit :: Int
1823 alternativeLayoutRuleBit = 23
1824 relaxedLayoutBit :: Int
1825 relaxedLayoutBit = 24
1826 nondecreasingIndentationBit :: Int
1827 nondecreasingIndentationBit = 25
1828 hetMetBit :: Int
1829 hetMetBit = 31
1830
1831 always :: Int -> Bool
1832 always           _     = True
1833 parrEnabled :: Int -> Bool
1834 parrEnabled      flags = testBit flags parrBit
1835 arrowsEnabled :: Int -> Bool
1836 arrowsEnabled    flags = testBit flags arrowsBit
1837 hetMetEnabled :: Int -> Bool
1838 hetMetEnabled    flags = testBit flags hetMetBit
1839 thEnabled :: Int -> Bool
1840 thEnabled        flags = testBit flags thBit
1841 ipEnabled :: Int -> Bool
1842 ipEnabled        flags = testBit flags ipBit
1843 explicitForallEnabled :: Int -> Bool
1844 explicitForallEnabled flags = testBit flags explicitForallBit
1845 bangPatEnabled :: Int -> Bool
1846 bangPatEnabled   flags = testBit flags bangPatBit
1847 -- tyFamEnabled :: Int -> Bool
1848 -- tyFamEnabled     flags = testBit flags tyFamBit
1849 haddockEnabled :: Int -> Bool
1850 haddockEnabled   flags = testBit flags haddockBit
1851 magicHashEnabled :: Int -> Bool
1852 magicHashEnabled flags = testBit flags magicHashBit
1853 -- kindSigsEnabled :: Int -> Bool
1854 -- kindSigsEnabled  flags = testBit flags kindSigsBit
1855 unicodeSyntaxEnabled :: Int -> Bool
1856 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1857 unboxedTuplesEnabled :: Int -> Bool
1858 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1859 datatypeContextsEnabled :: Int -> Bool
1860 datatypeContextsEnabled flags = testBit flags datatypeContextsBit
1861 qqEnabled :: Int -> Bool
1862 qqEnabled        flags = testBit flags qqBit
1863 -- inRulePrag :: Int -> Bool
1864 -- inRulePrag       flags = testBit flags inRulePragBit
1865 rawTokenStreamEnabled :: Int -> Bool
1866 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1867 alternativeLayoutRule :: Int -> Bool
1868 alternativeLayoutRule flags = testBit flags alternativeLayoutRuleBit
1869 relaxedLayout :: Int -> Bool
1870 relaxedLayout flags = testBit flags relaxedLayoutBit
1871 nondecreasingIndentation :: Int -> Bool
1872 nondecreasingIndentation flags = testBit flags nondecreasingIndentationBit
1873
1874 -- PState for parsing options pragmas
1875 --
1876 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1877 pragState dynflags buf loc = (mkPState dynflags buf loc) {
1878                                  lex_state = [bol, option_prags, 0]
1879                              }
1880
1881 -- create a parse state
1882 --
1883 mkPState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1884 mkPState flags buf loc =
1885   PState {
1886       buffer        = buf,
1887       dflags        = flags,
1888       messages      = emptyMessages,
1889       last_loc      = mkSrcSpan loc loc,
1890       last_len      = 0,
1891       loc           = loc,
1892       extsBitmap    = fromIntegral bitmap,
1893       context       = [],
1894       lex_state     = [bol, 0],
1895       alr_pending_implicit_tokens = [],
1896       alr_next_token = Nothing,
1897       alr_last_loc = noSrcSpan,
1898       alr_context = [],
1899       alr_expecting_ocurly = Nothing,
1900       alr_justClosedExplicitLetBlock = False,
1901       code_type_bracket_depth = 0
1902     }
1903     where
1904       bitmap =     ffiBit            `setBitIf` xopt Opt_ForeignFunctionInterface flags
1905                .|. parrBit           `setBitIf` xopt Opt_ParallelArrays  flags
1906                .|. arrowsBit         `setBitIf` xopt Opt_Arrows          flags
1907                .|. hetMetBit         `setBitIf` xopt Opt_ModalTypes      flags
1908                .|. thBit             `setBitIf` xopt Opt_TemplateHaskell flags
1909                .|. qqBit             `setBitIf` xopt Opt_QuasiQuotes     flags
1910                .|. ipBit             `setBitIf` xopt Opt_ImplicitParams  flags
1911                .|. explicitForallBit `setBitIf` xopt Opt_ExplicitForAll  flags
1912                .|. bangPatBit        `setBitIf` xopt Opt_BangPatterns    flags
1913                .|. tyFamBit          `setBitIf` xopt Opt_TypeFamilies    flags
1914                .|. haddockBit        `setBitIf` dopt Opt_Haddock         flags
1915                .|. magicHashBit      `setBitIf` xopt Opt_MagicHash       flags
1916                .|. kindSigsBit       `setBitIf` xopt Opt_KindSignatures  flags
1917                .|. recursiveDoBit    `setBitIf` xopt Opt_RecursiveDo     flags
1918                .|. recBit            `setBitIf` xopt Opt_DoRec           flags
1919                .|. recBit            `setBitIf` xopt Opt_Arrows          flags
1920                .|. unicodeSyntaxBit  `setBitIf` xopt Opt_UnicodeSyntax   flags
1921                .|. unboxedTuplesBit  `setBitIf` xopt Opt_UnboxedTuples   flags
1922                .|. datatypeContextsBit `setBitIf` xopt Opt_DatatypeContexts flags
1923                .|. transformComprehensionsBit `setBitIf` xopt Opt_TransformListComp flags
1924                .|. transformComprehensionsBit `setBitIf` xopt Opt_MonadComprehensions flags
1925                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1926                .|. alternativeLayoutRuleBit `setBitIf` xopt Opt_AlternativeLayoutRule flags
1927                .|. relaxedLayoutBit  `setBitIf` xopt Opt_RelaxedLayout flags
1928                .|. nondecreasingIndentationBit `setBitIf` xopt Opt_NondecreasingIndentation flags
1929       --
1930       setBitIf :: Int -> Bool -> Int
1931       b `setBitIf` cond | cond      = bit b
1932                         | otherwise = 0
1933
1934 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1935 addWarning option srcspan warning
1936  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1937        let warning' = mkWarnMsg srcspan alwaysQualify warning
1938            ws' = if dopt option d then ws `snocBag` warning' else ws
1939        in POk s{messages=(ws', es)} ()
1940
1941 getMessages :: PState -> Messages
1942 getMessages PState{messages=ms} = ms
1943
1944 getContext :: P [LayoutContext]
1945 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1946
1947 setContext :: [LayoutContext] -> P ()
1948 setContext ctx = P $ \s -> POk s{context=ctx} ()
1949
1950 popContext :: P ()
1951 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1952                               last_len = len, last_loc = last_loc }) ->
1953   case ctx of
1954         (_:tl) -> POk s{ context = tl } ()
1955         []     -> PFailed last_loc (srcParseErr buf len)
1956
1957 -- Push a new layout context at the indentation of the last token read.
1958 -- This is only used at the outer level of a module when the 'module'
1959 -- keyword is missing.
1960 pushCurrentContext :: P ()
1961 pushCurrentContext = P $ \ s@PState{ last_loc=loc, context=ctx } ->
1962     POk s{context = Layout (srcSpanStartCol loc) : ctx} ()
1963
1964 getOffside :: P Ordering
1965 getOffside = P $ \s@PState{last_loc=loc, context=stk} ->
1966                 let offs = srcSpanStartCol loc in
1967                 let ord = case stk of
1968                         (Layout n:_) -> --trace ("layout: " ++ show n ++ ", offs: " ++ show offs) $
1969                                         compare offs n
1970                         _            -> GT
1971                 in POk s ord
1972
1973 -- ---------------------------------------------------------------------------
1974 -- Construct a parse error
1975
1976 srcParseErr
1977   :: StringBuffer       -- current buffer (placed just after the last token)
1978   -> Int                -- length of the previous token
1979   -> Message
1980 srcParseErr buf len
1981   = hcat [ if null token
1982              then ptext (sLit "parse error (possibly incorrect indentation)")
1983              else hcat [ptext (sLit "parse error on input "),
1984                         char '`', text token, char '\'']
1985     ]
1986   where token = lexemeToString (offsetBytes (-len) buf) len
1987
1988 -- Report a parse failure, giving the span of the previous token as
1989 -- the location of the error.  This is the entry point for errors
1990 -- detected during parsing.
1991 srcParseFail :: P a
1992 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1993                             last_loc = last_loc } ->
1994     PFailed last_loc (srcParseErr buf len)
1995
1996 -- A lexical error is reported at a particular position in the source file,
1997 -- not over a token range.
1998 lexError :: String -> P a
1999 lexError str = do
2000   loc <- getSrcLoc
2001   (AI end buf) <- getInput
2002   reportLexError loc end buf str
2003
2004 -- -----------------------------------------------------------------------------
2005 -- This is the top-level function: called from the parser each time a
2006 -- new token is to be read from the input.
2007
2008 lexer :: (Located Token -> P a) -> P a
2009 lexer cont = do
2010   alr <- extension alternativeLayoutRule
2011   let lexTokenFun = if alr then lexTokenAlr else lexToken
2012   tok@(L _span _tok__) <- lexTokenFun
2013   --trace ("token: " ++ show _tok__) $ do
2014   cont tok
2015
2016 lexTokenAlr :: P (Located Token)
2017 lexTokenAlr = do mPending <- popPendingImplicitToken
2018                  t <- case mPending of
2019                       Nothing ->
2020                           do mNext <- popNextToken
2021                              t <- case mNext of
2022                                   Nothing -> lexToken
2023                                   Just next -> return next
2024                              alternativeLayoutRuleToken t
2025                       Just t ->
2026                           return t
2027                  setAlrLastLoc (getLoc t)
2028                  case unLoc t of
2029                      ITwhere -> setAlrExpectingOCurly (Just ALRLayoutWhere)
2030                      ITlet   -> setAlrExpectingOCurly (Just ALRLayoutLet)
2031                      ITof    -> setAlrExpectingOCurly (Just ALRLayoutOf)
2032                      ITdo    -> setAlrExpectingOCurly (Just ALRLayoutDo)
2033                      ITmdo   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2034                      ITrec   -> setAlrExpectingOCurly (Just ALRLayoutDo)
2035                      _       -> return ()
2036                  return t
2037
2038 alternativeLayoutRuleToken :: Located Token -> P (Located Token)
2039 alternativeLayoutRuleToken t
2040     = do context <- getALRContext
2041          lastLoc <- getAlrLastLoc
2042          mExpectingOCurly <- getAlrExpectingOCurly
2043          justClosedExplicitLetBlock <- getJustClosedExplicitLetBlock
2044          setJustClosedExplicitLetBlock False
2045          dflags <- getDynFlags
2046          let transitional = xopt Opt_AlternativeLayoutRuleTransitional dflags
2047              thisLoc = getLoc t
2048              thisCol = srcSpanStartCol thisLoc
2049              newLine = (lastLoc == noSrcSpan)
2050                     || (srcSpanStartLine thisLoc > srcSpanEndLine lastLoc)
2051          case (unLoc t, context, mExpectingOCurly) of
2052              -- This case handles a GHC extension to the original H98
2053              -- layout rule...
2054              (ITocurly, _, Just alrLayout) ->
2055                  do setAlrExpectingOCurly Nothing
2056                     let isLet = case alrLayout of
2057                                 ALRLayoutLet -> True
2058                                 _ -> False
2059                     setALRContext (ALRNoLayout (containsCommas ITocurly) isLet : context)
2060                     return t
2061              -- ...and makes this case unnecessary
2062              {-
2063              -- I think our implicit open-curly handling is slightly
2064              -- different to John's, in how it interacts with newlines
2065              -- and "in"
2066              (ITocurly, _, Just _) ->
2067                  do setAlrExpectingOCurly Nothing
2068                     setNextToken t
2069                     lexTokenAlr
2070              -}
2071              (_, ALRLayout _ col : ls, Just expectingOCurly)
2072               | (thisCol > col) ||
2073                 (thisCol == col &&
2074                  isNonDecreasingIntentation expectingOCurly) ->
2075                  do setAlrExpectingOCurly Nothing
2076                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2077                     setNextToken t
2078                     return (L thisLoc ITocurly)
2079               | otherwise ->
2080                  do setAlrExpectingOCurly Nothing
2081                     setPendingImplicitTokens [L lastLoc ITccurly]
2082                     setNextToken t
2083                     return (L lastLoc ITocurly)
2084              (_, _, Just expectingOCurly) ->
2085                  do setAlrExpectingOCurly Nothing
2086                     setALRContext (ALRLayout expectingOCurly thisCol : context)
2087                     setNextToken t
2088                     return (L thisLoc ITocurly)
2089              -- We do the [] cases earlier than in the spec, as we
2090              -- have an actual EOF token
2091              (ITeof, ALRLayout _ _ : ls, _) ->
2092                  do setALRContext ls
2093                     setNextToken t
2094                     return (L thisLoc ITccurly)
2095              (ITeof, _, _) ->
2096                  return t
2097              -- the other ITeof case omitted; general case below covers it
2098              (ITin, _, _)
2099               | justClosedExplicitLetBlock ->
2100                  return t
2101              (ITin, ALRLayout ALRLayoutLet _ : ls, _)
2102               | newLine ->
2103                  do setPendingImplicitTokens [t]
2104                     setALRContext ls
2105                     return (L thisLoc ITccurly)
2106              -- This next case is to handle a transitional issue:
2107              (ITwhere, ALRLayout _ col : ls, _)
2108               | newLine && thisCol == col && transitional ->
2109                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2110                                thisLoc
2111                                (transitionalAlternativeLayoutWarning
2112                                     "`where' clause at the same depth as implicit layout block")
2113                     setALRContext ls
2114                     setNextToken t
2115                     -- Note that we use lastLoc, as we may need to close
2116                     -- more layouts, or give a semicolon
2117                     return (L lastLoc ITccurly)
2118              -- This next case is to handle a transitional issue:
2119              (ITvbar, ALRLayout _ col : ls, _)
2120               | newLine && thisCol == col && transitional ->
2121                  do addWarning Opt_WarnAlternativeLayoutRuleTransitional
2122                                thisLoc
2123                                (transitionalAlternativeLayoutWarning
2124                                     "`|' at the same depth as implicit layout block")
2125                     setALRContext ls
2126                     setNextToken t
2127                     -- Note that we use lastLoc, as we may need to close
2128                     -- more layouts, or give a semicolon
2129                     return (L lastLoc ITccurly)
2130              (_, ALRLayout _ col : ls, _)
2131               | newLine && thisCol == col ->
2132                  do setNextToken t
2133                     return (L thisLoc ITsemi)
2134               | newLine && thisCol < col ->
2135                  do setALRContext ls
2136                     setNextToken t
2137                     -- Note that we use lastLoc, as we may need to close
2138                     -- more layouts, or give a semicolon
2139                     return (L lastLoc ITccurly)
2140              -- We need to handle close before open, as 'then' is both
2141              -- an open and a close
2142              (u, _, _)
2143               | isALRclose u ->
2144                  case context of
2145                  ALRLayout _ _ : ls ->
2146                      do setALRContext ls
2147                         setNextToken t
2148                         return (L thisLoc ITccurly)
2149                  ALRNoLayout _ isLet : ls ->
2150                      do let ls' = if isALRopen u
2151                                      then ALRNoLayout (containsCommas u) False : ls
2152                                      else ls
2153                         setALRContext ls'
2154                         when isLet $ setJustClosedExplicitLetBlock True
2155                         return t
2156                  [] ->
2157                      do let ls = if isALRopen u
2158                                     then [ALRNoLayout (containsCommas u) False]
2159                                     else ls
2160                         setALRContext ls
2161                         -- XXX This is an error in John's code, but
2162                         -- it looks reachable to me at first glance
2163                         return t
2164              (u, _, _)
2165               | isALRopen u ->
2166                  do setALRContext (ALRNoLayout (containsCommas u) False : context)
2167                     return t
2168              (ITin, ALRLayout ALRLayoutLet _ : ls, _) ->
2169                  do setALRContext ls
2170                     setPendingImplicitTokens [t]
2171                     return (L thisLoc ITccurly)
2172              (ITin, ALRLayout _ _ : ls, _) ->
2173                  do setALRContext ls
2174                     setNextToken t
2175                     return (L thisLoc ITccurly)
2176              -- the other ITin case omitted; general case below covers it
2177              (ITcomma, ALRLayout _ _ : ls, _)
2178               | topNoLayoutContainsCommas ls ->
2179                  do setALRContext ls
2180                     setNextToken t
2181                     return (L thisLoc ITccurly)
2182              (ITwhere, ALRLayout ALRLayoutDo _ : ls, _) ->
2183                  do setALRContext ls
2184                     setPendingImplicitTokens [t]
2185                     return (L thisLoc ITccurly)
2186              -- the other ITwhere case omitted; general case below covers it
2187              (_, _, _) -> return t
2188
2189 transitionalAlternativeLayoutWarning :: String -> SDoc
2190 transitionalAlternativeLayoutWarning msg
2191     = text "transitional layout will not be accepted in the future:"
2192    $$ text msg
2193
2194 isALRopen :: Token -> Bool
2195 isALRopen ITcase        = True
2196 isALRopen ITif          = True
2197 isALRopen ITthen        = True
2198 isALRopen IToparen      = True
2199 isALRopen ITobrack      = True
2200 isALRopen ITocurly      = True
2201 -- GHC Extensions:
2202 isALRopen IToubxparen   = True
2203 isALRopen ITparenEscape = True
2204 isALRopen _             = False
2205
2206 isALRclose :: Token -> Bool
2207 isALRclose ITof     = True
2208 isALRclose ITthen   = True
2209 isALRclose ITelse   = True
2210 isALRclose ITcparen = True
2211 isALRclose ITcbrack = True
2212 isALRclose ITccurly = True
2213 -- GHC Extensions:
2214 isALRclose ITcubxparen = True
2215 isALRclose _        = False
2216
2217 isNonDecreasingIntentation :: ALRLayout -> Bool
2218 isNonDecreasingIntentation ALRLayoutDo = True
2219 isNonDecreasingIntentation _           = False
2220
2221 containsCommas :: Token -> Bool
2222 containsCommas IToparen = True
2223 containsCommas ITobrack = True
2224 -- John doesn't have {} as containing commas, but records contain them,
2225 -- which caused a problem parsing Cabal's Distribution.Simple.InstallDirs
2226 -- (defaultInstallDirs).
2227 containsCommas ITocurly = True
2228 -- GHC Extensions:
2229 containsCommas IToubxparen = True
2230 containsCommas _        = False
2231
2232 topNoLayoutContainsCommas :: [ALRContext] -> Bool
2233 topNoLayoutContainsCommas [] = False
2234 topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
2235 topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
2236
2237 lexToken :: P (Located Token)
2238 lexToken = do
2239   inp@(AI loc1 buf) <- getInput
2240   sc <- getLexState
2241   exts <- getExts
2242   case alexScanUser exts inp sc of
2243     AlexEOF -> do
2244         let span = mkSrcSpan loc1 loc1
2245         setLastToken span 0
2246         return (L span ITeof)
2247     AlexError (AI loc2 buf) ->
2248         reportLexError loc1 loc2 buf "lexical error"
2249     AlexSkip inp2 _ -> do
2250         setInput inp2
2251         lexToken
2252     AlexToken inp2@(AI end buf2) _ t -> do
2253         setInput inp2
2254         let span = mkSrcSpan loc1 end
2255         let bytes = byteDiff buf buf2
2256         span `seq` setLastToken span bytes
2257         t span buf bytes
2258
2259 reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
2260 reportLexError loc1 loc2 buf str
2261   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
2262   | otherwise =
2263   let
2264         c = fst (nextChar buf)
2265   in
2266   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
2267     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
2268     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
2269
2270 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
2271 lexTokenStream buf loc dflags = unP go initState
2272     where dflags' = dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream
2273           initState = mkPState dflags' buf loc
2274           go = do
2275             ltok <- lexer return
2276             case ltok of
2277               L _ ITeof -> return []
2278               _ -> liftM (ltok:) go
2279
2280 linePrags = Map.singleton "line" (begin line_prag2)
2281
2282 fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag),
2283                                  ("options_ghc", lex_string_prag IToptions_prag),
2284                                  ("options_haddock", lex_string_prag ITdocOptions),
2285                                  ("language", token ITlanguage_prag),
2286                                  ("include", lex_string_prag ITinclude_prag)])
2287
2288 ignoredPrags = Map.fromList (map ignored pragmas)
2289                where ignored opt = (opt, nested_comment lexToken)
2290                      impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"]
2291                      options_pragmas = map ("options_" ++) impls
2292                      -- CFILES is a hugs-only thing.
2293                      pragmas = options_pragmas ++ ["cfiles", "contract"]
2294
2295 oneWordPrags = Map.fromList([("rules", rulePrag),
2296                            ("inline", token (ITinline_prag Inline FunLike)),
2297                            ("inlinable", token (ITinline_prag Inlinable FunLike)),
2298                            ("inlineable", token (ITinline_prag Inlinable FunLike)),
2299                                           -- Spelling variant
2300                            ("notinline", token (ITinline_prag NoInline FunLike)),
2301                            ("specialize", token ITspec_prag),
2302                            ("source", token ITsource_prag),
2303                            ("warning", token ITwarning_prag),
2304                            ("deprecated", token ITdeprecated_prag),
2305                            ("scc", token ITscc_prag),
2306                            ("generated", token ITgenerated_prag),
2307                            ("core", token ITcore_prag),
2308                            ("unpack", token ITunpack_prag),
2309                            ("ann", token ITann_prag),
2310                            ("vectorize", token ITvect_prag)])
2311
2312 twoWordPrags = Map.fromList([("inline conlike", token (ITinline_prag Inline ConLike)),
2313                              ("notinline conlike", token (ITinline_prag NoInline ConLike)),
2314                              ("specialize inline", token (ITspec_inline_prag True)),
2315                              ("specialize notinline", token (ITspec_inline_prag False)),
2316                              ("vectorize scalar", token ITvect_scalar_prag)])
2317
2318 dispatch_pragmas :: Map String Action -> Action
2319 dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
2320                                        Just found -> found span buf len
2321                                        Nothing -> lexError "unknown pragma"
2322
2323 known_pragma :: Map String Action -> AlexAccPred Int
2324 known_pragma prags _ _ len (AI _ buf) = (isJust $ Map.lookup (clean_pragma (lexemeToString (offsetBytes (- len) buf) len)) prags)
2325                                           && (nextCharIs buf (\c -> not (isAlphaNum c || c == '_')))
2326
2327 clean_pragma :: String -> String
2328 clean_pragma prag = canon_ws (map toLower (unprefix prag))
2329                     where unprefix prag' = case stripPrefix "{-#" prag' of
2330                                              Just rest -> rest
2331                                              Nothing -> prag'
2332                           canonical prag' = case prag' of
2333                                               "noinline" -> "notinline"
2334                                               "specialise" -> "specialize"
2335                                               "vectorise" -> "vectorize"
2336                                               "constructorlike" -> "conlike"
2337                                               _ -> prag'
2338                           canon_ws s = unwords (map canonical (words s))
2339 }