compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 {-# OPTIONS -w #-}
  25 -- The above warning supression flag is a temporary kludge.
  26 -- While working on this module you are encouraged to remove it and fix
  27 -- any warnings in the module. See
  28 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  29 -- for details
  30 --
  31 -- Note that Alex itself generates code with with some unused bindings and
  32 -- without type signatures, so removing the flag might not be possible.
  33
  34 {-# OPTIONS_GHC -funbox-strict-fields #-}
  35
  36 module Lexer (
  37    Token(..), lexer, pragState, mkPState, PState(..),
  38    P(..), ParseResult(..), getSrcLoc,
  39    failLocMsgP, failSpanMsgP, srcParseFail,
  40    getMessages,
  41    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  42    getLexState, popLexState, pushLexState,
  43    extension, standaloneDerivingEnabled, bangPatEnabled,
  44    addWarning,
  45    lexTokenStream
  46   ) where
  47
  48 import Bag
  49 import ErrUtils
  50 import Outputable
  51 import StringBuffer
  52 import FastString
  53 import FastTypes
  54 import SrcLoc
  55 import UniqFM
  56 import DynFlags
  57 import Ctype
  58 import Util             ( maybePrefixMatch, readRational )
  59
  60 import Control.Monad
  61 import Data.Bits
  62 import Data.Char        ( chr, ord, isSpace )
  63 import Data.Ratio
  64 import Debug.Trace
  65
  66 import Unicode ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  67 }
  68
  69 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  70 $whitechar   = [\ \n\r\f\v $unispace]
  71 $white_no_nl = $whitechar # \n
  72 $tab         = \t
  73
  74 $ascdigit  = 0-9
  75 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  76 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  77 $digit     = [$ascdigit $unidigit]
  78
  79 $special   = [\(\)\,\;\[\]\`\{\}]
  80 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  81 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  82 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  83
  84 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  85 $asclarge  = [A-Z]
  86 $large     = [$asclarge $unilarge]
  87
  88 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
  89 $ascsmall  = [a-z]
  90 $small     = [$ascsmall $unismall \_]
  91
  92 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
  93 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  94
  95 $octit     = 0-7
  96 $hexit     = [$decdigit A-F a-f]
  97 $symchar   = [$symbol \:]
  98 $nl        = [\n\r]
  99 $idchar    = [$small $large $digit \']
 100
 101 $docsym    = [\| \^ \* \$]
 102
 103 @varid     = $small $idchar*
 104 @conid     = $large $idchar*
 105
 106 @varsym    = $symbol $symchar*
 107 @consym    = \: $symchar*
 108
 109 @decimal     = $decdigit+
 110 @octal       = $octit+
 111 @hexadecimal = $hexit+
 112 @exponent    = [eE] [\-\+]? @decimal
 113
 114 -- we support the hierarchical module name extension:
 115 @qual = (@conid \.)+
 116
 117 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 118
 119 -- normal signed numerical literals can only be explicitly negative,
 120 -- not explicitly positive (contrast @exponent)
 121 @negative = \-
 122 @signed = @negative ?
 123
 124 haskell :-
 125
 126 -- everywhere: skip whitespace and comments
 127 $white_no_nl+                           ;
 128 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 129
 130 -- Everywhere: deal with nested comments.  We explicitly rule out
 131 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 132 -- (this can happen even though pragmas will normally take precedence due to
 133 -- longest-match, because pragmas aren't valid in every state, but comments
 134 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 135 -- set.
 136
 137 "{-" / { isNormalComment } { nested_comment lexToken }
 138
 139 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 140 -- more dashes followed by a symbol should be parsed as a varsym, so we
 141 -- have to exclude those.
 142
 143 -- Since Haddock comments aren't valid in every state, we need to rule them
 144 -- out here.
 145
 146 -- The following two rules match comments that begin with two dashes, but
 147 -- continue with a different character. The rules test that this character
 148 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 149 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 150 -- have a Haddock comment). The rules then munch the rest of the line.
 151
 152 "-- " ~[$docsym \#] .* { lineCommentToken }
 153 "--" [^$symbol : \ ] .* { lineCommentToken }
 154
 155 -- Next, match Haddock comments if no -haddock flag
 156
 157 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
 158
 159 -- Now, when we've matched comments that begin with 2 dashes and continue
 160 -- with a different character, we need to match comments that begin with three
 161 -- or more dashes (which clearly can't be Haddock comments). We only need to
 162 -- make sure that the first non-dash character isn't a symbol, and munch the
 163 -- rest of the line.
 164
 165 "---"\-* [^$symbol :] .* { lineCommentToken }
 166
 167 -- Since the previous rules all match dashes followed by at least one
 168 -- character, we also need to match a whole line filled with just dashes.
 169
 170 "--"\-* / { atEOL } { lineCommentToken }
 171
 172 -- We need this rule since none of the other single line comment rules
 173 -- actually match this case.
 174
 175 "-- " / { atEOL } { lineCommentToken }
 176
 177 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 178 -- blank lines) until we find a non-whitespace character, then do layout
 179 -- processing.
 180 --
 181 -- One slight wibble here: what if the line begins with {-#? In
 182 -- theory, we have to lex the pragma to see if it's one we recognise,
 183 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 184 -- as a nested comment.  We don't bother with this: if the line begins
 185 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 186 <bol> {
 187   \n                                    ;
 188   ^\# (line)?                           { begin line_prag1 }
 189   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 190   ^\# \! .* \n                          ; -- #!, for scripts
 191   ()                                    { do_bol }
 192 }
 193
 194 -- after a layout keyword (let, where, do, of), we begin a new layout
 195 -- context if the curly brace is missing.
 196 -- Careful! This stuff is quite delicate.
 197 <layout, layout_do> {
 198   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 199         -- we might encounter {-# here, but {- has been handled already
 200   \n                                    ;
 201   ^\# (line)?                           { begin line_prag1 }
 202 }
 203
 204 -- do is treated in a subtly different way, see new_layout_context
 205 <layout>    ()                          { new_layout_context True }
 206 <layout_do> ()                          { new_layout_context False }
 207
 208 -- after a new layout context which was found to be to the left of the
 209 -- previous context, we have generated a '{' token, and we now need to
 210 -- generate a matching '}' token.
 211 <layout_left>  ()                       { do_layout_left }
 212
 213 <0,option_prags> \n                             { begin bol }
 214
 215 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 216
 217 -- single-line line pragmas, of the form
 218 --    # <line> "<file>" <extra-stuff> \n
 219 <line_prag1> $decdigit+                 { setLine line_prag1a }
 220 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 221 <line_prag1b> .*                        { pop }
 222
 223 -- Haskell-style line pragmas, of the form
 224 --    {-# LINE <line> "<file>" #-}
 225 <line_prag2> $decdigit+                 { setLine line_prag2a }
 226 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 227 <line_prag2b> "#-}"|"-}"                { pop }
 228    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 229    -- with older versions of GHC which generated these.
 230
 231 <0,option_prags> {
 232   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 233   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 234   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 235                                         { token (ITinline_prag False) }
 236   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 237                                         { token ITspec_prag }
 238   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 239         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 240   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 241         $whitechar* (NO(T?)INLINE|no(t?)inline)
 242                                         { token (ITspec_inline_prag False) }
 243   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 244   "{-#" $whitechar* (WARNING|warning)
 245                                         { token ITwarning_prag }
 246   "{-#" $whitechar* (DEPRECATED|deprecated)
 247                                         { token ITdeprecated_prag }
 248   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 249   "{-#" $whitechar* (GENERATED|generated)
 250                                         { token ITgenerated_prag }
 251   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 252   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 253
 254   -- We ignore all these pragmas, but don't generate a warning for them
 255   -- CFILES is a hugs-only thing.
 256   "{-#" $whitechar* (OPTIONS_HUGS|options_hugs|OPTIONS_NHC98|options_nhc98|OPTIONS_JHC|options_jhc|CFILES|cfiles)
 257                     { nested_comment lexToken }
 258
 259   -- ToDo: should only be valid inside a pragma:
 260   "#-}"                                 { endPrag }
 261 }
 262
 263 <option_prags> {
 264   "{-#"  $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 265   "{-#"  $whitechar* (OPTIONS_GHC|options_ghc)
 266                                         { lex_string_prag IToptions_prag }
 267   "{-#"  $whitechar* (OPTIONS_HADDOCK|options_haddock)
 268                                          { lex_string_prag ITdocOptions }
 269   "-- #"                                 { multiline_doc_comment }
 270   "{-#"  $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 271   "{-#"  $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 272 }
 273
 274 <0> {
 275   -- In the "0" mode we ignore these pragmas
 276   "{-#"  $whitechar* (OPTIONS|options|OPTIONS_GHC|options_ghc|OPTIONS_HADDOCK|options_haddock|LANGUAGE|language|INCLUDE|include)
 277                      { nested_comment lexToken }
 278 }
 279
 280 <0> {
 281   "-- #" .* { lineCommentToken }
 282 }
 283
 284 <0,option_prags> {
 285   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 286                     (nested_comment lexToken) }
 287 }
 288
 289 -- '0' state: ordinary lexemes
 290
 291 -- Haddock comments
 292
 293 <0> {
 294   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 295   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 296 }
 297
 298 -- "special" symbols
 299
 300 <0> {
 301   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 302   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 303 }
 304
 305 <0> {
 306   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 307   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 308   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 309   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 310   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 311   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 312   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 313   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 314
 315   "[$" @varid "|"  / { ifExtension qqEnabled }
 316                      { lex_quasiquote_tok }
 317 }
 318
 319 <0> {
 320   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 321                                         { special IToparenbar }
 322   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 323 }
 324
 325 <0> {
 326   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 327 }
 328
 329 <0> {
 330   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 331          { token IToubxparen }
 332   "#)" / { ifExtension unboxedTuplesEnabled }
 333          { token ITcubxparen }
 334 }
 335
 336 <0> {
 337   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 338   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 339 }
 340
 341 <0,option_prags> {
 342   \(                                    { special IToparen }
 343   \)                                    { special ITcparen }
 344   \[                                    { special ITobrack }
 345   \]                                    { special ITcbrack }
 346   \,                                    { special ITcomma }
 347   \;                                    { special ITsemi }
 348   \`                                    { special ITbackquote }
 349
 350   \{                                    { open_brace }
 351   \}                                    { close_brace }
 352 }
 353
 354 <0,option_prags> {
 355   @qual @varid                  { idtoken qvarid }
 356   @qual @conid                  { idtoken qconid }
 357   @varid                        { varid }
 358   @conid                        { idtoken conid }
 359 }
 360
 361 <0> {
 362   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 363   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 364   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 365   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 366 }
 367
 368 -- ToDo: M.(,,,)
 369
 370 <0> {
 371   @qual @varsym                 { idtoken qvarsym }
 372   @qual @consym                 { idtoken qconsym }
 373   @varsym                       { varsym }
 374   @consym                       { consym }
 375 }
 376
 377 -- For the normal boxed literals we need to be careful
 378 -- when trying to be close to Haskell98
 379 <0> {
 380   -- Normal integral literals (:: Num a => a, from Integer)
 381   @decimal           { tok_num positive 0 0 decimal }
 382   0[oO] @octal       { tok_num positive 2 2 octal }
 383   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 384
 385   -- Normal rational literals (:: Fractional a => a, from Rational)
 386   @floating_point    { strtoken tok_float }
 387 }
 388
 389 <0> {
 390   -- Unboxed ints (:: Int#) and words (:: Word#)
 391   -- It's simpler (and faster?) to give separate cases to the negatives,
 392   -- especially considering octal/hexadecimal prefixes.
 393   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 394   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 395   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 396   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 397   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 398   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 399
 400   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 401   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 402   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 403
 404   -- Unboxed floats and doubles (:: Float#, :: Double#)
 405   -- prim_{float,double} work with signed literals
 406   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 407   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 408 }
 409
 410 -- Strings and chars are lexed by hand-written code.  The reason is
 411 -- that even if we recognise the string or char here in the regex
 412 -- lexer, we would still have to parse the string afterward in order
 413 -- to convert it to a String.
 414 <0> {
 415   \'                            { lex_char_tok }
 416   \"                            { lex_string_tok }
 417 }
 418
 419 {
 420 -- -----------------------------------------------------------------------------
 421 -- The token type
 422
 423 data Token
 424   = ITas                        -- Haskell keywords
 425   | ITcase
 426   | ITclass
 427   | ITdata
 428   | ITdefault
 429   | ITderiving
 430   | ITdo
 431   | ITelse
 432   | IThiding
 433   | ITif
 434   | ITimport
 435   | ITin
 436   | ITinfix
 437   | ITinfixl
 438   | ITinfixr
 439   | ITinstance
 440   | ITlet
 441   | ITmodule
 442   | ITnewtype
 443   | ITof
 444   | ITqualified
 445   | ITthen
 446   | ITtype
 447   | ITwhere
 448   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 449
 450   | ITforall                    -- GHC extension keywords
 451   | ITforeign
 452   | ITexport
 453   | ITlabel
 454   | ITdynamic
 455   | ITsafe
 456   | ITthreadsafe
 457   | ITunsafe
 458   | ITstdcallconv
 459   | ITccallconv
 460   | ITdotnet
 461   | ITmdo
 462   | ITfamily
 463   | ITgroup
 464   | ITby
 465   | ITusing
 466
 467         -- Pragmas
 468   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 469   | ITspec_prag                 -- SPECIALISE
 470   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 471   | ITsource_prag
 472   | ITrules_prag
 473   | ITwarning_prag
 474   | ITdeprecated_prag
 475   | ITline_prag
 476   | ITscc_prag
 477   | ITgenerated_prag
 478   | ITcore_prag                 -- hdaume: core annotations
 479   | ITunpack_prag
 480   | ITclose_prag
 481   | IToptions_prag String
 482   | ITinclude_prag String
 483   | ITlanguage_prag
 484
 485   | ITdotdot                    -- reserved symbols
 486   | ITcolon
 487   | ITdcolon
 488   | ITequal
 489   | ITlam
 490   | ITvbar
 491   | ITlarrow
 492   | ITrarrow
 493   | ITat
 494   | ITtilde
 495   | ITdarrow
 496   | ITminus
 497   | ITbang
 498   | ITstar
 499   | ITdot
 500
 501   | ITbiglam                    -- GHC-extension symbols
 502
 503   | ITocurly                    -- special symbols
 504   | ITccurly
 505   | ITocurlybar                 -- {|, for type applications
 506   | ITccurlybar                 -- |}, for type applications
 507   | ITvocurly
 508   | ITvccurly
 509   | ITobrack
 510   | ITopabrack                  -- [:, for parallel arrays with -XParr
 511   | ITcpabrack                  -- :], for parallel arrays with -XParr
 512   | ITcbrack
 513   | IToparen
 514   | ITcparen
 515   | IToubxparen
 516   | ITcubxparen
 517   | ITsemi
 518   | ITcomma
 519   | ITunderscore
 520   | ITbackquote
 521
 522   | ITvarid   FastString        -- identifiers
 523   | ITconid   FastString
 524   | ITvarsym  FastString
 525   | ITconsym  FastString
 526   | ITqvarid  (FastString,FastString)
 527   | ITqconid  (FastString,FastString)
 528   | ITqvarsym (FastString,FastString)
 529   | ITqconsym (FastString,FastString)
 530
 531   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 532
 533   | ITpragma StringBuffer
 534
 535   | ITchar       Char
 536   | ITstring     FastString
 537   | ITinteger    Integer
 538   | ITrational   Rational
 539
 540   | ITprimchar   Char
 541   | ITprimstring FastString
 542   | ITprimint    Integer
 543   | ITprimword   Integer
 544   | ITprimfloat  Rational
 545   | ITprimdouble Rational
 546
 547   -- MetaHaskell extension tokens
 548   | ITopenExpQuote              --  [| or [e|
 549   | ITopenPatQuote              --  [p|
 550   | ITopenDecQuote              --  [d|
 551   | ITopenTypQuote              --  [t|
 552   | ITcloseQuote                --  |]
 553   | ITidEscape   FastString     --  $x
 554   | ITparenEscape               --  $(
 555   | ITvarQuote                  --  '
 556   | ITtyQuote                   --  ''
 557   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 558
 559   -- Arrow notation extension
 560   | ITproc
 561   | ITrec
 562   | IToparenbar                 --  (|
 563   | ITcparenbar                 --  |)
 564   | ITlarrowtail                --  -<
 565   | ITrarrowtail                --  >-
 566   | ITLarrowtail                --  -<<
 567   | ITRarrowtail                --  >>-
 568
 569   | ITunknown String            -- Used when the lexer can't make sense of it
 570   | ITeof                       -- end of file token
 571
 572   -- Documentation annotations
 573   | ITdocCommentNext  String     -- something beginning '-- |'
 574   | ITdocCommentPrev  String     -- something beginning '-- ^'
 575   | ITdocCommentNamed String     -- something beginning '-- $'
 576   | ITdocSection      Int String -- a section heading
 577   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 578   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 579   | ITlineComment     String     -- comment starting by "--"
 580   | ITblockComment    String     -- comment in {- -}
 581
 582 #ifdef DEBUG
 583   deriving Show -- debugging
 584 #endif
 585
 586 {-
 587 isSpecial :: Token -> Bool
 588 -- If we see M.x, where x is a keyword, but
 589 -- is special, we treat is as just plain M.x,
 590 -- not as a keyword.
 591 isSpecial ITas          = True
 592 isSpecial IThiding      = True
 593 isSpecial ITqualified   = True
 594 isSpecial ITforall      = True
 595 isSpecial ITexport      = True
 596 isSpecial ITlabel       = True
 597 isSpecial ITdynamic     = True
 598 isSpecial ITsafe        = True
 599 isSpecial ITthreadsafe  = True
 600 isSpecial ITunsafe      = True
 601 isSpecial ITccallconv   = True
 602 isSpecial ITstdcallconv = True
 603 isSpecial ITmdo         = True
 604 isSpecial ITfamily      = True
 605 isSpecial ITgroup   = True
 606 isSpecial ITby      = True
 607 isSpecial ITusing   = True
 608 isSpecial _             = False
 609 -}
 610
 611 -- the bitmap provided as the third component indicates whether the
 612 -- corresponding extension keyword is valid under the extension options
 613 -- provided to the compiler; if the extension corresponding to *any* of the
 614 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 615 -- facilitates using a keyword in two different extensions that can be
 616 -- activated independently)
 617 --
 618 reservedWordsFM = listToUFM $
 619         map (\(x, y, z) -> (mkFastString x, (y, z)))
 620        [( "_",          ITunderscore,   0 ),
 621         ( "as",         ITas,           0 ),
 622         ( "case",       ITcase,         0 ),
 623         ( "class",      ITclass,        0 ),
 624         ( "data",       ITdata,         0 ),
 625         ( "default",    ITdefault,      0 ),
 626         ( "deriving",   ITderiving,     0 ),
 627         ( "do",         ITdo,           0 ),
 628         ( "else",       ITelse,         0 ),
 629         ( "hiding",     IThiding,       0 ),
 630         ( "if",         ITif,           0 ),
 631         ( "import",     ITimport,       0 ),
 632         ( "in",         ITin,           0 ),
 633         ( "infix",      ITinfix,        0 ),
 634         ( "infixl",     ITinfixl,       0 ),
 635         ( "infixr",     ITinfixr,       0 ),
 636         ( "instance",   ITinstance,     0 ),
 637         ( "let",        ITlet,          0 ),
 638         ( "module",     ITmodule,       0 ),
 639         ( "newtype",    ITnewtype,      0 ),
 640         ( "of",         ITof,           0 ),
 641         ( "qualified",  ITqualified,    0 ),
 642         ( "then",       ITthen,         0 ),
 643         ( "type",       ITtype,         0 ),
 644         ( "where",      ITwhere,        0 ),
 645         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 646
 647     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 648         ( "mdo",        ITmdo,           bit recursiveDoBit),
 649         ( "family",     ITfamily,        bit tyFamBit),
 650     ( "group",  ITgroup,     bit transformComprehensionsBit),
 651     ( "by",     ITby,        bit transformComprehensionsBit),
 652     ( "using",  ITusing,     bit transformComprehensionsBit),
 653
 654         ( "foreign",    ITforeign,       bit ffiBit),
 655         ( "export",     ITexport,        bit ffiBit),
 656         ( "label",      ITlabel,         bit ffiBit),
 657         ( "dynamic",    ITdynamic,       bit ffiBit),
 658         ( "safe",       ITsafe,          bit ffiBit),
 659         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 660         ( "unsafe",     ITunsafe,        bit ffiBit),
 661         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 662         ( "ccall",      ITccallconv,     bit ffiBit),
 663         ( "dotnet",     ITdotnet,        bit ffiBit),
 664
 665         ( "rec",        ITrec,           bit arrowsBit),
 666         ( "proc",       ITproc,          bit arrowsBit)
 667      ]
 668
 669 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 670 reservedSymsFM = listToUFM $
 671     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 672       [ ("..",  ITdotdot,   always)
 673         -- (:) is a reserved op, meaning only list cons
 674        ,(":",   ITcolon,    always)
 675        ,("::",  ITdcolon,   always)
 676        ,("=",   ITequal,    always)
 677        ,("\\",  ITlam,      always)
 678        ,("|",   ITvbar,     always)
 679        ,("<-",  ITlarrow,   always)
 680        ,("->",  ITrarrow,   always)
 681        ,("@",   ITat,       always)
 682        ,("~",   ITtilde,    always)
 683        ,("=>",  ITdarrow,   always)
 684        ,("-",   ITminus,    always)
 685        ,("!",   ITbang,     always)
 686
 687         -- For data T (a::*) = MkT
 688        ,("*", ITstar, \i -> kindSigsEnabled i || tyFamEnabled i)
 689         -- For 'forall a . t'
 690        ,(".", ITdot, \i -> explicitForallEnabled i || inRulePrag i)
 691
 692        ,("-<",  ITlarrowtail, arrowsEnabled)
 693        ,(">-",  ITrarrowtail, arrowsEnabled)
 694        ,("-<<", ITLarrowtail, arrowsEnabled)
 695        ,(">>-", ITRarrowtail, arrowsEnabled)
 696
 697 #if __GLASGOW_HASKELL__ >= 605
 698        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 699        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 700        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 701                                 explicitForallEnabled i)
 702        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 703        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 704        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 705         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 706         -- form part of a large operator.  This would let us have a better
 707         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 708 #endif
 709        ]
 710
 711 -- -----------------------------------------------------------------------------
 712 -- Lexer actions
 713
 714 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 715
 716 special :: Token -> Action
 717 special tok span _buf _len = return (L span tok)
 718
 719 token, layout_token :: Token -> Action
 720 token t span _buf _len = return (L span t)
 721 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 722
 723 idtoken :: (StringBuffer -> Int -> Token) -> Action
 724 idtoken f span buf len = return (L span $! (f buf len))
 725
 726 skip_one_varid :: (FastString -> Token) -> Action
 727 skip_one_varid f span buf len
 728   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 729
 730 strtoken :: (String -> Token) -> Action
 731 strtoken f span buf len =
 732   return (L span $! (f $! lexemeToString buf len))
 733
 734 init_strtoken :: Int -> (String -> Token) -> Action
 735 -- like strtoken, but drops the last N character(s)
 736 init_strtoken drop f span buf len =
 737   return (L span $! (f $! lexemeToString buf (len-drop)))
 738
 739 begin :: Int -> Action
 740 begin code _span _str _len = do pushLexState code; lexToken
 741
 742 pop :: Action
 743 pop _span _buf _len = do popLexState; lexToken
 744
 745 pop_and :: Action -> Action
 746 pop_and act span buf len = do popLexState; act span buf len
 747
 748 {-# INLINE nextCharIs #-}
 749 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 750
 751 notFollowedBy char _ _ _ (AI _ _ buf)
 752   = nextCharIs buf (/=char)
 753
 754 notFollowedBySymbol _ _ _ (AI _ _ buf)
 755   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 756
 757 -- We must reject doc comments as being ordinary comments everywhere.
 758 -- In some cases the doc comment will be selected as the lexeme due to
 759 -- maximal munch, but not always, because the nested comment rule is
 760 -- valid in all states, but the doc-comment rules are only valid in
 761 -- the non-layout states.
 762 isNormalComment bits _ _ (AI _ _ buf)
 763   | haddockEnabled bits = notFollowedByDocOrPragma
 764   | otherwise           = nextCharIs buf (/='#')
 765   where
 766     notFollowedByDocOrPragma
 767        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 768
 769 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 770
 771 {-
 772 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 773   = if haddockEnabled bits then False else (p buf)
 774 -}
 775
 776 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 777
 778 ifExtension pred bits _ _ _ = pred bits
 779
 780 multiline_doc_comment :: Action
 781 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 782   where
 783     worker commentAcc input docType oneLine = case alexGetChar input of
 784       Just ('\n', input')
 785         | oneLine -> docCommentEnd input commentAcc docType buf span
 786         | otherwise -> case checkIfCommentLine input' of
 787           Just input -> worker ('\n':commentAcc) input docType False
 788           Nothing -> docCommentEnd input commentAcc docType buf span
 789       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 790       Nothing -> docCommentEnd input commentAcc docType buf span
 791
 792     checkIfCommentLine input = check (dropNonNewlineSpace input)
 793       where
 794         check input = case alexGetChar input of
 795           Just ('-', input) -> case alexGetChar input of
 796             Just ('-', input) -> case alexGetChar input of
 797               Just (c, _) | c /= '-' -> Just input
 798               _ -> Nothing
 799             _ -> Nothing
 800           _ -> Nothing
 801
 802         dropNonNewlineSpace input = case alexGetChar input of
 803           Just (c, input')
 804             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 805             | otherwise -> input
 806           Nothing -> input
 807
 808 lineCommentToken :: Action
 809 lineCommentToken span buf len = do
 810   b <- extension rawTokenStreamEnabled
 811   if b then strtoken ITlineComment span buf len else lexToken
 812
 813 {-
 814   nested comments require traversing by hand, they can't be parsed
 815   using regular expressions.
 816 -}
 817 nested_comment :: P (Located Token) -> Action
 818 nested_comment cont span _str _len = do
 819   input <- getInput
 820   go "" (1::Int) input
 821   where
 822     go commentAcc 0 input = do setInput input
 823                                b <- extension rawTokenStreamEnabled
 824                                if b
 825                                  then docCommentEnd input commentAcc ITblockComment _str span
 826                                  else cont
 827     go commentAcc n input = case alexGetChar input of
 828       Nothing -> errBrace input span
 829       Just ('-',input) -> case alexGetChar input of
 830         Nothing  -> errBrace input span
 831         Just ('\125',input) -> go commentAcc (n-1) input
 832         Just (_,_)          -> go ('-':commentAcc) n input
 833       Just ('\123',input) -> case alexGetChar input of
 834         Nothing  -> errBrace input span
 835         Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
 836         Just (_,_)       -> go ('\123':commentAcc) n input
 837       Just (c,input) -> go (c:commentAcc) n input
 838
 839 nested_doc_comment :: Action
 840 nested_doc_comment span buf _len = withLexedDocType (go "")
 841   where
 842     go commentAcc input docType _ = case alexGetChar input of
 843       Nothing -> errBrace input span
 844       Just ('-',input) -> case alexGetChar input of
 845         Nothing -> errBrace input span
 846         Just ('\125',input) ->
 847           docCommentEnd input commentAcc docType buf span
 848         Just (_,_) -> go ('-':commentAcc) input docType False
 849       Just ('\123', input) -> case alexGetChar input of
 850         Nothing  -> errBrace input span
 851         Just ('-',input) -> do
 852           setInput input
 853           let cont = do input <- getInput; go commentAcc input docType False
 854           nested_comment cont span buf _len
 855         Just (_,_) -> go ('\123':commentAcc) input docType False
 856       Just (c,input) -> go (c:commentAcc) input docType False
 857
 858 withLexedDocType lexDocComment = do
 859   input@(AI _ _ buf) <- getInput
 860   case prevChar buf ' ' of
 861     '|' -> lexDocComment input ITdocCommentNext False
 862     '^' -> lexDocComment input ITdocCommentPrev False
 863     '$' -> lexDocComment input ITdocCommentNamed False
 864     '*' -> lexDocSection 1 input
 865     '#' -> lexDocComment input ITdocOptionsOld False
 866  where
 867     lexDocSection n input = case alexGetChar input of
 868       Just ('*', input) -> lexDocSection (n+1) input
 869       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 870       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 871
 872 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 873 -- off again at the end of the pragma.
 874 rulePrag :: Action
 875 rulePrag span buf len = do
 876   setExts (.|. inRulePragBit)
 877   return (L span ITrules_prag)
 878
 879 endPrag :: Action
 880 endPrag span buf len = do
 881   setExts (.&. complement (bit inRulePragBit))
 882   return (L span ITclose_prag)
 883
 884 -- docCommentEnd
 885 -------------------------------------------------------------------------------
 886 -- This function is quite tricky. We can't just return a new token, we also
 887 -- need to update the state of the parser. Why? Because the token is longer
 888 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 889 -- it writes the wrong token length to the parser state. This function is
 890 -- called afterwards, so it can just update the state.
 891
 892 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 893 -- which is something that the original lexer didn't account for.
 894 -- I have added last_line_len in the parser state which represents the length
 895 -- of the part of the token that is on the last line. It is now used for layout
 896 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 897 -- was before, the full length of the token, and it is now only used for error
 898 -- messages. /Waern
 899
 900 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 901                  SrcSpan -> P (Located Token)
 902 docCommentEnd input commentAcc docType buf span = do
 903   setInput input
 904   let (AI loc last_offs nextBuf) = input
 905       comment = reverse commentAcc
 906       span' = mkSrcSpan (srcSpanStart span) loc
 907       last_len = byteDiff buf nextBuf
 908
 909       last_line_len = if (last_offs - last_len < 0)
 910         then last_offs
 911         else last_len
 912
 913   span `seq` setLastToken span' last_len last_line_len
 914   return (L span' (docType comment))
 915
 916 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 917
 918 open_brace, close_brace :: Action
 919 open_brace span _str _len = do
 920   ctx <- getContext
 921   setContext (NoLayout:ctx)
 922   return (L span ITocurly)
 923 close_brace span _str _len = do
 924   popContext
 925   return (L span ITccurly)
 926
 927 qvarid buf len = ITqvarid $! splitQualName buf len
 928 qconid buf len = ITqconid $! splitQualName buf len
 929
 930 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 931 -- takes a StringBuffer and a length, and returns the module name
 932 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 933 -- because of hierarchical module names.
 934 splitQualName orig_buf len = split orig_buf orig_buf
 935   where
 936     split buf dot_buf
 937         | orig_buf `byteDiff` buf >= len  = done dot_buf
 938         | c == '.'                        = found_dot buf'
 939         | otherwise                       = split buf' dot_buf
 940       where
 941        (c,buf') = nextChar buf
 942
 943     -- careful, we might get names like M....
 944     -- so, if the character after the dot is not upper-case, this is
 945     -- the end of the qualifier part.
 946     found_dot buf -- buf points after the '.'
 947         | isUpper c    = split buf' buf
 948         | otherwise    = done buf
 949       where
 950        (c,buf') = nextChar buf
 951
 952     done dot_buf =
 953         (lexemeToFastString orig_buf (qual_size - 1),
 954          lexemeToFastString dot_buf (len - qual_size))
 955       where
 956         qual_size = orig_buf `byteDiff` dot_buf
 957
 958 varid span buf len =
 959   fs `seq`
 960   case lookupUFM reservedWordsFM fs of
 961         Just (keyword,0)    -> do
 962                 maybe_layout keyword
 963                 return (L span keyword)
 964         Just (keyword,exts) -> do
 965                 b <- extension (\i -> exts .&. i /= 0)
 966                 if b then do maybe_layout keyword
 967                              return (L span keyword)
 968                      else return (L span (ITvarid fs))
 969         _other -> return (L span (ITvarid fs))
 970   where
 971         fs = lexemeToFastString buf len
 972
 973 conid buf len = ITconid fs
 974   where fs = lexemeToFastString buf len
 975
 976 qvarsym buf len = ITqvarsym $! splitQualName buf len
 977 qconsym buf len = ITqconsym $! splitQualName buf len
 978
 979 varsym = sym ITvarsym
 980 consym = sym ITconsym
 981
 982 sym con span buf len =
 983   case lookupUFM reservedSymsFM fs of
 984         Just (keyword,exts) -> do
 985                 b <- extension exts
 986                 if b then return (L span keyword)
 987                      else return (L span $! con fs)
 988         _other -> return (L span $! con fs)
 989   where
 990         fs = lexemeToFastString buf len
 991
 992 -- Variations on the integral numeric literal.
 993 tok_integral :: (Integer -> Token)
 994      -> (Integer -> Integer)
 995  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
 996      -> Int -> Int
 997      -> (Integer, (Char->Int)) -> Action
 998 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
 999   return $ L span $ itint $! transint $ parseUnsignedInteger
1000      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
1001
1002 -- some conveniences for use with tok_integral
1003 tok_num = tok_integral ITinteger
1004 tok_primint = tok_integral ITprimint
1005 tok_primword = tok_integral ITprimword positive
1006 positive = id
1007 negative = negate
1008 decimal = (10,octDecDigit)
1009 octal = (8,octDecDigit)
1010 hexadecimal = (16,hexDigit)
1011
1012 -- readRational can understand negative rationals, exponents, everything.
1013 tok_float        str = ITrational   $! readRational str
1014 tok_primfloat    str = ITprimfloat  $! readRational str
1015 tok_primdouble   str = ITprimdouble $! readRational str
1016
1017 -- -----------------------------------------------------------------------------
1018 -- Layout processing
1019
1020 -- we're at the first token on a line, insert layout tokens if necessary
1021 do_bol :: Action
1022 do_bol span _str _len = do
1023         pos <- getOffside
1024         case pos of
1025             LT -> do
1026                 --trace "layout: inserting '}'" $ do
1027                 popContext
1028                 -- do NOT pop the lex state, we might have a ';' to insert
1029                 return (L span ITvccurly)
1030             EQ -> do
1031                 --trace "layout: inserting ';'" $ do
1032                 popLexState
1033                 return (L span ITsemi)
1034             GT -> do
1035                 popLexState
1036                 lexToken
1037
1038 -- certain keywords put us in the "layout" state, where we might
1039 -- add an opening curly brace.
1040 maybe_layout ITdo       = pushLexState layout_do
1041 maybe_layout ITmdo      = pushLexState layout_do
1042 maybe_layout ITof       = pushLexState layout
1043 maybe_layout ITlet      = pushLexState layout
1044 maybe_layout ITwhere    = pushLexState layout
1045 maybe_layout ITrec      = pushLexState layout
1046 maybe_layout _          = return ()
1047
1048 -- Pushing a new implicit layout context.  If the indentation of the
1049 -- next token is not greater than the previous layout context, then
1050 -- Haskell 98 says that the new layout context should be empty; that is
1051 -- the lexer must generate {}.
1052 --
1053 -- We are slightly more lenient than this: when the new context is started
1054 -- by a 'do', then we allow the new context to be at the same indentation as
1055 -- the previous context.  This is what the 'strict' argument is for.
1056 --
1057 new_layout_context strict span _buf _len = do
1058     popLexState
1059     (AI _ offset _) <- getInput
1060     ctx <- getContext
1061     case ctx of
1062         Layout prev_off : _  |
1063            (strict     && prev_off >= offset  ||
1064             not strict && prev_off > offset) -> do
1065                 -- token is indented to the left of the previous context.
1066                 -- we must generate a {} sequence now.
1067                 pushLexState layout_left
1068                 return (L span ITvocurly)
1069         _ -> do
1070                 setContext (Layout offset : ctx)
1071                 return (L span ITvocurly)
1072
1073 do_layout_left span _buf _len = do
1074     popLexState
1075     pushLexState bol  -- we must be at the start of a line
1076     return (L span ITvccurly)
1077
1078 -- -----------------------------------------------------------------------------
1079 -- LINE pragmas
1080
1081 setLine :: Int -> Action
1082 setLine code span buf len = do
1083   let line = parseUnsignedInteger buf len 10 octDecDigit
1084   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1085         -- subtract one: the line number refers to the *following* line
1086   popLexState
1087   pushLexState code
1088   lexToken
1089
1090 setFile :: Int -> Action
1091 setFile code span buf len = do
1092   let file = lexemeToFastString (stepOn buf) (len-2)
1093   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1094   popLexState
1095   pushLexState code
1096   lexToken
1097
1098
1099 -- -----------------------------------------------------------------------------
1100 -- Options, includes and language pragmas.
1101
1102 lex_string_prag :: (String -> Token) -> Action
1103 lex_string_prag mkTok span _buf _len
1104     = do input <- getInput
1105          start <- getSrcLoc
1106          tok <- go [] input
1107          end <- getSrcLoc
1108          return (L (mkSrcSpan start end) tok)
1109     where go acc input
1110               = if isString input "#-}"
1111                    then do setInput input
1112                            return (mkTok (reverse acc))
1113                    else case alexGetChar input of
1114                           Just (c,i) -> go (c:acc) i
1115                           Nothing -> err input
1116           isString _ [] = True
1117           isString i (x:xs)
1118               = case alexGetChar i of
1119                   Just (c,i') | c == x    -> isString i' xs
1120                   _other -> False
1121           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1122
1123
1124 -- -----------------------------------------------------------------------------
1125 -- Strings & Chars
1126
1127 -- This stuff is horrible.  I hates it.
1128
1129 lex_string_tok :: Action
1130 lex_string_tok span _buf _len = do
1131   tok <- lex_string ""
1132   end <- getSrcLoc
1133   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1134
1135 lex_string :: String -> P Token
1136 lex_string s = do
1137   i <- getInput
1138   case alexGetChar' i of
1139     Nothing -> lit_error
1140
1141     Just ('"',i)  -> do
1142         setInput i
1143         magicHash <- extension magicHashEnabled
1144         if magicHash
1145           then do
1146             i <- getInput
1147             case alexGetChar' i of
1148               Just ('#',i) -> do
1149                    setInput i
1150                    if any (> '\xFF') s
1151                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1152                     else let s' = mkZFastString (reverse s) in
1153                          return (ITprimstring s')
1154                         -- mkZFastString is a hack to avoid encoding the
1155                         -- string in UTF-8.  We just want the exact bytes.
1156               _other ->
1157                 return (ITstring (mkFastString (reverse s)))
1158           else
1159                 return (ITstring (mkFastString (reverse s)))
1160
1161     Just ('\\',i)
1162         | Just ('&',i) <- next -> do
1163                 setInput i; lex_string s
1164         | Just (c,i) <- next, is_space c -> do
1165                 setInput i; lex_stringgap s
1166         where next = alexGetChar' i
1167
1168     Just (c, i) -> do
1169         c' <- lex_char c i
1170         lex_string (c':s)
1171
1172 lex_stringgap s = do
1173   c <- getCharOrFail
1174   case c of
1175     '\\' -> lex_string s
1176     c | is_space c -> lex_stringgap s
1177     _other -> lit_error
1178
1179
1180 lex_char_tok :: Action
1181 -- Here we are basically parsing character literals, such as 'x' or '\n'
1182 -- but, when Template Haskell is on, we additionally spot
1183 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1184 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1185 -- So we have to do two characters of lookahead: when we see 'x we need to
1186 -- see if there's a trailing quote
1187 lex_char_tok span _buf _len = do        -- We've seen '
1188    i1 <- getInput       -- Look ahead to first character
1189    let loc = srcSpanStart span
1190    case alexGetChar' i1 of
1191         Nothing -> lit_error
1192
1193         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1194                   th_exts <- extension thEnabled
1195                   if th_exts then do
1196                         setInput i2
1197                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1198                    else lit_error
1199
1200         Just ('\\', i2@(AI _end2 _ _)) -> do    -- We've seen 'backslash
1201                   setInput i2
1202                   lit_ch <- lex_escape
1203                   mc <- getCharOrFail   -- Trailing quote
1204                   if mc == '\'' then finish_char_tok loc lit_ch
1205                                 else do setInput i2; lit_error
1206
1207         Just (c, i2@(AI _end2 _ _))
1208                 | not (isAny c) -> lit_error
1209                 | otherwise ->
1210
1211                 -- We've seen 'x, where x is a valid character
1212                 --  (i.e. not newline etc) but not a quote or backslash
1213            case alexGetChar' i2 of      -- Look ahead one more character
1214                 Nothing -> lit_error
1215                 Just ('\'', i3) -> do   -- We've seen 'x'
1216                         setInput i3
1217                         finish_char_tok loc c
1218                 _other -> do            -- We've seen 'x not followed by quote
1219                                         -- If TH is on, just parse the quote only
1220                         th_exts <- extension thEnabled
1221                         let (AI end _ _) = i1
1222                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1223                                    else do setInput i2; lit_error
1224
1225 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1226 finish_char_tok loc ch  -- We've already seen the closing quote
1227                         -- Just need to check for trailing #
1228   = do  magicHash <- extension magicHashEnabled
1229         i@(AI end _ _) <- getInput
1230         if magicHash then do
1231                 case alexGetChar' i of
1232                         Just ('#',i@(AI end _ _)) -> do
1233                                 setInput i
1234                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1235                         _other ->
1236                                 return (L (mkSrcSpan loc end) (ITchar ch))
1237                 else do
1238                    return (L (mkSrcSpan loc end) (ITchar ch))
1239
1240 lex_char :: Char -> AlexInput -> P Char
1241 lex_char c inp = do
1242   case c of
1243       '\\' -> do setInput inp; lex_escape
1244       c | isAny c -> do setInput inp; return c
1245       _other -> lit_error
1246
1247 isAny c | c > '\x7f' = isPrint c
1248         | otherwise  = is_any c
1249
1250 lex_escape :: P Char
1251 lex_escape = do
1252   c <- getCharOrFail
1253   case c of
1254         'a'   -> return '\a'
1255         'b'   -> return '\b'
1256         'f'   -> return '\f'
1257         'n'   -> return '\n'
1258         'r'   -> return '\r'
1259         't'   -> return '\t'
1260         'v'   -> return '\v'
1261         '\\'  -> return '\\'
1262         '"'   -> return '\"'
1263         '\''  -> return '\''
1264         '^'   -> do c <- getCharOrFail
1265                     if c >= '@' && c <= '_'
1266                         then return (chr (ord c - ord '@'))
1267                         else lit_error
1268
1269         'x'   -> readNum is_hexdigit 16 hexDigit
1270         'o'   -> readNum is_octdigit  8 octDecDigit
1271         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1272
1273         c1 ->  do
1274            i <- getInput
1275            case alexGetChar' i of
1276             Nothing -> lit_error
1277             Just (c2,i2) ->
1278               case alexGetChar' i2 of
1279                 Nothing -> do setInput i2; lit_error
1280                 Just (c3,i3) ->
1281                    let str = [c1,c2,c3] in
1282                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1283                                      Just rest <- [maybePrefixMatch p str] ] of
1284                           (escape_char,[]):_ -> do
1285                                 setInput i3
1286                                 return escape_char
1287                           (escape_char,_:_):_ -> do
1288                                 setInput i2
1289                                 return escape_char
1290                           [] -> lit_error
1291
1292 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1293 readNum is_digit base conv = do
1294   i <- getInput
1295   c <- getCharOrFail
1296   if is_digit c
1297         then readNum2 is_digit base conv (conv c)
1298         else do setInput i; lit_error
1299
1300 readNum2 is_digit base conv i = do
1301   input <- getInput
1302   read i input
1303   where read i input = do
1304           case alexGetChar' input of
1305             Just (c,input') | is_digit c -> do
1306                 read (i*base + conv c) input'
1307             _other -> do
1308                 if i >= 0 && i <= 0x10FFFF
1309                    then do setInput input; return (chr i)
1310                    else lit_error
1311
1312 silly_escape_chars = [
1313         ("NUL", '\NUL'),
1314         ("SOH", '\SOH'),
1315         ("STX", '\STX'),
1316         ("ETX", '\ETX'),
1317         ("EOT", '\EOT'),
1318         ("ENQ", '\ENQ'),
1319         ("ACK", '\ACK'),
1320         ("BEL", '\BEL'),
1321         ("BS", '\BS'),
1322         ("HT", '\HT'),
1323         ("LF", '\LF'),
1324         ("VT", '\VT'),
1325         ("FF", '\FF'),
1326         ("CR", '\CR'),
1327         ("SO", '\SO'),
1328         ("SI", '\SI'),
1329         ("DLE", '\DLE'),
1330         ("DC1", '\DC1'),
1331         ("DC2", '\DC2'),
1332         ("DC3", '\DC3'),
1333         ("DC4", '\DC4'),
1334         ("NAK", '\NAK'),
1335         ("SYN", '\SYN'),
1336         ("ETB", '\ETB'),
1337         ("CAN", '\CAN'),
1338         ("EM", '\EM'),
1339         ("SUB", '\SUB'),
1340         ("ESC", '\ESC'),
1341         ("FS", '\FS'),
1342         ("GS", '\GS'),
1343         ("RS", '\RS'),
1344         ("US", '\US'),
1345         ("SP", '\SP'),
1346         ("DEL", '\DEL')
1347         ]
1348
1349 -- before calling lit_error, ensure that the current input is pointing to
1350 -- the position of the error in the buffer.  This is so that we can report
1351 -- a correct location to the user, but also so we can detect UTF-8 decoding
1352 -- errors if they occur.
1353 lit_error = lexError "lexical error in string/character literal"
1354
1355 getCharOrFail :: P Char
1356 getCharOrFail =  do
1357   i <- getInput
1358   case alexGetChar' i of
1359         Nothing -> lexError "unexpected end-of-file in string/character literal"
1360         Just (c,i)  -> do setInput i; return c
1361
1362 -- -----------------------------------------------------------------------------
1363 -- QuasiQuote
1364
1365 lex_quasiquote_tok :: Action
1366 lex_quasiquote_tok span buf len = do
1367   let quoter = reverse $ takeWhile (/= '$')
1368                $ reverse $ lexemeToString buf (len - 1)
1369   quoteStart <- getSrcLoc
1370   quote <- lex_quasiquote ""
1371   end <- getSrcLoc
1372   return (L (mkSrcSpan (srcSpanStart span) end)
1373            (ITquasiQuote (mkFastString quoter,
1374                           mkFastString (reverse quote),
1375                           mkSrcSpan quoteStart end)))
1376
1377 lex_quasiquote :: String -> P String
1378 lex_quasiquote s = do
1379   i <- getInput
1380   case alexGetChar' i of
1381     Nothing -> lit_error
1382
1383     Just ('\\',i)
1384         | Just ('|',i) <- next -> do
1385                 setInput i; lex_quasiquote ('|' : s)
1386         | Just (']',i) <- next -> do
1387                 setInput i; lex_quasiquote (']' : s)
1388         where next = alexGetChar' i
1389
1390     Just ('|',i)
1391         | Just (']',i) <- next -> do
1392                 setInput i; return s
1393         where next = alexGetChar' i
1394
1395     Just (c, i) -> do
1396          setInput i; lex_quasiquote (c : s)
1397
1398 -- -----------------------------------------------------------------------------
1399 -- Warnings
1400
1401 warn :: DynFlag -> SDoc -> Action
1402 warn option warning srcspan _buf _len = do
1403     addWarning option srcspan warning
1404     lexToken
1405
1406 warnThen :: DynFlag -> SDoc -> Action -> Action
1407 warnThen option warning action srcspan buf len = do
1408     addWarning option srcspan warning
1409     action srcspan buf len
1410
1411 -- -----------------------------------------------------------------------------
1412 -- The Parse Monad
1413
1414 data LayoutContext
1415   = NoLayout
1416   | Layout !Int
1417   deriving Show
1418
1419 data ParseResult a
1420   = POk PState a
1421   | PFailed
1422         SrcSpan         -- The start and end of the text span related to
1423                         -- the error.  Might be used in environments which can
1424                         -- show this span, e.g. by highlighting it.
1425         Message         -- The error message
1426
1427 data PState = PState {
1428         buffer     :: StringBuffer,
1429     dflags     :: DynFlags,
1430     messages   :: Messages,
1431         last_loc   :: SrcSpan,  -- pos of previous token
1432         last_offs  :: !Int,     -- offset of the previous token from the
1433                                 -- beginning of  the current line.
1434                                 -- \t is equal to 8 spaces.
1435         last_len   :: !Int,     -- len of previous token
1436   last_line_len :: !Int,
1437         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1438         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1439         context    :: [LayoutContext],
1440         lex_state  :: [Int]
1441      }
1442         -- last_loc and last_len are used when generating error messages,
1443         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1444         -- current token to happyError, we could at least get rid of last_len.
1445         -- Getting rid of last_loc would require finding another way to
1446         -- implement pushCurrentContext (which is only called from one place).
1447
1448 newtype P a = P { unP :: PState -> ParseResult a }
1449
1450 instance Monad P where
1451   return = returnP
1452   (>>=) = thenP
1453   fail = failP
1454
1455 returnP :: a -> P a
1456 returnP a = a `seq` (P $ \s -> POk s a)
1457
1458 thenP :: P a -> (a -> P b) -> P b
1459 (P m) `thenP` k = P $ \ s ->
1460         case m s of
1461                 POk s1 a         -> (unP (k a)) s1
1462                 PFailed span err -> PFailed span err
1463
1464 failP :: String -> P a
1465 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1466
1467 failMsgP :: String -> P a
1468 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1469
1470 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1471 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1472
1473 failSpanMsgP :: SrcSpan -> SDoc -> P a
1474 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1475
1476 extension :: (Int -> Bool) -> P Bool
1477 extension p = P $ \s -> POk s (p $! extsBitmap s)
1478
1479 getExts :: P Int
1480 getExts = P $ \s -> POk s (extsBitmap s)
1481
1482 setExts :: (Int -> Int) -> P ()
1483 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1484
1485 setSrcLoc :: SrcLoc -> P ()
1486 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1487
1488 getSrcLoc :: P SrcLoc
1489 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1490
1491 setLastToken :: SrcSpan -> Int -> Int -> P ()
1492 setLastToken loc len line_len = P $ \s -> POk s {
1493   last_loc=loc,
1494   last_len=len,
1495   last_line_len=line_len
1496 } ()
1497
1498 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1499
1500 alexInputPrevChar :: AlexInput -> Char
1501 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1502
1503 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1504 alexGetChar (AI loc ofs s)
1505   | atEnd s   = Nothing
1506   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1507                 --trace (show (ord c)) $
1508                 Just (adj_c, (AI loc' ofs' s'))
1509   where (c,s') = nextChar s
1510         loc'   = advanceSrcLoc loc c
1511         ofs'   = advanceOffs c ofs
1512
1513         non_graphic     = '\x0'
1514         upper           = '\x1'
1515         lower           = '\x2'
1516         digit           = '\x3'
1517         symbol          = '\x4'
1518         space           = '\x5'
1519         other_graphic   = '\x6'
1520
1521         adj_c
1522           | c <= '\x06' = non_graphic
1523           | c <= '\x7f' = c
1524           -- Alex doesn't handle Unicode, so when Unicode
1525           -- character is encoutered we output these values
1526           -- with the actual character value hidden in the state.
1527           | otherwise =
1528                 case generalCategory c of
1529                   UppercaseLetter       -> upper
1530                   LowercaseLetter       -> lower
1531                   TitlecaseLetter       -> upper
1532                   ModifierLetter        -> other_graphic
1533                   OtherLetter           -> lower -- see #1103
1534                   NonSpacingMark        -> other_graphic
1535                   SpacingCombiningMark  -> other_graphic
1536                   EnclosingMark         -> other_graphic
1537                   DecimalNumber         -> digit
1538                   LetterNumber          -> other_graphic
1539                   OtherNumber           -> other_graphic
1540                   ConnectorPunctuation  -> other_graphic
1541                   DashPunctuation       -> other_graphic
1542                   OpenPunctuation       -> other_graphic
1543                   ClosePunctuation      -> other_graphic
1544                   InitialQuote          -> other_graphic
1545                   FinalQuote            -> other_graphic
1546                   OtherPunctuation      -> other_graphic
1547                   MathSymbol            -> symbol
1548                   CurrencySymbol        -> symbol
1549                   ModifierSymbol        -> symbol
1550                   OtherSymbol           -> symbol
1551                   Space                 -> space
1552                   _other                -> non_graphic
1553
1554 -- This version does not squash unicode characters, it is used when
1555 -- lexing strings.
1556 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1557 alexGetChar' (AI loc ofs s)
1558   | atEnd s   = Nothing
1559   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1560                 --trace (show (ord c)) $
1561                 Just (c, (AI loc' ofs' s'))
1562   where (c,s') = nextChar s
1563         loc'   = advanceSrcLoc loc c
1564         ofs'   = advanceOffs c ofs
1565
1566 advanceOffs :: Char -> Int -> Int
1567 advanceOffs '\n' _    = 0
1568 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1569 advanceOffs _    offs = offs + 1
1570
1571 getInput :: P AlexInput
1572 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1573
1574 setInput :: AlexInput -> P ()
1575 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1576
1577 pushLexState :: Int -> P ()
1578 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1579
1580 popLexState :: P Int
1581 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1582
1583 getLexState :: P Int
1584 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1585
1586 -- for reasons of efficiency, flags indicating language extensions (eg,
1587 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1588 -- integer
1589
1590 genericsBit, ffiBit, parrBit :: Int
1591 genericsBit = 0 -- {| and |}
1592 ffiBit     = 1
1593 parrBit    = 2
1594 arrowsBit  = 4
1595 thBit      = 5
1596 ipBit      = 6
1597 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1598 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1599                 -- (doesn't affect the lexer)
1600 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1601 haddockBit = 10 -- Lex and parse Haddock comments
1602 magicHashBit = 11 -- "#" in both functions and operators
1603 kindSigsBit = 12 -- Kind signatures on type variables
1604 recursiveDoBit = 13 -- mdo
1605 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1606 unboxedTuplesBit = 15 -- (# and #)
1607 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1608 transformComprehensionsBit = 17
1609 qqBit      = 18 -- enable quasiquoting
1610 inRulePragBit = 19
1611 rawTokenStreamBit = 20 -- producing a token stream with all comments included
1612
1613 genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1614 always           _     = True
1615 genericsEnabled  flags = testBit flags genericsBit
1616 ffiEnabled       flags = testBit flags ffiBit
1617 parrEnabled      flags = testBit flags parrBit
1618 arrowsEnabled    flags = testBit flags arrowsBit
1619 thEnabled        flags = testBit flags thBit
1620 ipEnabled        flags = testBit flags ipBit
1621 explicitForallEnabled flags = testBit flags explicitForallBit
1622 bangPatEnabled   flags = testBit flags bangPatBit
1623 tyFamEnabled     flags = testBit flags tyFamBit
1624 haddockEnabled   flags = testBit flags haddockBit
1625 magicHashEnabled flags = testBit flags magicHashBit
1626 kindSigsEnabled  flags = testBit flags kindSigsBit
1627 recursiveDoEnabled flags = testBit flags recursiveDoBit
1628 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1629 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1630 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1631 transformComprehensionsEnabled flags = testBit flags transformComprehensionsBit
1632 qqEnabled        flags = testBit flags qqBit
1633 inRulePrag       flags = testBit flags inRulePragBit
1634 rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
1635
1636 -- PState for parsing options pragmas
1637 --
1638 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1639 pragState dynflags buf loc =
1640   PState {
1641       buffer        = buf,
1642       messages      = emptyMessages,
1643       dflags        = dynflags,
1644       last_loc      = mkSrcSpan loc loc,
1645       last_offs     = 0,
1646       last_len      = 0,
1647       last_line_len = 0,
1648       loc           = loc,
1649       extsBitmap    = 0,
1650       context       = [],
1651       lex_state     = [bol, option_prags, 0]
1652     }
1653
1654
1655 -- create a parse state
1656 --
1657 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1658 mkPState buf loc flags  =
1659   PState {
1660       buffer          = buf,
1661       dflags        = flags,
1662       messages      = emptyMessages,
1663       last_loc      = mkSrcSpan loc loc,
1664       last_offs     = 0,
1665       last_len      = 0,
1666       last_line_len = 0,
1667       loc           = loc,
1668       extsBitmap    = fromIntegral bitmap,
1669       context       = [],
1670       lex_state     = [bol, 0]
1671         -- we begin in the layout state if toplev_layout is set
1672     }
1673     where
1674       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1675                .|. ffiBit       `setBitIf` dopt Opt_ForeignFunctionInterface flags
1676                .|. parrBit      `setBitIf` dopt Opt_PArr         flags
1677                .|. arrowsBit    `setBitIf` dopt Opt_Arrows       flags
1678                .|. thBit        `setBitIf` dopt Opt_TemplateHaskell flags
1679                .|. qqBit        `setBitIf` dopt Opt_QuasiQuotes flags
1680                .|. ipBit        `setBitIf` dopt Opt_ImplicitParams flags
1681                .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
1682                .|. explicitForallBit `setBitIf` dopt Opt_LiberalTypeSynonyms flags
1683                .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
1684                .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
1685                .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
1686                .|. explicitForallBit `setBitIf` dopt Opt_RankNTypes flags
1687                .|. bangPatBit   `setBitIf` dopt Opt_BangPatterns flags
1688                .|. tyFamBit     `setBitIf` dopt Opt_TypeFamilies flags
1689                .|. haddockBit   `setBitIf` dopt Opt_Haddock      flags
1690                .|. magicHashBit `setBitIf` dopt Opt_MagicHash    flags
1691                .|. kindSigsBit  `setBitIf` dopt Opt_KindSignatures flags
1692                .|. recursiveDoBit `setBitIf` dopt Opt_RecursiveDo flags
1693                .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
1694                .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
1695                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1696                .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1697                .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
1698       --
1699       setBitIf :: Int -> Bool -> Int
1700       b `setBitIf` cond | cond      = bit b
1701                         | otherwise = 0
1702
1703 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1704 addWarning option srcspan warning
1705  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1706        let warning' = mkWarnMsg srcspan alwaysQualify warning
1707            ws' = if dopt option d then ws `snocBag` warning' else ws
1708        in POk s{messages=(ws', es)} ()
1709
1710 getMessages :: PState -> Messages
1711 getMessages PState{messages=ms} = ms
1712
1713 getContext :: P [LayoutContext]
1714 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1715
1716 setContext :: [LayoutContext] -> P ()
1717 setContext ctx = P $ \s -> POk s{context=ctx} ()
1718
1719 popContext :: P ()
1720 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1721                               last_len = len, last_loc = last_loc }) ->
1722   case ctx of
1723         (_:tl) -> POk s{ context = tl } ()
1724         []     -> PFailed last_loc (srcParseErr buf len)
1725
1726 -- Push a new layout context at the indentation of the last token read.
1727 -- This is only used at the outer level of a module when the 'module'
1728 -- keyword is missing.
1729 pushCurrentContext :: P ()
1730 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1731     POk s{context = Layout (offs-len) : ctx} ()
1732 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1733
1734 getOffside :: P Ordering
1735 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1736                 let ord = case stk of
1737                         (Layout n:_) -> compare offs n
1738                         _            -> GT
1739                 in POk s ord
1740
1741 -- ---------------------------------------------------------------------------
1742 -- Construct a parse error
1743
1744 srcParseErr
1745   :: StringBuffer       -- current buffer (placed just after the last token)
1746   -> Int                -- length of the previous token
1747   -> Message
1748 srcParseErr buf len
1749   = hcat [ if null token
1750              then ptext (sLit "parse error (possibly incorrect indentation)")
1751              else hcat [ptext (sLit "parse error on input "),
1752                         char '`', text token, char '\'']
1753     ]
1754   where token = lexemeToString (offsetBytes (-len) buf) len
1755
1756 -- Report a parse failure, giving the span of the previous token as
1757 -- the location of the error.  This is the entry point for errors
1758 -- detected during parsing.
1759 srcParseFail :: P a
1760 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1761                             last_loc = last_loc } ->
1762     PFailed last_loc (srcParseErr buf len)
1763
1764 -- A lexical error is reported at a particular position in the source file,
1765 -- not over a token range.
1766 lexError :: String -> P a
1767 lexError str = do
1768   loc <- getSrcLoc
1769   (AI end _ buf) <- getInput
1770   reportLexError loc end buf str
1771
1772 -- -----------------------------------------------------------------------------
1773 -- This is the top-level function: called from the parser each time a
1774 -- new token is to be read from the input.
1775
1776 lexer :: (Located Token -> P a) -> P a
1777 lexer cont = do
1778   tok@(L _span _tok__) <- lexToken
1779 --  trace ("token: " ++ show tok__) $ do
1780   cont tok
1781
1782 lexToken :: P (Located Token)
1783 lexToken = do
1784   inp@(AI loc1 _ buf) <- getInput
1785   sc <- getLexState
1786   exts <- getExts
1787   case alexScanUser exts inp sc of
1788     AlexEOF -> do
1789         let span = mkSrcSpan loc1 loc1
1790         setLastToken span 0 0
1791         return (L span ITeof)
1792     AlexError (AI loc2 _ buf) ->
1793         reportLexError loc1 loc2 buf "lexical error"
1794     AlexSkip inp2 _ -> do
1795         setInput inp2
1796         lexToken
1797     AlexToken inp2@(AI end _ buf2) _ t -> do
1798         setInput inp2
1799         let span = mkSrcSpan loc1 end
1800         let bytes = byteDiff buf buf2
1801         span `seq` setLastToken span bytes bytes
1802         t span buf bytes
1803
1804 reportLexError loc1 loc2 buf str
1805   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1806   | otherwise =
1807   let
1808         c = fst (nextChar buf)
1809   in
1810   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1811     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1812     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1813
1814 lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
1815 lexTokenStream buf loc dflags = unP go initState
1816     where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
1817           go = do
1818             ltok <- lexer return
1819             case ltok of
1820               L _ ITeof -> return []
1821               _ -> liftM (ltok:) go
1822 }