compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 {-# OPTIONS -w #-}
  25 -- The above warning supression flag is a temporary kludge.
  26 -- While working on this module you are encouraged to remove it and fix
  27 -- any warnings in the module. See
  28 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  29 -- for details
  30 --
  31 -- Note that Alex itself generates code with with some unused bindings and
  32 -- without type signatures, so removing the flag might not be possible.
  33
  34 {-# OPTIONS_GHC -funbox-strict-fields #-}
  35
  36 module Lexer (
  37    Token(..), lexer, pragState, mkPState, PState(..),
  38    P(..), ParseResult(..), getSrcLoc,
  39    failLocMsgP, failSpanMsgP, srcParseFail,
  40    getMessages,
  41    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  42    getLexState, popLexState, pushLexState,
  43    extension, standaloneDerivingEnabled, bangPatEnabled,
  44    addWarning
  45   ) where
  46
  47 import Bag
  48 import ErrUtils
  49 import Outputable
  50 import StringBuffer
  51 import FastString
  52 import FastTypes
  53 import SrcLoc
  54 import UniqFM
  55 import DynFlags
  56 import Ctype
  57 import Util             ( maybePrefixMatch, readRational )
  58
  59 import Control.Monad
  60 import Data.Bits
  61 import Data.Char        ( chr, ord, isSpace )
  62 import Data.Ratio
  63 import Debug.Trace
  64
  65 import Unicode ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  66 }
  67
  68 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  69 $whitechar   = [\ \n\r\f\v $unispace]
  70 $white_no_nl = $whitechar # \n
  71 $tab         = \t
  72
  73 $ascdigit  = 0-9
  74 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  75 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  76 $digit     = [$ascdigit $unidigit]
  77
  78 $special   = [\(\)\,\;\[\]\`\{\}]
  79 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  80 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  81 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  82
  83 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  84 $asclarge  = [A-Z]
  85 $large     = [$asclarge $unilarge]
  86
  87 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
  88 $ascsmall  = [a-z]
  89 $small     = [$ascsmall $unismall \_]
  90
  91 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
  92 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  93
  94 $octit     = 0-7
  95 $hexit     = [$decdigit A-F a-f]
  96 $symchar   = [$symbol \:]
  97 $nl        = [\n\r]
  98 $idchar    = [$small $large $digit \']
  99
 100 $docsym    = [\| \^ \* \$]
 101
 102 @varid     = $small $idchar*
 103 @conid     = $large $idchar*
 104
 105 @varsym    = $symbol $symchar*
 106 @consym    = \: $symchar*
 107
 108 @decimal     = $decdigit+
 109 @octal       = $octit+
 110 @hexadecimal = $hexit+
 111 @exponent    = [eE] [\-\+]? @decimal
 112
 113 -- we support the hierarchical module name extension:
 114 @qual = (@conid \.)+
 115
 116 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 117
 118 -- normal signed numerical literals can only be explicitly negative,
 119 -- not explicitly positive (contrast @exponent)
 120 @negative = \-
 121 @signed = @negative ?
 122
 123 haskell :-
 124
 125 -- everywhere: skip whitespace and comments
 126 $white_no_nl+                           ;
 127 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 128
 129 -- Everywhere: deal with nested comments.  We explicitly rule out
 130 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 131 -- (this can happen even though pragmas will normally take precedence due to
 132 -- longest-match, because pragmas aren't valid in every state, but comments
 133 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 134 -- set.
 135
 136 "{-" / { isNormalComment } { nested_comment lexToken }
 137
 138 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 139 -- more dashes followed by a symbol should be parsed as a varsym, so we
 140 -- have to exclude those.
 141
 142 -- Since Haddock comments aren't valid in every state, we need to rule them
 143 -- out here.
 144
 145 -- The following two rules match comments that begin with two dashes, but
 146 -- continue with a different character. The rules test that this character
 147 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 148 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 149 -- have a Haddock comment). The rules then munch the rest of the line.
 150
 151 "-- " ~[$docsym \#] .* ;
 152 "--" [^$symbol : \ ] .* ;
 153
 154 -- Next, match Haddock comments if no -haddock flag
 155
 156 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } ;
 157
 158 -- Now, when we've matched comments that begin with 2 dashes and continue
 159 -- with a different character, we need to match comments that begin with three
 160 -- or more dashes (which clearly can't be Haddock comments). We only need to
 161 -- make sure that the first non-dash character isn't a symbol, and munch the
 162 -- rest of the line.
 163
 164 "---"\-* [^$symbol :] .* ;
 165
 166 -- Since the previous rules all match dashes followed by at least one
 167 -- character, we also need to match a whole line filled with just dashes.
 168
 169 "--"\-* / { atEOL } ;
 170
 171 -- We need this rule since none of the other single line comment rules
 172 -- actually match this case.
 173
 174 "-- " / { atEOL } ;
 175
 176 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 177 -- blank lines) until we find a non-whitespace character, then do layout
 178 -- processing.
 179 --
 180 -- One slight wibble here: what if the line begins with {-#? In
 181 -- theory, we have to lex the pragma to see if it's one we recognise,
 182 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 183 -- as a nested comment.  We don't bother with this: if the line begins
 184 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 185 <bol> {
 186   \n                                    ;
 187   ^\# (line)?                           { begin line_prag1 }
 188   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 189   ^\# \! .* \n                          ; -- #!, for scripts
 190   ()                                    { do_bol }
 191 }
 192
 193 -- after a layout keyword (let, where, do, of), we begin a new layout
 194 -- context if the curly brace is missing.
 195 -- Careful! This stuff is quite delicate.
 196 <layout, layout_do> {
 197   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 198         -- we might encounter {-# here, but {- has been handled already
 199   \n                                    ;
 200   ^\# (line)?                           { begin line_prag1 }
 201 }
 202
 203 -- do is treated in a subtly different way, see new_layout_context
 204 <layout>    ()                          { new_layout_context True }
 205 <layout_do> ()                          { new_layout_context False }
 206
 207 -- after a new layout context which was found to be to the left of the
 208 -- previous context, we have generated a '{' token, and we now need to
 209 -- generate a matching '}' token.
 210 <layout_left>  ()                       { do_layout_left }
 211
 212 <0,option_prags> \n                             { begin bol }
 213
 214 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 215
 216 -- single-line line pragmas, of the form
 217 --    # <line> "<file>" <extra-stuff> \n
 218 <line_prag1> $decdigit+                 { setLine line_prag1a }
 219 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 220 <line_prag1b> .*                        { pop }
 221
 222 -- Haskell-style line pragmas, of the form
 223 --    {-# LINE <line> "<file>" #-}
 224 <line_prag2> $decdigit+                 { setLine line_prag2a }
 225 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 226 <line_prag2b> "#-}"|"-}"                { pop }
 227    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 228    -- with older versions of GHC which generated these.
 229
 230 <0,option_prags> {
 231   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 232   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 233   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 234                                         { token (ITinline_prag False) }
 235   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 236                                         { token ITspec_prag }
 237   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 238         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 239   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 240         $whitechar* (NO(T?)INLINE|no(t?)inline)
 241                                         { token (ITspec_inline_prag False) }
 242   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 243   "{-#" $whitechar* (WARNING|warning)
 244                                         { token ITwarning_prag }
 245   "{-#" $whitechar* (DEPRECATED|deprecated)
 246                                         { token ITdeprecated_prag }
 247   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 248   "{-#" $whitechar* (GENERATED|generated)
 249                                         { token ITgenerated_prag }
 250   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 251   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 252
 253   -- We ignore all these pragmas, but don't generate a warning for them
 254   -- CFILES is a hugs-only thing.
 255   "{-#" $whitechar* (OPTIONS_HUGS|options_hugs|OPTIONS_NHC98|options_nhc98|OPTIONS_JHC|options_jhc|CFILES|cfiles)
 256                     { nested_comment lexToken }
 257
 258   -- ToDo: should only be valid inside a pragma:
 259   "#-}"                                 { endPrag }
 260 }
 261
 262 <option_prags> {
 263   "{-#"  $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 264   "{-#"  $whitechar* (OPTIONS_GHC|options_ghc)
 265                                         { lex_string_prag IToptions_prag }
 266   "{-#"  $whitechar* (OPTIONS_HADDOCK|options_haddock)
 267                                          { lex_string_prag ITdocOptions }
 268   "-- #"                                 { multiline_doc_comment }
 269   "{-#"  $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 270   "{-#"  $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 271 }
 272
 273 <0> {
 274   -- In the "0" mode we ignore these pragmas
 275   "{-#"  $whitechar* (OPTIONS|options|OPTIONS_GHC|options_ghc|OPTIONS_HADDOCK|options_haddock|LANGUAGE|language|INCLUDE|include)
 276                      { nested_comment lexToken }
 277 }
 278
 279 <0> {
 280   "-- #" .* ;
 281 }
 282
 283 <0,option_prags> {
 284   "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
 285                     (nested_comment lexToken) }
 286 }
 287
 288 -- '0' state: ordinary lexemes
 289
 290 -- Haddock comments
 291
 292 <0> {
 293   "-- " $docsym      / { ifExtension haddockEnabled } { multiline_doc_comment }
 294   "{-" \ ? $docsym   / { ifExtension haddockEnabled } { nested_doc_comment }
 295 }
 296
 297 -- "special" symbols
 298
 299 <0> {
 300   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 301   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 302 }
 303
 304 <0> {
 305   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 306   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 307   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 308   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 309   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 310   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 311   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 312   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 313
 314   "[$" @varid "|"  / { ifExtension qqEnabled }
 315                      { lex_quasiquote_tok }
 316 }
 317
 318 <0> {
 319   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 320                                         { special IToparenbar }
 321   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 322 }
 323
 324 <0> {
 325   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 326 }
 327
 328 <0> {
 329   "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
 330          { token IToubxparen }
 331   "#)" / { ifExtension unboxedTuplesEnabled }
 332          { token ITcubxparen }
 333 }
 334
 335 <0> {
 336   "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
 337   "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
 338 }
 339
 340 <0,option_prags> {
 341   \(                                    { special IToparen }
 342   \)                                    { special ITcparen }
 343   \[                                    { special ITobrack }
 344   \]                                    { special ITcbrack }
 345   \,                                    { special ITcomma }
 346   \;                                    { special ITsemi }
 347   \`                                    { special ITbackquote }
 348
 349   \{                                    { open_brace }
 350   \}                                    { close_brace }
 351 }
 352
 353 <0,option_prags> {
 354   @qual @varid                  { idtoken qvarid }
 355   @qual @conid                  { idtoken qconid }
 356   @varid                        { varid }
 357   @conid                        { idtoken conid }
 358 }
 359
 360 <0> {
 361   @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
 362   @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
 363   @varid "#"+       / { ifExtension magicHashEnabled } { varid }
 364   @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
 365 }
 366
 367 -- ToDo: M.(,,,)
 368
 369 <0> {
 370   @qual @varsym                 { idtoken qvarsym }
 371   @qual @consym                 { idtoken qconsym }
 372   @varsym                       { varsym }
 373   @consym                       { consym }
 374 }
 375
 376 -- For the normal boxed literals we need to be careful
 377 -- when trying to be close to Haskell98
 378 <0> {
 379   -- Normal integral literals (:: Num a => a, from Integer)
 380   @decimal           { tok_num positive 0 0 decimal }
 381   0[oO] @octal       { tok_num positive 2 2 octal }
 382   0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 383
 384   -- Normal rational literals (:: Fractional a => a, from Rational)
 385   @floating_point    { strtoken tok_float }
 386 }
 387
 388 <0> {
 389   -- Unboxed ints (:: Int#) and words (:: Word#)
 390   -- It's simpler (and faster?) to give separate cases to the negatives,
 391   -- especially considering octal/hexadecimal prefixes.
 392   @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
 393   0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
 394   0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
 395   @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
 396   @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
 397   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 398
 399   @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
 400   0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
 401   0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
 402
 403   -- Unboxed floats and doubles (:: Float#, :: Double#)
 404   -- prim_{float,double} work with signed literals
 405   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
 406   @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
 407 }
 408
 409 -- Strings and chars are lexed by hand-written code.  The reason is
 410 -- that even if we recognise the string or char here in the regex
 411 -- lexer, we would still have to parse the string afterward in order
 412 -- to convert it to a String.
 413 <0> {
 414   \'                            { lex_char_tok }
 415   \"                            { lex_string_tok }
 416 }
 417
 418 {
 419 -- -----------------------------------------------------------------------------
 420 -- The token type
 421
 422 data Token
 423   = ITas                        -- Haskell keywords
 424   | ITcase
 425   | ITclass
 426   | ITdata
 427   | ITdefault
 428   | ITderiving
 429   | ITdo
 430   | ITelse
 431   | IThiding
 432   | ITif
 433   | ITimport
 434   | ITin
 435   | ITinfix
 436   | ITinfixl
 437   | ITinfixr
 438   | ITinstance
 439   | ITlet
 440   | ITmodule
 441   | ITnewtype
 442   | ITof
 443   | ITqualified
 444   | ITthen
 445   | ITtype
 446   | ITwhere
 447   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 448
 449   | ITforall                    -- GHC extension keywords
 450   | ITforeign
 451   | ITexport
 452   | ITlabel
 453   | ITdynamic
 454   | ITsafe
 455   | ITthreadsafe
 456   | ITunsafe
 457   | ITstdcallconv
 458   | ITccallconv
 459   | ITdotnet
 460   | ITmdo
 461   | ITfamily
 462   | ITgroup
 463   | ITby
 464   | ITusing
 465
 466         -- Pragmas
 467   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 468   | ITspec_prag                 -- SPECIALISE
 469   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 470   | ITsource_prag
 471   | ITrules_prag
 472   | ITwarning_prag
 473   | ITdeprecated_prag
 474   | ITline_prag
 475   | ITscc_prag
 476   | ITgenerated_prag
 477   | ITcore_prag                 -- hdaume: core annotations
 478   | ITunpack_prag
 479   | ITclose_prag
 480   | IToptions_prag String
 481   | ITinclude_prag String
 482   | ITlanguage_prag
 483
 484   | ITdotdot                    -- reserved symbols
 485   | ITcolon
 486   | ITdcolon
 487   | ITequal
 488   | ITlam
 489   | ITvbar
 490   | ITlarrow
 491   | ITrarrow
 492   | ITat
 493   | ITtilde
 494   | ITdarrow
 495   | ITminus
 496   | ITbang
 497   | ITstar
 498   | ITdot
 499
 500   | ITbiglam                    -- GHC-extension symbols
 501
 502   | ITocurly                    -- special symbols
 503   | ITccurly
 504   | ITocurlybar                 -- {|, for type applications
 505   | ITccurlybar                 -- |}, for type applications
 506   | ITvocurly
 507   | ITvccurly
 508   | ITobrack
 509   | ITopabrack                  -- [:, for parallel arrays with -XParr
 510   | ITcpabrack                  -- :], for parallel arrays with -XParr
 511   | ITcbrack
 512   | IToparen
 513   | ITcparen
 514   | IToubxparen
 515   | ITcubxparen
 516   | ITsemi
 517   | ITcomma
 518   | ITunderscore
 519   | ITbackquote
 520
 521   | ITvarid   FastString        -- identifiers
 522   | ITconid   FastString
 523   | ITvarsym  FastString
 524   | ITconsym  FastString
 525   | ITqvarid  (FastString,FastString)
 526   | ITqconid  (FastString,FastString)
 527   | ITqvarsym (FastString,FastString)
 528   | ITqconsym (FastString,FastString)
 529
 530   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 531
 532   | ITpragma StringBuffer
 533
 534   | ITchar       Char
 535   | ITstring     FastString
 536   | ITinteger    Integer
 537   | ITrational   Rational
 538
 539   | ITprimchar   Char
 540   | ITprimstring FastString
 541   | ITprimint    Integer
 542   | ITprimword   Integer
 543   | ITprimfloat  Rational
 544   | ITprimdouble Rational
 545
 546   -- MetaHaskell extension tokens
 547   | ITopenExpQuote              --  [| or [e|
 548   | ITopenPatQuote              --  [p|
 549   | ITopenDecQuote              --  [d|
 550   | ITopenTypQuote              --  [t|
 551   | ITcloseQuote                --  |]
 552   | ITidEscape   FastString     --  $x
 553   | ITparenEscape               --  $(
 554   | ITvarQuote                  --  '
 555   | ITtyQuote                   --  ''
 556   | ITquasiQuote (FastString,FastString,SrcSpan) --  [:...|...|]
 557
 558   -- Arrow notation extension
 559   | ITproc
 560   | ITrec
 561   | IToparenbar                 --  (|
 562   | ITcparenbar                 --  |)
 563   | ITlarrowtail                --  -<
 564   | ITrarrowtail                --  >-
 565   | ITLarrowtail                --  -<<
 566   | ITRarrowtail                --  >>-
 567
 568   | ITunknown String            -- Used when the lexer can't make sense of it
 569   | ITeof                       -- end of file token
 570
 571   -- Documentation annotations
 572   | ITdocCommentNext  String     -- something beginning '-- |'
 573   | ITdocCommentPrev  String     -- something beginning '-- ^'
 574   | ITdocCommentNamed String     -- something beginning '-- $'
 575   | ITdocSection      Int String -- a section heading
 576   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 577   | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
 578
 579 #ifdef DEBUG
 580   deriving Show -- debugging
 581 #endif
 582
 583 {-
 584 isSpecial :: Token -> Bool
 585 -- If we see M.x, where x is a keyword, but
 586 -- is special, we treat is as just plain M.x,
 587 -- not as a keyword.
 588 isSpecial ITas          = True
 589 isSpecial IThiding      = True
 590 isSpecial ITqualified   = True
 591 isSpecial ITforall      = True
 592 isSpecial ITexport      = True
 593 isSpecial ITlabel       = True
 594 isSpecial ITdynamic     = True
 595 isSpecial ITsafe        = True
 596 isSpecial ITthreadsafe  = True
 597 isSpecial ITunsafe      = True
 598 isSpecial ITccallconv   = True
 599 isSpecial ITstdcallconv = True
 600 isSpecial ITmdo         = True
 601 isSpecial ITfamily      = True
 602 isSpecial ITgroup   = True
 603 isSpecial ITby      = True
 604 isSpecial ITusing   = True
 605 isSpecial _             = False
 606 -}
 607
 608 -- the bitmap provided as the third component indicates whether the
 609 -- corresponding extension keyword is valid under the extension options
 610 -- provided to the compiler; if the extension corresponding to *any* of the
 611 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 612 -- facilitates using a keyword in two different extensions that can be
 613 -- activated independently)
 614 --
 615 reservedWordsFM = listToUFM $
 616         map (\(x, y, z) -> (mkFastString x, (y, z)))
 617        [( "_",          ITunderscore,   0 ),
 618         ( "as",         ITas,           0 ),
 619         ( "case",       ITcase,         0 ),
 620         ( "class",      ITclass,        0 ),
 621         ( "data",       ITdata,         0 ),
 622         ( "default",    ITdefault,      0 ),
 623         ( "deriving",   ITderiving,     0 ),
 624         ( "do",         ITdo,           0 ),
 625         ( "else",       ITelse,         0 ),
 626         ( "hiding",     IThiding,       0 ),
 627         ( "if",         ITif,           0 ),
 628         ( "import",     ITimport,       0 ),
 629         ( "in",         ITin,           0 ),
 630         ( "infix",      ITinfix,        0 ),
 631         ( "infixl",     ITinfixl,       0 ),
 632         ( "infixr",     ITinfixr,       0 ),
 633         ( "instance",   ITinstance,     0 ),
 634         ( "let",        ITlet,          0 ),
 635         ( "module",     ITmodule,       0 ),
 636         ( "newtype",    ITnewtype,      0 ),
 637         ( "of",         ITof,           0 ),
 638         ( "qualified",  ITqualified,    0 ),
 639         ( "then",       ITthen,         0 ),
 640         ( "type",       ITtype,         0 ),
 641         ( "where",      ITwhere,        0 ),
 642         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 643
 644     ( "forall", ITforall,        bit explicitForallBit .|. bit inRulePragBit),
 645         ( "mdo",        ITmdo,           bit recursiveDoBit),
 646         ( "family",     ITfamily,        bit tyFamBit),
 647     ( "group",  ITgroup,     bit transformComprehensionsBit),
 648     ( "by",     ITby,        bit transformComprehensionsBit),
 649     ( "using",  ITusing,     bit transformComprehensionsBit),
 650
 651         ( "foreign",    ITforeign,       bit ffiBit),
 652         ( "export",     ITexport,        bit ffiBit),
 653         ( "label",      ITlabel,         bit ffiBit),
 654         ( "dynamic",    ITdynamic,       bit ffiBit),
 655         ( "safe",       ITsafe,          bit ffiBit),
 656         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 657         ( "unsafe",     ITunsafe,        bit ffiBit),
 658         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 659         ( "ccall",      ITccallconv,     bit ffiBit),
 660         ( "dotnet",     ITdotnet,        bit ffiBit),
 661
 662         ( "rec",        ITrec,           bit arrowsBit),
 663         ( "proc",       ITproc,          bit arrowsBit)
 664      ]
 665
 666 reservedSymsFM :: UniqFM (Token, Int -> Bool)
 667 reservedSymsFM = listToUFM $
 668     map (\ (x,y,z) -> (mkFastString x,(y,z)))
 669       [ ("..",  ITdotdot,   always)
 670         -- (:) is a reserved op, meaning only list cons
 671        ,(":",   ITcolon,    always)
 672        ,("::",  ITdcolon,   always)
 673        ,("=",   ITequal,    always)
 674        ,("\\",  ITlam,      always)
 675        ,("|",   ITvbar,     always)
 676        ,("<-",  ITlarrow,   always)
 677        ,("->",  ITrarrow,   always)
 678        ,("@",   ITat,       always)
 679        ,("~",   ITtilde,    always)
 680        ,("=>",  ITdarrow,   always)
 681        ,("-",   ITminus,    always)
 682        ,("!",   ITbang,     always)
 683
 684         -- For data T (a::*) = MkT
 685        ,("*", ITstar, \i -> kindSigsEnabled i || tyFamEnabled i)
 686         -- For 'forall a . t'
 687        ,(".", ITdot, \i -> explicitForallEnabled i || inRulePrag i)
 688
 689        ,("-<",  ITlarrowtail, arrowsEnabled)
 690        ,(">-",  ITrarrowtail, arrowsEnabled)
 691        ,("-<<", ITLarrowtail, arrowsEnabled)
 692        ,(">>-", ITRarrowtail, arrowsEnabled)
 693
 694 #if __GLASGOW_HASKELL__ >= 605
 695        ,("∷",   ITdcolon, unicodeSyntaxEnabled)
 696        ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
 697        ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
 698                                 explicitForallEnabled i)
 699        ,("→",   ITrarrow, unicodeSyntaxEnabled)
 700        ,("←",   ITlarrow, unicodeSyntaxEnabled)
 701        ,("⋯",   ITdotdot, unicodeSyntaxEnabled)
 702         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 703         -- form part of a large operator.  This would let us have a better
 704         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 705 #endif
 706        ]
 707
 708 -- -----------------------------------------------------------------------------
 709 -- Lexer actions
 710
 711 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 712
 713 special :: Token -> Action
 714 special tok span _buf _len = return (L span tok)
 715
 716 token, layout_token :: Token -> Action
 717 token t span _buf _len = return (L span t)
 718 layout_token t span _buf _len = pushLexState layout >> return (L span t)
 719
 720 idtoken :: (StringBuffer -> Int -> Token) -> Action
 721 idtoken f span buf len = return (L span $! (f buf len))
 722
 723 skip_one_varid :: (FastString -> Token) -> Action
 724 skip_one_varid f span buf len
 725   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 726
 727 strtoken :: (String -> Token) -> Action
 728 strtoken f span buf len =
 729   return (L span $! (f $! lexemeToString buf len))
 730
 731 init_strtoken :: Int -> (String -> Token) -> Action
 732 -- like strtoken, but drops the last N character(s)
 733 init_strtoken drop f span buf len =
 734   return (L span $! (f $! lexemeToString buf (len-drop)))
 735
 736 begin :: Int -> Action
 737 begin code _span _str _len = do pushLexState code; lexToken
 738
 739 pop :: Action
 740 pop _span _buf _len = do popLexState; lexToken
 741
 742 pop_and :: Action -> Action
 743 pop_and act span buf len = do popLexState; act span buf len
 744
 745 {-# INLINE nextCharIs #-}
 746 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 747
 748 notFollowedBy char _ _ _ (AI _ _ buf)
 749   = nextCharIs buf (/=char)
 750
 751 notFollowedBySymbol _ _ _ (AI _ _ buf)
 752   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 753
 754 -- We must reject doc comments as being ordinary comments everywhere.
 755 -- In some cases the doc comment will be selected as the lexeme due to
 756 -- maximal munch, but not always, because the nested comment rule is
 757 -- valid in all states, but the doc-comment rules are only valid in
 758 -- the non-layout states.
 759 isNormalComment bits _ _ (AI _ _ buf)
 760   | haddockEnabled bits = notFollowedByDocOrPragma
 761   | otherwise           = nextCharIs buf (/='#')
 762   where
 763     notFollowedByDocOrPragma
 764        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 765
 766 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 767
 768 {-
 769 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 770   = if haddockEnabled bits then False else (p buf)
 771 -}
 772
 773 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 774
 775 ifExtension pred bits _ _ _ = pred bits
 776
 777 multiline_doc_comment :: Action
 778 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 779   where
 780     worker commentAcc input docType oneLine = case alexGetChar input of
 781       Just ('\n', input')
 782         | oneLine -> docCommentEnd input commentAcc docType buf span
 783         | otherwise -> case checkIfCommentLine input' of
 784           Just input -> worker ('\n':commentAcc) input docType False
 785           Nothing -> docCommentEnd input commentAcc docType buf span
 786       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 787       Nothing -> docCommentEnd input commentAcc docType buf span
 788
 789     checkIfCommentLine input = check (dropNonNewlineSpace input)
 790       where
 791         check input = case alexGetChar input of
 792           Just ('-', input) -> case alexGetChar input of
 793             Just ('-', input) -> case alexGetChar input of
 794               Just (c, _) | c /= '-' -> Just input
 795               _ -> Nothing
 796             _ -> Nothing
 797           _ -> Nothing
 798
 799         dropNonNewlineSpace input = case alexGetChar input of
 800           Just (c, input')
 801             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 802             | otherwise -> input
 803           Nothing -> input
 804
 805 {-
 806   nested comments require traversing by hand, they can't be parsed
 807   using regular expressions.
 808 -}
 809 nested_comment :: P (Located Token) -> Action
 810 nested_comment cont span _str _len = do
 811   input <- getInput
 812   go (1::Int) input
 813   where
 814     go 0 input = do setInput input; cont
 815     go n input = case alexGetChar input of
 816       Nothing -> errBrace input span
 817       Just ('-',input) -> case alexGetChar input of
 818         Nothing  -> errBrace input span
 819         Just ('\125',input) -> go (n-1) input
 820         Just (_,_)          -> go n input
 821       Just ('\123',input) -> case alexGetChar input of
 822         Nothing  -> errBrace input span
 823         Just ('-',input) -> go (n+1) input
 824         Just (_,_)       -> go n input
 825       Just (_,input) -> go n input
 826
 827 nested_doc_comment :: Action
 828 nested_doc_comment span buf _len = withLexedDocType (go "")
 829   where
 830     go commentAcc input docType _ = case alexGetChar input of
 831       Nothing -> errBrace input span
 832       Just ('-',input) -> case alexGetChar input of
 833         Nothing -> errBrace input span
 834         Just ('\125',input) ->
 835           docCommentEnd input commentAcc docType buf span
 836         Just (_,_) -> go ('-':commentAcc) input docType False
 837       Just ('\123', input) -> case alexGetChar input of
 838         Nothing  -> errBrace input span
 839         Just ('-',input) -> do
 840           setInput input
 841           let cont = do input <- getInput; go commentAcc input docType False
 842           nested_comment cont span buf _len
 843         Just (_,_) -> go ('\123':commentAcc) input docType False
 844       Just (c,input) -> go (c:commentAcc) input docType False
 845
 846 withLexedDocType lexDocComment = do
 847   input@(AI _ _ buf) <- getInput
 848   case prevChar buf ' ' of
 849     '|' -> lexDocComment input ITdocCommentNext False
 850     '^' -> lexDocComment input ITdocCommentPrev False
 851     '$' -> lexDocComment input ITdocCommentNamed False
 852     '*' -> lexDocSection 1 input
 853     '#' -> lexDocComment input ITdocOptionsOld False
 854  where
 855     lexDocSection n input = case alexGetChar input of
 856       Just ('*', input) -> lexDocSection (n+1) input
 857       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
 858       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 859
 860 -- RULES pragmas turn on the forall and '.' keywords, and we turn them
 861 -- off again at the end of the pragma.
 862 rulePrag :: Action
 863 rulePrag span buf len = do
 864   setExts (.|. inRulePragBit)
 865   return (L span ITrules_prag)
 866
 867 endPrag :: Action
 868 endPrag span buf len = do
 869   setExts (.&. complement (bit inRulePragBit))
 870   return (L span ITclose_prag)
 871
 872 -- docCommentEnd
 873 -------------------------------------------------------------------------------
 874 -- This function is quite tricky. We can't just return a new token, we also
 875 -- need to update the state of the parser. Why? Because the token is longer
 876 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 877 -- it writes the wrong token length to the parser state. This function is
 878 -- called afterwards, so it can just update the state.
 879
 880 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 881 -- which is something that the original lexer didn't account for.
 882 -- I have added last_line_len in the parser state which represents the length
 883 -- of the part of the token that is on the last line. It is now used for layout
 884 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 885 -- was before, the full length of the token, and it is now only used for error
 886 -- messages. /Waern
 887
 888 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 889                  SrcSpan -> P (Located Token)
 890 docCommentEnd input commentAcc docType buf span = do
 891   setInput input
 892   let (AI loc last_offs nextBuf) = input
 893       comment = reverse commentAcc
 894       span' = mkSrcSpan (srcSpanStart span) loc
 895       last_len = byteDiff buf nextBuf
 896
 897       last_line_len = if (last_offs - last_len < 0)
 898         then last_offs
 899         else last_len
 900
 901   span `seq` setLastToken span' last_len last_line_len
 902   return (L span' (docType comment))
 903
 904 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 905
 906 open_brace, close_brace :: Action
 907 open_brace span _str _len = do
 908   ctx <- getContext
 909   setContext (NoLayout:ctx)
 910   return (L span ITocurly)
 911 close_brace span _str _len = do
 912   popContext
 913   return (L span ITccurly)
 914
 915 qvarid buf len = ITqvarid $! splitQualName buf len
 916 qconid buf len = ITqconid $! splitQualName buf len
 917
 918 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 919 -- takes a StringBuffer and a length, and returns the module name
 920 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 921 -- because of hierarchical module names.
 922 splitQualName orig_buf len = split orig_buf orig_buf
 923   where
 924     split buf dot_buf
 925         | orig_buf `byteDiff` buf >= len  = done dot_buf
 926         | c == '.'                        = found_dot buf'
 927         | otherwise                       = split buf' dot_buf
 928       where
 929        (c,buf') = nextChar buf
 930
 931     -- careful, we might get names like M....
 932     -- so, if the character after the dot is not upper-case, this is
 933     -- the end of the qualifier part.
 934     found_dot buf -- buf points after the '.'
 935         | isUpper c    = split buf' buf
 936         | otherwise    = done buf
 937       where
 938        (c,buf') = nextChar buf
 939
 940     done dot_buf =
 941         (lexemeToFastString orig_buf (qual_size - 1),
 942          lexemeToFastString dot_buf (len - qual_size))
 943       where
 944         qual_size = orig_buf `byteDiff` dot_buf
 945
 946 varid span buf len =
 947   fs `seq`
 948   case lookupUFM reservedWordsFM fs of
 949         Just (keyword,0)    -> do
 950                 maybe_layout keyword
 951                 return (L span keyword)
 952         Just (keyword,exts) -> do
 953                 b <- extension (\i -> exts .&. i /= 0)
 954                 if b then do maybe_layout keyword
 955                              return (L span keyword)
 956                      else return (L span (ITvarid fs))
 957         _other -> return (L span (ITvarid fs))
 958   where
 959         fs = lexemeToFastString buf len
 960
 961 conid buf len = ITconid fs
 962   where fs = lexemeToFastString buf len
 963
 964 qvarsym buf len = ITqvarsym $! splitQualName buf len
 965 qconsym buf len = ITqconsym $! splitQualName buf len
 966
 967 varsym = sym ITvarsym
 968 consym = sym ITconsym
 969
 970 sym con span buf len =
 971   case lookupUFM reservedSymsFM fs of
 972         Just (keyword,exts) -> do
 973                 b <- extension exts
 974                 if b then return (L span keyword)
 975                      else return (L span $! con fs)
 976         _other -> return (L span $! con fs)
 977   where
 978         fs = lexemeToFastString buf len
 979
 980 -- Variations on the integral numeric literal.
 981 tok_integral :: (Integer -> Token)
 982      -> (Integer -> Integer)
 983  --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
 984      -> Int -> Int
 985      -> (Integer, (Char->Int)) -> Action
 986 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
 987   return $ L span $ itint $! transint $ parseUnsignedInteger
 988      (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
 989
 990 -- some conveniences for use with tok_integral
 991 tok_num = tok_integral ITinteger
 992 tok_primint = tok_integral ITprimint
 993 tok_primword = tok_integral ITprimword positive
 994 positive = id
 995 negative = negate
 996 decimal = (10,octDecDigit)
 997 octal = (8,octDecDigit)
 998 hexadecimal = (16,hexDigit)
 999
1000 -- readRational can understand negative rationals, exponents, everything.
1001 tok_float        str = ITrational   $! readRational str
1002 tok_primfloat    str = ITprimfloat  $! readRational str
1003 tok_primdouble   str = ITprimdouble $! readRational str
1004
1005 -- -----------------------------------------------------------------------------
1006 -- Layout processing
1007
1008 -- we're at the first token on a line, insert layout tokens if necessary
1009 do_bol :: Action
1010 do_bol span _str _len = do
1011         pos <- getOffside
1012         case pos of
1013             LT -> do
1014                 --trace "layout: inserting '}'" $ do
1015                 popContext
1016                 -- do NOT pop the lex state, we might have a ';' to insert
1017                 return (L span ITvccurly)
1018             EQ -> do
1019                 --trace "layout: inserting ';'" $ do
1020                 popLexState
1021                 return (L span ITsemi)
1022             GT -> do
1023                 popLexState
1024                 lexToken
1025
1026 -- certain keywords put us in the "layout" state, where we might
1027 -- add an opening curly brace.
1028 maybe_layout ITdo       = pushLexState layout_do
1029 maybe_layout ITmdo      = pushLexState layout_do
1030 maybe_layout ITof       = pushLexState layout
1031 maybe_layout ITlet      = pushLexState layout
1032 maybe_layout ITwhere    = pushLexState layout
1033 maybe_layout ITrec      = pushLexState layout
1034 maybe_layout _          = return ()
1035
1036 -- Pushing a new implicit layout context.  If the indentation of the
1037 -- next token is not greater than the previous layout context, then
1038 -- Haskell 98 says that the new layout context should be empty; that is
1039 -- the lexer must generate {}.
1040 --
1041 -- We are slightly more lenient than this: when the new context is started
1042 -- by a 'do', then we allow the new context to be at the same indentation as
1043 -- the previous context.  This is what the 'strict' argument is for.
1044 --
1045 new_layout_context strict span _buf _len = do
1046     popLexState
1047     (AI _ offset _) <- getInput
1048     ctx <- getContext
1049     case ctx of
1050         Layout prev_off : _  |
1051            (strict     && prev_off >= offset  ||
1052             not strict && prev_off > offset) -> do
1053                 -- token is indented to the left of the previous context.
1054                 -- we must generate a {} sequence now.
1055                 pushLexState layout_left
1056                 return (L span ITvocurly)
1057         _ -> do
1058                 setContext (Layout offset : ctx)
1059                 return (L span ITvocurly)
1060
1061 do_layout_left span _buf _len = do
1062     popLexState
1063     pushLexState bol  -- we must be at the start of a line
1064     return (L span ITvccurly)
1065
1066 -- -----------------------------------------------------------------------------
1067 -- LINE pragmas
1068
1069 setLine :: Int -> Action
1070 setLine code span buf len = do
1071   let line = parseUnsignedInteger buf len 10 octDecDigit
1072   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1073         -- subtract one: the line number refers to the *following* line
1074   popLexState
1075   pushLexState code
1076   lexToken
1077
1078 setFile :: Int -> Action
1079 setFile code span buf len = do
1080   let file = lexemeToFastString (stepOn buf) (len-2)
1081   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1082   popLexState
1083   pushLexState code
1084   lexToken
1085
1086
1087 -- -----------------------------------------------------------------------------
1088 -- Options, includes and language pragmas.
1089
1090 lex_string_prag :: (String -> Token) -> Action
1091 lex_string_prag mkTok span _buf _len
1092     = do input <- getInput
1093          start <- getSrcLoc
1094          tok <- go [] input
1095          end <- getSrcLoc
1096          return (L (mkSrcSpan start end) tok)
1097     where go acc input
1098               = if isString input "#-}"
1099                    then do setInput input
1100                            return (mkTok (reverse acc))
1101                    else case alexGetChar input of
1102                           Just (c,i) -> go (c:acc) i
1103                           Nothing -> err input
1104           isString _ [] = True
1105           isString i (x:xs)
1106               = case alexGetChar i of
1107                   Just (c,i') | c == x    -> isString i' xs
1108                   _other -> False
1109           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1110
1111
1112 -- -----------------------------------------------------------------------------
1113 -- Strings & Chars
1114
1115 -- This stuff is horrible.  I hates it.
1116
1117 lex_string_tok :: Action
1118 lex_string_tok span _buf _len = do
1119   tok <- lex_string ""
1120   end <- getSrcLoc
1121   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1122
1123 lex_string :: String -> P Token
1124 lex_string s = do
1125   i <- getInput
1126   case alexGetChar' i of
1127     Nothing -> lit_error
1128
1129     Just ('"',i)  -> do
1130         setInput i
1131         magicHash <- extension magicHashEnabled
1132         if magicHash
1133           then do
1134             i <- getInput
1135             case alexGetChar' i of
1136               Just ('#',i) -> do
1137                    setInput i
1138                    if any (> '\xFF') s
1139                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1140                     else let s' = mkZFastString (reverse s) in
1141                          return (ITprimstring s')
1142                         -- mkZFastString is a hack to avoid encoding the
1143                         -- string in UTF-8.  We just want the exact bytes.
1144               _other ->
1145                 return (ITstring (mkFastString (reverse s)))
1146           else
1147                 return (ITstring (mkFastString (reverse s)))
1148
1149     Just ('\\',i)
1150         | Just ('&',i) <- next -> do
1151                 setInput i; lex_string s
1152         | Just (c,i) <- next, is_space c -> do
1153                 setInput i; lex_stringgap s
1154         where next = alexGetChar' i
1155
1156     Just (c, i) -> do
1157         c' <- lex_char c i
1158         lex_string (c':s)
1159
1160 lex_stringgap s = do
1161   c <- getCharOrFail
1162   case c of
1163     '\\' -> lex_string s
1164     c | is_space c -> lex_stringgap s
1165     _other -> lit_error
1166
1167
1168 lex_char_tok :: Action
1169 -- Here we are basically parsing character literals, such as 'x' or '\n'
1170 -- but, when Template Haskell is on, we additionally spot
1171 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1172 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1173 -- So we have to do two characters of lookahead: when we see 'x we need to
1174 -- see if there's a trailing quote
1175 lex_char_tok span _buf _len = do        -- We've seen '
1176    i1 <- getInput       -- Look ahead to first character
1177    let loc = srcSpanStart span
1178    case alexGetChar' i1 of
1179         Nothing -> lit_error
1180
1181         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1182                   th_exts <- extension thEnabled
1183                   if th_exts then do
1184                         setInput i2
1185                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1186                    else lit_error
1187
1188         Just ('\\', i2@(AI _end2 _ _)) -> do    -- We've seen 'backslash
1189                   setInput i2
1190                   lit_ch <- lex_escape
1191                   mc <- getCharOrFail   -- Trailing quote
1192                   if mc == '\'' then finish_char_tok loc lit_ch
1193                                 else do setInput i2; lit_error
1194
1195         Just (c, i2@(AI _end2 _ _))
1196                 | not (isAny c) -> lit_error
1197                 | otherwise ->
1198
1199                 -- We've seen 'x, where x is a valid character
1200                 --  (i.e. not newline etc) but not a quote or backslash
1201            case alexGetChar' i2 of      -- Look ahead one more character
1202                 Nothing -> lit_error
1203                 Just ('\'', i3) -> do   -- We've seen 'x'
1204                         setInput i3
1205                         finish_char_tok loc c
1206                 _other -> do            -- We've seen 'x not followed by quote
1207                                         -- If TH is on, just parse the quote only
1208                         th_exts <- extension thEnabled
1209                         let (AI end _ _) = i1
1210                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1211                                    else do setInput i2; lit_error
1212
1213 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1214 finish_char_tok loc ch  -- We've already seen the closing quote
1215                         -- Just need to check for trailing #
1216   = do  magicHash <- extension magicHashEnabled
1217         i@(AI end _ _) <- getInput
1218         if magicHash then do
1219                 case alexGetChar' i of
1220                         Just ('#',i@(AI end _ _)) -> do
1221                                 setInput i
1222                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1223                         _other ->
1224                                 return (L (mkSrcSpan loc end) (ITchar ch))
1225                 else do
1226                    return (L (mkSrcSpan loc end) (ITchar ch))
1227
1228 lex_char :: Char -> AlexInput -> P Char
1229 lex_char c inp = do
1230   case c of
1231       '\\' -> do setInput inp; lex_escape
1232       c | isAny c -> do setInput inp; return c
1233       _other -> lit_error
1234
1235 isAny c | c > '\x7f' = isPrint c
1236         | otherwise  = is_any c
1237
1238 lex_escape :: P Char
1239 lex_escape = do
1240   c <- getCharOrFail
1241   case c of
1242         'a'   -> return '\a'
1243         'b'   -> return '\b'
1244         'f'   -> return '\f'
1245         'n'   -> return '\n'
1246         'r'   -> return '\r'
1247         't'   -> return '\t'
1248         'v'   -> return '\v'
1249         '\\'  -> return '\\'
1250         '"'   -> return '\"'
1251         '\''  -> return '\''
1252         '^'   -> do c <- getCharOrFail
1253                     if c >= '@' && c <= '_'
1254                         then return (chr (ord c - ord '@'))
1255                         else lit_error
1256
1257         'x'   -> readNum is_hexdigit 16 hexDigit
1258         'o'   -> readNum is_octdigit  8 octDecDigit
1259         x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
1260
1261         c1 ->  do
1262            i <- getInput
1263            case alexGetChar' i of
1264             Nothing -> lit_error
1265             Just (c2,i2) ->
1266               case alexGetChar' i2 of
1267                 Nothing -> do setInput i2; lit_error
1268                 Just (c3,i3) ->
1269                    let str = [c1,c2,c3] in
1270                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1271                                      Just rest <- [maybePrefixMatch p str] ] of
1272                           (escape_char,[]):_ -> do
1273                                 setInput i3
1274                                 return escape_char
1275                           (escape_char,_:_):_ -> do
1276                                 setInput i2
1277                                 return escape_char
1278                           [] -> lit_error
1279
1280 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1281 readNum is_digit base conv = do
1282   i <- getInput
1283   c <- getCharOrFail
1284   if is_digit c
1285         then readNum2 is_digit base conv (conv c)
1286         else do setInput i; lit_error
1287
1288 readNum2 is_digit base conv i = do
1289   input <- getInput
1290   read i input
1291   where read i input = do
1292           case alexGetChar' input of
1293             Just (c,input') | is_digit c -> do
1294                 read (i*base + conv c) input'
1295             _other -> do
1296                 if i >= 0 && i <= 0x10FFFF
1297                    then do setInput input; return (chr i)
1298                    else lit_error
1299
1300 silly_escape_chars = [
1301         ("NUL", '\NUL'),
1302         ("SOH", '\SOH'),
1303         ("STX", '\STX'),
1304         ("ETX", '\ETX'),
1305         ("EOT", '\EOT'),
1306         ("ENQ", '\ENQ'),
1307         ("ACK", '\ACK'),
1308         ("BEL", '\BEL'),
1309         ("BS", '\BS'),
1310         ("HT", '\HT'),
1311         ("LF", '\LF'),
1312         ("VT", '\VT'),
1313         ("FF", '\FF'),
1314         ("CR", '\CR'),
1315         ("SO", '\SO'),
1316         ("SI", '\SI'),
1317         ("DLE", '\DLE'),
1318         ("DC1", '\DC1'),
1319         ("DC2", '\DC2'),
1320         ("DC3", '\DC3'),
1321         ("DC4", '\DC4'),
1322         ("NAK", '\NAK'),
1323         ("SYN", '\SYN'),
1324         ("ETB", '\ETB'),
1325         ("CAN", '\CAN'),
1326         ("EM", '\EM'),
1327         ("SUB", '\SUB'),
1328         ("ESC", '\ESC'),
1329         ("FS", '\FS'),
1330         ("GS", '\GS'),
1331         ("RS", '\RS'),
1332         ("US", '\US'),
1333         ("SP", '\SP'),
1334         ("DEL", '\DEL')
1335         ]
1336
1337 -- before calling lit_error, ensure that the current input is pointing to
1338 -- the position of the error in the buffer.  This is so that we can report
1339 -- a correct location to the user, but also so we can detect UTF-8 decoding
1340 -- errors if they occur.
1341 lit_error = lexError "lexical error in string/character literal"
1342
1343 getCharOrFail :: P Char
1344 getCharOrFail =  do
1345   i <- getInput
1346   case alexGetChar' i of
1347         Nothing -> lexError "unexpected end-of-file in string/character literal"
1348         Just (c,i)  -> do setInput i; return c
1349
1350 -- -----------------------------------------------------------------------------
1351 -- QuasiQuote
1352
1353 lex_quasiquote_tok :: Action
1354 lex_quasiquote_tok span buf len = do
1355   let quoter = reverse $ takeWhile (/= '$')
1356                $ reverse $ lexemeToString buf (len - 1)
1357   quoteStart <- getSrcLoc
1358   quote <- lex_quasiquote ""
1359   end <- getSrcLoc
1360   return (L (mkSrcSpan (srcSpanStart span) end)
1361            (ITquasiQuote (mkFastString quoter,
1362                           mkFastString (reverse quote),
1363                           mkSrcSpan quoteStart end)))
1364
1365 lex_quasiquote :: String -> P String
1366 lex_quasiquote s = do
1367   i <- getInput
1368   case alexGetChar' i of
1369     Nothing -> lit_error
1370
1371     Just ('\\',i)
1372         | Just ('|',i) <- next -> do
1373                 setInput i; lex_quasiquote ('|' : s)
1374         | Just (']',i) <- next -> do
1375                 setInput i; lex_quasiquote (']' : s)
1376         where next = alexGetChar' i
1377
1378     Just ('|',i)
1379         | Just (']',i) <- next -> do
1380                 setInput i; return s
1381         where next = alexGetChar' i
1382
1383     Just (c, i) -> do
1384          setInput i; lex_quasiquote (c : s)
1385
1386 -- -----------------------------------------------------------------------------
1387 -- Warnings
1388
1389 warn :: DynFlag -> SDoc -> Action
1390 warn option warning srcspan _buf _len = do
1391     addWarning option srcspan warning
1392     lexToken
1393
1394 warnThen :: DynFlag -> SDoc -> Action -> Action
1395 warnThen option warning action srcspan buf len = do
1396     addWarning option srcspan warning
1397     action srcspan buf len
1398
1399 -- -----------------------------------------------------------------------------
1400 -- The Parse Monad
1401
1402 data LayoutContext
1403   = NoLayout
1404   | Layout !Int
1405   deriving Show
1406
1407 data ParseResult a
1408   = POk PState a
1409   | PFailed
1410         SrcSpan         -- The start and end of the text span related to
1411                         -- the error.  Might be used in environments which can
1412                         -- show this span, e.g. by highlighting it.
1413         Message         -- The error message
1414
1415 data PState = PState {
1416         buffer     :: StringBuffer,
1417     dflags     :: DynFlags,
1418     messages   :: Messages,
1419         last_loc   :: SrcSpan,  -- pos of previous token
1420         last_offs  :: !Int,     -- offset of the previous token from the
1421                                 -- beginning of  the current line.
1422                                 -- \t is equal to 8 spaces.
1423         last_len   :: !Int,     -- len of previous token
1424   last_line_len :: !Int,
1425         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1426         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1427         context    :: [LayoutContext],
1428         lex_state  :: [Int]
1429      }
1430         -- last_loc and last_len are used when generating error messages,
1431         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1432         -- current token to happyError, we could at least get rid of last_len.
1433         -- Getting rid of last_loc would require finding another way to
1434         -- implement pushCurrentContext (which is only called from one place).
1435
1436 newtype P a = P { unP :: PState -> ParseResult a }
1437
1438 instance Monad P where
1439   return = returnP
1440   (>>=) = thenP
1441   fail = failP
1442
1443 returnP :: a -> P a
1444 returnP a = a `seq` (P $ \s -> POk s a)
1445
1446 thenP :: P a -> (a -> P b) -> P b
1447 (P m) `thenP` k = P $ \ s ->
1448         case m s of
1449                 POk s1 a         -> (unP (k a)) s1
1450                 PFailed span err -> PFailed span err
1451
1452 failP :: String -> P a
1453 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1454
1455 failMsgP :: String -> P a
1456 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1457
1458 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1459 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1460
1461 failSpanMsgP :: SrcSpan -> SDoc -> P a
1462 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1463
1464 extension :: (Int -> Bool) -> P Bool
1465 extension p = P $ \s -> POk s (p $! extsBitmap s)
1466
1467 getExts :: P Int
1468 getExts = P $ \s -> POk s (extsBitmap s)
1469
1470 setExts :: (Int -> Int) -> P ()
1471 setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
1472
1473 setSrcLoc :: SrcLoc -> P ()
1474 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1475
1476 getSrcLoc :: P SrcLoc
1477 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1478
1479 setLastToken :: SrcSpan -> Int -> Int -> P ()
1480 setLastToken loc len line_len = P $ \s -> POk s {
1481   last_loc=loc,
1482   last_len=len,
1483   last_line_len=line_len
1484 } ()
1485
1486 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1487
1488 alexInputPrevChar :: AlexInput -> Char
1489 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1490
1491 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1492 alexGetChar (AI loc ofs s)
1493   | atEnd s   = Nothing
1494   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1495                 --trace (show (ord c)) $
1496                 Just (adj_c, (AI loc' ofs' s'))
1497   where (c,s') = nextChar s
1498         loc'   = advanceSrcLoc loc c
1499         ofs'   = advanceOffs c ofs
1500
1501         non_graphic     = '\x0'
1502         upper           = '\x1'
1503         lower           = '\x2'
1504         digit           = '\x3'
1505         symbol          = '\x4'
1506         space           = '\x5'
1507         other_graphic   = '\x6'
1508
1509         adj_c
1510           | c <= '\x06' = non_graphic
1511           | c <= '\x7f' = c
1512           -- Alex doesn't handle Unicode, so when Unicode
1513           -- character is encoutered we output these values
1514           -- with the actual character value hidden in the state.
1515           | otherwise =
1516                 case generalCategory c of
1517                   UppercaseLetter       -> upper
1518                   LowercaseLetter       -> lower
1519                   TitlecaseLetter       -> upper
1520                   ModifierLetter        -> other_graphic
1521                   OtherLetter           -> lower -- see #1103
1522                   NonSpacingMark        -> other_graphic
1523                   SpacingCombiningMark  -> other_graphic
1524                   EnclosingMark         -> other_graphic
1525                   DecimalNumber         -> digit
1526                   LetterNumber          -> other_graphic
1527                   OtherNumber           -> other_graphic
1528                   ConnectorPunctuation  -> other_graphic
1529                   DashPunctuation       -> other_graphic
1530                   OpenPunctuation       -> other_graphic
1531                   ClosePunctuation      -> other_graphic
1532                   InitialQuote          -> other_graphic
1533                   FinalQuote            -> other_graphic
1534                   OtherPunctuation      -> other_graphic
1535                   MathSymbol            -> symbol
1536                   CurrencySymbol        -> symbol
1537                   ModifierSymbol        -> symbol
1538                   OtherSymbol           -> symbol
1539                   Space                 -> space
1540                   _other                -> non_graphic
1541
1542 -- This version does not squash unicode characters, it is used when
1543 -- lexing strings.
1544 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1545 alexGetChar' (AI loc ofs s)
1546   | atEnd s   = Nothing
1547   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1548                 --trace (show (ord c)) $
1549                 Just (c, (AI loc' ofs' s'))
1550   where (c,s') = nextChar s
1551         loc'   = advanceSrcLoc loc c
1552         ofs'   = advanceOffs c ofs
1553
1554 advanceOffs :: Char -> Int -> Int
1555 advanceOffs '\n' _    = 0
1556 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1557 advanceOffs _    offs = offs + 1
1558
1559 getInput :: P AlexInput
1560 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1561
1562 setInput :: AlexInput -> P ()
1563 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1564
1565 pushLexState :: Int -> P ()
1566 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1567
1568 popLexState :: P Int
1569 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1570
1571 getLexState :: P Int
1572 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1573
1574 -- for reasons of efficiency, flags indicating language extensions (eg,
1575 -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
1576 -- integer
1577
1578 genericsBit, ffiBit, parrBit :: Int
1579 genericsBit = 0 -- {| and |}
1580 ffiBit     = 1
1581 parrBit    = 2
1582 arrowsBit  = 4
1583 thBit      = 5
1584 ipBit      = 6
1585 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1586 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1587                 -- (doesn't affect the lexer)
1588 tyFamBit   = 9  -- indexed type families: 'family' keyword and kind sigs
1589 haddockBit = 10 -- Lex and parse Haddock comments
1590 magicHashBit = 11 -- "#" in both functions and operators
1591 kindSigsBit = 12 -- Kind signatures on type variables
1592 recursiveDoBit = 13 -- mdo
1593 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1594 unboxedTuplesBit = 15 -- (# and #)
1595 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1596 transformComprehensionsBit = 17
1597 qqBit      = 18 -- enable quasiquoting
1598 inRulePragBit = 19
1599
1600 genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1601 always           _     = True
1602 genericsEnabled  flags = testBit flags genericsBit
1603 ffiEnabled       flags = testBit flags ffiBit
1604 parrEnabled      flags = testBit flags parrBit
1605 arrowsEnabled    flags = testBit flags arrowsBit
1606 thEnabled        flags = testBit flags thBit
1607 ipEnabled        flags = testBit flags ipBit
1608 explicitForallEnabled flags = testBit flags explicitForallBit
1609 bangPatEnabled   flags = testBit flags bangPatBit
1610 tyFamEnabled     flags = testBit flags tyFamBit
1611 haddockEnabled   flags = testBit flags haddockBit
1612 magicHashEnabled flags = testBit flags magicHashBit
1613 kindSigsEnabled  flags = testBit flags kindSigsBit
1614 recursiveDoEnabled flags = testBit flags recursiveDoBit
1615 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1616 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1617 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1618 transformComprehensionsEnabled flags = testBit flags transformComprehensionsBit
1619 qqEnabled        flags = testBit flags qqBit
1620 inRulePrag       flags = testBit flags inRulePragBit
1621
1622 -- PState for parsing options pragmas
1623 --
1624 pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
1625 pragState dynflags buf loc =
1626   PState {
1627       buffer        = buf,
1628       messages      = emptyMessages,
1629       dflags        = dynflags,
1630       last_loc      = mkSrcSpan loc loc,
1631       last_offs     = 0,
1632       last_len      = 0,
1633       last_line_len = 0,
1634       loc           = loc,
1635       extsBitmap    = 0,
1636       context       = [],
1637       lex_state     = [bol, option_prags, 0]
1638     }
1639
1640
1641 -- create a parse state
1642 --
1643 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1644 mkPState buf loc flags  =
1645   PState {
1646       buffer          = buf,
1647       dflags        = flags,
1648       messages      = emptyMessages,
1649       last_loc      = mkSrcSpan loc loc,
1650       last_offs     = 0,
1651       last_len      = 0,
1652       last_line_len = 0,
1653       loc           = loc,
1654       extsBitmap    = fromIntegral bitmap,
1655       context       = [],
1656       lex_state     = [bol, 0]
1657         -- we begin in the layout state if toplev_layout is set
1658     }
1659     where
1660       bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1661                .|. ffiBit       `setBitIf` dopt Opt_ForeignFunctionInterface flags
1662                .|. parrBit      `setBitIf` dopt Opt_PArr         flags
1663                .|. arrowsBit    `setBitIf` dopt Opt_Arrows       flags
1664                .|. thBit        `setBitIf` dopt Opt_TemplateHaskell flags
1665                .|. qqBit        `setBitIf` dopt Opt_QuasiQuotes flags
1666                .|. ipBit        `setBitIf` dopt Opt_ImplicitParams flags
1667                .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
1668                .|. explicitForallBit `setBitIf` dopt Opt_LiberalTypeSynonyms flags
1669                .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
1670                .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
1671                .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
1672                .|. explicitForallBit `setBitIf` dopt Opt_RankNTypes flags
1673                .|. bangPatBit   `setBitIf` dopt Opt_BangPatterns flags
1674                .|. tyFamBit     `setBitIf` dopt Opt_TypeFamilies flags
1675                .|. haddockBit   `setBitIf` dopt Opt_Haddock      flags
1676                .|. magicHashBit `setBitIf` dopt Opt_MagicHash    flags
1677                .|. kindSigsBit  `setBitIf` dopt Opt_KindSignatures flags
1678                .|. recursiveDoBit `setBitIf` dopt Opt_RecursiveDo flags
1679                .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
1680                .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
1681                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1682            .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1683       --
1684       setBitIf :: Int -> Bool -> Int
1685       b `setBitIf` cond | cond      = bit b
1686                         | otherwise = 0
1687
1688 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1689 addWarning option srcspan warning
1690  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1691        let warning' = mkWarnMsg srcspan alwaysQualify warning
1692            ws' = if dopt option d then ws `snocBag` warning' else ws
1693        in POk s{messages=(ws', es)} ()
1694
1695 getMessages :: PState -> Messages
1696 getMessages PState{messages=ms} = ms
1697
1698 getContext :: P [LayoutContext]
1699 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1700
1701 setContext :: [LayoutContext] -> P ()
1702 setContext ctx = P $ \s -> POk s{context=ctx} ()
1703
1704 popContext :: P ()
1705 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1706                               last_len = len, last_loc = last_loc }) ->
1707   case ctx of
1708         (_:tl) -> POk s{ context = tl } ()
1709         []     -> PFailed last_loc (srcParseErr buf len)
1710
1711 -- Push a new layout context at the indentation of the last token read.
1712 -- This is only used at the outer level of a module when the 'module'
1713 -- keyword is missing.
1714 pushCurrentContext :: P ()
1715 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1716     POk s{context = Layout (offs-len) : ctx} ()
1717 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1718
1719 getOffside :: P Ordering
1720 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1721                 let ord = case stk of
1722                         (Layout n:_) -> compare offs n
1723                         _            -> GT
1724                 in POk s ord
1725
1726 -- ---------------------------------------------------------------------------
1727 -- Construct a parse error
1728
1729 srcParseErr
1730   :: StringBuffer       -- current buffer (placed just after the last token)
1731   -> Int                -- length of the previous token
1732   -> Message
1733 srcParseErr buf len
1734   = hcat [ if null token
1735              then ptext (sLit "parse error (possibly incorrect indentation)")
1736              else hcat [ptext (sLit "parse error on input "),
1737                         char '`', text token, char '\'']
1738     ]
1739   where token = lexemeToString (offsetBytes (-len) buf) len
1740
1741 -- Report a parse failure, giving the span of the previous token as
1742 -- the location of the error.  This is the entry point for errors
1743 -- detected during parsing.
1744 srcParseFail :: P a
1745 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1746                             last_loc = last_loc } ->
1747     PFailed last_loc (srcParseErr buf len)
1748
1749 -- A lexical error is reported at a particular position in the source file,
1750 -- not over a token range.
1751 lexError :: String -> P a
1752 lexError str = do
1753   loc <- getSrcLoc
1754   (AI end _ buf) <- getInput
1755   reportLexError loc end buf str
1756
1757 -- -----------------------------------------------------------------------------
1758 -- This is the top-level function: called from the parser each time a
1759 -- new token is to be read from the input.
1760
1761 lexer :: (Located Token -> P a) -> P a
1762 lexer cont = do
1763   tok@(L _span _tok__) <- lexToken
1764 --  trace ("token: " ++ show tok__) $ do
1765   cont tok
1766
1767 lexToken :: P (Located Token)
1768 lexToken = do
1769   inp@(AI loc1 _ buf) <- getInput
1770   sc <- getLexState
1771   exts <- getExts
1772   case alexScanUser exts inp sc of
1773     AlexEOF -> do
1774         let span = mkSrcSpan loc1 loc1
1775         setLastToken span 0 0
1776         return (L span ITeof)
1777     AlexError (AI loc2 _ buf) ->
1778         reportLexError loc1 loc2 buf "lexical error"
1779     AlexSkip inp2 _ -> do
1780         setInput inp2
1781         lexToken
1782     AlexToken inp2@(AI end _ buf2) _ t -> do
1783         setInput inp2
1784         let span = mkSrcSpan loc1 end
1785         let bytes = byteDiff buf buf2
1786         span `seq` setLastToken span bytes bytes
1787         t span buf bytes
1788
1789 reportLexError loc1 loc2 buf str
1790   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1791   | otherwise =
1792   let
1793         c = fst (nextChar buf)
1794   in
1795   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1796     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1797     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1798 }