compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    getMessages,
  29    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  30    getLexState, popLexState, pushLexState,
  31    extension, glaExtsEnabled, bangPatEnabled
  32   ) where
  33
  34 #include "HsVersions.h"
  35
  36 import Bag
  37 import ErrUtils
  38 import Outputable
  39 import StringBuffer
  40 import FastString
  41 import FastTypes
  42 import SrcLoc
  43 import UniqFM
  44 import DynFlags
  45 import Ctype
  46 import Util             ( maybePrefixMatch, readRational )
  47
  48 import Control.Monad
  49 import Data.Bits
  50 import Data.Char        ( chr, isSpace )
  51 import Data.Ratio
  52 import Debug.Trace
  53
  54 #if __GLASGOW_HASKELL__ >= 605
  55 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  56 #else
  57 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  58 #endif
  59 }
  60
  61 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  62 $whitechar   = [\ \n\r\f\v\xa0 $unispace]
  63 $white_no_nl = $whitechar # \n
  64 $tab         = \t
  65
  66 $ascdigit  = 0-9
  67 $unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  68 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  69 $digit     = [$ascdigit $unidigit]
  70
  71 $special   = [\(\)\,\;\[\]\`\{\}]
  72 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  73 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  74 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  75
  76 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  77 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  78 $large     = [$asclarge $unilarge]
  79
  80 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
  81 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  82 $small     = [$ascsmall $unismall \_]
  83
  84 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
  85 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  86
  87 $octit     = 0-7
  88 $hexit     = [$decdigit A-F a-f]
  89 $symchar   = [$symbol \:]
  90 $nl        = [\n\r]
  91 $idchar    = [$small $large $digit \']
  92
  93 $docsym    = [\| \^ \* \$]
  94
  95 @varid     = $small $idchar*
  96 @conid     = $large $idchar*
  97
  98 @varsym    = $symbol $symchar*
  99 @consym    = \: $symchar*
 100
 101 @decimal     = $decdigit+
 102 @octal       = $octit+
 103 @hexadecimal = $hexit+
 104 @exponent    = [eE] [\-\+]? @decimal
 105
 106 -- we support the hierarchical module name extension:
 107 @qual = (@conid \.)+
 108
 109 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 110
 111 haskell :-
 112
 113 -- everywhere: skip whitespace and comments
 114 $white_no_nl+                           ;
 115 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 116
 117 -- Everywhere: deal with nested comments.  We explicitly rule out
 118 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 119 -- (this can happen even though pragmas will normally take precedence due to
 120 -- longest-match, because pragmas aren't valid in every state, but comments
 121 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 122 -- set.
 123
 124 "{-" / { isNormalComment } { nested_comment lexToken }
 125
 126 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 127 -- more dashes followed by a symbol should be parsed as a varsym, so we
 128 -- have to exclude those.
 129
 130 -- Since Haddock comments aren't valid in every state, we need to rule them
 131 -- out here.
 132
 133 -- The following two rules match comments that begin with two dashes, but
 134 -- continue with a different character. The rules test that this character
 135 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 136 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 137 -- have a Haddock comment). The rules then munch the rest of the line.
 138
 139 "-- " ~$docsym .* ;
 140 "--" [^$symbol : \ ] .* ;
 141
 142 -- Next, match Haddock comments if no -haddock flag
 143
 144 "-- " $docsym .* / { ifExtension (not . haddockEnabled) } ;
 145
 146 -- Now, when we've matched comments that begin with 2 dashes and continue
 147 -- with a different character, we need to match comments that begin with three
 148 -- or more dashes (which clearly can't be Haddock comments). We only need to
 149 -- make sure that the first non-dash character isn't a symbol, and munch the
 150 -- rest of the line.
 151
 152 "---"\-* [^$symbol :] .* ;
 153
 154 -- Since the previous rules all match dashes followed by at least one
 155 -- character, we also need to match a whole line filled with just dashes.
 156
 157 "--"\-* / { atEOL } ;
 158
 159 -- We need this rule since none of the other single line comment rules
 160 -- actually match this case.
 161
 162 "-- " / { atEOL } ;
 163
 164 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 165 -- blank lines) until we find a non-whitespace character, then do layout
 166 -- processing.
 167 --
 168 -- One slight wibble here: what if the line begins with {-#? In
 169 -- theory, we have to lex the pragma to see if it's one we recognise,
 170 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 171 -- as a nested comment.  We don't bother with this: if the line begins
 172 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 173 <bol> {
 174   \n                                    ;
 175   ^\# (line)?                           { begin line_prag1 }
 176   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 177   ^\# \! .* \n                          ; -- #!, for scripts
 178   ()                                    { do_bol }
 179 }
 180
 181 -- after a layout keyword (let, where, do, of), we begin a new layout
 182 -- context if the curly brace is missing.
 183 -- Careful! This stuff is quite delicate.
 184 <layout, layout_do> {
 185   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 186         -- we might encounter {-# here, but {- has been handled already
 187   \n                                    ;
 188   ^\# (line)?                           { begin line_prag1 }
 189 }
 190
 191 -- do is treated in a subtly different way, see new_layout_context
 192 <layout>    ()                          { new_layout_context True }
 193 <layout_do> ()                          { new_layout_context False }
 194
 195 -- after a new layout context which was found to be to the left of the
 196 -- previous context, we have generated a '{' token, and we now need to
 197 -- generate a matching '}' token.
 198 <layout_left>  ()                       { do_layout_left }
 199
 200 <0,option_prags,glaexts> \n                             { begin bol }
 201
 202 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 203
 204 -- single-line line pragmas, of the form
 205 --    # <line> "<file>" <extra-stuff> \n
 206 <line_prag1> $decdigit+                 { setLine line_prag1a }
 207 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 208 <line_prag1b> .*                        { pop }
 209
 210 -- Haskell-style line pragmas, of the form
 211 --    {-# LINE <line> "<file>" #-}
 212 <line_prag2> $decdigit+                 { setLine line_prag2a }
 213 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 214 <line_prag2b> "#-}"|"-}"                { pop }
 215    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 216    -- with older versions of GHC which generated these.
 217
 218 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 219 -- is on, because the contents of the pragma is always written using
 220 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 221 -- enabled, we're sure to get a parse error.
 222 -- (ToDo: we should really emit a warning when ignoring pragmas)
 223 <glaexts>
 224   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 225
 226 <0,option_prags,glaexts> {
 227   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 228   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 229                                         { token (ITinline_prag False) }
 230   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 231                                         { token ITspec_prag }
 232   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 233         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 234   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 235         $whitechar* (NO(T?)INLINE|no(t?)inline)
 236                                         { token (ITspec_inline_prag False) }
 237   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 238   "{-#" $whitechar* (DEPRECATED|deprecated)
 239                                         { token ITdeprecated_prag }
 240   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 241   "{-#" $whitechar* (GENERATED|generated)
 242                                         { token ITgenerated_prag }
 243   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 244   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 245
 246   "{-#" $whitechar* (DOCOPTIONS|docoptions)
 247   / { ifExtension haddockEnabled }     { lex_string_prag ITdocOptions }
 248
 249  "{-#"                                 { nested_comment lexToken }
 250
 251   -- ToDo: should only be valid inside a pragma:
 252   "#-}"                                 { token ITclose_prag}
 253 }
 254
 255 <option_prags> {
 256   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 257   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 258                                         { lex_string_prag IToptions_prag }
 259   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 260   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 261 }
 262
 263 <0,option_prags,glaexts> {
 264         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 265   "{-#" $whitechar* $idchar+            { nested_comment lexToken }
 266 }
 267
 268 -- '0' state: ordinary lexemes
 269 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 270
 271 -- Haddock comments
 272
 273 <0,glaexts> {
 274   "-- " $docsym    / { ifExtension haddockEnabled } { multiline_doc_comment }
 275   "{-" \ ? $docsym / { ifExtension haddockEnabled } { nested_doc_comment }
 276 }
 277
 278 -- "special" symbols
 279
 280 <0,glaexts> {
 281   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 282   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 283 }
 284
 285 <0,glaexts> {
 286   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 287   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 288   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 289   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 290   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 291   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 292   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 293   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 294 }
 295
 296 <0,glaexts> {
 297   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 298                                         { special IToparenbar }
 299   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 300 }
 301
 302 <0,glaexts> {
 303   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 304 }
 305
 306 <glaexts> {
 307   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 308   "#)"                                  { token ITcubxparen }
 309   "{|"                                  { token ITocurlybar }
 310   "|}"                                  { token ITccurlybar }
 311 }
 312
 313 <0,option_prags,glaexts> {
 314   \(                                    { special IToparen }
 315   \)                                    { special ITcparen }
 316   \[                                    { special ITobrack }
 317   \]                                    { special ITcbrack }
 318   \,                                    { special ITcomma }
 319   \;                                    { special ITsemi }
 320   \`                                    { special ITbackquote }
 321
 322   \{                                    { open_brace }
 323   \}                                    { close_brace }
 324 }
 325
 326 <0,option_prags,glaexts> {
 327   @qual @varid                  { check_qvarid }
 328   @qual @conid                  { idtoken qconid }
 329   @varid                        { varid }
 330   @conid                        { idtoken conid }
 331 }
 332
 333 -- after an illegal qvarid, such as 'M.let',
 334 -- we back up and try again in the bad_qvarid state:
 335 <bad_qvarid> {
 336   @conid                        { pop_and (idtoken conid) }
 337   @qual @conid                  { pop_and (idtoken qconid) }
 338 }
 339
 340 <glaexts> {
 341   @qual @varid "#"+             { idtoken qvarid }
 342   @qual @conid "#"+             { idtoken qconid }
 343   @varid "#"+                   { varid }
 344   @conid "#"+                   { idtoken conid }
 345 }
 346
 347 -- ToDo: M.(,,,)
 348
 349 <0,glaexts> {
 350   @qual @varsym                 { idtoken qvarsym }
 351   @qual @consym                 { idtoken qconsym }
 352   @varsym                       { varsym }
 353   @consym                       { consym }
 354 }
 355
 356 <0,glaexts> {
 357   @decimal                      { tok_decimal }
 358   0[oO] @octal                  { tok_octal }
 359   0[xX] @hexadecimal            { tok_hexadecimal }
 360 }
 361
 362 <glaexts> {
 363   @decimal \#                   { prim_decimal }
 364   0[oO] @octal \#               { prim_octal }
 365   0[xX] @hexadecimal \#         { prim_hexadecimal }
 366 }
 367
 368 <0,glaexts> @floating_point             { strtoken tok_float }
 369 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 370 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 371
 372 -- Strings and chars are lexed by hand-written code.  The reason is
 373 -- that even if we recognise the string or char here in the regex
 374 -- lexer, we would still have to parse the string afterward in order
 375 -- to convert it to a String.
 376 <0,glaexts> {
 377   \'                            { lex_char_tok }
 378   \"                            { lex_string_tok }
 379 }
 380
 381 {
 382 -- -----------------------------------------------------------------------------
 383 -- The token type
 384
 385 data Token
 386   = ITas                        -- Haskell keywords
 387   | ITcase
 388   | ITclass
 389   | ITdata
 390   | ITdefault
 391   | ITderiving
 392   | ITderive
 393   | ITdo
 394   | ITelse
 395   | IThiding
 396   | ITif
 397   | ITimport
 398   | ITin
 399   | ITinfix
 400   | ITinfixl
 401   | ITinfixr
 402   | ITinstance
 403   | ITlet
 404   | ITmodule
 405   | ITnewtype
 406   | ITof
 407   | ITqualified
 408   | ITthen
 409   | ITtype
 410   | ITwhere
 411   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 412
 413   | ITforall                    -- GHC extension keywords
 414   | ITforeign
 415   | ITexport
 416   | ITlabel
 417   | ITdynamic
 418   | ITsafe
 419   | ITthreadsafe
 420   | ITunsafe
 421   | ITstdcallconv
 422   | ITccallconv
 423   | ITdotnet
 424   | ITmdo
 425   | ITfamily
 426
 427         -- Pragmas
 428   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 429   | ITspec_prag                 -- SPECIALISE
 430   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 431   | ITsource_prag
 432   | ITrules_prag
 433   | ITdeprecated_prag
 434   | ITline_prag
 435   | ITscc_prag
 436   | ITgenerated_prag
 437   | ITcore_prag                 -- hdaume: core annotations
 438   | ITunpack_prag
 439   | ITclose_prag
 440   | IToptions_prag String
 441   | ITinclude_prag String
 442   | ITlanguage_prag
 443
 444   | ITdotdot                    -- reserved symbols
 445   | ITcolon
 446   | ITdcolon
 447   | ITequal
 448   | ITlam
 449   | ITvbar
 450   | ITlarrow
 451   | ITrarrow
 452   | ITat
 453   | ITtilde
 454   | ITdarrow
 455   | ITminus
 456   | ITbang
 457   | ITstar
 458   | ITdot
 459
 460   | ITbiglam                    -- GHC-extension symbols
 461
 462   | ITocurly                    -- special symbols
 463   | ITccurly
 464   | ITocurlybar                 -- {|, for type applications
 465   | ITccurlybar                 -- |}, for type applications
 466   | ITvocurly
 467   | ITvccurly
 468   | ITobrack
 469   | ITopabrack                  -- [:, for parallel arrays with -fparr
 470   | ITcpabrack                  -- :], for parallel arrays with -fparr
 471   | ITcbrack
 472   | IToparen
 473   | ITcparen
 474   | IToubxparen
 475   | ITcubxparen
 476   | ITsemi
 477   | ITcomma
 478   | ITunderscore
 479   | ITbackquote
 480
 481   | ITvarid   FastString        -- identifiers
 482   | ITconid   FastString
 483   | ITvarsym  FastString
 484   | ITconsym  FastString
 485   | ITqvarid  (FastString,FastString)
 486   | ITqconid  (FastString,FastString)
 487   | ITqvarsym (FastString,FastString)
 488   | ITqconsym (FastString,FastString)
 489
 490   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 491
 492   | ITpragma StringBuffer
 493
 494   | ITchar       Char
 495   | ITstring     FastString
 496   | ITinteger    Integer
 497   | ITrational   Rational
 498
 499   | ITprimchar   Char
 500   | ITprimstring FastString
 501   | ITprimint    Integer
 502   | ITprimfloat  Rational
 503   | ITprimdouble Rational
 504
 505   -- MetaHaskell extension tokens
 506   | ITopenExpQuote              --  [| or [e|
 507   | ITopenPatQuote              --  [p|
 508   | ITopenDecQuote              --  [d|
 509   | ITopenTypQuote              --  [t|
 510   | ITcloseQuote                --  |]
 511   | ITidEscape   FastString     --  $x
 512   | ITparenEscape               --  $(
 513   | ITvarQuote                  --  '
 514   | ITtyQuote                   --  ''
 515
 516   -- Arrow notation extension
 517   | ITproc
 518   | ITrec
 519   | IToparenbar                 --  (|
 520   | ITcparenbar                 --  |)
 521   | ITlarrowtail                --  -<
 522   | ITrarrowtail                --  >-
 523   | ITLarrowtail                --  -<<
 524   | ITRarrowtail                --  >>-
 525
 526   | ITunknown String            -- Used when the lexer can't make sense of it
 527   | ITeof                       -- end of file token
 528
 529   -- Documentation annotations
 530   | ITdocCommentNext  String     -- something beginning '-- |'
 531   | ITdocCommentPrev  String     -- something beginning '-- ^'
 532   | ITdocCommentNamed String     -- something beginning '-- $'
 533   | ITdocSection      Int String -- a section heading
 534   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 535
 536 #ifdef DEBUG
 537   deriving Show -- debugging
 538 #endif
 539
 540 isSpecial :: Token -> Bool
 541 -- If we see M.x, where x is a keyword, but
 542 -- is special, we treat is as just plain M.x,
 543 -- not as a keyword.
 544 isSpecial ITas          = True
 545 isSpecial IThiding      = True
 546 isSpecial ITderive      = True
 547 isSpecial ITqualified   = True
 548 isSpecial ITforall      = True
 549 isSpecial ITexport      = True
 550 isSpecial ITlabel       = True
 551 isSpecial ITdynamic     = True
 552 isSpecial ITsafe        = True
 553 isSpecial ITthreadsafe  = True
 554 isSpecial ITunsafe      = True
 555 isSpecial ITccallconv   = True
 556 isSpecial ITstdcallconv = True
 557 isSpecial ITmdo         = True
 558 isSpecial ITfamily      = True
 559 isSpecial _             = False
 560
 561 -- the bitmap provided as the third component indicates whether the
 562 -- corresponding extension keyword is valid under the extension options
 563 -- provided to the compiler; if the extension corresponding to *any* of the
 564 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 565 -- facilitates using a keyword in two different extensions that can be
 566 -- activated independently)
 567 --
 568 reservedWordsFM = listToUFM $
 569         map (\(x, y, z) -> (mkFastString x, (y, z)))
 570        [( "_",          ITunderscore,   0 ),
 571         ( "as",         ITas,           0 ),
 572         ( "case",       ITcase,         0 ),
 573         ( "class",      ITclass,        0 ),
 574         ( "data",       ITdata,         0 ),
 575         ( "default",    ITdefault,      0 ),
 576         ( "deriving",   ITderiving,     0 ),
 577         ( "derive",     ITderive,       0 ),
 578         ( "do",         ITdo,           0 ),
 579         ( "else",       ITelse,         0 ),
 580         ( "hiding",     IThiding,       0 ),
 581         ( "if",         ITif,           0 ),
 582         ( "import",     ITimport,       0 ),
 583         ( "in",         ITin,           0 ),
 584         ( "infix",      ITinfix,        0 ),
 585         ( "infixl",     ITinfixl,       0 ),
 586         ( "infixr",     ITinfixr,       0 ),
 587         ( "instance",   ITinstance,     0 ),
 588         ( "let",        ITlet,          0 ),
 589         ( "module",     ITmodule,       0 ),
 590         ( "newtype",    ITnewtype,      0 ),
 591         ( "of",         ITof,           0 ),
 592         ( "qualified",  ITqualified,    0 ),
 593         ( "then",       ITthen,         0 ),
 594         ( "type",       ITtype,         0 ),
 595         ( "where",      ITwhere,        0 ),
 596         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 597
 598         ( "forall",     ITforall,        bit tvBit),
 599         ( "mdo",        ITmdo,           bit glaExtsBit),
 600         ( "family",     ITfamily,        bit idxTysBit),
 601
 602         ( "foreign",    ITforeign,       bit ffiBit),
 603         ( "export",     ITexport,        bit ffiBit),
 604         ( "label",      ITlabel,         bit ffiBit),
 605         ( "dynamic",    ITdynamic,       bit ffiBit),
 606         ( "safe",       ITsafe,          bit ffiBit),
 607         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 608         ( "unsafe",     ITunsafe,        bit ffiBit),
 609         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 610         ( "ccall",      ITccallconv,     bit ffiBit),
 611         ( "dotnet",     ITdotnet,        bit ffiBit),
 612
 613         ( "rec",        ITrec,           bit arrowsBit),
 614         ( "proc",       ITproc,          bit arrowsBit)
 615      ]
 616
 617 reservedSymsFM = listToUFM $
 618         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 619       [ ("..",  ITdotdot,       0)
 620        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 621                                                 -- meaning only list cons
 622        ,("::",  ITdcolon,       0)
 623        ,("=",   ITequal,        0)
 624        ,("\\",  ITlam,          0)
 625        ,("|",   ITvbar,         0)
 626        ,("<-",  ITlarrow,       0)
 627        ,("->",  ITrarrow,       0)
 628        ,("@",   ITat,           0)
 629        ,("~",   ITtilde,        0)
 630        ,("=>",  ITdarrow,       0)
 631        ,("-",   ITminus,        0)
 632        ,("!",   ITbang,         0)
 633
 634        ,("*",   ITstar,         bit glaExtsBit .|.
 635                                 bit idxTysBit)      -- For data T (a::*) = MkT
 636        ,(".",   ITdot,          bit tvBit)          -- For 'forall a . t'
 637
 638        ,("-<",  ITlarrowtail,   bit arrowsBit)
 639        ,(">-",  ITrarrowtail,   bit arrowsBit)
 640        ,("-<<", ITLarrowtail,   bit arrowsBit)
 641        ,(">>-", ITRarrowtail,   bit arrowsBit)
 642
 643 #if __GLASGOW_HASKELL__ >= 605
 644        ,("∷",   ITdcolon,       bit glaExtsBit)
 645        ,("⇒",   ITdarrow,     bit glaExtsBit)
 646        ,("∀", ITforall,       bit glaExtsBit)
 647        ,("→",   ITrarrow,     bit glaExtsBit)
 648        ,("←",   ITlarrow,     bit glaExtsBit)
 649        ,("⋯",         ITdotdot,       bit glaExtsBit)
 650         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 651         -- form part of a large operator.  This would let us have a better
 652         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 653 #endif
 654        ]
 655
 656 -- -----------------------------------------------------------------------------
 657 -- Lexer actions
 658
 659 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 660
 661 special :: Token -> Action
 662 special tok span _buf len = return (L span tok)
 663
 664 token, layout_token :: Token -> Action
 665 token t span buf len = return (L span t)
 666 layout_token t span buf len = pushLexState layout >> return (L span t)
 667
 668 idtoken :: (StringBuffer -> Int -> Token) -> Action
 669 idtoken f span buf len = return (L span $! (f buf len))
 670
 671 skip_one_varid :: (FastString -> Token) -> Action
 672 skip_one_varid f span buf len
 673   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 674
 675 strtoken :: (String -> Token) -> Action
 676 strtoken f span buf len =
 677   return (L span $! (f $! lexemeToString buf len))
 678
 679 init_strtoken :: Int -> (String -> Token) -> Action
 680 -- like strtoken, but drops the last N character(s)
 681 init_strtoken drop f span buf len =
 682   return (L span $! (f $! lexemeToString buf (len-drop)))
 683
 684 begin :: Int -> Action
 685 begin code _span _str _len = do pushLexState code; lexToken
 686
 687 pop :: Action
 688 pop _span _buf _len = do popLexState; lexToken
 689
 690 pop_and :: Action -> Action
 691 pop_and act span buf len = do popLexState; act span buf len
 692
 693 {-# INLINE nextCharIs #-}
 694 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 695
 696 notFollowedBy char _ _ _ (AI _ _ buf)
 697   = nextCharIs buf (/=char)
 698
 699 notFollowedBySymbol _ _ _ (AI _ _ buf)
 700   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 701
 702 -- We must reject doc comments as being ordinary comments everywhere.
 703 -- In some cases the doc comment will be selected as the lexeme due to
 704 -- maximal munch, but not always, because the nested comment rule is
 705 -- valid in all states, but the doc-comment rules are only valid in
 706 -- the non-layout states.
 707 isNormalComment bits _ _ (AI _ _ buf)
 708   | haddockEnabled bits = notFollowedByDocOrPragma
 709   | otherwise           = nextCharIs buf (/='#')
 710   where
 711     notFollowedByDocOrPragma
 712        = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 713
 714 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 715
 716 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 717   = if haddockEnabled bits then False else (p buf)
 718
 719 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 720
 721 ifExtension pred bits _ _ _ = pred bits
 722
 723 multiline_doc_comment :: Action
 724 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 725   where
 726     worker commentAcc input docType oneLine = case alexGetChar input of
 727       Just ('\n', input')
 728         | oneLine -> docCommentEnd input commentAcc docType buf span
 729         | otherwise -> case checkIfCommentLine input' of
 730           Just input -> worker ('\n':commentAcc) input docType False
 731           Nothing -> docCommentEnd input commentAcc docType buf span
 732       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 733       Nothing -> docCommentEnd input commentAcc docType buf span
 734
 735     checkIfCommentLine input = check (dropNonNewlineSpace input)
 736       where
 737         check input = case alexGetChar input of
 738           Just ('-', input) -> case alexGetChar input of
 739             Just ('-', input) -> case alexGetChar input of
 740               Just (c, _) | c /= '-' -> Just input
 741               _ -> Nothing
 742             _ -> Nothing
 743           _ -> Nothing
 744
 745         dropNonNewlineSpace input = case alexGetChar input of
 746           Just (c, input')
 747             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 748             | otherwise -> input
 749           Nothing -> input
 750
 751 {-
 752   nested comments require traversing by hand, they can't be parsed
 753   using regular expressions.
 754 -}
 755 nested_comment :: P (Located Token) -> Action
 756 nested_comment cont span _str _len = do
 757   input <- getInput
 758   go 1 input
 759   where
 760     go 0 input = do setInput input; cont
 761     go n input = case alexGetChar input of
 762       Nothing -> errBrace input span
 763       Just ('-',input) -> case alexGetChar input of
 764         Nothing  -> errBrace input span
 765         Just ('\125',input) -> go (n-1) input
 766         Just (c,_)          -> go n input
 767       Just ('\123',input) -> case alexGetChar input of
 768         Nothing  -> errBrace input span
 769         Just ('-',input) -> go (n+1) input
 770         Just (c,_)       -> go n input
 771       Just (c,input) -> go n input
 772
 773 nested_doc_comment :: Action
 774 nested_doc_comment span buf _len = withLexedDocType (go "")
 775   where
 776     go commentAcc input docType _ = case alexGetChar input of
 777       Nothing -> errBrace input span
 778       Just ('-',input) -> case alexGetChar input of
 779         Nothing -> errBrace input span
 780         Just ('\125',input@(AI end _ buf2)) ->
 781           docCommentEnd input commentAcc docType buf span
 782         Just (c,_) -> go ('-':commentAcc) input docType False
 783       Just ('\123', input) -> case alexGetChar input of
 784         Nothing  -> errBrace input span
 785         Just ('-',input) -> do
 786           setInput input
 787           let cont = do input <- getInput; go commentAcc input docType False
 788           nested_comment cont span buf _len
 789         Just (c,_) -> go ('\123':commentAcc) input docType False
 790       Just (c,input) -> go (c:commentAcc) input docType False
 791
 792 withLexedDocType lexDocComment = do
 793   input@(AI _ _ buf) <- getInput
 794   case prevChar buf ' ' of
 795     '|' -> lexDocComment input ITdocCommentNext False
 796     '^' -> lexDocComment input ITdocCommentPrev False
 797     '$' -> lexDocComment input ITdocCommentNamed False
 798     '*' -> lexDocSection 1 input
 799  where
 800     lexDocSection n input = case alexGetChar input of
 801       Just ('*', input) -> lexDocSection (n+1) input
 802       Just (c, _) -> lexDocComment input (ITdocSection n) True
 803       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 804
 805 -- docCommentEnd
 806 -------------------------------------------------------------------------------
 807 -- This function is quite tricky. We can't just return a new token, we also
 808 -- need to update the state of the parser. Why? Because the token is longer
 809 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 810 -- it writes the wrong token length to the parser state. This function is
 811 -- called afterwards, so it can just update the state.
 812
 813 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 814 -- which is something that the original lexer didn't account for.
 815 -- I have added last_line_len in the parser state which represents the length
 816 -- of the part of the token that is on the last line. It is now used for layout
 817 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 818 -- was before, the full length of the token, and it is now only used for error
 819 -- messages. /Waern
 820
 821 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 822                  SrcSpan -> P (Located Token)
 823 docCommentEnd input commentAcc docType buf span = do
 824   setInput input
 825   let (AI loc last_offs nextBuf) = input
 826       comment = reverse commentAcc
 827       span' = mkSrcSpan (srcSpanStart span) loc
 828       last_len = byteDiff buf nextBuf
 829
 830       last_line_len = if (last_offs - last_len < 0)
 831         then last_offs
 832         else last_len
 833
 834   span `seq` setLastToken span' last_len last_line_len
 835   return (L span' (docType comment))
 836
 837 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 838
 839 open_brace, close_brace :: Action
 840 open_brace span _str _len = do
 841   ctx <- getContext
 842   setContext (NoLayout:ctx)
 843   return (L span ITocurly)
 844 close_brace span _str _len = do
 845   popContext
 846   return (L span ITccurly)
 847
 848 -- We have to be careful not to count M.<varid> as a qualified name
 849 -- when <varid> is a keyword.  We hack around this by catching
 850 -- the offending tokens afterward, and re-lexing in a different state.
 851 check_qvarid span buf len = do
 852   case lookupUFM reservedWordsFM var of
 853         Just (keyword,exts)
 854           | not (isSpecial keyword) ->
 855           if exts == 0
 856              then try_again
 857              else do
 858                 b <- extension (\i -> exts .&. i /= 0)
 859                 if b then try_again
 860                      else return token
 861         _other -> return token
 862   where
 863         (mod,var) = splitQualName buf len
 864         token     = L span (ITqvarid (mod,var))
 865
 866         try_again = do
 867                 (AI _ offs _) <- getInput
 868                 setInput (AI (srcSpanStart span) (offs-len) buf)
 869                 pushLexState bad_qvarid
 870                 lexToken
 871
 872 qvarid buf len = ITqvarid $! splitQualName buf len
 873 qconid buf len = ITqconid $! splitQualName buf len
 874
 875 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 876 -- takes a StringBuffer and a length, and returns the module name
 877 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 878 -- because of hierarchical module names.
 879 splitQualName orig_buf len = split orig_buf orig_buf
 880   where
 881     split buf dot_buf
 882         | orig_buf `byteDiff` buf >= len  = done dot_buf
 883         | c == '.'                        = found_dot buf'
 884         | otherwise                       = split buf' dot_buf
 885       where
 886        (c,buf') = nextChar buf
 887
 888     -- careful, we might get names like M....
 889     -- so, if the character after the dot is not upper-case, this is
 890     -- the end of the qualifier part.
 891     found_dot buf -- buf points after the '.'
 892         | isUpper c    = split buf' buf
 893         | otherwise    = done buf
 894       where
 895        (c,buf') = nextChar buf
 896
 897     done dot_buf =
 898         (lexemeToFastString orig_buf (qual_size - 1),
 899          lexemeToFastString dot_buf (len - qual_size))
 900       where
 901         qual_size = orig_buf `byteDiff` dot_buf
 902
 903 varid span buf len =
 904   case lookupUFM reservedWordsFM fs of
 905         Just (keyword,0)    -> do
 906                 maybe_layout keyword
 907                 return (L span keyword)
 908         Just (keyword,exts) -> do
 909                 b <- extension (\i -> exts .&. i /= 0)
 910                 if b then do maybe_layout keyword
 911                              return (L span keyword)
 912                      else return (L span (ITvarid fs))
 913         _other -> return (L span (ITvarid fs))
 914   where
 915         fs = lexemeToFastString buf len
 916
 917 conid buf len = ITconid fs
 918   where fs = lexemeToFastString buf len
 919
 920 qvarsym buf len = ITqvarsym $! splitQualName buf len
 921 qconsym buf len = ITqconsym $! splitQualName buf len
 922
 923 varsym = sym ITvarsym
 924 consym = sym ITconsym
 925
 926 sym con span buf len =
 927   case lookupUFM reservedSymsFM fs of
 928         Just (keyword,0)    -> return (L span keyword)
 929         Just (keyword,exts) -> do
 930                 b <- extension (\i -> exts .&. i /= 0)
 931                 if b then return (L span keyword)
 932                      else return (L span $! con fs)
 933         _other -> return (L span $! con fs)
 934   where
 935         fs = lexemeToFastString buf len
 936
 937 tok_decimal span buf len
 938   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 939
 940 tok_octal span buf len
 941   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 942
 943 tok_hexadecimal span buf len
 944   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 945
 946 prim_decimal span buf len
 947   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 948
 949 prim_octal span buf len
 950   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 951
 952 prim_hexadecimal span buf len
 953   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 954
 955 tok_float        str = ITrational   $! readRational str
 956 prim_float       str = ITprimfloat  $! readRational str
 957 prim_double      str = ITprimdouble $! readRational str
 958
 959 -- -----------------------------------------------------------------------------
 960 -- Layout processing
 961
 962 -- we're at the first token on a line, insert layout tokens if necessary
 963 do_bol :: Action
 964 do_bol span _str _len = do
 965         pos <- getOffside
 966         case pos of
 967             LT -> do
 968                 --trace "layout: inserting '}'" $ do
 969                 popContext
 970                 -- do NOT pop the lex state, we might have a ';' to insert
 971                 return (L span ITvccurly)
 972             EQ -> do
 973                 --trace "layout: inserting ';'" $ do
 974                 popLexState
 975                 return (L span ITsemi)
 976             GT -> do
 977                 popLexState
 978                 lexToken
 979
 980 -- certain keywords put us in the "layout" state, where we might
 981 -- add an opening curly brace.
 982 maybe_layout ITdo       = pushLexState layout_do
 983 maybe_layout ITmdo      = pushLexState layout_do
 984 maybe_layout ITof       = pushLexState layout
 985 maybe_layout ITlet      = pushLexState layout
 986 maybe_layout ITwhere    = pushLexState layout
 987 maybe_layout ITrec      = pushLexState layout
 988 maybe_layout _          = return ()
 989
 990 -- Pushing a new implicit layout context.  If the indentation of the
 991 -- next token is not greater than the previous layout context, then
 992 -- Haskell 98 says that the new layout context should be empty; that is
 993 -- the lexer must generate {}.
 994 --
 995 -- We are slightly more lenient than this: when the new context is started
 996 -- by a 'do', then we allow the new context to be at the same indentation as
 997 -- the previous context.  This is what the 'strict' argument is for.
 998 --
 999 new_layout_context strict span _buf _len = do
1000     popLexState
1001     (AI _ offset _) <- getInput
1002     ctx <- getContext
1003     case ctx of
1004         Layout prev_off : _  |
1005            (strict     && prev_off >= offset  ||
1006             not strict && prev_off > offset) -> do
1007                 -- token is indented to the left of the previous context.
1008                 -- we must generate a {} sequence now.
1009                 pushLexState layout_left
1010                 return (L span ITvocurly)
1011         other -> do
1012                 setContext (Layout offset : ctx)
1013                 return (L span ITvocurly)
1014
1015 do_layout_left span _buf _len = do
1016     popLexState
1017     pushLexState bol  -- we must be at the start of a line
1018     return (L span ITvccurly)
1019
1020 -- -----------------------------------------------------------------------------
1021 -- LINE pragmas
1022
1023 setLine :: Int -> Action
1024 setLine code span buf len = do
1025   let line = parseInteger buf len 10 octDecDigit
1026   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1027         -- subtract one: the line number refers to the *following* line
1028   popLexState
1029   pushLexState code
1030   lexToken
1031
1032 setFile :: Int -> Action
1033 setFile code span buf len = do
1034   let file = lexemeToFastString (stepOn buf) (len-2)
1035   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1036   popLexState
1037   pushLexState code
1038   lexToken
1039
1040
1041 -- -----------------------------------------------------------------------------
1042 -- Options, includes and language pragmas.
1043
1044 lex_string_prag :: (String -> Token) -> Action
1045 lex_string_prag mkTok span buf len
1046     = do input <- getInput
1047          start <- getSrcLoc
1048          tok <- go [] input
1049          end <- getSrcLoc
1050          return (L (mkSrcSpan start end) tok)
1051     where go acc input
1052               = if isString input "#-}"
1053                    then do setInput input
1054                            return (mkTok (reverse acc))
1055                    else case alexGetChar input of
1056                           Just (c,i) -> go (c:acc) i
1057                           Nothing -> err input
1058           isString i [] = True
1059           isString i (x:xs)
1060               = case alexGetChar i of
1061                   Just (c,i') | c == x    -> isString i' xs
1062                   _other -> False
1063           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1064
1065
1066 -- -----------------------------------------------------------------------------
1067 -- Strings & Chars
1068
1069 -- This stuff is horrible.  I hates it.
1070
1071 lex_string_tok :: Action
1072 lex_string_tok span buf len = do
1073   tok <- lex_string ""
1074   end <- getSrcLoc
1075   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1076
1077 lex_string :: String -> P Token
1078 lex_string s = do
1079   i <- getInput
1080   case alexGetChar' i of
1081     Nothing -> lit_error
1082
1083     Just ('"',i)  -> do
1084         setInput i
1085         glaexts <- extension glaExtsEnabled
1086         if glaexts
1087           then do
1088             i <- getInput
1089             case alexGetChar' i of
1090               Just ('#',i) -> do
1091                    setInput i
1092                    if any (> '\xFF') s
1093                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1094                     else let s' = mkZFastString (reverse s) in
1095                          return (ITprimstring s')
1096                         -- mkZFastString is a hack to avoid encoding the
1097                         -- string in UTF-8.  We just want the exact bytes.
1098               _other ->
1099                 return (ITstring (mkFastString (reverse s)))
1100           else
1101                 return (ITstring (mkFastString (reverse s)))
1102
1103     Just ('\\',i)
1104         | Just ('&',i) <- next -> do
1105                 setInput i; lex_string s
1106         | Just (c,i) <- next, is_space c -> do
1107                 setInput i; lex_stringgap s
1108         where next = alexGetChar' i
1109
1110     Just (c, i) -> do
1111         c' <- lex_char c i
1112         lex_string (c':s)
1113
1114 lex_stringgap s = do
1115   c <- getCharOrFail
1116   case c of
1117     '\\' -> lex_string s
1118     c | is_space c -> lex_stringgap s
1119     _other -> lit_error
1120
1121
1122 lex_char_tok :: Action
1123 -- Here we are basically parsing character literals, such as 'x' or '\n'
1124 -- but, when Template Haskell is on, we additionally spot
1125 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1126 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1127 -- So we have to do two characters of lookahead: when we see 'x we need to
1128 -- see if there's a trailing quote
1129 lex_char_tok span buf len = do  -- We've seen '
1130    i1 <- getInput       -- Look ahead to first character
1131    let loc = srcSpanStart span
1132    case alexGetChar' i1 of
1133         Nothing -> lit_error
1134
1135         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1136                   th_exts <- extension thEnabled
1137                   if th_exts then do
1138                         setInput i2
1139                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1140                    else lit_error
1141
1142         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
1143                   setInput i2
1144                   lit_ch <- lex_escape
1145                   mc <- getCharOrFail   -- Trailing quote
1146                   if mc == '\'' then finish_char_tok loc lit_ch
1147                                 else do setInput i2; lit_error
1148
1149         Just (c, i2@(AI end2 _ _))
1150                 | not (isAny c) -> lit_error
1151                 | otherwise ->
1152
1153                 -- We've seen 'x, where x is a valid character
1154                 --  (i.e. not newline etc) but not a quote or backslash
1155            case alexGetChar' i2 of      -- Look ahead one more character
1156                 Nothing -> lit_error
1157                 Just ('\'', i3) -> do   -- We've seen 'x'
1158                         setInput i3
1159                         finish_char_tok loc c
1160                 _other -> do            -- We've seen 'x not followed by quote
1161                                         -- If TH is on, just parse the quote only
1162                         th_exts <- extension thEnabled
1163                         let (AI end _ _) = i1
1164                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1165                                    else do setInput i2; lit_error
1166
1167 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1168 finish_char_tok loc ch  -- We've already seen the closing quote
1169                         -- Just need to check for trailing #
1170   = do  glaexts <- extension glaExtsEnabled
1171         i@(AI end _ _) <- getInput
1172         if glaexts then do
1173                 case alexGetChar' i of
1174                         Just ('#',i@(AI end _ _)) -> do
1175                                 setInput i
1176                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1177                         _other ->
1178                                 return (L (mkSrcSpan loc end) (ITchar ch))
1179                 else do
1180                    return (L (mkSrcSpan loc end) (ITchar ch))
1181
1182 lex_char :: Char -> AlexInput -> P Char
1183 lex_char c inp = do
1184   case c of
1185       '\\' -> do setInput inp; lex_escape
1186       c | isAny c -> do setInput inp; return c
1187       _other -> lit_error
1188
1189 isAny c | c > '\xff' = isPrint c
1190         | otherwise  = is_any c
1191
1192 lex_escape :: P Char
1193 lex_escape = do
1194   c <- getCharOrFail
1195   case c of
1196         'a'   -> return '\a'
1197         'b'   -> return '\b'
1198         'f'   -> return '\f'
1199         'n'   -> return '\n'
1200         'r'   -> return '\r'
1201         't'   -> return '\t'
1202         'v'   -> return '\v'
1203         '\\'  -> return '\\'
1204         '"'   -> return '\"'
1205         '\''  -> return '\''
1206         '^'   -> do c <- getCharOrFail
1207                     if c >= '@' && c <= '_'
1208                         then return (chr (ord c - ord '@'))
1209                         else lit_error
1210
1211         'x'   -> readNum is_hexdigit 16 hexDigit
1212         'o'   -> readNum is_octdigit  8 octDecDigit
1213         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1214
1215         c1 ->  do
1216            i <- getInput
1217            case alexGetChar' i of
1218             Nothing -> lit_error
1219             Just (c2,i2) ->
1220               case alexGetChar' i2 of
1221                 Nothing -> do setInput i2; lit_error
1222                 Just (c3,i3) ->
1223                    let str = [c1,c2,c3] in
1224                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1225                                      Just rest <- [maybePrefixMatch p str] ] of
1226                           (escape_char,[]):_ -> do
1227                                 setInput i3
1228                                 return escape_char
1229                           (escape_char,_:_):_ -> do
1230                                 setInput i2
1231                                 return escape_char
1232                           [] -> lit_error
1233
1234 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1235 readNum is_digit base conv = do
1236   i <- getInput
1237   c <- getCharOrFail
1238   if is_digit c
1239         then readNum2 is_digit base conv (conv c)
1240         else do setInput i; lit_error
1241
1242 readNum2 is_digit base conv i = do
1243   input <- getInput
1244   read i input
1245   where read i input = do
1246           case alexGetChar' input of
1247             Just (c,input') | is_digit c -> do
1248                 read (i*base + conv c) input'
1249             _other -> do
1250                 if i >= 0 && i <= 0x10FFFF
1251                    then do setInput input; return (chr i)
1252                    else lit_error
1253
1254 silly_escape_chars = [
1255         ("NUL", '\NUL'),
1256         ("SOH", '\SOH'),
1257         ("STX", '\STX'),
1258         ("ETX", '\ETX'),
1259         ("EOT", '\EOT'),
1260         ("ENQ", '\ENQ'),
1261         ("ACK", '\ACK'),
1262         ("BEL", '\BEL'),
1263         ("BS", '\BS'),
1264         ("HT", '\HT'),
1265         ("LF", '\LF'),
1266         ("VT", '\VT'),
1267         ("FF", '\FF'),
1268         ("CR", '\CR'),
1269         ("SO", '\SO'),
1270         ("SI", '\SI'),
1271         ("DLE", '\DLE'),
1272         ("DC1", '\DC1'),
1273         ("DC2", '\DC2'),
1274         ("DC3", '\DC3'),
1275         ("DC4", '\DC4'),
1276         ("NAK", '\NAK'),
1277         ("SYN", '\SYN'),
1278         ("ETB", '\ETB'),
1279         ("CAN", '\CAN'),
1280         ("EM", '\EM'),
1281         ("SUB", '\SUB'),
1282         ("ESC", '\ESC'),
1283         ("FS", '\FS'),
1284         ("GS", '\GS'),
1285         ("RS", '\RS'),
1286         ("US", '\US'),
1287         ("SP", '\SP'),
1288         ("DEL", '\DEL')
1289         ]
1290
1291 -- before calling lit_error, ensure that the current input is pointing to
1292 -- the position of the error in the buffer.  This is so that we can report
1293 -- a correct location to the user, but also so we can detect UTF-8 decoding
1294 -- errors if they occur.
1295 lit_error = lexError "lexical error in string/character literal"
1296
1297 getCharOrFail :: P Char
1298 getCharOrFail =  do
1299   i <- getInput
1300   case alexGetChar' i of
1301         Nothing -> lexError "unexpected end-of-file in string/character literal"
1302         Just (c,i)  -> do setInput i; return c
1303
1304 -- -----------------------------------------------------------------------------
1305 -- Warnings
1306
1307 warn :: DynFlag -> SDoc -> Action
1308 warn option warning span _buf _len = do
1309     addWarning option (mkWarnMsg span alwaysQualify warning)
1310     lexToken
1311
1312 -- -----------------------------------------------------------------------------
1313 -- The Parse Monad
1314
1315 data LayoutContext
1316   = NoLayout
1317   | Layout !Int
1318   deriving Show
1319
1320 data ParseResult a
1321   = POk PState a
1322   | PFailed
1323         SrcSpan         -- The start and end of the text span related to
1324                         -- the error.  Might be used in environments which can
1325                         -- show this span, e.g. by highlighting it.
1326         Message         -- The error message
1327
1328 data PState = PState {
1329         buffer     :: StringBuffer,
1330     dflags     :: DynFlags,
1331     messages   :: Messages,
1332         last_loc   :: SrcSpan,  -- pos of previous token
1333         last_offs  :: !Int,     -- offset of the previous token from the
1334                                 -- beginning of  the current line.
1335                                 -- \t is equal to 8 spaces.
1336         last_len   :: !Int,     -- len of previous token
1337   last_line_len :: !Int,
1338         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1339         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1340         context    :: [LayoutContext],
1341         lex_state  :: [Int]
1342      }
1343         -- last_loc and last_len are used when generating error messages,
1344         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1345         -- current token to happyError, we could at least get rid of last_len.
1346         -- Getting rid of last_loc would require finding another way to
1347         -- implement pushCurrentContext (which is only called from one place).
1348
1349 newtype P a = P { unP :: PState -> ParseResult a }
1350
1351 instance Monad P where
1352   return = returnP
1353   (>>=) = thenP
1354   fail = failP
1355
1356 returnP :: a -> P a
1357 returnP a = P $ \s -> POk s a
1358
1359 thenP :: P a -> (a -> P b) -> P b
1360 (P m) `thenP` k = P $ \ s ->
1361         case m s of
1362                 POk s1 a         -> (unP (k a)) s1
1363                 PFailed span err -> PFailed span err
1364
1365 failP :: String -> P a
1366 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1367
1368 failMsgP :: String -> P a
1369 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1370
1371 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1372 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1373
1374 failSpanMsgP :: SrcSpan -> String -> P a
1375 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1376
1377 extension :: (Int -> Bool) -> P Bool
1378 extension p = P $ \s -> POk s (p $! extsBitmap s)
1379
1380 getExts :: P Int
1381 getExts = P $ \s -> POk s (extsBitmap s)
1382
1383 setSrcLoc :: SrcLoc -> P ()
1384 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1385
1386 getSrcLoc :: P SrcLoc
1387 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1388
1389 setLastToken :: SrcSpan -> Int -> Int -> P ()
1390 setLastToken loc len line_len = P $ \s -> POk s {
1391   last_loc=loc,
1392   last_len=len,
1393   last_line_len=line_len
1394 } ()
1395
1396 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1397
1398 alexInputPrevChar :: AlexInput -> Char
1399 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1400
1401 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1402 alexGetChar (AI loc ofs s)
1403   | atEnd s   = Nothing
1404   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1405                 --trace (show (ord c)) $
1406                 Just (adj_c, (AI loc' ofs' s'))
1407   where (c,s') = nextChar s
1408         loc'   = advanceSrcLoc loc c
1409         ofs'   = advanceOffs c ofs
1410
1411         non_graphic     = '\x0'
1412         upper           = '\x1'
1413         lower           = '\x2'
1414         digit           = '\x3'
1415         symbol          = '\x4'
1416         space           = '\x5'
1417         other_graphic   = '\x6'
1418
1419         adj_c
1420           | c <= '\x06' = non_graphic
1421           | c <= '\xff' = c
1422           -- Alex doesn't handle Unicode, so when Unicode
1423           -- character is encoutered we output these values
1424           -- with the actual character value hidden in the state.
1425           | otherwise =
1426                 case generalCategory c of
1427                   UppercaseLetter       -> upper
1428                   LowercaseLetter       -> lower
1429                   TitlecaseLetter       -> upper
1430                   ModifierLetter        -> other_graphic
1431                   OtherLetter           -> other_graphic
1432                   NonSpacingMark        -> other_graphic
1433                   SpacingCombiningMark  -> other_graphic
1434                   EnclosingMark         -> other_graphic
1435                   DecimalNumber         -> digit
1436                   LetterNumber          -> other_graphic
1437                   OtherNumber           -> other_graphic
1438                   ConnectorPunctuation  -> other_graphic
1439                   DashPunctuation       -> other_graphic
1440                   OpenPunctuation       -> other_graphic
1441                   ClosePunctuation      -> other_graphic
1442                   InitialQuote          -> other_graphic
1443                   FinalQuote            -> other_graphic
1444                   OtherPunctuation      -> other_graphic
1445                   MathSymbol            -> symbol
1446                   CurrencySymbol        -> symbol
1447                   ModifierSymbol        -> symbol
1448                   OtherSymbol           -> symbol
1449                   Space                 -> space
1450                   _other                -> non_graphic
1451
1452 -- This version does not squash unicode characters, it is used when
1453 -- lexing strings.
1454 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1455 alexGetChar' (AI loc ofs s)
1456   | atEnd s   = Nothing
1457   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1458                 --trace (show (ord c)) $
1459                 Just (c, (AI loc' ofs' s'))
1460   where (c,s') = nextChar s
1461         loc'   = advanceSrcLoc loc c
1462         ofs'   = advanceOffs c ofs
1463
1464 advanceOffs :: Char -> Int -> Int
1465 advanceOffs '\n' offs = 0
1466 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1467 advanceOffs _    offs = offs + 1
1468
1469 getInput :: P AlexInput
1470 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1471
1472 setInput :: AlexInput -> P ()
1473 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1474
1475 pushLexState :: Int -> P ()
1476 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1477
1478 popLexState :: P Int
1479 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1480
1481 getLexState :: P Int
1482 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1483
1484 -- for reasons of efficiency, flags indicating language extensions (eg,
1485 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1486 -- integer
1487
1488 glaExtsBit, ffiBit, parrBit :: Int
1489 glaExtsBit = 0
1490 ffiBit     = 1
1491 parrBit    = 2
1492 arrowsBit  = 4
1493 thBit      = 5
1494 ipBit      = 6
1495 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1496 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1497                 -- (doesn't affect the lexer)
1498 idxTysBit  = 9  -- indexed type families: 'family' keyword and kind sigs
1499 haddockBit = 10 -- Lex and parse Haddock comments
1500
1501 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1502 glaExtsEnabled flags = testBit flags glaExtsBit
1503 ffiEnabled     flags = testBit flags ffiBit
1504 parrEnabled    flags = testBit flags parrBit
1505 arrowsEnabled  flags = testBit flags arrowsBit
1506 thEnabled      flags = testBit flags thBit
1507 ipEnabled      flags = testBit flags ipBit
1508 tvEnabled      flags = testBit flags tvBit
1509 bangPatEnabled flags = testBit flags bangPatBit
1510 idxTysEnabled  flags = testBit flags idxTysBit
1511 haddockEnabled flags = testBit flags haddockBit
1512
1513 -- PState for parsing options pragmas
1514 --
1515 pragState :: StringBuffer -> SrcLoc -> PState
1516 pragState buf loc  =
1517   PState {
1518       buffer          = buf,
1519       messages      = emptyMessages,
1520       -- XXX defaultDynFlags is not right, but we don't have a real
1521       -- dflags handy
1522       dflags        = defaultDynFlags,
1523       last_loc      = mkSrcSpan loc loc,
1524       last_offs     = 0,
1525       last_len      = 0,
1526       last_line_len = 0,
1527       loc           = loc,
1528       extsBitmap    = 0,
1529       context       = [],
1530       lex_state     = [bol, option_prags, 0]
1531     }
1532
1533
1534 -- create a parse state
1535 --
1536 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1537 mkPState buf loc flags  =
1538   PState {
1539       buffer          = buf,
1540       dflags        = flags,
1541       messages      = emptyMessages,
1542       last_loc      = mkSrcSpan loc loc,
1543       last_offs     = 0,
1544       last_len      = 0,
1545       last_line_len = 0,
1546       loc           = loc,
1547       extsBitmap    = fromIntegral bitmap,
1548       context       = [],
1549       lex_state     = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1550         -- we begin in the layout state if toplev_layout is set
1551     }
1552     where
1553       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1554                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1555                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1556                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1557                .|. thBit      `setBitIf` dopt Opt_TH          flags
1558                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1559                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1560                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1561                .|. idxTysBit  `setBitIf` dopt Opt_IndexedTypes flags
1562                .|. haddockBit `setBitIf` dopt Opt_Haddock     flags
1563       --
1564       setBitIf :: Int -> Bool -> Int
1565       b `setBitIf` cond | cond      = bit b
1566                         | otherwise = 0
1567
1568 addWarning :: DynFlag -> WarnMsg -> P ()
1569 addWarning option w
1570  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1571        let ws' = if dopt option d then ws `snocBag` w else ws
1572        in POk s{messages=(ws', es)} ()
1573
1574 getMessages :: PState -> Messages
1575 getMessages PState{messages=ms} = ms
1576
1577 getContext :: P [LayoutContext]
1578 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1579
1580 setContext :: [LayoutContext] -> P ()
1581 setContext ctx = P $ \s -> POk s{context=ctx} ()
1582
1583 popContext :: P ()
1584 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1585                            loc = loc, last_len = len, last_loc = last_loc }) ->
1586   case ctx of
1587         (_:tl) -> POk s{ context = tl } ()
1588         []     -> PFailed last_loc (srcParseErr buf len)
1589
1590 -- Push a new layout context at the indentation of the last token read.
1591 -- This is only used at the outer level of a module when the 'module'
1592 -- keyword is missing.
1593 pushCurrentContext :: P ()
1594 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1595     POk s{context = Layout (offs-len) : ctx} ()
1596 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1597
1598 getOffside :: P Ordering
1599 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1600                 let ord = case stk of
1601                         (Layout n:_) -> compare offs n
1602                         _            -> GT
1603                 in POk s ord
1604
1605 -- ---------------------------------------------------------------------------
1606 -- Construct a parse error
1607
1608 srcParseErr
1609   :: StringBuffer       -- current buffer (placed just after the last token)
1610   -> Int                -- length of the previous token
1611   -> Message
1612 srcParseErr buf len
1613   = hcat [ if null token
1614              then ptext SLIT("parse error (possibly incorrect indentation)")
1615              else hcat [ptext SLIT("parse error on input "),
1616                         char '`', text token, char '\'']
1617     ]
1618   where token = lexemeToString (offsetBytes (-len) buf) len
1619
1620 -- Report a parse failure, giving the span of the previous token as
1621 -- the location of the error.  This is the entry point for errors
1622 -- detected during parsing.
1623 srcParseFail :: P a
1624 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1625                             last_loc = last_loc } ->
1626     PFailed last_loc (srcParseErr buf len)
1627
1628 -- A lexical error is reported at a particular position in the source file,
1629 -- not over a token range.
1630 lexError :: String -> P a
1631 lexError str = do
1632   loc <- getSrcLoc
1633   i@(AI end _ buf) <- getInput
1634   reportLexError loc end buf str
1635
1636 -- -----------------------------------------------------------------------------
1637 -- This is the top-level function: called from the parser each time a
1638 -- new token is to be read from the input.
1639
1640 lexer :: (Located Token -> P a) -> P a
1641 lexer cont = do
1642   tok@(L span tok__) <- lexToken
1643 --  trace ("token: " ++ show tok__) $ do
1644   cont tok
1645
1646 lexToken :: P (Located Token)
1647 lexToken = do
1648   inp@(AI loc1 _ buf) <- getInput
1649   sc <- getLexState
1650   exts <- getExts
1651   case alexScanUser exts inp sc of
1652     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1653                   setLastToken span 0 0
1654                   return (L span ITeof)
1655     AlexError (AI loc2 _ buf) -> do
1656         reportLexError loc1 loc2 buf "lexical error"
1657     AlexSkip inp2 _ -> do
1658         setInput inp2
1659         lexToken
1660     AlexToken inp2@(AI end _ buf2) len t -> do
1661     setInput inp2
1662     let span = mkSrcSpan loc1 end
1663     let bytes = byteDiff buf buf2
1664     span `seq` setLastToken span bytes bytes
1665     t span buf bytes
1666
1667 reportLexError loc1 loc2 buf str
1668   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1669   | otherwise =
1670   let
1671         c = fst (nextChar buf)
1672   in
1673   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1674     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1675     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1676 }