compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    getMessages,
  29    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  30    getLexState, popLexState, pushLexState,
  31    extension, glaExtsEnabled, bangPatEnabled
  32   ) where
  33
  34 #include "HsVersions.h"
  35
  36 import Bag
  37 import ErrUtils
  38 import Outputable
  39 import StringBuffer
  40 import FastString
  41 import FastTypes
  42 import SrcLoc
  43 import UniqFM
  44 import DynFlags
  45 import Ctype
  46 import Util             ( maybePrefixMatch, readRational )
  47
  48 import Control.Monad
  49 import Data.Bits
  50 import Data.Char        ( chr, isSpace )
  51 import Data.Ratio
  52 import Debug.Trace
  53
  54 #if __GLASGOW_HASKELL__ >= 605
  55 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  56 #else
  57 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  58 #endif
  59 }
  60
  61 $unispace    = \x05
  62 $whitechar   = [\ \n\r\f\v\xa0 $unispace]
  63 $white_no_nl = $whitechar # \n
  64 $tab         = \t
  65
  66 $ascdigit  = 0-9
  67 $unidigit  = \x03
  68 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  69 $digit     = [$ascdigit $unidigit]
  70
  71 $special   = [\(\)\,\;\[\]\`\{\}]
  72 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  73 $unisymbol = \x04
  74 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  75
  76 $unilarge  = \x01
  77 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  78 $large     = [$asclarge $unilarge]
  79
  80 $unismall  = \x02
  81 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  82 $small     = [$ascsmall $unismall \_]
  83
  84 $unigraphic = \x06
  85 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  86
  87 $octit     = 0-7
  88 $hexit     = [$decdigit A-F a-f]
  89 $symchar   = [$symbol \:]
  90 $nl        = [\n\r]
  91 $idchar    = [$small $large $digit \']
  92
  93 $docsym    = [\| \^ \* \$]
  94
  95 @varid     = $small $idchar*
  96 @conid     = $large $idchar*
  97
  98 @varsym    = $symbol $symchar*
  99 @consym    = \: $symchar*
 100
 101 @decimal     = $decdigit+
 102 @octal       = $octit+
 103 @hexadecimal = $hexit+
 104 @exponent    = [eE] [\-\+]? @decimal
 105
 106 -- we support the hierarchical module name extension:
 107 @qual = (@conid \.)+
 108
 109 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 110
 111 haskell :-
 112
 113 -- everywhere: skip whitespace and comments
 114 $white_no_nl+                           ;
 115 $tab+         { warn Opt_WarnTabs (text "Tab character") }
 116
 117 -- Everywhere: deal with nested comments.  We explicitly rule out
 118 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 119 -- (this can happen even though pragmas will normally take precedence due to
 120 -- longest-match, because pragmas aren't valid in every state, but comments
 121 -- are). We also rule out nested Haddock comments, if the -haddock flag is
 122 -- set.
 123
 124 "{-" / { isNormalComment } { nested_comment lexToken }
 125
 126 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 127 -- more dashes followed by a symbol should be parsed as a varsym, so we
 128 -- have to exclude those.
 129
 130 -- Since Haddock comments aren't valid in every state, we need to rule them
 131 -- out here.
 132
 133 -- The following two rules match comments that begin with two dashes, but
 134 -- continue with a different character. The rules test that this character
 135 -- is not a symbol (in which case we'd have a varsym), and that it's not a
 136 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
 137 -- have a Haddock comment). The rules then munch the rest of the line.
 138
 139 "-- " ~$docsym .* ;
 140 "--" [^$symbol : \ ] .* ;
 141
 142 -- Next, match Haddock comments if no -haddock flag
 143
 144 "-- " $docsym .* / { ifExtension (not . haddockEnabled) } ;
 145
 146 -- Now, when we've matched comments that begin with 2 dashes and continue
 147 -- with a different character, we need to match comments that begin with three
 148 -- or more dashes (which clearly can't be Haddock comments). We only need to
 149 -- make sure that the first non-dash character isn't a symbol, and munch the
 150 -- rest of the line.
 151
 152 "---"\-* [^$symbol :] .* ;
 153
 154 -- Since the previous rules all match dashes followed by at least one
 155 -- character, we also need to match a whole line filled with just dashes.
 156
 157 "--"\-* / { atEOL } ;
 158
 159 -- We need this rule since none of the other single line comment rules
 160 -- actually match this case.
 161
 162 "-- " / { atEOL } ;
 163
 164 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 165 -- blank lines) until we find a non-whitespace character, then do layout
 166 -- processing.
 167 --
 168 -- One slight wibble here: what if the line begins with {-#? In
 169 -- theory, we have to lex the pragma to see if it's one we recognise,
 170 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 171 -- as a nested comment.  We don't bother with this: if the line begins
 172 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 173 <bol> {
 174   \n                                    ;
 175   ^\# (line)?                           { begin line_prag1 }
 176   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 177   ^\# \! .* \n                          ; -- #!, for scripts
 178   ()                                    { do_bol }
 179 }
 180
 181 -- after a layout keyword (let, where, do, of), we begin a new layout
 182 -- context if the curly brace is missing.
 183 -- Careful! This stuff is quite delicate.
 184 <layout, layout_do> {
 185   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 186         -- we might encounter {-# here, but {- has been handled already
 187   \n                                    ;
 188   ^\# (line)?                           { begin line_prag1 }
 189 }
 190
 191 -- do is treated in a subtly different way, see new_layout_context
 192 <layout>    ()                          { new_layout_context True }
 193 <layout_do> ()                          { new_layout_context False }
 194
 195 -- after a new layout context which was found to be to the left of the
 196 -- previous context, we have generated a '{' token, and we now need to
 197 -- generate a matching '}' token.
 198 <layout_left>  ()                       { do_layout_left }
 199
 200 <0,option_prags,glaexts> \n                             { begin bol }
 201
 202 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 203
 204 -- single-line line pragmas, of the form
 205 --    # <line> "<file>" <extra-stuff> \n
 206 <line_prag1> $decdigit+                 { setLine line_prag1a }
 207 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 208 <line_prag1b> .*                        { pop }
 209
 210 -- Haskell-style line pragmas, of the form
 211 --    {-# LINE <line> "<file>" #-}
 212 <line_prag2> $decdigit+                 { setLine line_prag2a }
 213 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 214 <line_prag2b> "#-}"|"-}"                { pop }
 215    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 216    -- with older versions of GHC which generated these.
 217
 218 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 219 -- is on, because the contents of the pragma is always written using
 220 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 221 -- enabled, we're sure to get a parse error.
 222 -- (ToDo: we should really emit a warning when ignoring pragmas)
 223 <glaexts>
 224   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 225
 226 <0,option_prags,glaexts> {
 227   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 228   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 229                                         { token (ITinline_prag False) }
 230   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 231                                         { token ITspec_prag }
 232   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 233         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 234   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 235         $whitechar* (NO(T?)INLINE|no(t?)inline)
 236                                         { token (ITspec_inline_prag False) }
 237   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 238   "{-#" $whitechar* (DEPRECATED|deprecated)
 239                                         { token ITdeprecated_prag }
 240   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 241   "{-#" $whitechar* (GENERATED|generated)
 242                                         { token ITgenerated_prag }
 243   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 244   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 245
 246   "{-#" $whitechar* (DOCOPTIONS|docoptions)
 247   / { ifExtension haddockEnabled }     { lex_string_prag ITdocOptions }
 248
 249  "{-#"                                 { nested_comment lexToken }
 250
 251   -- ToDo: should only be valid inside a pragma:
 252   "#-}"                                 { token ITclose_prag}
 253 }
 254
 255 <option_prags> {
 256   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 257   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 258                                         { lex_string_prag IToptions_prag }
 259   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 260   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 261 }
 262
 263 <0,option_prags,glaexts> {
 264         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 265   "{-#" $whitechar* $idchar+            { nested_comment lexToken }
 266 }
 267
 268 -- '0' state: ordinary lexemes
 269 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 270
 271 -- Haddock comments
 272
 273 <0,glaexts> {
 274   "-- " $docsym    / { ifExtension haddockEnabled } { multiline_doc_comment }
 275   "{-" \ ? $docsym / { ifExtension haddockEnabled } { nested_doc_comment }
 276 }
 277
 278 -- "special" symbols
 279
 280 <0,glaexts> {
 281   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 282   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 283 }
 284
 285 <0,glaexts> {
 286   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 287   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 288   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 289   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 290   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 291   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 292   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 293   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 294 }
 295
 296 <0,glaexts> {
 297   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 298                                         { special IToparenbar }
 299   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 300 }
 301
 302 <0,glaexts> {
 303   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 304 }
 305
 306 <glaexts> {
 307   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 308   "#)"                                  { token ITcubxparen }
 309   "{|"                                  { token ITocurlybar }
 310   "|}"                                  { token ITccurlybar }
 311 }
 312
 313 <0,option_prags,glaexts> {
 314   \(                                    { special IToparen }
 315   \)                                    { special ITcparen }
 316   \[                                    { special ITobrack }
 317   \]                                    { special ITcbrack }
 318   \,                                    { special ITcomma }
 319   \;                                    { special ITsemi }
 320   \`                                    { special ITbackquote }
 321
 322   \{                                    { open_brace }
 323   \}                                    { close_brace }
 324 }
 325
 326 <0,option_prags,glaexts> {
 327   @qual @varid                  { check_qvarid }
 328   @qual @conid                  { idtoken qconid }
 329   @varid                        { varid }
 330   @conid                        { idtoken conid }
 331 }
 332
 333 -- after an illegal qvarid, such as 'M.let',
 334 -- we back up and try again in the bad_qvarid state:
 335 <bad_qvarid> {
 336   @conid                        { pop_and (idtoken conid) }
 337   @qual @conid                  { pop_and (idtoken qconid) }
 338 }
 339
 340 <glaexts> {
 341   @qual @varid "#"+             { idtoken qvarid }
 342   @qual @conid "#"+             { idtoken qconid }
 343   @varid "#"+                   { varid }
 344   @conid "#"+                   { idtoken conid }
 345 }
 346
 347 -- ToDo: M.(,,,)
 348
 349 <0,glaexts> {
 350   @qual @varsym                 { idtoken qvarsym }
 351   @qual @consym                 { idtoken qconsym }
 352   @varsym                       { varsym }
 353   @consym                       { consym }
 354 }
 355
 356 <0,glaexts> {
 357   @decimal                      { tok_decimal }
 358   0[oO] @octal                  { tok_octal }
 359   0[xX] @hexadecimal            { tok_hexadecimal }
 360 }
 361
 362 <glaexts> {
 363   @decimal \#                   { prim_decimal }
 364   0[oO] @octal \#               { prim_octal }
 365   0[xX] @hexadecimal \#         { prim_hexadecimal }
 366 }
 367
 368 <0,glaexts> @floating_point             { strtoken tok_float }
 369 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 370 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 371
 372 -- Strings and chars are lexed by hand-written code.  The reason is
 373 -- that even if we recognise the string or char here in the regex
 374 -- lexer, we would still have to parse the string afterward in order
 375 -- to convert it to a String.
 376 <0,glaexts> {
 377   \'                            { lex_char_tok }
 378   \"                            { lex_string_tok }
 379 }
 380
 381 {
 382 -- -----------------------------------------------------------------------------
 383 -- The token type
 384
 385 data Token
 386   = ITas                        -- Haskell keywords
 387   | ITcase
 388   | ITclass
 389   | ITdata
 390   | ITdefault
 391   | ITderiving
 392   | ITderive
 393   | ITdo
 394   | ITelse
 395   | IThiding
 396   | ITif
 397   | ITimport
 398   | ITin
 399   | ITinfix
 400   | ITinfixl
 401   | ITinfixr
 402   | ITinstance
 403   | ITlet
 404   | ITmodule
 405   | ITnewtype
 406   | ITof
 407   | ITqualified
 408   | ITthen
 409   | ITtype
 410   | ITwhere
 411   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 412
 413   | ITforall                    -- GHC extension keywords
 414   | ITforeign
 415   | ITexport
 416   | ITlabel
 417   | ITdynamic
 418   | ITsafe
 419   | ITthreadsafe
 420   | ITunsafe
 421   | ITstdcallconv
 422   | ITccallconv
 423   | ITdotnet
 424   | ITmdo
 425   | ITfamily
 426
 427         -- Pragmas
 428   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 429   | ITspec_prag                 -- SPECIALISE
 430   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 431   | ITsource_prag
 432   | ITrules_prag
 433   | ITdeprecated_prag
 434   | ITline_prag
 435   | ITscc_prag
 436   | ITgenerated_prag
 437   | ITcore_prag                 -- hdaume: core annotations
 438   | ITunpack_prag
 439   | ITclose_prag
 440   | IToptions_prag String
 441   | ITinclude_prag String
 442   | ITlanguage_prag
 443
 444   | ITdotdot                    -- reserved symbols
 445   | ITcolon
 446   | ITdcolon
 447   | ITequal
 448   | ITlam
 449   | ITvbar
 450   | ITlarrow
 451   | ITrarrow
 452   | ITat
 453   | ITtilde
 454   | ITdarrow
 455   | ITminus
 456   | ITbang
 457   | ITstar
 458   | ITdot
 459
 460   | ITbiglam                    -- GHC-extension symbols
 461
 462   | ITocurly                    -- special symbols
 463   | ITccurly
 464   | ITocurlybar                 -- {|, for type applications
 465   | ITccurlybar                 -- |}, for type applications
 466   | ITvocurly
 467   | ITvccurly
 468   | ITobrack
 469   | ITopabrack                  -- [:, for parallel arrays with -fparr
 470   | ITcpabrack                  -- :], for parallel arrays with -fparr
 471   | ITcbrack
 472   | IToparen
 473   | ITcparen
 474   | IToubxparen
 475   | ITcubxparen
 476   | ITsemi
 477   | ITcomma
 478   | ITunderscore
 479   | ITbackquote
 480
 481   | ITvarid   FastString        -- identifiers
 482   | ITconid   FastString
 483   | ITvarsym  FastString
 484   | ITconsym  FastString
 485   | ITqvarid  (FastString,FastString)
 486   | ITqconid  (FastString,FastString)
 487   | ITqvarsym (FastString,FastString)
 488   | ITqconsym (FastString,FastString)
 489
 490   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 491
 492   | ITpragma StringBuffer
 493
 494   | ITchar       Char
 495   | ITstring     FastString
 496   | ITinteger    Integer
 497   | ITrational   Rational
 498
 499   | ITprimchar   Char
 500   | ITprimstring FastString
 501   | ITprimint    Integer
 502   | ITprimfloat  Rational
 503   | ITprimdouble Rational
 504
 505   -- MetaHaskell extension tokens
 506   | ITopenExpQuote              --  [| or [e|
 507   | ITopenPatQuote              --  [p|
 508   | ITopenDecQuote              --  [d|
 509   | ITopenTypQuote              --  [t|
 510   | ITcloseQuote                --  |]
 511   | ITidEscape   FastString     --  $x
 512   | ITparenEscape               --  $(
 513   | ITvarQuote                  --  '
 514   | ITtyQuote                   --  ''
 515
 516   -- Arrow notation extension
 517   | ITproc
 518   | ITrec
 519   | IToparenbar                 --  (|
 520   | ITcparenbar                 --  |)
 521   | ITlarrowtail                --  -<
 522   | ITrarrowtail                --  >-
 523   | ITLarrowtail                --  -<<
 524   | ITRarrowtail                --  >>-
 525
 526   | ITunknown String            -- Used when the lexer can't make sense of it
 527   | ITeof                       -- end of file token
 528
 529   -- Documentation annotations
 530   | ITdocCommentNext  String     -- something beginning '-- |'
 531   | ITdocCommentPrev  String     -- something beginning '-- ^'
 532   | ITdocCommentNamed String     -- something beginning '-- $'
 533   | ITdocSection      Int String -- a section heading
 534   | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
 535
 536 #ifdef DEBUG
 537   deriving Show -- debugging
 538 #endif
 539
 540 isSpecial :: Token -> Bool
 541 -- If we see M.x, where x is a keyword, but
 542 -- is special, we treat is as just plain M.x,
 543 -- not as a keyword.
 544 isSpecial ITas          = True
 545 isSpecial IThiding      = True
 546 isSpecial ITderive      = True
 547 isSpecial ITqualified   = True
 548 isSpecial ITforall      = True
 549 isSpecial ITexport      = True
 550 isSpecial ITlabel       = True
 551 isSpecial ITdynamic     = True
 552 isSpecial ITsafe        = True
 553 isSpecial ITthreadsafe  = True
 554 isSpecial ITunsafe      = True
 555 isSpecial ITccallconv   = True
 556 isSpecial ITstdcallconv = True
 557 isSpecial ITmdo         = True
 558 isSpecial ITfamily      = True
 559 isSpecial _             = False
 560
 561 -- the bitmap provided as the third component indicates whether the
 562 -- corresponding extension keyword is valid under the extension options
 563 -- provided to the compiler; if the extension corresponding to *any* of the
 564 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 565 -- facilitates using a keyword in two different extensions that can be
 566 -- activated independently)
 567 --
 568 reservedWordsFM = listToUFM $
 569         map (\(x, y, z) -> (mkFastString x, (y, z)))
 570        [( "_",          ITunderscore,   0 ),
 571         ( "as",         ITas,           0 ),
 572         ( "case",       ITcase,         0 ),
 573         ( "class",      ITclass,        0 ),
 574         ( "data",       ITdata,         0 ),
 575         ( "default",    ITdefault,      0 ),
 576         ( "deriving",   ITderiving,     0 ),
 577         ( "derive",     ITderive,       0 ),
 578         ( "do",         ITdo,           0 ),
 579         ( "else",       ITelse,         0 ),
 580         ( "hiding",     IThiding,       0 ),
 581         ( "if",         ITif,           0 ),
 582         ( "import",     ITimport,       0 ),
 583         ( "in",         ITin,           0 ),
 584         ( "infix",      ITinfix,        0 ),
 585         ( "infixl",     ITinfixl,       0 ),
 586         ( "infixr",     ITinfixr,       0 ),
 587         ( "instance",   ITinstance,     0 ),
 588         ( "let",        ITlet,          0 ),
 589         ( "module",     ITmodule,       0 ),
 590         ( "newtype",    ITnewtype,      0 ),
 591         ( "of",         ITof,           0 ),
 592         ( "qualified",  ITqualified,    0 ),
 593         ( "then",       ITthen,         0 ),
 594         ( "type",       ITtype,         0 ),
 595         ( "where",      ITwhere,        0 ),
 596         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 597
 598         ( "forall",     ITforall,        bit tvBit),
 599         ( "mdo",        ITmdo,           bit glaExtsBit),
 600         ( "family",     ITfamily,        bit idxTysBit),
 601
 602         ( "foreign",    ITforeign,       bit ffiBit),
 603         ( "export",     ITexport,        bit ffiBit),
 604         ( "label",      ITlabel,         bit ffiBit),
 605         ( "dynamic",    ITdynamic,       bit ffiBit),
 606         ( "safe",       ITsafe,          bit ffiBit),
 607         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 608         ( "unsafe",     ITunsafe,        bit ffiBit),
 609         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 610         ( "ccall",      ITccallconv,     bit ffiBit),
 611         ( "dotnet",     ITdotnet,        bit ffiBit),
 612
 613         ( "rec",        ITrec,           bit arrowsBit),
 614         ( "proc",       ITproc,          bit arrowsBit)
 615      ]
 616
 617 reservedSymsFM = listToUFM $
 618         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 619       [ ("..",  ITdotdot,       0)
 620        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 621                                                 -- meaning only list cons
 622        ,("::",  ITdcolon,       0)
 623        ,("=",   ITequal,        0)
 624        ,("\\",  ITlam,          0)
 625        ,("|",   ITvbar,         0)
 626        ,("<-",  ITlarrow,       0)
 627        ,("->",  ITrarrow,       0)
 628        ,("@",   ITat,           0)
 629        ,("~",   ITtilde,        0)
 630        ,("=>",  ITdarrow,       0)
 631        ,("-",   ITminus,        0)
 632        ,("!",   ITbang,         0)
 633
 634        ,("*",   ITstar,         bit glaExtsBit .|.
 635                                 bit idxTysBit)      -- For data T (a::*) = MkT
 636        ,(".",   ITdot,          bit tvBit)          -- For 'forall a . t'
 637
 638        ,("-<",  ITlarrowtail,   bit arrowsBit)
 639        ,(">-",  ITrarrowtail,   bit arrowsBit)
 640        ,("-<<", ITLarrowtail,   bit arrowsBit)
 641        ,(">>-", ITRarrowtail,   bit arrowsBit)
 642
 643 #if __GLASGOW_HASKELL__ >= 605
 644        ,("∷",   ITdcolon,       bit glaExtsBit)
 645        ,("⇒",   ITdarrow,     bit glaExtsBit)
 646        ,("∀", ITforall,       bit glaExtsBit)
 647        ,("→",   ITrarrow,     bit glaExtsBit)
 648        ,("←",   ITlarrow,     bit glaExtsBit)
 649        ,("⋯",         ITdotdot,       bit glaExtsBit)
 650         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 651         -- form part of a large operator.  This would let us have a better
 652         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 653 #endif
 654        ]
 655
 656 -- -----------------------------------------------------------------------------
 657 -- Lexer actions
 658
 659 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 660
 661 special :: Token -> Action
 662 special tok span _buf len = return (L span tok)
 663
 664 token, layout_token :: Token -> Action
 665 token t span buf len = return (L span t)
 666 layout_token t span buf len = pushLexState layout >> return (L span t)
 667
 668 idtoken :: (StringBuffer -> Int -> Token) -> Action
 669 idtoken f span buf len = return (L span $! (f buf len))
 670
 671 skip_one_varid :: (FastString -> Token) -> Action
 672 skip_one_varid f span buf len
 673   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 674
 675 strtoken :: (String -> Token) -> Action
 676 strtoken f span buf len =
 677   return (L span $! (f $! lexemeToString buf len))
 678
 679 init_strtoken :: Int -> (String -> Token) -> Action
 680 -- like strtoken, but drops the last N character(s)
 681 init_strtoken drop f span buf len =
 682   return (L span $! (f $! lexemeToString buf (len-drop)))
 683
 684 begin :: Int -> Action
 685 begin code _span _str _len = do pushLexState code; lexToken
 686
 687 pop :: Action
 688 pop _span _buf _len = do popLexState; lexToken
 689
 690 pop_and :: Action -> Action
 691 pop_and act span buf len = do popLexState; act span buf len
 692
 693 {-# INLINE nextCharIs #-}
 694 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
 695
 696 notFollowedBy char _ _ _ (AI _ _ buf)
 697   = nextCharIs buf (/=char)
 698
 699 notFollowedBySymbol _ _ _ (AI _ _ buf)
 700   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 701
 702 isNormalComment bits _ _ (AI _ _ buf)
 703   = nextCharIs buf (/='#')
 704
 705 haddockDisabledAnd p bits _ _ (AI _ _ buf)
 706   = if haddockEnabled bits then False else (p buf)
 707
 708 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 709
 710 ifExtension pred bits _ _ _ = pred bits
 711
 712 multiline_doc_comment :: Action
 713 multiline_doc_comment span buf _len = withLexedDocType (worker "")
 714   where
 715     worker commentAcc input docType oneLine = case alexGetChar input of
 716       Just ('\n', input')
 717         | oneLine -> docCommentEnd input commentAcc docType buf span
 718         | otherwise -> case checkIfCommentLine input' of
 719           Just input -> worker ('\n':commentAcc) input docType False
 720           Nothing -> docCommentEnd input commentAcc docType buf span
 721       Just (c, input) -> worker (c:commentAcc) input docType oneLine
 722       Nothing -> docCommentEnd input commentAcc docType buf span
 723
 724     checkIfCommentLine input = check (dropNonNewlineSpace input)
 725       where
 726         check input = case alexGetChar input of
 727           Just ('-', input) -> case alexGetChar input of
 728             Just ('-', input) -> case alexGetChar input of
 729               Just (c, _) | c /= '-' -> Just input
 730               _ -> Nothing
 731             _ -> Nothing
 732           _ -> Nothing
 733
 734         dropNonNewlineSpace input = case alexGetChar input of
 735           Just (c, input')
 736             | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
 737             | otherwise -> input
 738           Nothing -> input
 739
 740 {-
 741   nested comments require traversing by hand, they can't be parsed
 742   using regular expressions.
 743 -}
 744 nested_comment :: P (Located Token) -> Action
 745 nested_comment cont span _str _len = do
 746   input <- getInput
 747   go 1 input
 748   where
 749     go 0 input = do setInput input; cont
 750     go n input = case alexGetChar input of
 751       Nothing -> errBrace input span
 752       Just ('-',input) -> case alexGetChar input of
 753         Nothing  -> errBrace input span
 754         Just ('\125',input) -> go (n-1) input
 755         Just (c,_)          -> go n input
 756       Just ('\123',input) -> case alexGetChar input of
 757         Nothing  -> errBrace input span
 758         Just ('-',input) -> go (n+1) input
 759         Just (c,_)       -> go n input
 760       Just (c,input) -> go n input
 761
 762 nested_doc_comment :: Action
 763 nested_doc_comment span buf _len = withLexedDocType (go "")
 764   where
 765     go commentAcc input docType _ = case alexGetChar input of
 766       Nothing -> errBrace input span
 767       Just ('-',input) -> case alexGetChar input of
 768         Nothing -> errBrace input span
 769         Just ('\125',input@(AI end _ buf2)) ->
 770           docCommentEnd input commentAcc docType buf span
 771         Just (c,_) -> go ('-':commentAcc) input docType False
 772       Just ('\123', input) -> case alexGetChar input of
 773         Nothing  -> errBrace input span
 774         Just ('-',input) -> do
 775           setInput input
 776           let cont = do input <- getInput; go commentAcc input docType False
 777           nested_comment cont span buf _len
 778         Just (c,_) -> go ('\123':commentAcc) input docType False
 779       Just (c,input) -> go (c:commentAcc) input docType False
 780
 781 withLexedDocType lexDocComment = do
 782   input@(AI _ _ buf) <- getInput
 783   case prevChar buf ' ' of
 784     '|' -> lexDocComment input ITdocCommentNext False
 785     '^' -> lexDocComment input ITdocCommentPrev False
 786     '$' -> lexDocComment input ITdocCommentNamed False
 787     '*' -> lexDocSection 1 input
 788  where
 789     lexDocSection n input = case alexGetChar input of
 790       Just ('*', input) -> lexDocSection (n+1) input
 791       Just (c, _) -> lexDocComment input (ITdocSection n) True
 792       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 793
 794 -- docCommentEnd
 795 -------------------------------------------------------------------------------
 796 -- This function is quite tricky. We can't just return a new token, we also
 797 -- need to update the state of the parser. Why? Because the token is longer
 798 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
 799 -- it writes the wrong token length to the parser state. This function is
 800 -- called afterwards, so it can just update the state.
 801
 802 -- This is complicated by the fact that Haddock tokens can span multiple lines,
 803 -- which is something that the original lexer didn't account for.
 804 -- I have added last_line_len in the parser state which represents the length
 805 -- of the part of the token that is on the last line. It is now used for layout
 806 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
 807 -- was before, the full length of the token, and it is now only used for error
 808 -- messages. /Waern
 809
 810 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
 811                  SrcSpan -> P (Located Token)
 812 docCommentEnd input commentAcc docType buf span = do
 813   setInput input
 814   let (AI loc last_offs nextBuf) = input
 815       comment = reverse commentAcc
 816       span' = mkSrcSpan (srcSpanStart span) loc
 817       last_len = byteDiff buf nextBuf
 818
 819       last_line_len = if (last_offs - last_len < 0)
 820         then last_offs
 821         else last_len
 822
 823   span `seq` setLastToken span' last_len last_line_len
 824   return (L span' (docType comment))
 825
 826 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 827
 828 open_brace, close_brace :: Action
 829 open_brace span _str _len = do
 830   ctx <- getContext
 831   setContext (NoLayout:ctx)
 832   return (L span ITocurly)
 833 close_brace span _str _len = do
 834   popContext
 835   return (L span ITccurly)
 836
 837 -- We have to be careful not to count M.<varid> as a qualified name
 838 -- when <varid> is a keyword.  We hack around this by catching
 839 -- the offending tokens afterward, and re-lexing in a different state.
 840 check_qvarid span buf len = do
 841   case lookupUFM reservedWordsFM var of
 842         Just (keyword,exts)
 843           | not (isSpecial keyword) ->
 844           if exts == 0
 845              then try_again
 846              else do
 847                 b <- extension (\i -> exts .&. i /= 0)
 848                 if b then try_again
 849                      else return token
 850         _other -> return token
 851   where
 852         (mod,var) = splitQualName buf len
 853         token     = L span (ITqvarid (mod,var))
 854
 855         try_again = do
 856                 (AI _ offs _) <- getInput
 857                 setInput (AI (srcSpanStart span) (offs-len) buf)
 858                 pushLexState bad_qvarid
 859                 lexToken
 860
 861 qvarid buf len = ITqvarid $! splitQualName buf len
 862 qconid buf len = ITqconid $! splitQualName buf len
 863
 864 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 865 -- takes a StringBuffer and a length, and returns the module name
 866 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 867 -- because of hierarchical module names.
 868 splitQualName orig_buf len = split orig_buf orig_buf
 869   where
 870     split buf dot_buf
 871         | orig_buf `byteDiff` buf >= len  = done dot_buf
 872         | c == '.'                        = found_dot buf'
 873         | otherwise                       = split buf' dot_buf
 874       where
 875        (c,buf') = nextChar buf
 876
 877     -- careful, we might get names like M....
 878     -- so, if the character after the dot is not upper-case, this is
 879     -- the end of the qualifier part.
 880     found_dot buf -- buf points after the '.'
 881         | isUpper c    = split buf' buf
 882         | otherwise    = done buf
 883       where
 884        (c,buf') = nextChar buf
 885
 886     done dot_buf =
 887         (lexemeToFastString orig_buf (qual_size - 1),
 888          lexemeToFastString dot_buf (len - qual_size))
 889       where
 890         qual_size = orig_buf `byteDiff` dot_buf
 891
 892 varid span buf len =
 893   case lookupUFM reservedWordsFM fs of
 894         Just (keyword,0)    -> do
 895                 maybe_layout keyword
 896                 return (L span keyword)
 897         Just (keyword,exts) -> do
 898                 b <- extension (\i -> exts .&. i /= 0)
 899                 if b then do maybe_layout keyword
 900                              return (L span keyword)
 901                      else return (L span (ITvarid fs))
 902         _other -> return (L span (ITvarid fs))
 903   where
 904         fs = lexemeToFastString buf len
 905
 906 conid buf len = ITconid fs
 907   where fs = lexemeToFastString buf len
 908
 909 qvarsym buf len = ITqvarsym $! splitQualName buf len
 910 qconsym buf len = ITqconsym $! splitQualName buf len
 911
 912 varsym = sym ITvarsym
 913 consym = sym ITconsym
 914
 915 sym con span buf len =
 916   case lookupUFM reservedSymsFM fs of
 917         Just (keyword,0)    -> return (L span keyword)
 918         Just (keyword,exts) -> do
 919                 b <- extension (\i -> exts .&. i /= 0)
 920                 if b then return (L span keyword)
 921                      else return (L span $! con fs)
 922         _other -> return (L span $! con fs)
 923   where
 924         fs = lexemeToFastString buf len
 925
 926 tok_decimal span buf len
 927   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 928
 929 tok_octal span buf len
 930   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 931
 932 tok_hexadecimal span buf len
 933   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 934
 935 prim_decimal span buf len
 936   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 937
 938 prim_octal span buf len
 939   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 940
 941 prim_hexadecimal span buf len
 942   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 943
 944 tok_float        str = ITrational   $! readRational str
 945 prim_float       str = ITprimfloat  $! readRational str
 946 prim_double      str = ITprimdouble $! readRational str
 947
 948 -- -----------------------------------------------------------------------------
 949 -- Layout processing
 950
 951 -- we're at the first token on a line, insert layout tokens if necessary
 952 do_bol :: Action
 953 do_bol span _str _len = do
 954         pos <- getOffside
 955         case pos of
 956             LT -> do
 957                 --trace "layout: inserting '}'" $ do
 958                 popContext
 959                 -- do NOT pop the lex state, we might have a ';' to insert
 960                 return (L span ITvccurly)
 961             EQ -> do
 962                 --trace "layout: inserting ';'" $ do
 963                 popLexState
 964                 return (L span ITsemi)
 965             GT -> do
 966                 popLexState
 967                 lexToken
 968
 969 -- certain keywords put us in the "layout" state, where we might
 970 -- add an opening curly brace.
 971 maybe_layout ITdo       = pushLexState layout_do
 972 maybe_layout ITmdo      = pushLexState layout_do
 973 maybe_layout ITof       = pushLexState layout
 974 maybe_layout ITlet      = pushLexState layout
 975 maybe_layout ITwhere    = pushLexState layout
 976 maybe_layout ITrec      = pushLexState layout
 977 maybe_layout _          = return ()
 978
 979 -- Pushing a new implicit layout context.  If the indentation of the
 980 -- next token is not greater than the previous layout context, then
 981 -- Haskell 98 says that the new layout context should be empty; that is
 982 -- the lexer must generate {}.
 983 --
 984 -- We are slightly more lenient than this: when the new context is started
 985 -- by a 'do', then we allow the new context to be at the same indentation as
 986 -- the previous context.  This is what the 'strict' argument is for.
 987 --
 988 new_layout_context strict span _buf _len = do
 989     popLexState
 990     (AI _ offset _) <- getInput
 991     ctx <- getContext
 992     case ctx of
 993         Layout prev_off : _  |
 994            (strict     && prev_off >= offset  ||
 995             not strict && prev_off > offset) -> do
 996                 -- token is indented to the left of the previous context.
 997                 -- we must generate a {} sequence now.
 998                 pushLexState layout_left
 999                 return (L span ITvocurly)
1000         other -> do
1001                 setContext (Layout offset : ctx)
1002                 return (L span ITvocurly)
1003
1004 do_layout_left span _buf _len = do
1005     popLexState
1006     pushLexState bol  -- we must be at the start of a line
1007     return (L span ITvccurly)
1008
1009 -- -----------------------------------------------------------------------------
1010 -- LINE pragmas
1011
1012 setLine :: Int -> Action
1013 setLine code span buf len = do
1014   let line = parseInteger buf len 10 octDecDigit
1015   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1016         -- subtract one: the line number refers to the *following* line
1017   popLexState
1018   pushLexState code
1019   lexToken
1020
1021 setFile :: Int -> Action
1022 setFile code span buf len = do
1023   let file = lexemeToFastString (stepOn buf) (len-2)
1024   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1025   popLexState
1026   pushLexState code
1027   lexToken
1028
1029
1030 -- -----------------------------------------------------------------------------
1031 -- Options, includes and language pragmas.
1032
1033 lex_string_prag :: (String -> Token) -> Action
1034 lex_string_prag mkTok span buf len
1035     = do input <- getInput
1036          start <- getSrcLoc
1037          tok <- go [] input
1038          end <- getSrcLoc
1039          return (L (mkSrcSpan start end) tok)
1040     where go acc input
1041               = if isString input "#-}"
1042                    then do setInput input
1043                            return (mkTok (reverse acc))
1044                    else case alexGetChar input of
1045                           Just (c,i) -> go (c:acc) i
1046                           Nothing -> err input
1047           isString i [] = True
1048           isString i (x:xs)
1049               = case alexGetChar i of
1050                   Just (c,i') | c == x    -> isString i' xs
1051                   _other -> False
1052           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1053
1054
1055 -- -----------------------------------------------------------------------------
1056 -- Strings & Chars
1057
1058 -- This stuff is horrible.  I hates it.
1059
1060 lex_string_tok :: Action
1061 lex_string_tok span buf len = do
1062   tok <- lex_string ""
1063   end <- getSrcLoc
1064   return (L (mkSrcSpan (srcSpanStart span) end) tok)
1065
1066 lex_string :: String -> P Token
1067 lex_string s = do
1068   i <- getInput
1069   case alexGetChar' i of
1070     Nothing -> lit_error
1071
1072     Just ('"',i)  -> do
1073         setInput i
1074         glaexts <- extension glaExtsEnabled
1075         if glaexts
1076           then do
1077             i <- getInput
1078             case alexGetChar' i of
1079               Just ('#',i) -> do
1080                    setInput i
1081                    if any (> '\xFF') s
1082                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1083                     else let s' = mkZFastString (reverse s) in
1084                          return (ITprimstring s')
1085                         -- mkZFastString is a hack to avoid encoding the
1086                         -- string in UTF-8.  We just want the exact bytes.
1087               _other ->
1088                 return (ITstring (mkFastString (reverse s)))
1089           else
1090                 return (ITstring (mkFastString (reverse s)))
1091
1092     Just ('\\',i)
1093         | Just ('&',i) <- next -> do
1094                 setInput i; lex_string s
1095         | Just (c,i) <- next, is_space c -> do
1096                 setInput i; lex_stringgap s
1097         where next = alexGetChar' i
1098
1099     Just (c, i) -> do
1100         c' <- lex_char c i
1101         lex_string (c':s)
1102
1103 lex_stringgap s = do
1104   c <- getCharOrFail
1105   case c of
1106     '\\' -> lex_string s
1107     c | is_space c -> lex_stringgap s
1108     _other -> lit_error
1109
1110
1111 lex_char_tok :: Action
1112 -- Here we are basically parsing character literals, such as 'x' or '\n'
1113 -- but, when Template Haskell is on, we additionally spot
1114 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1115 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
1116 -- So we have to do two characters of lookahead: when we see 'x we need to
1117 -- see if there's a trailing quote
1118 lex_char_tok span buf len = do  -- We've seen '
1119    i1 <- getInput       -- Look ahead to first character
1120    let loc = srcSpanStart span
1121    case alexGetChar' i1 of
1122         Nothing -> lit_error
1123
1124         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
1125                   th_exts <- extension thEnabled
1126                   if th_exts then do
1127                         setInput i2
1128                         return (L (mkSrcSpan loc end2)  ITtyQuote)
1129                    else lit_error
1130
1131         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
1132                   setInput i2
1133                   lit_ch <- lex_escape
1134                   mc <- getCharOrFail   -- Trailing quote
1135                   if mc == '\'' then finish_char_tok loc lit_ch
1136                                 else do setInput i2; lit_error
1137
1138         Just (c, i2@(AI end2 _ _))
1139                 | not (isAny c) -> lit_error
1140                 | otherwise ->
1141
1142                 -- We've seen 'x, where x is a valid character
1143                 --  (i.e. not newline etc) but not a quote or backslash
1144            case alexGetChar' i2 of      -- Look ahead one more character
1145                 Nothing -> lit_error
1146                 Just ('\'', i3) -> do   -- We've seen 'x'
1147                         setInput i3
1148                         finish_char_tok loc c
1149                 _other -> do            -- We've seen 'x not followed by quote
1150                                         -- If TH is on, just parse the quote only
1151                         th_exts <- extension thEnabled
1152                         let (AI end _ _) = i1
1153                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1154                                    else do setInput i2; lit_error
1155
1156 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1157 finish_char_tok loc ch  -- We've already seen the closing quote
1158                         -- Just need to check for trailing #
1159   = do  glaexts <- extension glaExtsEnabled
1160         i@(AI end _ _) <- getInput
1161         if glaexts then do
1162                 case alexGetChar' i of
1163                         Just ('#',i@(AI end _ _)) -> do
1164                                 setInput i
1165                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1166                         _other ->
1167                                 return (L (mkSrcSpan loc end) (ITchar ch))
1168                 else do
1169                    return (L (mkSrcSpan loc end) (ITchar ch))
1170
1171 lex_char :: Char -> AlexInput -> P Char
1172 lex_char c inp = do
1173   case c of
1174       '\\' -> do setInput inp; lex_escape
1175       c | isAny c -> do setInput inp; return c
1176       _other -> lit_error
1177
1178 isAny c | c > '\xff' = isPrint c
1179         | otherwise  = is_any c
1180
1181 lex_escape :: P Char
1182 lex_escape = do
1183   c <- getCharOrFail
1184   case c of
1185         'a'   -> return '\a'
1186         'b'   -> return '\b'
1187         'f'   -> return '\f'
1188         'n'   -> return '\n'
1189         'r'   -> return '\r'
1190         't'   -> return '\t'
1191         'v'   -> return '\v'
1192         '\\'  -> return '\\'
1193         '"'   -> return '\"'
1194         '\''  -> return '\''
1195         '^'   -> do c <- getCharOrFail
1196                     if c >= '@' && c <= '_'
1197                         then return (chr (ord c - ord '@'))
1198                         else lit_error
1199
1200         'x'   -> readNum is_hexdigit 16 hexDigit
1201         'o'   -> readNum is_octdigit  8 octDecDigit
1202         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1203
1204         c1 ->  do
1205            i <- getInput
1206            case alexGetChar' i of
1207             Nothing -> lit_error
1208             Just (c2,i2) ->
1209               case alexGetChar' i2 of
1210                 Nothing -> do setInput i2; lit_error
1211                 Just (c3,i3) ->
1212                    let str = [c1,c2,c3] in
1213                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1214                                      Just rest <- [maybePrefixMatch p str] ] of
1215                           (escape_char,[]):_ -> do
1216                                 setInput i3
1217                                 return escape_char
1218                           (escape_char,_:_):_ -> do
1219                                 setInput i2
1220                                 return escape_char
1221                           [] -> lit_error
1222
1223 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1224 readNum is_digit base conv = do
1225   i <- getInput
1226   c <- getCharOrFail
1227   if is_digit c
1228         then readNum2 is_digit base conv (conv c)
1229         else do setInput i; lit_error
1230
1231 readNum2 is_digit base conv i = do
1232   input <- getInput
1233   read i input
1234   where read i input = do
1235           case alexGetChar' input of
1236             Just (c,input') | is_digit c -> do
1237                 read (i*base + conv c) input'
1238             _other -> do
1239                 if i >= 0 && i <= 0x10FFFF
1240                    then do setInput input; return (chr i)
1241                    else lit_error
1242
1243 silly_escape_chars = [
1244         ("NUL", '\NUL'),
1245         ("SOH", '\SOH'),
1246         ("STX", '\STX'),
1247         ("ETX", '\ETX'),
1248         ("EOT", '\EOT'),
1249         ("ENQ", '\ENQ'),
1250         ("ACK", '\ACK'),
1251         ("BEL", '\BEL'),
1252         ("BS", '\BS'),
1253         ("HT", '\HT'),
1254         ("LF", '\LF'),
1255         ("VT", '\VT'),
1256         ("FF", '\FF'),
1257         ("CR", '\CR'),
1258         ("SO", '\SO'),
1259         ("SI", '\SI'),
1260         ("DLE", '\DLE'),
1261         ("DC1", '\DC1'),
1262         ("DC2", '\DC2'),
1263         ("DC3", '\DC3'),
1264         ("DC4", '\DC4'),
1265         ("NAK", '\NAK'),
1266         ("SYN", '\SYN'),
1267         ("ETB", '\ETB'),
1268         ("CAN", '\CAN'),
1269         ("EM", '\EM'),
1270         ("SUB", '\SUB'),
1271         ("ESC", '\ESC'),
1272         ("FS", '\FS'),
1273         ("GS", '\GS'),
1274         ("RS", '\RS'),
1275         ("US", '\US'),
1276         ("SP", '\SP'),
1277         ("DEL", '\DEL')
1278         ]
1279
1280 -- before calling lit_error, ensure that the current input is pointing to
1281 -- the position of the error in the buffer.  This is so that we can report
1282 -- a correct location to the user, but also so we can detect UTF-8 decoding
1283 -- errors if they occur.
1284 lit_error = lexError "lexical error in string/character literal"
1285
1286 getCharOrFail :: P Char
1287 getCharOrFail =  do
1288   i <- getInput
1289   case alexGetChar' i of
1290         Nothing -> lexError "unexpected end-of-file in string/character literal"
1291         Just (c,i)  -> do setInput i; return c
1292
1293 -- -----------------------------------------------------------------------------
1294 -- Warnings
1295
1296 warn :: DynFlag -> SDoc -> Action
1297 warn option warning span _buf _len = do
1298     addWarning option (mkWarnMsg span alwaysQualify warning)
1299     lexToken
1300
1301 -- -----------------------------------------------------------------------------
1302 -- The Parse Monad
1303
1304 data LayoutContext
1305   = NoLayout
1306   | Layout !Int
1307   deriving Show
1308
1309 data ParseResult a
1310   = POk PState a
1311   | PFailed
1312         SrcSpan         -- The start and end of the text span related to
1313                         -- the error.  Might be used in environments which can
1314                         -- show this span, e.g. by highlighting it.
1315         Message         -- The error message
1316
1317 data PState = PState {
1318         buffer     :: StringBuffer,
1319     dflags     :: DynFlags,
1320     messages   :: Messages,
1321         last_loc   :: SrcSpan,  -- pos of previous token
1322         last_offs  :: !Int,     -- offset of the previous token from the
1323                                 -- beginning of  the current line.
1324                                 -- \t is equal to 8 spaces.
1325         last_len   :: !Int,     -- len of previous token
1326   last_line_len :: !Int,
1327         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1328         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1329         context    :: [LayoutContext],
1330         lex_state  :: [Int]
1331      }
1332         -- last_loc and last_len are used when generating error messages,
1333         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1334         -- current token to happyError, we could at least get rid of last_len.
1335         -- Getting rid of last_loc would require finding another way to
1336         -- implement pushCurrentContext (which is only called from one place).
1337
1338 newtype P a = P { unP :: PState -> ParseResult a }
1339
1340 instance Monad P where
1341   return = returnP
1342   (>>=) = thenP
1343   fail = failP
1344
1345 returnP :: a -> P a
1346 returnP a = P $ \s -> POk s a
1347
1348 thenP :: P a -> (a -> P b) -> P b
1349 (P m) `thenP` k = P $ \ s ->
1350         case m s of
1351                 POk s1 a         -> (unP (k a)) s1
1352                 PFailed span err -> PFailed span err
1353
1354 failP :: String -> P a
1355 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1356
1357 failMsgP :: String -> P a
1358 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1359
1360 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1361 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1362
1363 failSpanMsgP :: SrcSpan -> String -> P a
1364 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1365
1366 extension :: (Int -> Bool) -> P Bool
1367 extension p = P $ \s -> POk s (p $! extsBitmap s)
1368
1369 getExts :: P Int
1370 getExts = P $ \s -> POk s (extsBitmap s)
1371
1372 setSrcLoc :: SrcLoc -> P ()
1373 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1374
1375 getSrcLoc :: P SrcLoc
1376 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1377
1378 setLastToken :: SrcSpan -> Int -> Int -> P ()
1379 setLastToken loc len line_len = P $ \s -> POk s {
1380   last_loc=loc,
1381   last_len=len,
1382   last_line_len=line_len
1383 } ()
1384
1385 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1386
1387 alexInputPrevChar :: AlexInput -> Char
1388 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1389
1390 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1391 alexGetChar (AI loc ofs s)
1392   | atEnd s   = Nothing
1393   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1394                 --trace (show (ord c)) $
1395                 Just (adj_c, (AI loc' ofs' s'))
1396   where (c,s') = nextChar s
1397         loc'   = advanceSrcLoc loc c
1398         ofs'   = advanceOffs c ofs
1399
1400         non_graphic     = '\x0'
1401         upper           = '\x1'
1402         lower           = '\x2'
1403         digit           = '\x3'
1404         symbol          = '\x4'
1405         space           = '\x5'
1406         other_graphic   = '\x6'
1407
1408         adj_c
1409           | c <= '\x06' = non_graphic
1410           | c <= '\xff' = c
1411           | otherwise =
1412                 case generalCategory c of
1413                   UppercaseLetter       -> upper
1414                   LowercaseLetter       -> lower
1415                   TitlecaseLetter       -> upper
1416                   ModifierLetter        -> other_graphic
1417                   OtherLetter           -> other_graphic
1418                   NonSpacingMark        -> other_graphic
1419                   SpacingCombiningMark  -> other_graphic
1420                   EnclosingMark         -> other_graphic
1421                   DecimalNumber         -> digit
1422                   LetterNumber          -> other_graphic
1423                   OtherNumber           -> other_graphic
1424                   ConnectorPunctuation  -> other_graphic
1425                   DashPunctuation       -> other_graphic
1426                   OpenPunctuation       -> other_graphic
1427                   ClosePunctuation      -> other_graphic
1428                   InitialQuote          -> other_graphic
1429                   FinalQuote            -> other_graphic
1430                   OtherPunctuation      -> other_graphic
1431                   MathSymbol            -> symbol
1432                   CurrencySymbol        -> symbol
1433                   ModifierSymbol        -> symbol
1434                   OtherSymbol           -> symbol
1435                   Space                 -> space
1436                   _other                -> non_graphic
1437
1438 -- This version does not squash unicode characters, it is used when
1439 -- lexing strings.
1440 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1441 alexGetChar' (AI loc ofs s)
1442   | atEnd s   = Nothing
1443   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1444                 --trace (show (ord c)) $
1445                 Just (c, (AI loc' ofs' s'))
1446   where (c,s') = nextChar s
1447         loc'   = advanceSrcLoc loc c
1448         ofs'   = advanceOffs c ofs
1449
1450 advanceOffs :: Char -> Int -> Int
1451 advanceOffs '\n' offs = 0
1452 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1453 advanceOffs _    offs = offs + 1
1454
1455 getInput :: P AlexInput
1456 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1457
1458 setInput :: AlexInput -> P ()
1459 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1460
1461 pushLexState :: Int -> P ()
1462 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1463
1464 popLexState :: P Int
1465 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1466
1467 getLexState :: P Int
1468 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1469
1470 -- for reasons of efficiency, flags indicating language extensions (eg,
1471 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1472 -- integer
1473
1474 glaExtsBit, ffiBit, parrBit :: Int
1475 glaExtsBit = 0
1476 ffiBit     = 1
1477 parrBit    = 2
1478 arrowsBit  = 4
1479 thBit      = 5
1480 ipBit      = 6
1481 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1482 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1483                 -- (doesn't affect the lexer)
1484 idxTysBit  = 9  -- indexed type families: 'family' keyword and kind sigs
1485 haddockBit = 10 -- Lex and parse Haddock comments
1486
1487 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1488 glaExtsEnabled flags = testBit flags glaExtsBit
1489 ffiEnabled     flags = testBit flags ffiBit
1490 parrEnabled    flags = testBit flags parrBit
1491 arrowsEnabled  flags = testBit flags arrowsBit
1492 thEnabled      flags = testBit flags thBit
1493 ipEnabled      flags = testBit flags ipBit
1494 tvEnabled      flags = testBit flags tvBit
1495 bangPatEnabled flags = testBit flags bangPatBit
1496 idxTysEnabled  flags = testBit flags idxTysBit
1497 haddockEnabled flags = testBit flags haddockBit
1498
1499 -- PState for parsing options pragmas
1500 --
1501 pragState :: StringBuffer -> SrcLoc -> PState
1502 pragState buf loc  =
1503   PState {
1504       buffer          = buf,
1505       messages      = emptyMessages,
1506       -- XXX defaultDynFlags is not right, but we don't have a real
1507       -- dflags handy
1508       dflags        = defaultDynFlags,
1509       last_loc      = mkSrcSpan loc loc,
1510       last_offs     = 0,
1511       last_len      = 0,
1512       last_line_len = 0,
1513       loc           = loc,
1514       extsBitmap    = 0,
1515       context       = [],
1516       lex_state     = [bol, option_prags, 0]
1517     }
1518
1519
1520 -- create a parse state
1521 --
1522 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1523 mkPState buf loc flags  =
1524   PState {
1525       buffer          = buf,
1526       dflags        = flags,
1527       messages      = emptyMessages,
1528       last_loc      = mkSrcSpan loc loc,
1529       last_offs     = 0,
1530       last_len      = 0,
1531       last_line_len = 0,
1532       loc           = loc,
1533       extsBitmap    = fromIntegral bitmap,
1534       context       = [],
1535       lex_state     = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1536         -- we begin in the layout state if toplev_layout is set
1537     }
1538     where
1539       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1540                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1541                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1542                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1543                .|. thBit      `setBitIf` dopt Opt_TH          flags
1544                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1545                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1546                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1547                .|. idxTysBit  `setBitIf` dopt Opt_IndexedTypes flags
1548                .|. haddockBit `setBitIf` dopt Opt_Haddock     flags
1549       --
1550       setBitIf :: Int -> Bool -> Int
1551       b `setBitIf` cond | cond      = bit b
1552                         | otherwise = 0
1553
1554 addWarning :: DynFlag -> WarnMsg -> P ()
1555 addWarning option w
1556  = P $ \s@PState{messages=(ws,es), dflags=d} ->
1557        let ws' = if dopt option d then ws `snocBag` w else ws
1558        in POk s{messages=(ws', es)} ()
1559
1560 getMessages :: PState -> Messages
1561 getMessages PState{messages=ms} = ms
1562
1563 getContext :: P [LayoutContext]
1564 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1565
1566 setContext :: [LayoutContext] -> P ()
1567 setContext ctx = P $ \s -> POk s{context=ctx} ()
1568
1569 popContext :: P ()
1570 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1571                            loc = loc, last_len = len, last_loc = last_loc }) ->
1572   case ctx of
1573         (_:tl) -> POk s{ context = tl } ()
1574         []     -> PFailed last_loc (srcParseErr buf len)
1575
1576 -- Push a new layout context at the indentation of the last token read.
1577 -- This is only used at the outer level of a module when the 'module'
1578 -- keyword is missing.
1579 pushCurrentContext :: P ()
1580 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1581     POk s{context = Layout (offs-len) : ctx} ()
1582 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1583
1584 getOffside :: P Ordering
1585 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1586                 let ord = case stk of
1587                         (Layout n:_) -> compare offs n
1588                         _            -> GT
1589                 in POk s ord
1590
1591 -- ---------------------------------------------------------------------------
1592 -- Construct a parse error
1593
1594 srcParseErr
1595   :: StringBuffer       -- current buffer (placed just after the last token)
1596   -> Int                -- length of the previous token
1597   -> Message
1598 srcParseErr buf len
1599   = hcat [ if null token
1600              then ptext SLIT("parse error (possibly incorrect indentation)")
1601              else hcat [ptext SLIT("parse error on input "),
1602                         char '`', text token, char '\'']
1603     ]
1604   where token = lexemeToString (offsetBytes (-len) buf) len
1605
1606 -- Report a parse failure, giving the span of the previous token as
1607 -- the location of the error.  This is the entry point for errors
1608 -- detected during parsing.
1609 srcParseFail :: P a
1610 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1611                             last_loc = last_loc } ->
1612     PFailed last_loc (srcParseErr buf len)
1613
1614 -- A lexical error is reported at a particular position in the source file,
1615 -- not over a token range.
1616 lexError :: String -> P a
1617 lexError str = do
1618   loc <- getSrcLoc
1619   i@(AI end _ buf) <- getInput
1620   reportLexError loc end buf str
1621
1622 -- -----------------------------------------------------------------------------
1623 -- This is the top-level function: called from the parser each time a
1624 -- new token is to be read from the input.
1625
1626 lexer :: (Located Token -> P a) -> P a
1627 lexer cont = do
1628   tok@(L span tok__) <- lexToken
1629 --  trace ("token: " ++ show tok__) $ do
1630   cont tok
1631
1632 lexToken :: P (Located Token)
1633 lexToken = do
1634   inp@(AI loc1 _ buf) <- getInput
1635   sc <- getLexState
1636   exts <- getExts
1637   case alexScanUser exts inp sc of
1638     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1639                   setLastToken span 0 0
1640                   return (L span ITeof)
1641     AlexError (AI loc2 _ buf) -> do
1642         reportLexError loc1 loc2 buf "lexical error"
1643     AlexSkip inp2 _ -> do
1644         setInput inp2
1645         lexToken
1646     AlexToken inp2@(AI end _ buf2) len t -> do
1647     setInput inp2
1648     let span = mkSrcSpan loc1 end
1649     let bytes = byteDiff buf buf2
1650     span `seq` setLastToken span bytes bytes
1651     t span buf bytes
1652
1653 reportLexError loc1 loc2 buf str
1654   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1655   | otherwise =
1656   let
1657         c = fst (nextChar buf)
1658   in
1659   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1660     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1661     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1662 }