ghc/compiler/parser/Lex.lhs

   1 %
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
   3 %
   4 \section[Lexical analysis]{Lexical analysis}
   5
   6 --------------------------------------------------------
   7 [Jan 98]
   8 There's a known bug in here:
   9
  10         If an interface file ends prematurely, Lex tries to
  11         do headFS of an empty FastString.
  12
  13 An example that provokes the error is
  14
  15         f _:_ _forall_ [a] <<<END OF FILE>>>
  16 --------------------------------------------------------
  17
  18 \begin{code}
  19 module Lex (
  20
  21         srcParseErr,
  22
  23         -- Monad for parser
  24         Token(..), lexer, ParseResult(..), PState(..),
  25         ExtFlags(..), mkPState,
  26         StringBuffer,
  27
  28         P, thenP, thenP_, returnP, mapP, failP, failMsgP,
  29         getSrcLocP, setSrcLocP, getSrcFile,
  30         layoutOn, layoutOff, pushContext, popContext
  31     ) where
  32
  33 #include "HsVersions.h"
  34
  35 import Char             ( toUpper, isDigit, chr, ord )
  36 import Ratio            ( (%) )
  37
  38 import PrelNames        ( mkTupNameStr )
  39 import ForeignCall      ( Safety(..) )
  40 import UniqFM           ( listToUFM, lookupUFM )
  41 import BasicTypes       ( Boxity(..) )
  42 import SrcLoc           ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
  43                           replaceSrcLine, mkSrcLoc )
  44
  45 import ErrUtils         ( Message )
  46 import Outputable
  47
  48 import FastString
  49 import StringBuffer
  50 import Ctype
  51
  52 import GLAEXTS
  53 import DATA_BITS        ( Bits(..) )
  54 import DATA_INT         ( Int32 )
  55 \end{code}
  56
  57 %************************************************************************
  58 %*                                                                      *
  59 \subsection{Data types}
  60 %*                                                                      *
  61 %************************************************************************
  62
  63 The token data type, fairly un-interesting except from one
  64 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
  65 strictness, unfolding etc).
  66
  67 The Idea/Observation here is that the renamer needs to scan through
  68 all of an interface file before it can continue. But only a fraction
  69 of the information contained in the file turns out to be useful, so
  70 delaying as much as possible of the scanning and parsing of an
  71 interface file Makes Sense (Heap profiles of the compiler
  72 show a reduction in heap usage by at least a factor of two,
  73 post-renamer).
  74
  75 Hence, the interface file lexer spots when value declarations are
  76 being scanned and return the @ITidinfo@ and @ITtype@ constructors
  77 for the type and any other id info for that binding (unfolding, strictness
  78 etc). These constructors are applied to the result of lexing these sub-chunks.
  79
  80 The lexing of the type and id info is all done lazily, of course, so
  81 the scanning (and subsequent parsing) will be done *only* on the ids the
  82 renamer finds out that it is interested in. The rest will just be junked.
  83 Laziness, you know it makes sense :-)
  84
  85 \begin{code}
  86 data Token
  87   = ITas                        -- Haskell keywords
  88   | ITcase
  89   | ITclass
  90   | ITdata
  91   | ITdefault
  92   | ITderiving
  93   | ITdo
  94   | ITelse
  95   | IThiding
  96   | ITif
  97   | ITimport
  98   | ITin
  99   | ITinfix
 100   | ITinfixl
 101   | ITinfixr
 102   | ITinstance
 103   | ITlet
 104   | ITmodule
 105   | ITnewtype
 106   | ITof
 107   | ITqualified
 108   | ITthen
 109   | ITtype
 110   | ITwhere
 111   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 112
 113   | ITforall                    -- GHC extension keywords
 114   | ITforeign
 115   | ITexport
 116   | ITlabel
 117   | ITdynamic
 118   | ITsafe
 119   | ITthreadsafe
 120   | ITunsafe
 121   | ITwith
 122   | ITstdcallconv
 123   | ITccallconv
 124   | ITdotnet
 125   | ITccall (Bool,Bool,Safety)  -- (is_dyn, is_casm, may_gc)
 126   | ITmdo
 127
 128   | ITspecialise_prag           -- Pragmas
 129   | ITsource_prag
 130   | ITinline_prag
 131   | ITnoinline_prag
 132   | ITrules_prag
 133   | ITdeprecated_prag
 134   | ITline_prag
 135   | ITscc_prag
 136   | ITcore_prag                 -- hdaume: core annotations
 137   | ITclose_prag
 138
 139   | ITdotdot                    -- reserved symbols
 140   | ITcolon
 141   | ITdcolon
 142   | ITequal
 143   | ITlam
 144   | ITvbar
 145   | ITlarrow
 146   | ITrarrow
 147   | ITat
 148   | ITtilde
 149   | ITdarrow
 150   | ITminus
 151   | ITbang
 152   | ITstar
 153   | ITdot
 154
 155   | ITbiglam                    -- GHC-extension symbols
 156
 157   | ITocurly                    -- special symbols
 158   | ITccurly
 159   | ITocurlybar                 -- {|, for type applications
 160   | ITccurlybar                 -- |}, for type applications
 161   | ITvccurly
 162   | ITobrack
 163   | ITopabrack                  -- [:, for parallel arrays with -fparr
 164   | ITcpabrack                  -- :], for parallel arrays with -fparr
 165   | ITcbrack
 166   | IToparen
 167   | ITcparen
 168   | IToubxparen
 169   | ITcubxparen
 170   | ITsemi
 171   | ITcomma
 172   | ITunderscore
 173   | ITbackquote
 174
 175   | ITvarid   FastString        -- identifiers
 176   | ITconid   FastString
 177   | ITvarsym  FastString
 178   | ITconsym  FastString
 179   | ITqvarid  (FastString,FastString)
 180   | ITqconid  (FastString,FastString)
 181   | ITqvarsym (FastString,FastString)
 182   | ITqconsym (FastString,FastString)
 183
 184   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 185   | ITsplitipvarid FastString   -- GHC extension: implicit param: %x
 186
 187   | ITpragma StringBuffer
 188
 189   | ITchar       Int
 190   | ITstring     FastString
 191   | ITinteger    Integer
 192   | ITrational   Rational
 193
 194   | ITprimchar   Int
 195   | ITprimstring FastString
 196   | ITprimint    Integer
 197   | ITprimfloat  Rational
 198   | ITprimdouble Rational
 199   | ITlitlit     FastString
 200
 201   -- MetaHaskell extension tokens
 202   | ITopenExpQuote              -- [| or [e|
 203   | ITopenPatQuote              -- [p|
 204   | ITopenDecQuote              -- [d|
 205   | ITopenTypQuote              -- [t|
 206   | ITcloseQuote                -- |]
 207   | ITidEscape   FastString     -- $x
 208   | ITparenEscape               -- $(
 209   | ITreifyType
 210   | ITreifyDecl
 211   | ITreifyFixity
 212
 213   -- Arrow notation extension
 214   | ITproc
 215   | ITrec
 216   | IToparenbar                 -- (|
 217   | ITcparenbar                 -- |)
 218   | ITlarrowtail                -- -<
 219   | ITrarrowtail                -- >-
 220   | ITLarrowtail                -- -<<
 221   | ITRarrowtail                -- >>-
 222
 223   | ITunknown String            -- Used when the lexer can't make sense of it
 224   | ITeof                       -- end of file token
 225   deriving Show -- debugging
 226 \end{code}
 227
 228 -----------------------------------------------------------------------------
 229 Keyword Lists
 230
 231 \begin{code}
 232 pragmaKeywordsFM = listToUFM $
 233       map (\ (x,y) -> (mkFastString x,y))
 234        [( "SPECIALISE", ITspecialise_prag ),
 235         ( "SPECIALIZE", ITspecialise_prag ),
 236         ( "SOURCE",     ITsource_prag ),
 237         ( "INLINE",     ITinline_prag ),
 238         ( "NOINLINE",   ITnoinline_prag ),
 239         ( "NOTINLINE",  ITnoinline_prag ),
 240         ( "LINE",       ITline_prag ),
 241         ( "RULES",      ITrules_prag ),
 242         ( "RULEZ",      ITrules_prag ), -- american spelling :-)
 243         ( "SCC",        ITscc_prag ),
 244         ( "CORE",       ITcore_prag ),  -- hdaume: core annotation
 245         ( "DEPRECATED", ITdeprecated_prag )
 246         ]
 247
 248 haskellKeywordsFM = listToUFM $
 249       map (\ (x,y) -> (mkFastString x,y))
 250        [( "_",          ITunderscore ),
 251         ( "as",         ITas ),
 252         ( "case",       ITcase ),
 253         ( "class",      ITclass ),
 254         ( "data",       ITdata ),
 255         ( "default",    ITdefault ),
 256         ( "deriving",   ITderiving ),
 257         ( "do",         ITdo ),
 258         ( "else",       ITelse ),
 259         ( "hiding",     IThiding ),
 260         ( "if",         ITif ),
 261         ( "import",     ITimport ),
 262         ( "in",         ITin ),
 263         ( "infix",      ITinfix ),
 264         ( "infixl",     ITinfixl ),
 265         ( "infixr",     ITinfixr ),
 266         ( "instance",   ITinstance ),
 267         ( "let",        ITlet ),
 268         ( "module",     ITmodule ),
 269         ( "newtype",    ITnewtype ),
 270         ( "of",         ITof ),
 271         ( "qualified",  ITqualified ),
 272         ( "then",       ITthen ),
 273         ( "type",       ITtype ),
 274         ( "where",      ITwhere ),
 275         ( "_scc_",      ITscc )         -- ToDo: remove
 276      ]
 277
 278 isSpecial :: Token -> Bool
 279 -- If we see M.x, where x is a keyword, but
 280 -- is special, we treat is as just plain M.x,
 281 -- not as a keyword.
 282 isSpecial ITas          = True
 283 isSpecial IThiding      = True
 284 isSpecial ITqualified   = True
 285 isSpecial ITforall      = True
 286 isSpecial ITexport      = True
 287 isSpecial ITlabel       = True
 288 isSpecial ITdynamic     = True
 289 isSpecial ITsafe        = True
 290 isSpecial ITthreadsafe  = True
 291 isSpecial ITunsafe      = True
 292 isSpecial ITwith        = True
 293 isSpecial ITccallconv   = True
 294 isSpecial ITstdcallconv = True
 295 isSpecial ITmdo         = True
 296 isSpecial _             = False
 297
 298 -- the bitmap provided as the third component indicates whether the
 299 -- corresponding extension keyword is valid under the extension options
 300 -- provided to the compiler; if the extension corresponding to *any* of the
 301 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 302 -- facilitates using a keyword in two different extensions that can be
 303 -- activated independently)
 304 --
 305 ghcExtensionKeywordsFM = listToUFM $
 306         map (\(x, y, z) -> (mkFastString x, (y, z)))
 307      [  ( "forall",     ITforall,        bit glaExtsBit),
 308         ( "mdo",        ITmdo,           bit glaExtsBit),
 309         ( "reifyDecl",  ITreifyDecl,     bit glaExtsBit),
 310         ( "reifyType",  ITreifyType,     bit glaExtsBit),
 311         ( "reifyFixity",ITreifyFixity,   bit glaExtsBit),
 312
 313         ( "foreign",    ITforeign,       bit ffiBit),
 314         ( "export",     ITexport,        bit ffiBit),
 315         ( "label",      ITlabel,         bit ffiBit),
 316         ( "dynamic",    ITdynamic,       bit ffiBit),
 317         ( "safe",       ITsafe,          bit ffiBit),
 318         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 319         ( "unsafe",     ITunsafe,        bit ffiBit),
 320         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 321         ( "ccall",      ITccallconv,     bit ffiBit),
 322         ( "dotnet",     ITdotnet,        bit ffiBit),
 323
 324         ( "with",       ITwith,          bit withBit),
 325
 326         ( "rec",        ITrec,           bit arrowsBit),
 327         ( "proc",       ITproc,          bit arrowsBit),
 328
 329         -- On death row
 330         ("_ccall_",     ITccall (False, False, PlayRisky),
 331                                          bit glaExtsBit),
 332         ("_ccall_GC_",  ITccall (False, False, PlaySafe False),
 333                                          bit glaExtsBit),
 334         ("_casm_",      ITccall (False, True,  PlayRisky),
 335                                          bit glaExtsBit),
 336         ("_casm_GC_",   ITccall (False, True,  PlaySafe False),
 337                                          bit glaExtsBit)
 338      ]
 339
 340 haskellKeySymsFM = listToUFM $
 341         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 342       [ ("..",  ITdotdot,       Nothing)
 343        ,(":",   ITcolon,        Nothing)        -- (:) is a reserved op,
 344                                         -- meaning only list cons
 345        ,("::",  ITdcolon,       Nothing)
 346        ,("=",   ITequal,        Nothing)
 347        ,("\\",  ITlam,          Nothing)
 348        ,("|",   ITvbar,         Nothing)
 349        ,("<-",  ITlarrow,       Nothing)
 350        ,("->",  ITrarrow,       Nothing)
 351        ,("@",   ITat,           Nothing)
 352        ,("~",   ITtilde,        Nothing)
 353        ,("=>",  ITdarrow,       Nothing)
 354        ,("-",   ITminus,        Nothing)
 355        ,("!",   ITbang,         Nothing)
 356
 357        ,("*",   ITstar,         Just (bit glaExtsBit))  -- For data T (a::*) = MkT
 358        ,(".",   ITdot,          Just (bit glaExtsBit))  -- For 'forall a . t'
 359
 360        ,("-<",  ITlarrowtail,   Just (bit arrowsBit))
 361        ,(">-",  ITrarrowtail,   Just (bit arrowsBit))
 362        ,("-<<", ITLarrowtail,   Just (bit arrowsBit))
 363        ,(">>-", ITRarrowtail,   Just (bit arrowsBit))
 364        ]
 365
 366 \end{code}
 367
 368 -----------------------------------------------------------------------------
 369 The lexical analyser
 370
 371 Lexer state:
 372
 373         - (exts)  lexing a source with extensions, eg, an interface file or
 374                   with -fglasgow-exts
 375         - (bol)   pointer to beginning of line (for column calculations)
 376         - (buf)   pointer to beginning of token
 377         - (buf)   pointer to current char
 378         - (atbol) flag indicating whether we're at the beginning of a line
 379
 380 \begin{code}
 381 lexer :: (Token -> P a) -> P a
 382 lexer cont buf s@(PState{
 383                     loc = loc,
 384                     extsBitmap = exts,
 385                     bol = bol,
 386                     atbol = atbol,
 387                     context = ctx
 388                 })
 389
 390         -- first, start a new lexeme and lose all the whitespace
 391   =  _scc_ "Lexer"
 392   tab line bol atbol (stepOverLexeme buf)
 393   where
 394         line = srcLocLine loc
 395
 396         tab y bol atbol buf = --trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $
 397           case currentChar# buf of
 398
 399             '\NUL'# ->
 400                    if bufferExhausted (stepOn buf)
 401                         then cont ITeof buf s'
 402                         else trace "lexer: misplaced NUL?" $
 403                              tab y bol atbol (stepOn buf)
 404
 405             '\n'# -> let buf' = stepOn buf
 406                      in tab (y +# 1#) (currentIndex# buf') 1# buf'
 407
 408                 -- find comments.  This got harder in Haskell 98.
 409             '-'# ->  let trundle n =
 410                           let next = lookAhead# buf n in
 411                           if next `eqChar#` '-'# then trundle (n +# 1#)
 412                           else if is_symbol next || n <# 2#
 413                                 then is_a_token
 414                                 else tab y bol atbol
 415                                          (stepOnUntilChar# (stepOnBy# buf n) '\n'#)
 416                     in trundle 1#
 417
 418                 -- comments and pragmas.  We deal with LINE pragmas here,
 419                 -- and throw out any unrecognised pragmas as comments.  Any
 420                 -- pragmas we know about are dealt with later (after any layout
 421                 -- processing if necessary).
 422             '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
 423                 if lookAhead# buf 2# `eqChar#` '#'# then
 424                   case expandWhile# is_space (addToCurrentPos buf 3#) of { buf1->
 425                   case expandWhile# is_ident (stepOverLexeme buf1)   of { buf2->
 426                   let lexeme = mkFastString -- ToDo: too slow
 427                                   (map toUpper (lexemeToString buf2)) in
 428                   case lookupUFM pragmaKeywordsFM lexeme of
 429                         -- ignore RULES pragmas when -fglasgow-exts is off
 430                         Just ITrules_prag | not (glaExtsEnabled exts) ->
 431                            skip_to_end (stepOnBy# buf 2#) s'
 432                         Just ITline_prag ->
 433                            line_prag skip_to_end buf2 s'
 434                         Just other -> is_a_token
 435                         Nothing -> skip_to_end (stepOnBy# buf 2#) s'
 436                   }}
 437
 438                 else skip_to_end (stepOnBy# buf 2#) s'
 439                 where
 440                     skip_to_end = skipNestedComment (lexer cont)
 441
 442                 -- special GHC extension: we grok cpp-style #line pragmas
 443             '#'# | lexemeIndex buf ==# bol ->   -- the '#' must be in column 0
 444                 -- SPECIAL CASE: if we see "#!" at the beginning of the line,
 445                 -- we ignore the rest of the line.  This is for script-files
 446                 -- on Unix which begin with the special syntax "#! /bin/sh",
 447                 -- for example.
 448                 if lookAhead# buf 1# `eqChar#` '!'#
 449                    then next_line (stepOnBy# buf 2#) s'
 450                    else
 451                 let buf1 | lookAhead# buf 1# `eqChar#` 'l'# &&
 452                            lookAhead# buf 2# `eqChar#` 'i'# &&
 453                            lookAhead# buf 3# `eqChar#` 'n'# &&
 454                            lookAhead# buf 4# `eqChar#` 'e'#  = stepOnBy# buf 5#
 455                          | otherwise = stepOn buf
 456                 in
 457                 case expandWhile# is_space buf1 of { buf2 ->
 458                 if is_digit (currentChar# buf2)
 459                         then line_prag next_line buf2 s'
 460                         else is_a_token
 461                 }
 462                 where
 463                 next_line buf = lexer cont (stepOnUntilChar# buf '\n'#)
 464
 465                 -- tabs have been expanded beforehand
 466             c | is_space c -> tab y bol atbol (stepOn buf)
 467               | otherwise  -> is_a_token
 468
 469            where s' = s{loc = replaceSrcLine loc y,
 470                         bol = bol,
 471                        atbol = atbol}
 472
 473                  is_a_token | atbol /=# 0# = lexBOL cont buf s'
 474                             | otherwise    = lexToken cont exts buf s'
 475
 476 -- {-# LINE .. #-} pragmas.  yeuch.
 477 line_prag cont buf s@PState{loc=loc} =
 478   case expandWhile# is_space buf                of { buf1 ->
 479   case scanNumLit 0 (stepOverLexeme buf1)       of { (line,buf2) ->
 480   -- subtract one: the line number refers to the *following* line.
 481   let real_line = line - 1 in
 482   case fromInteger real_line                    of { i@(I# l) ->
 483         -- ToDo, if no filename then we skip the newline.... d'oh
 484   case expandWhile# is_space buf2               of { buf3 ->
 485   case currentChar# buf3                        of
 486      '\"'#{-"-} ->
 487         case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
 488         let
 489             file = lexemeToFastString buf4
 490             new_buf = stepOn (stepOverLexeme buf4)
 491         in
 492         if nullFastString file
 493                 then cont new_buf s{loc = replaceSrcLine loc l}
 494                 else cont new_buf s{loc = mkSrcLoc file i}
 495         }
 496      _other -> cont (stepOverLexeme buf3) s{loc = replaceSrcLine loc l}
 497   }}}}
 498
 499 skipNestedComment :: P a -> P a
 500 skipNestedComment cont buf state = skipNestedComment' (loc state) cont buf state
 501
 502 skipNestedComment' :: SrcLoc -> P a -> P a
 503 skipNestedComment' orig_loc cont buf = loop buf
 504  where
 505    loop buf =
 506      case currentChar# buf of
 507         '-'# | lookAhead# buf 1# `eqChar#` '}'# -> cont (stepOnBy# buf 2#)
 508
 509         '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
 510               skipNestedComment
 511                 (skipNestedComment' orig_loc cont)
 512                 (stepOnBy# buf 2#)
 513
 514         '\n'# -> \ s@PState{loc=loc} ->
 515                  let buf' = stepOn buf in
 516                  loop buf' s{loc = incSrcLine loc,
 517                              bol = currentIndex# buf',
 518                              atbol = 1#}
 519
 520         -- pass the original SrcLoc to lexError so that the error is
 521         -- reported at the line it was originally on, not the line at
 522         -- the end of the file.
 523         '\NUL'# | bufferExhausted (stepOn buf) ->
 524                 \s -> lexError "unterminated `{-'" buf s{loc=orig_loc} -- -}
 525
 526         _   -> loop (stepOn buf)
 527
 528 -- When we are lexing the first token of a line, check whether we need to
 529 -- insert virtual semicolons or close braces due to layout.
 530
 531 lexBOL :: (Token -> P a) -> P a
 532 lexBOL cont buf s@(PState{
 533                     loc = loc,
 534                     extsBitmap = exts,
 535                     bol = bol,
 536                     atbol = atbol,
 537                     context = ctx
 538                   }) =
 539         if need_close_curly then
 540                 --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $
 541                 cont ITvccurly buf s{atbol = 1#, context = tail ctx}
 542         else if need_semi_colon then
 543                 --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $
 544                 cont ITsemi buf s{atbol = 0#}
 545         else
 546                 lexToken cont exts buf s{atbol = 0#}
 547   where
 548         col = currentIndex# buf -# bol
 549
 550         need_close_curly =
 551                 case ctx of
 552                         [] -> False
 553                         (i:_) -> case i of
 554                                     NoLayout -> False
 555                                     Layout n -> col <# n
 556         need_semi_colon =
 557                 case ctx of
 558                         [] -> False
 559                         (i:_) -> case i of
 560                                     NoLayout -> False
 561                                     Layout n -> col ==# n
 562
 563
 564 lexToken :: (Token -> P a) -> Int# -> P a
 565 lexToken cont exts buf =
 566 -- trace "lexToken" $
 567   case currentChar# buf of
 568
 569     -- special symbols ----------------------------------------------------
 570     '('# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '#'# &&
 571          -- Unboxed tules: '(#' but not '(##'
 572            not (lookAhead# buf 2# `eqChar#` '#'#)
 573                 -> cont IToubxparen (addToCurrentPos buf 2#)
 574          -- Arrow notation extension: '(|' but not '(||'
 575          | arrowsEnabled exts && lookAhead# buf 1# `eqChar#` '|'# &&
 576            not (lookAhead# buf 2# `eqChar#` '|'#)
 577                 -> cont IToparenbar (addToCurrentPos buf 2#)
 578          | otherwise
 579                 -> cont IToparen (incCurrentPos buf)
 580
 581     ')'# -> cont ITcparen    (incCurrentPos buf)
 582     '['# | parrEnabled exts && lookAhead# buf 1# `eqChar#` ':'# ->
 583             cont ITopabrack  (addToCurrentPos buf 2#)
 584          ------- MetaHaskell Extensions, looking for [| [e|  [t|  [p| and [d|
 585          | glaExtsEnabled exts &&
 586            ((lookAhead# buf 1# ) `eqChar#` '|'# ) ->
 587                 cont ITopenExpQuote (addToCurrentPos buf 2# )
 588          | glaExtsEnabled exts &&
 589            (let c = (lookAhead# buf 1# )
 590             in eqChar# c 'e'# || eqChar# c 't'# || eqChar# c 'd'#  || eqChar# c 'p'#) &&
 591            ((lookAhead# buf 2#) `eqChar#` '|'#) ->
 592                 let quote 'e'# = ITopenExpQuote
 593                     quote 'p'# = ITopenPatQuote
 594                     quote 'd'# = ITopenDecQuote
 595                     quote 't'# = ITopenTypQuote
 596                 in cont (quote (lookAhead# buf 1#)) (addToCurrentPos buf 3# )
 597          | otherwise ->
 598             cont ITobrack    (incCurrentPos buf)
 599
 600     ']'# -> cont ITcbrack    (incCurrentPos buf)
 601     ','# -> cont ITcomma     (incCurrentPos buf)
 602     ';'# -> cont ITsemi      (incCurrentPos buf)
 603     '}'# -> \ s@PState{context = ctx} ->
 604             case ctx of
 605                 (_:ctx') -> cont ITccurly (incCurrentPos buf) s{context=ctx'}
 606                 _        -> lexError "too many '}'s" buf s
 607     '|'# -> case lookAhead# buf 1# of
 608                  '}'#  | glaExtsEnabled exts -> cont ITccurlybar
 609                                                      (addToCurrentPos buf 2#)
 610                  -- MetaHaskell extension
 611                  ']'#  | glaExtsEnabled exts -> cont ITcloseQuote (addToCurrentPos buf 2#)
 612                  -- arrow notation extension
 613                  ')'#  | arrowsEnabled exts -> cont ITcparenbar
 614                                                      (addToCurrentPos buf 2#)
 615                  other -> lex_sym cont exts (incCurrentPos buf)
 616     ':'# -> case lookAhead# buf 1# of
 617                  ']'#  | parrEnabled exts    -> cont ITcpabrack
 618                                                      (addToCurrentPos buf 2#)
 619                  _                           -> lex_sym cont exts (incCurrentPos buf)
 620
 621
 622     '#'# -> case lookAhead# buf 1# of
 623                 ')'#  | glaExtsEnabled exts
 624                      -> cont ITcubxparen (addToCurrentPos buf 2#)
 625                 '-'# -> case lookAhead# buf 2# of
 626                            '}'# -> cont ITclose_prag (addToCurrentPos buf 3#)
 627                            _    -> lex_sym cont exts (incCurrentPos buf)
 628                 _    -> lex_sym cont exts (incCurrentPos buf)
 629
 630     '`'# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '`'#
 631                 -> lex_cstring cont (addToCurrentPos buf 2#)
 632          | otherwise
 633                 -> cont ITbackquote (incCurrentPos buf)
 634
 635     '{'# ->   -- for Emacs: -}
 636             case lookAhead# buf 1# of
 637            '|'# | glaExtsEnabled exts
 638                 -> cont ITocurlybar (addToCurrentPos buf 2#)
 639            '-'# -> case lookAhead# buf 2# of
 640                     '#'# -> lex_prag cont (addToCurrentPos buf 3#)
 641                     _    -> cont ITocurly (incCurrentPos buf)
 642            _ -> (layoutOff `thenP_` cont ITocurly)  (incCurrentPos buf)
 643
 644
 645
 646
 647     -- strings/characters -------------------------------------------------
 648     '\"'#{-"-} -> lex_string cont exts [] (incCurrentPos buf)
 649     '\''#      -> lex_char (char_end cont) exts (incCurrentPos buf)
 650
 651         -- Hexadecimal and octal constants
 652     '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2
 653                 -> readNum (after_lexnum cont exts) buf' is_hexdigit 16 hex
 654          | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2
 655                 -> readNum (after_lexnum cont exts) buf' is_octdigit  8 oct_or_dec
 656         where ch   = lookAhead# buf 1#
 657               ch2  = lookAhead# buf 2#
 658               buf' = addToCurrentPos buf 2#
 659
 660     '\NUL'# ->
 661             if bufferExhausted (stepOn buf) then
 662                cont ITeof buf
 663             else
 664                trace "lexIface: misplaced NUL?" $
 665                cont (ITunknown "\NUL") (stepOn buf)
 666
 667     '?'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->  -- ?x implicit parameter
 668             specialPrefixId ITdupipvarid cont exts (incCurrentPos buf)
 669     '%'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
 670             specialPrefixId ITsplitipvarid cont exts (incCurrentPos buf)
 671
 672     ---------------- MetaHaskell Extensions for quotation escape
 673     '$'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->  -- $x  variable escape
 674             specialPrefixId ITidEscape cont exts (addToCurrentPos buf 1#)
 675     '$'# | glaExtsEnabled exts &&  -- $( f x )  expression escape
 676            ((lookAhead# buf 1#) `eqChar#` '('#) -> cont ITparenEscape (addToCurrentPos buf 2#)
 677
 678     c | is_digit  c -> lex_num cont exts 0 buf
 679       | is_symbol c -> lex_sym cont exts buf
 680       | is_upper  c -> lex_con cont exts buf
 681       | is_lower  c -> lex_id  cont exts buf
 682       | otherwise   -> lexError "illegal character" buf
 683
 684 -- Int# is unlifted, and therefore faster than Bool for flags.
 685 {-# INLINE flag #-}
 686 flag :: Int# -> Bool
 687 flag 0# = False
 688 flag _  = True
 689
 690 -------------------------------------------------------------------------------
 691 -- Pragmas
 692
 693 lex_prag cont buf
 694   = case expandWhile# is_space buf of { buf1 ->
 695     case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 ->
 696     let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
 697     case lookupUFM pragmaKeywordsFM lexeme of
 698         Just kw -> cont kw (mergeLexemes buf buf2)
 699         Nothing -> panic "lex_prag"
 700   }}
 701
 702 -------------------------------------------------------------------------------
 703 -- Strings & Chars
 704
 705 lex_string cont exts s buf
 706   = case currentChar# buf of
 707         '"'#{-"-} ->
 708            let buf' = incCurrentPos buf
 709            in case currentChar# buf' of
 710                 '#'# | glaExtsEnabled exts ->
 711                    if any (> 0xFF) s
 712                     then lexError "primitive string literal must contain only characters <= \'\\xFF\'" buf'
 713                     else let s' = mkFastStringNarrow (map chr (reverse s)) in
 714                          -- always a narrow string/byte array
 715                          cont (ITprimstring s') (incCurrentPos buf')
 716
 717                 _other -> let s' = mkFastString (map chr (reverse s))
 718                           in cont (ITstring s') buf'
 719
 720         -- ignore \& in a string, deal with string gaps
 721         '\\'# | next_ch `eqChar#` '&'#
 722                 -> lex_string cont exts s buf'
 723               | is_space next_ch
 724                 -> lex_stringgap cont exts s (incCurrentPos buf)
 725
 726             where next_ch = lookAhead# buf 1#
 727                   buf' = addToCurrentPos buf 2#
 728
 729         _ -> lex_char (lex_next_string cont s) exts buf
 730
 731 lex_stringgap cont exts s buf
 732   = let buf' = incCurrentPos buf in
 733     case currentChar# buf of
 734         '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont exts s buf'
 735                   st{loc = incSrcLine loc}
 736         '\\'# -> lex_string cont exts s buf'
 737         c | is_space c -> lex_stringgap cont exts s buf'
 738         other -> charError buf'
 739
 740 lex_next_string cont s exts c buf = lex_string cont exts (c:s) buf
 741
 742 lex_char :: (Int# -> Int -> P a) -> Int# -> P a
 743 lex_char cont exts buf
 744   = case currentChar# buf of
 745         '\\'# -> lex_escape (cont exts) (incCurrentPos buf)
 746         c | is_any c -> cont exts (I# (ord# c)) (incCurrentPos buf)
 747         other -> charError buf
 748
 749 char_end cont exts c buf
 750   = case currentChar# buf of
 751         '\''# -> let buf' = incCurrentPos buf in
 752                  case currentChar# buf' of
 753                         '#'# | glaExtsEnabled exts
 754                                 -> cont (ITprimchar c) (incCurrentPos buf')
 755                         _       -> cont (ITchar c) buf'
 756         _     -> charError buf
 757
 758 lex_escape cont buf
 759   = let buf' = incCurrentPos buf in
 760     case currentChar# buf of
 761         'a'#       -> cont (ord '\a') buf'
 762         'b'#       -> cont (ord '\b') buf'
 763         'f'#       -> cont (ord '\f') buf'
 764         'n'#       -> cont (ord '\n') buf'
 765         'r'#       -> cont (ord '\r') buf'
 766         't'#       -> cont (ord '\t') buf'
 767         'v'#       -> cont (ord '\v') buf'
 768         '\\'#      -> cont (ord '\\') buf'
 769         '"'#       -> cont (ord '\"') buf'
 770         '\''#      -> cont (ord '\'') buf'
 771         '^'#       -> let c = currentChar# buf' in
 772                       if c `geChar#` '@'# && c `leChar#` '_'#
 773                         then cont (I# (ord# c -# ord# '@'#)) (incCurrentPos buf')
 774                         else charError buf'
 775
 776         'x'#      -> readNum (after_charnum cont) buf' is_hexdigit 16 hex
 777         'o'#      -> readNum (after_charnum cont) buf' is_octdigit  8 oct_or_dec
 778         x | is_digit x
 779                   -> readNum (after_charnum cont) buf is_digit    10 oct_or_dec
 780
 781         _          -> case [ (c,buf2) | (p,c) <- silly_escape_chars,
 782                                        Just buf2 <- [prefixMatch buf p] ] of
 783                             (c,buf2):_ -> cont (ord c) buf2
 784                             [] -> charError buf'
 785
 786 after_charnum cont i buf
 787   = if i >= 0 && i <= 0x10FFFF
 788         then cont (fromInteger i) buf
 789         else charError buf
 790
 791 readNum cont buf is_digit base conv = read buf 0
 792   where read buf i
 793           = case currentChar# buf of { c ->
 794             if is_digit c
 795                 then read (incCurrentPos buf) (i*base + (toInteger (I# (conv c))))
 796                 else cont i buf
 797             }
 798
 799 is_hexdigit c
 800         =  is_digit c
 801         || (c `geChar#` 'a'# && c `leChar#` 'f'#)
 802         || (c `geChar#` 'A'# && c `leChar#` 'F'#)
 803
 804 hex c | is_digit c = ord# c -# ord# '0'#
 805       | otherwise  = ord# (to_lower c) -# ord# 'a'# +# 10#
 806 oct_or_dec c = ord# c -# ord# '0'#
 807
 808 is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'#
 809
 810 to_lower c
 811   | c `geChar#` 'A'# && c `leChar#` 'Z'#
 812         = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#))
 813   | otherwise = c
 814
 815 charError buf = lexError "error in character literal" buf
 816
 817 silly_escape_chars = [
 818         ("NUL", '\NUL'),
 819         ("SOH", '\SOH'),
 820         ("STX", '\STX'),
 821         ("ETX", '\ETX'),
 822         ("EOT", '\EOT'),
 823         ("ENQ", '\ENQ'),
 824         ("ACK", '\ACK'),
 825         ("BEL", '\BEL'),
 826         ("BS", '\BS'),
 827         ("HT", '\HT'),
 828         ("LF", '\LF'),
 829         ("VT", '\VT'),
 830         ("FF", '\FF'),
 831         ("CR", '\CR'),
 832         ("SO", '\SO'),
 833         ("SI", '\SI'),
 834         ("DLE", '\DLE'),
 835         ("DC1", '\DC1'),
 836         ("DC2", '\DC2'),
 837         ("DC3", '\DC3'),
 838         ("DC4", '\DC4'),
 839         ("NAK", '\NAK'),
 840         ("SYN", '\SYN'),
 841         ("ETB", '\ETB'),
 842         ("CAN", '\CAN'),
 843         ("EM", '\EM'),
 844         ("SUB", '\SUB'),
 845         ("ESC", '\ESC'),
 846         ("FS", '\FS'),
 847         ("GS", '\GS'),
 848         ("RS", '\RS'),
 849         ("US", '\US'),
 850         ("SP", '\SP'),
 851         ("DEL", '\DEL')
 852         ]
 853
 854 -----------------------------------------------------------------------------
 855 -- Numbers
 856
 857 lex_num :: (Token -> P a) -> Int# -> Integer -> P a
 858 lex_num cont exts acc buf =
 859  case scanNumLit acc buf of
 860      (acc',buf') ->
 861        case currentChar# buf' of
 862          '.'# | is_digit (lookAhead# buf' 1#) ->
 863              -- this case is not optimised at all, as the
 864              -- presence of floating point numbers in interface
 865              -- files is not that common. (ToDo)
 866             case expandWhile# is_digit (incCurrentPos buf') of
 867               buf2 -> -- points to first non digit char
 868                   case currentChar# buf2 of
 869                         'E'# -> float_exponent cont exts buf2
 870                         'e'# -> float_exponent cont exts buf2
 871                         _    -> float_done cont exts buf2
 872
 873          -- numbers like '9e4' are floats
 874          'E'# -> float_exponent cont exts buf'
 875          'e'# -> float_exponent cont exts buf'
 876          _    -> after_lexnum cont exts acc' buf' -- it's an integer
 877
 878 float_exponent cont exts buf2 =
 879   let buf3 = incCurrentPos buf2
 880       buf4 = case currentChar# buf3 of
 881                 '-'# | is_digit (lookAhead# buf3 1#)
 882                         -> expandWhile# is_digit (incCurrentPos buf3)
 883                 '+'# | is_digit (lookAhead# buf3 1#)
 884                         -> expandWhile# is_digit (incCurrentPos buf3)
 885                 x | is_digit x -> expandWhile# is_digit buf3
 886                 _ -> buf2
 887   in
 888      float_done cont exts buf4
 889
 890 float_done cont exts buf =
 891    case currentChar# buf of -- glasgow exts only
 892         '#'# | glaExtsEnabled exts ->
 893               let buf' = incCurrentPos buf in
 894               case currentChar# buf' of
 895                 '#'# -> cont (ITprimdouble v) (incCurrentPos buf')
 896                 _    -> cont (ITprimfloat  v) buf'
 897         _ -> cont (ITrational v) buf
 898  where
 899    v = readRational__ (lexemeToString buf)
 900
 901 after_lexnum cont exts i buf
 902   = case currentChar# buf of
 903         '#'# | glaExtsEnabled exts -> cont (ITprimint i) (incCurrentPos buf)
 904         _                          -> cont (ITinteger i) buf
 905
 906 readRational :: ReadS Rational -- NB: doesn't handle leading "-"
 907 readRational r = do
 908      (n,d,s) <- readFix r
 909      (k,t)   <- readExp s
 910      return ((n%1)*10^^(k-d), t)
 911  where
 912      readFix r = do
 913         (ds,s)  <- lexDecDigits r
 914         (ds',t) <- lexDotDigits s
 915         return (read (ds++ds'), length ds', t)
 916
 917      readExp (e:s) | e `elem` "eE" = readExp' s
 918      readExp s                     = return (0,s)
 919
 920      readExp' ('+':s) = readDec s
 921      readExp' ('-':s) = do
 922                         (k,t) <- readDec s
 923                         return (-k,t)
 924      readExp' s       = readDec s
 925
 926      readDec s = do
 927         (ds,r) <- nonnull isDigit s
 928         return (foldl1 (\n d -> n * 10 + d) [ ord d - ord '0' | d <- ds ],
 929                 r)
 930
 931      lexDecDigits = nonnull isDigit
 932
 933      lexDotDigits ('.':s) = return (span isDigit s)
 934      lexDotDigits s       = return ("",s)
 935
 936      nonnull p s = do (cs@(_:_),t) <- return (span p s)
 937                       return (cs,t)
 938
 939 readRational__ :: String -> Rational -- NB: *does* handle a leading "-"
 940 readRational__ top_s
 941   = case top_s of
 942       '-' : xs -> - (read_me xs)
 943       xs       -> read_me xs
 944   where
 945     read_me s
 946       = case (do { (x,"") <- readRational s ; return x }) of
 947           [x] -> x
 948           []  -> error ("readRational__: no parse:"        ++ top_s)
 949           _   -> error ("readRational__: ambiguous parse:" ++ top_s)
 950
 951 -----------------------------------------------------------------------------
 952 -- C "literal literal"s  (i.e. things like ``NULL'', ``stdout'' etc.)
 953
 954 -- we lexemeToFastString on the bit between the ``''s, but include the
 955 -- quotes in the full lexeme.
 956
 957 lex_cstring cont buf =
 958  case expandUntilMatch (stepOverLexeme buf) "\'\'" of
 959    Just buf' -> cont (ITlitlit (lexemeToFastString
 960                                 (addToCurrentPos buf' (negateInt# 2#))))
 961                    (mergeLexemes buf buf')
 962    Nothing   -> lexError "unterminated ``" buf
 963
 964 -----------------------------------------------------------------------------
 965 -- identifiers, symbols etc.
 966
 967 -- used for identifiers with special prefixes like
 968 -- ?x (implicit parameters), $x (MetaHaskell escapes) and #x
 969 -- we've already seen the prefix char, so look for an id, and wrap
 970 -- the new "ip_constr" around the lexeme returned
 971
 972 specialPrefixId ip_constr cont exts buf = lex_id newcont exts buf
 973  where newcont (ITvarid lexeme) buf2 = cont (ip_constr (tailFS lexeme)) buf2
 974        newcont token buf2 = cont token buf2
 975 {-
 976  case expandWhile# is_ident buf of
 977    buf' -> cont (ip_constr (tailFS lexeme)) buf'
 978         where lexeme = lexemeToFastString buf'
 979 -}
 980
 981 lex_id cont exts buf =
 982  let buf1 = expandWhile# is_ident buf in
 983  seq buf1 $
 984
 985  case (if glaExtsEnabled exts
 986         then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
 987         else buf1)                              of { buf' ->
 988  seq buf' $
 989
 990  let lexeme  = lexemeToFastString buf' in
 991
 992  case _scc_ "haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
 993         Just kwd_token -> --trace ("hkeywd: "++unpackFS(lexeme)) $
 994                           cont kwd_token buf';
 995         Nothing        ->
 996
 997  let var_token = cont (ITvarid lexeme) buf' in
 998
 999  case lookupUFM ghcExtensionKeywordsFM lexeme of {
1000     Just (kwd_token, validExts)
1001       | validExts .&. (toInt32 exts) /= 0 -> cont kwd_token buf';
1002     _                                     -> var_token
1003
1004  }}}
1005
1006 lex_sym cont exts buf =
1007  -- trace "lex_sym" $
1008  case expandWhile# is_symbol buf of
1009    buf' -> case lookupUFM haskellKeySymsFM lexeme of {
1010                 Just (kwd_token, Nothing)
1011                         -> cont kwd_token buf' ;
1012                 Just (kwd_token, Just validExts)
1013                         | validExts .&. toInt32 exts /= 0
1014                         -> cont kwd_token buf' ;
1015                 other   -> cont (mk_var_token lexeme) buf'
1016            }
1017         where lexeme = lexemeToFastString buf'
1018
1019
1020 -- lex_con recursively collects components of a qualified identifer.
1021 -- The argument buf is the StringBuffer representing the lexeme
1022 -- identified so far, where the next character is upper-case.
1023
1024 lex_con cont exts buf =
1025  -- trace ("con: "{-++unpackFS lexeme-}) $
1026  let empty_buf = stepOverLexeme buf in
1027  case expandWhile# is_ident empty_buf of { buf1 ->
1028  case slurp_trailing_hashes buf1 exts of { con_buf ->
1029
1030  let all_buf = mergeLexemes buf con_buf
1031
1032      con_lexeme = lexemeToFastString con_buf
1033      mod_lexeme = lexemeToFastString (decCurrentPos buf)
1034      all_lexeme = lexemeToFastString all_buf
1035
1036      just_a_conid
1037         | emptyLexeme buf = cont (ITconid con_lexeme)               all_buf
1038         | otherwise       = cont (ITqconid (mod_lexeme,con_lexeme)) all_buf
1039  in
1040
1041  case currentChar# all_buf of
1042      '.'# -> maybe_qualified cont exts all_lexeme
1043                 (incCurrentPos all_buf) just_a_conid
1044      _    -> just_a_conid
1045   }}
1046
1047
1048 maybe_qualified cont exts mod buf just_a_conid =
1049  -- trace ("qid: "{-++unpackFS lexeme-}) $
1050  case currentChar# buf of
1051   '['# ->       -- Special case for []
1052     case lookAhead# buf 1# of
1053      ']'# -> cont (ITqconid  (mod,FSLIT("[]"))) (addToCurrentPos buf 2#)
1054      _    -> just_a_conid
1055
1056   '('# ->  -- Special case for (,,,)
1057            -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
1058     case lookAhead# buf 1# of
1059      '#'# | glaExtsEnabled exts -> case lookAhead# buf 2# of
1060                 ','# -> lex_ubx_tuple cont mod (addToCurrentPos buf 3#)
1061                                 just_a_conid
1062                 _    -> just_a_conid
1063      ')'# -> cont (ITqconid (mod,FSLIT("()"))) (addToCurrentPos buf 2#)
1064      ','# -> lex_tuple cont mod (addToCurrentPos buf 2#) just_a_conid
1065      _    -> just_a_conid
1066
1067   '-'# -> case lookAhead# buf 1# of
1068             '>'# -> cont (ITqconid (mod,FSLIT("(->)"))) (addToCurrentPos buf 2#)
1069             _    -> lex_id3 cont exts mod buf just_a_conid
1070
1071   _    -> lex_id3 cont exts mod buf just_a_conid
1072
1073
1074 lex_id3 cont exts mod buf just_a_conid
1075   | is_upper (currentChar# buf) =
1076      lex_con cont exts buf
1077
1078   | is_symbol (currentChar# buf) =
1079      let
1080         start_new_lexeme = stepOverLexeme buf
1081      in
1082      -- trace ("lex_id31 "{-++unpackFS lexeme-}) $
1083      case expandWhile# is_symbol start_new_lexeme of { buf' ->
1084      let
1085        lexeme  = lexemeToFastString buf'
1086         -- real lexeme is M.<sym>
1087        new_buf = mergeLexemes buf buf'
1088      in
1089      cont (mk_qvar_token mod lexeme) new_buf
1090         -- wrong, but arguably morally right: M... is now a qvarsym
1091      }
1092
1093   | otherwise   =
1094      let
1095         start_new_lexeme = stepOverLexeme buf
1096      in
1097      -- trace ("lex_id32 "{-++unpackFS lexeme-}) $
1098      case expandWhile# is_ident start_new_lexeme of { buf1 ->
1099      if emptyLexeme buf1
1100             then just_a_conid
1101             else
1102
1103      case slurp_trailing_hashes buf1 exts of { buf' ->
1104
1105      let
1106       lexeme      = lexemeToFastString buf'
1107       new_buf     = mergeLexemes buf buf'
1108       is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
1109      in
1110      case _scc_ "haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
1111             Nothing          -> is_a_qvarid ;
1112
1113             Just kwd_token | isSpecial kwd_token   -- special ids (as, qualified, hiding) shouldn't be
1114                            -> is_a_qvarid          --  recognised as keywords here.
1115                            | otherwise
1116                            -> just_a_conid         -- avoid M.where etc.
1117      }}}
1118
1119 slurp_trailing_hashes buf exts
1120   | glaExtsEnabled exts = expandWhile# (`eqChar#` '#'#) buf
1121   | otherwise           = buf
1122
1123
1124 mk_var_token pk_str
1125   | is_upper f          = ITconid pk_str
1126   | is_ident f          = ITvarid pk_str
1127   | f `eqChar#` ':'#    = ITconsym pk_str
1128   | otherwise           = ITvarsym pk_str
1129   where
1130       (C# f) = headFS pk_str
1131       -- tl     = _TAIL_ pk_str
1132
1133 mk_qvar_token m token =
1134 -- trace ("mk_qvar ") $
1135  case mk_var_token token of
1136    ITconid n  -> ITqconid  (m,n)
1137    ITvarid n  -> ITqvarid  (m,n)
1138    ITconsym n -> ITqconsym (m,n)
1139    ITvarsym n -> ITqvarsym (m,n)
1140    _          -> ITunknown (show token)
1141 \end{code}
1142
1143 ----------------------------------------------------------------------------
1144 Horrible stuff for dealing with M.(,,,)
1145
1146 \begin{code}
1147 lex_tuple cont mod buf back_off =
1148   go 2 buf
1149   where
1150    go n buf =
1151     case currentChar# buf of
1152       ','# -> go (n+1) (stepOn buf)
1153       ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Boxed n))) (stepOn buf)
1154       _    -> back_off
1155
1156 lex_ubx_tuple cont mod buf back_off =
1157   go 2 buf
1158   where
1159    go n buf =
1160     case currentChar# buf of
1161       ','# -> go (n+1) (stepOn buf)
1162       '#'# -> case lookAhead# buf 1# of
1163                 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Unboxed n)))
1164                                  (stepOnBy# buf 2#)
1165                 _    -> back_off
1166       _    -> back_off
1167 \end{code}
1168
1169 -----------------------------------------------------------------------------
1170
1171 \begin{code}
1172 data LayoutContext
1173   = NoLayout
1174   | Layout Int#
1175
1176 data ParseResult a
1177   = POk PState a
1178   | PFailed Message
1179
1180 data PState = PState {
1181         loc        :: SrcLoc,
1182         extsBitmap :: Int#,     -- bitmap that determines permitted extensions
1183         bol        :: Int#,
1184         atbol      :: Int#,
1185         context    :: [LayoutContext]
1186      }
1187
1188 type P a = StringBuffer         -- Input string
1189         -> PState
1190         -> ParseResult a
1191
1192 returnP   :: a -> P a
1193 returnP a buf s = POk s a
1194
1195 thenP      :: P a -> (a -> P b) -> P b
1196 m `thenP` k = \ buf s ->
1197         case m buf s of
1198                 POk s1 a -> k a buf s1
1199                 PFailed err  -> PFailed err
1200
1201 thenP_     :: P a -> P b -> P b
1202 m `thenP_` k = m `thenP` \_ -> k
1203
1204 mapP :: (a -> P b) -> [a] -> P [b]
1205 mapP f [] = returnP []
1206 mapP f (a:as) =
1207      f a `thenP` \b ->
1208      mapP f as `thenP` \bs ->
1209      returnP (b:bs)
1210
1211 failP :: String -> P a
1212 failP msg buf s = PFailed (text msg)
1213
1214 failMsgP :: Message -> P a
1215 failMsgP msg buf s = PFailed msg
1216
1217 lexError :: String -> P a
1218 lexError str buf s@PState{ loc = loc }
1219   = failMsgP (hcat [ppr loc, text ": ", text str]) buf s
1220
1221 getSrcLocP :: P SrcLoc
1222 getSrcLocP buf s@(PState{ loc = loc }) = POk s loc
1223
1224 -- use a temporary SrcLoc for the duration of the argument
1225 setSrcLocP :: SrcLoc -> P a -> P a
1226 setSrcLocP new_loc p buf s =
1227   case p buf s{ loc=new_loc } of
1228       POk _ a   -> POk s a
1229       PFailed e -> PFailed e
1230
1231 getSrcFile :: P FastString
1232 getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc)
1233
1234 pushContext :: LayoutContext -> P ()
1235 pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} ()
1236
1237 {-
1238
1239 This special case in layoutOn is to handle layout contexts with are
1240 indented the same or less than the current context.  This is illegal
1241 according to the Haskell spec, so we have to arrange to close the
1242 current context.  eg.
1243
1244 class Foo a where
1245 class Bar a
1246
1247 after the first 'where', the sequence of events is:
1248
1249         - layout system inserts a ';' (column 0)
1250         - parser begins a new context at column 0
1251         - parser shifts ';' (legal empty declaration)
1252         - parser sees 'class': parse error (we're still in the inner context)
1253
1254 trouble is, by the time we know we need a new context, the lexer has
1255 already generated the ';'.  Hacky solution is as follows: since we
1256 know the column of the next token (it's the column number of the new
1257 context), we set the ACTUAL column number of the new context to this
1258 numer plus one.  Hence the next time the lexer is called, a '}' will
1259 be generated to close the new context straight away.  Furthermore, we
1260 have to set the atbol flag so that the ';' that the parser shifted as
1261 part of the new context is re-generated.
1262
1263 when the new context is *less* indented than the current one:
1264
1265 f = f where g = g where
1266 h = h
1267
1268         - current context: column 12.
1269         - on seeing 'h' (column 0), the layout system inserts '}'
1270         - parser starts a new context, column 0
1271         - parser sees '}', uses it to close new context
1272         - we still need to insert another '}' followed by a ';',
1273           hence the atbol trick.
1274
1275 There's also a special hack in here to deal with
1276
1277         do
1278            ....
1279            e $ do
1280            blah
1281
1282 i.e. the inner context is at the same indentation level as the outer
1283 context.  This is strictly illegal according to Haskell 98, but
1284 there's a lot of existing code using this style and it doesn't make
1285 any sense to disallow it, since empty 'do' lists don't make sense.
1286 -}
1287
1288 layoutOn :: Bool -> P ()
1289 layoutOn strict buf s@(PState{ bol = bol, context = ctx }) =
1290     let offset = lexemeIndex buf -# bol in
1291     case ctx of
1292         Layout prev_off : _
1293            | if strict then prev_off >=# offset else prev_off ># offset ->
1294                 --trace ("layout on, column: " ++  show (I# offset)) $
1295                 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
1296         other ->
1297                 --trace ("layout on, column: " ++  show (I# offset)) $
1298                 POk s{ context = Layout offset : ctx } ()
1299
1300 layoutOff :: P ()
1301 layoutOff buf s@(PState{ context = ctx }) =
1302     POk s{ context = NoLayout:ctx } ()
1303
1304 popContext :: P ()
1305 popContext = \ buf s@(PState{ context = ctx, loc = loc }) ->
1306   case ctx of
1307         (_:tl) -> POk s{ context = tl } ()
1308         []     -> PFailed (srcParseErr buf loc)
1309
1310 -- for reasons of efficiency, flags indicating language extensions (eg,
1311 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1312 -- integer
1313
1314 glaExtsBit, ffiBit, parrBit :: Int
1315 glaExtsBit = 0
1316 ffiBit     = 1
1317 parrBit    = 2
1318 withBit    = 3
1319 arrowsBit  = 4
1320
1321 glaExtsEnabled, ffiEnabled, parrEnabled :: Int# -> Bool
1322 glaExtsEnabled flags = testBit (toInt32 flags) glaExtsBit
1323 ffiEnabled     flags = testBit (toInt32 flags) ffiBit
1324 withEnabled    flags = testBit (toInt32 flags) withBit
1325 parrEnabled    flags = testBit (toInt32 flags) parrBit
1326 arrowsEnabled  flags = testBit (toInt32 flags) arrowsBit
1327
1328 toInt32 :: Int# -> Int32
1329 toInt32 x# = fromIntegral (I# x#)
1330
1331 -- convenient record-based bitmap for the interface to the rest of the world
1332 --
1333 -- NB: `glasgowExtsEF' implies `ffiEF' (see `mkPState' below)
1334 --
1335 data ExtFlags = ExtFlags {
1336                   glasgowExtsEF :: Bool,
1337                   ffiEF         :: Bool,
1338                   withEF        :: Bool,
1339                   parrEF        :: Bool,
1340                   arrowsEF      :: Bool
1341                 }
1342
1343 -- create a parse state
1344 --
1345 mkPState          :: SrcLoc -> ExtFlags -> PState
1346 mkPState loc exts  =
1347   PState {
1348     loc        = loc,
1349       extsBitmap = case (fromIntegral bitmap) of {I# bits -> bits},
1350       bol         = 0#,
1351       atbol      = 1#,
1352       context    = []
1353     }
1354     where
1355       bitmap =     glaExtsBit `setBitIf` glasgowExtsEF     exts
1356                .|. ffiBit     `setBitIf` (ffiEF            exts
1357                                           || glasgowExtsEF exts)
1358                .|. withBit    `setBitIf` withEF            exts
1359                .|. parrBit    `setBitIf` parrEF            exts
1360                .|. arrowsBit  `setBitIf` arrowsEF          exts
1361       --
1362       setBitIf :: Int -> Bool -> Int32
1363       b `setBitIf` cond | cond      = bit b
1364                         | otherwise = 0
1365
1366 -----------------------------------------------------------------------------
1367
1368 srcParseErr :: StringBuffer -> SrcLoc -> Message
1369 srcParseErr s l
1370   = hcat [ppr l,
1371           if null token
1372              then ptext SLIT(": parse error (possibly incorrect indentation)")
1373              else hcat [ptext SLIT(": parse error on input "),
1374                         char '`', text token, char '\'']
1375     ]
1376   where
1377         token = lexemeToString s
1378
1379 \end{code}