ghc/compiler/parser/Lex.lhs

   1 %
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
   3 %
   4 \section[Lexical analysis]{Lexical analysis}
   5
   6 --------------------------------------------------------
   7 [Jan 98]
   8 There's a known bug in here:
   9
  10         If an interface file ends prematurely, Lex tries to
  11         do headFS of an empty FastString.
  12
  13 An example that provokes the error is
  14
  15         f _:_ _forall_ [a] <<<END OF FILE>>>
  16 --------------------------------------------------------
  17
  18 \begin{code}
  19
  20 module Lex (
  21
  22         ifaceParseErr, srcParseErr,
  23
  24         -- Monad for parser
  25         Token(..), lexer, ParseResult(..), PState(..),
  26         checkVersion, ExtFlags(..), mkPState,
  27         StringBuffer,
  28
  29         P, thenP, thenP_, returnP, mapP, failP, failMsgP,
  30         getSrcLocP, setSrcLocP, getSrcFile,
  31         layoutOn, layoutOff, pushContext, popContext
  32     ) where
  33
  34 #include "HsVersions.h"
  35
  36 import Char             ( isSpace, toUpper )
  37 import List             ( isSuffixOf )
  38
  39 import PrelNames        ( mkTupNameStr )
  40 import CmdLineOpts      ( opt_HiVersion, opt_NoHiCheck )
  41 import ForeignCall      ( Safety(..) )
  42 import NewDemand        ( StrictSig(..), Demand(..), Demands(..),
  43                           DmdResult(..), mkTopDmdType, evalDmd, lazyDmd )
  44 import UniqFM           ( listToUFM, lookupUFM )
  45 import BasicTypes       ( Boxity(..) )
  46 import SrcLoc           ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
  47                           replaceSrcLine, mkSrcLoc )
  48
  49 import ErrUtils         ( Message )
  50 import Outputable
  51
  52 import FastString
  53 import StringBuffer
  54 import GlaExts
  55 import Ctype
  56 import Char             ( chr, ord )
  57 import PrelRead         ( readRational__ ) -- Glasgow non-std
  58 import PrelBits         ( Bits(..) )       -- non-std
  59 \end{code}
  60
  61 %************************************************************************
  62 %*                                                                      *
  63 \subsection{Data types}
  64 %*                                                                      *
  65 %************************************************************************
  66
  67 The token data type, fairly un-interesting except from one
  68 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
  69 strictness, unfolding etc).
  70
  71 The Idea/Observation here is that the renamer needs to scan through
  72 all of an interface file before it can continue. But only a fraction
  73 of the information contained in the file turns out to be useful, so
  74 delaying as much as possible of the scanning and parsing of an
  75 interface file Makes Sense (Heap profiles of the compiler
  76 show a reduction in heap usage by at least a factor of two,
  77 post-renamer).
  78
  79 Hence, the interface file lexer spots when value declarations are
  80 being scanned and return the @ITidinfo@ and @ITtype@ constructors
  81 for the type and any other id info for that binding (unfolding, strictness
  82 etc). These constructors are applied to the result of lexing these sub-chunks.
  83
  84 The lexing of the type and id info is all done lazily, of course, so
  85 the scanning (and subsequent parsing) will be done *only* on the ids the
  86 renamer finds out that it is interested in. The rest will just be junked.
  87 Laziness, you know it makes sense :-)
  88
  89 \begin{code}
  90 data Token
  91   = ITas                        -- Haskell keywords
  92   | ITcase
  93   | ITclass
  94   | ITdata
  95   | ITdefault
  96   | ITderiving
  97   | ITdo
  98   | ITelse
  99   | IThiding
 100   | ITif
 101   | ITimport
 102   | ITin
 103   | ITinfix
 104   | ITinfixl
 105   | ITinfixr
 106   | ITinstance
 107   | ITlet
 108   | ITmodule
 109   | ITnewtype
 110   | ITof
 111   | ITqualified
 112   | ITthen
 113   | ITtype
 114   | ITwhere
 115   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 116
 117   | ITforall                    -- GHC extension keywords
 118   | ITforeign
 119   | ITexport
 120   | ITlabel
 121   | ITdynamic
 122   | ITsafe
 123   | ITunsafe
 124   | ITwith
 125   | ITstdcallconv
 126   | ITccallconv
 127   | ITdotnet
 128
 129   | ITinterface                 -- interface keywords
 130   | IT__export
 131   | ITdepends
 132   | IT__forall
 133   | ITletrec
 134   | ITcoerce
 135   | ITinlineMe
 136   | ITinlineCall
 137   | ITccall (Bool,Bool,Safety)  -- (is_dyn, is_casm, may_gc)
 138   | ITdefaultbranch
 139   | ITbottom
 140   | ITinteger_lit
 141   | ITfloat_lit
 142   | ITword_lit
 143   | ITword64_lit
 144   | ITint64_lit
 145   | ITrational_lit
 146   | ITaddr_lit
 147   | ITlabel_lit
 148   | ITlit_lit
 149   | ITstring_lit
 150   | ITtypeapp
 151   | ITusage
 152   | ITfuall
 153   | ITarity
 154   | ITspecialise
 155   | ITnocaf
 156   | ITunfold
 157   | ITstrict StrictSig
 158   | ITrules
 159   | ITcprinfo
 160   | ITdeprecated
 161   | IT__scc
 162   | ITsccAllCafs
 163
 164   | ITspecialise_prag           -- Pragmas
 165   | ITsource_prag
 166   | ITinline_prag
 167   | ITnoinline_prag
 168   | ITrules_prag
 169   | ITdeprecated_prag
 170   | ITline_prag
 171   | ITscc_prag
 172   | ITclose_prag
 173
 174   | ITdotdot                    -- reserved symbols
 175   | ITdcolon
 176   | ITequal
 177   | ITlam
 178   | ITvbar
 179   | ITlarrow
 180   | ITrarrow
 181   | ITat
 182   | ITtilde
 183   | ITdarrow
 184   | ITminus
 185   | ITbang
 186   | ITstar
 187   | ITdot
 188
 189   | ITbiglam                    -- GHC-extension symbols
 190
 191   | ITocurly                    -- special symbols
 192   | ITccurly
 193   | ITocurlybar                 -- {|, for type applications
 194   | ITccurlybar                 -- |}, for type applications
 195   | ITvccurly
 196   | ITobrack
 197   | ITopabrack                  -- [:, for parallel arrays with -fparr
 198   | ITcpabrack                  -- :], for parallel arrays with -fparr
 199   | ITcbrack
 200   | IToparen
 201   | ITcparen
 202   | IToubxparen
 203   | ITcubxparen
 204   | ITsemi
 205   | ITcomma
 206   | ITunderscore
 207   | ITbackquote
 208
 209   | ITvarid   FAST_STRING       -- identifiers
 210   | ITconid   FAST_STRING
 211   | ITvarsym  FAST_STRING
 212   | ITconsym  FAST_STRING
 213   | ITqvarid  (FAST_STRING,FAST_STRING)
 214   | ITqconid  (FAST_STRING,FAST_STRING)
 215   | ITqvarsym (FAST_STRING,FAST_STRING)
 216   | ITqconsym (FAST_STRING,FAST_STRING)
 217
 218   | ITdupipvarid   FAST_STRING  -- GHC extension: implicit param: ?x
 219   | ITsplitipvarid FAST_STRING  -- GHC extension: implicit param: %x
 220
 221   | ITpragma StringBuffer
 222
 223   | ITchar       Int
 224   | ITstring     FAST_STRING
 225   | ITinteger    Integer
 226   | ITrational   Rational
 227
 228   | ITprimchar   Int
 229   | ITprimstring FAST_STRING
 230   | ITprimint    Integer
 231   | ITprimfloat  Rational
 232   | ITprimdouble Rational
 233   | ITlitlit     FAST_STRING
 234
 235   | ITunknown String            -- Used when the lexer can't make sense of it
 236   | ITeof                       -- end of file token
 237   deriving Show -- debugging
 238 \end{code}
 239
 240 -----------------------------------------------------------------------------
 241 Keyword Lists
 242
 243 \begin{code}
 244 pragmaKeywordsFM = listToUFM $
 245       map (\ (x,y) -> (_PK_ x,y))
 246        [( "SPECIALISE", ITspecialise_prag ),
 247         ( "SPECIALIZE", ITspecialise_prag ),
 248         ( "SOURCE",     ITsource_prag ),
 249         ( "INLINE",     ITinline_prag ),
 250         ( "NOINLINE",   ITnoinline_prag ),
 251         ( "NOTINLINE",  ITnoinline_prag ),
 252         ( "LINE",       ITline_prag ),
 253         ( "RULES",      ITrules_prag ),
 254         ( "RULEZ",      ITrules_prag ), -- american spelling :-)
 255         ( "SCC",        ITscc_prag ),
 256         ( "DEPRECATED", ITdeprecated_prag )
 257         ]
 258
 259 haskellKeywordsFM = listToUFM $
 260       map (\ (x,y) -> (_PK_ x,y))
 261        [( "_",          ITunderscore ),
 262         ( "as",         ITas ),
 263         ( "case",       ITcase ),
 264         ( "class",      ITclass ),
 265         ( "data",       ITdata ),
 266         ( "default",    ITdefault ),
 267         ( "deriving",   ITderiving ),
 268         ( "do",         ITdo ),
 269         ( "else",       ITelse ),
 270         ( "hiding",     IThiding ),
 271         ( "if",         ITif ),
 272         ( "import",     ITimport ),
 273         ( "in",         ITin ),
 274         ( "infix",      ITinfix ),
 275         ( "infixl",     ITinfixl ),
 276         ( "infixr",     ITinfixr ),
 277         ( "instance",   ITinstance ),
 278         ( "let",        ITlet ),
 279         ( "module",     ITmodule ),
 280         ( "newtype",    ITnewtype ),
 281         ( "of",         ITof ),
 282         ( "qualified",  ITqualified ),
 283         ( "then",       ITthen ),
 284         ( "type",       ITtype ),
 285         ( "where",      ITwhere ),
 286         ( "_scc_",      ITscc )         -- ToDo: remove
 287      ]
 288
 289 isSpecial :: Token -> Bool
 290 -- If we see M.x, where x is a keyword, but
 291 -- is special, we treat is as just plain M.x,
 292 -- not as a keyword.
 293 isSpecial ITas          = True
 294 isSpecial IThiding      = True
 295 isSpecial ITqualified   = True
 296 isSpecial ITforall      = True
 297 isSpecial ITexport      = True
 298 isSpecial ITlabel       = True
 299 isSpecial ITdynamic     = True
 300 isSpecial ITsafe        = True
 301 isSpecial ITunsafe      = True
 302 isSpecial ITwith        = True
 303 isSpecial ITccallconv   = True
 304 isSpecial ITstdcallconv = True
 305 isSpecial _             = False
 306
 307 -- IMPORTANT: Keep this in synch with ParseIface.y's var_fs production! (SUP)
 308 ghcExtensionKeywordsFM = listToUFM $
 309         map (\ (x,y) -> (_PK_ x,y))
 310      [  ( "forall",     ITforall ),
 311         ( "foreign",    ITforeign ),
 312         ( "export",     ITexport ),
 313         ( "label",      ITlabel ),
 314         ( "dynamic",    ITdynamic ),
 315         ( "safe",       ITunsafe ),
 316         ( "unsafe",     ITunsafe ),
 317         ( "with",       ITwith ),
 318         ( "stdcall",    ITstdcallconv),
 319         ( "ccall",      ITccallconv),
 320         ( "dotnet",     ITdotnet),
 321         ("_ccall_",     ITccall (False, False, PlayRisky)),
 322         ("_ccall_GC_",  ITccall (False, False, PlaySafe)),
 323         ("_casm_",      ITccall (False, True,  PlayRisky)),
 324         ("_casm_GC_",   ITccall (False, True,  PlaySafe)),
 325
 326         -- interface keywords
 327         ("__interface",         ITinterface),
 328         ("__export",            IT__export),
 329         ("__depends",           ITdepends),
 330         ("__forall",            IT__forall),
 331         ("__letrec",            ITletrec),
 332         ("__coerce",            ITcoerce),
 333         ("__inline_me",         ITinlineMe),
 334         ("__inline_call",       ITinlineCall),
 335         ("__depends",           ITdepends),
 336         ("__DEFAULT",           ITdefaultbranch),
 337         ("__bot",               ITbottom),
 338         ("__integer",           ITinteger_lit),
 339         ("__float",             ITfloat_lit),
 340         ("__int64",             ITint64_lit),
 341         ("__word",              ITword_lit),
 342         ("__word64",            ITword64_lit),
 343         ("__rational",          ITrational_lit),
 344         ("__addr",              ITaddr_lit),
 345         ("__label",             ITlabel_lit),
 346         ("__litlit",            ITlit_lit),
 347         ("__string",            ITstring_lit),
 348         ("__a",                 ITtypeapp),
 349         ("__u",                 ITusage),
 350         ("__fuall",             ITfuall),
 351         ("__A",                 ITarity),
 352         ("__P",                 ITspecialise),
 353         ("__C",                 ITnocaf),
 354         ("__R",                 ITrules),
 355         ("__D",                 ITdeprecated),
 356         ("__U",                 ITunfold),
 357
 358         ("__ccall",             ITccall (False, False, PlayRisky)),
 359         ("__ccall_GC",          ITccall (False, False, PlaySafe)),
 360         ("__dyn_ccall",         ITccall (True,  False, PlayRisky)),
 361         ("__dyn_ccall_GC",      ITccall (True,  False, PlaySafe)),
 362         ("__casm",              ITccall (False, True,  PlayRisky)),
 363         ("__dyn_casm",          ITccall (True,  True,  PlayRisky)),
 364         ("__casm_GC",           ITccall (False, True,  PlaySafe)),
 365         ("__dyn_casm_GC",       ITccall (True,  True,  PlaySafe)),
 366
 367         ("/\\",                 ITbiglam)
 368      ]
 369
 370
 371 haskellKeySymsFM = listToUFM $
 372         map (\ (x,y) -> (_PK_ x,y))
 373       [ ("..",          ITdotdot)
 374        ,("::",          ITdcolon)
 375        ,("=",           ITequal)
 376        ,("\\",          ITlam)
 377        ,("|",           ITvbar)
 378        ,("<-",          ITlarrow)
 379        ,("->",          ITrarrow)
 380        ,("@",           ITat)
 381        ,("~",           ITtilde)
 382        ,("=>",          ITdarrow)
 383        ,("-",           ITminus)
 384        ,("!",           ITbang)
 385        ,("*",           ITstar)
 386        ,(".",           ITdot)          -- sadly, for 'forall a . t'
 387        ]
 388 \end{code}
 389
 390 -----------------------------------------------------------------------------
 391 The lexical analyser
 392
 393 Lexer state:
 394
 395         - (exts)  lexing a source with extensions, eg, an interface file or
 396                   with -fglasgow-exts
 397         - (bol)   pointer to beginning of line (for column calculations)
 398         - (buf)   pointer to beginning of token
 399         - (buf)   pointer to current char
 400         - (atbol) flag indicating whether we're at the beginning of a line
 401
 402 \begin{code}
 403 lexer :: (Token -> P a) -> P a
 404 lexer cont buf s@(PState{
 405                     loc = loc,
 406                     extsBitmap = exts,
 407                     bol = bol,
 408                     atbol = atbol,
 409                     context = ctx
 410                 })
 411
 412         -- first, start a new lexeme and lose all the whitespace
 413   =  _scc_ "Lexer"
 414   tab line bol atbol (stepOverLexeme buf)
 415   where
 416         line = srcLocLine loc
 417
 418         tab y bol atbol buf = --trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $
 419           case currentChar# buf of
 420
 421             '\NUL'# ->
 422                    if bufferExhausted (stepOn buf)
 423                         then cont ITeof buf s'
 424                         else trace "lexer: misplaced NUL?" $
 425                              tab y bol atbol (stepOn buf)
 426
 427             '\n'# -> let buf' = stepOn buf
 428                      in tab (y +# 1#) (currentIndex# buf') 1# buf'
 429
 430                 -- find comments.  This got harder in Haskell 98.
 431             '-'# ->  let trundle n =
 432                           let next = lookAhead# buf n in
 433                           if next `eqChar#` '-'# then trundle (n +# 1#)
 434                           else if is_symbol next || n <# 2#
 435                                 then is_a_token
 436                                 else tab y bol atbol
 437                                          (stepOnUntilChar# (stepOnBy# buf n) '\n'#)
 438                     in trundle 1#
 439
 440                 -- comments and pragmas.  We deal with LINE pragmas here,
 441                 -- and throw out any unrecognised pragmas as comments.  Any
 442                 -- pragmas we know about are dealt with later (after any layout
 443                 -- processing if necessary).
 444             '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
 445                 if lookAhead# buf 2# `eqChar#` '#'# then
 446                   if lookAhead# buf 3# `eqChar#` '#'# then is_a_token else
 447                   case expandWhile# is_space (setCurrentPos# buf 3#) of { buf1->
 448                   case expandWhile# is_ident (stepOverLexeme buf1)   of { buf2->
 449                   let lexeme = mkFastString -- ToDo: too slow
 450                                   (map toUpper (lexemeToString buf2)) in
 451                   case lookupUFM pragmaKeywordsFM lexeme of
 452                         -- ignore RULES pragmas when -fglasgow-exts is off
 453                         Just ITrules_prag | not (glaExtsEnabled exts) ->
 454                            skip_to_end (stepOnBy# buf 2#) s'
 455                         Just ITline_prag ->
 456                            line_prag skip_to_end buf2 s'
 457                         Just other -> is_a_token
 458                         Nothing -> skip_to_end (stepOnBy# buf 2#) s'
 459                   }}
 460
 461                 else skip_to_end (stepOnBy# buf 2#) s'
 462                 where
 463                     skip_to_end = skipNestedComment (lexer cont)
 464
 465                 -- special GHC extension: we grok cpp-style #line pragmas
 466             '#'# | lexemeIndex buf ==# bol ->   -- the '#' must be in column 0
 467                 let buf1 | lookAhead# buf 1# `eqChar#` 'l'# &&
 468                            lookAhead# buf 2# `eqChar#` 'i'# &&
 469                            lookAhead# buf 3# `eqChar#` 'n'# &&
 470                            lookAhead# buf 4# `eqChar#` 'e'#  = stepOnBy# buf 5#
 471                          | otherwise = stepOn buf
 472                 in
 473                 case expandWhile# is_space buf1 of { buf2 ->
 474                 if is_digit (currentChar# buf2)
 475                         then line_prag next_line buf2 s'
 476                         else is_a_token
 477                 }
 478                 where
 479                 next_line buf = lexer cont (stepOnUntilChar# buf '\n'#)
 480
 481                 -- tabs have been expanded beforehand
 482             c | is_space c -> tab y bol atbol (stepOn buf)
 483               | otherwise  -> is_a_token
 484
 485            where s' = s{loc = replaceSrcLine loc y,
 486                         bol = bol,
 487                        atbol = atbol}
 488
 489                  is_a_token | atbol /=# 0# = lexBOL cont buf s'
 490                             | otherwise    = lexToken cont exts buf s'
 491
 492 -- {-# LINE .. #-} pragmas.  yeuch.
 493 line_prag cont buf s@PState{loc=loc} =
 494   case expandWhile# is_space buf                of { buf1 ->
 495   case scanNumLit 0 (stepOverLexeme buf1)       of { (line,buf2) ->
 496   -- subtract one: the line number refers to the *following* line.
 497   let real_line = line - 1 in
 498   case fromInteger real_line                    of { i@(I# l) ->
 499         -- ToDo, if no filename then we skip the newline.... d'oh
 500   case expandWhile# is_space buf2               of { buf3 ->
 501   case currentChar# buf3                        of
 502      '\"'#{-"-} ->
 503         case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
 504         let
 505             file = lexemeToFastString buf4
 506             new_buf = stepOn (stepOverLexeme buf4)
 507         in
 508         if nullFastString file
 509                 then cont new_buf s{loc = replaceSrcLine loc l}
 510                 else cont new_buf s{loc = mkSrcLoc file i}
 511         }
 512      _other -> cont (stepOverLexeme buf3) s{loc = replaceSrcLine loc l}
 513   }}}}
 514
 515 skipNestedComment :: P a -> P a
 516 skipNestedComment cont buf state = skipNestedComment' (loc state) cont buf state
 517
 518 skipNestedComment' :: SrcLoc -> P a -> P a
 519 skipNestedComment' orig_loc cont buf = loop buf
 520  where
 521    loop buf =
 522      case currentChar# buf of
 523         '-'# | lookAhead# buf 1# `eqChar#` '}'# -> cont (stepOnBy# buf 2#)
 524
 525         '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
 526               skipNestedComment
 527                 (skipNestedComment' orig_loc cont)
 528                 (stepOnBy# buf 2#)
 529
 530         '\n'# -> \ s@PState{loc=loc} ->
 531                  let buf' = stepOn buf in
 532                  loop buf' s{loc = incSrcLine loc,
 533                              bol = currentIndex# buf',
 534                              atbol = 1#}
 535
 536         -- pass the original SrcLoc to lexError so that the error is
 537         -- reported at the line it was originally on, not the line at
 538         -- the end of the file.
 539         '\NUL'# | bufferExhausted (stepOn buf) ->
 540                 \s -> lexError "unterminated `{-'" buf s{loc=orig_loc} -- -}
 541
 542         _   -> loop (stepOn buf)
 543
 544 -- When we are lexing the first token of a line, check whether we need to
 545 -- insert virtual semicolons or close braces due to layout.
 546
 547 lexBOL :: (Token -> P a) -> P a
 548 lexBOL cont buf s@(PState{
 549                     loc = loc,
 550                     extsBitmap = exts,
 551                     bol = bol,
 552                     atbol = atbol,
 553                     context = ctx
 554                   }) =
 555         if need_close_curly then
 556                 --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $
 557                 cont ITvccurly buf s{atbol = 1#, context = tail ctx}
 558         else if need_semi_colon then
 559                 --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $
 560                 cont ITsemi buf s{atbol = 0#}
 561         else
 562                 lexToken cont exts buf s{atbol = 0#}
 563   where
 564         col = currentIndex# buf -# bol
 565
 566         need_close_curly =
 567                 case ctx of
 568                         [] -> False
 569                         (i:_) -> case i of
 570                                     NoLayout -> False
 571                                     Layout n -> col <# n
 572         need_semi_colon =
 573                 case ctx of
 574                         [] -> False
 575                         (i:_) -> case i of
 576                                     NoLayout -> False
 577                                     Layout n -> col ==# n
 578
 579
 580 lexToken :: (Token -> P a) -> Int# -> P a
 581 lexToken cont exts buf =
 582 -- trace "lexToken" $
 583   case currentChar# buf of
 584
 585     -- special symbols ----------------------------------------------------
 586     '('# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '#'#
 587                 -> cont IToubxparen (setCurrentPos# buf 2#)
 588          | otherwise
 589                 -> cont IToparen (incLexeme buf)
 590
 591     ')'# -> cont ITcparen    (incLexeme buf)
 592     '['# | parrEnabled exts && lookAhead# buf 1# `eqChar#` ':'# ->
 593             cont ITopabrack  (setCurrentPos# buf 2#)
 594          | otherwise ->
 595             cont ITobrack    (incLexeme buf)
 596     ']'# -> cont ITcbrack    (incLexeme buf)
 597     ','# -> cont ITcomma     (incLexeme buf)
 598     ';'# -> cont ITsemi      (incLexeme buf)
 599     '}'# -> \ s@PState{context = ctx} ->
 600             case ctx of
 601                 (_:ctx') -> cont ITccurly (incLexeme buf) s{context=ctx'}
 602                 _        -> lexError "too many '}'s" buf s
 603     '|'# -> case lookAhead# buf 1# of
 604                  '}'#  | glaExtsEnabled exts -> cont ITccurlybar
 605                                                      (setCurrentPos# buf 2#)
 606                  _                           -> lex_sym cont (incLexeme buf)
 607     ':'# -> case lookAhead# buf 1# of
 608                  ']'#  | parrEnabled exts    -> cont ITcpabrack
 609                                                      (setCurrentPos# buf 2#)
 610                  _                           -> lex_sym cont (incLexeme buf)
 611
 612
 613     '#'# -> case lookAhead# buf 1# of
 614                 ')'#  | glaExtsEnabled exts
 615                      -> cont ITcubxparen (setCurrentPos# buf 2#)
 616                 '-'# -> case lookAhead# buf 2# of
 617                            '}'# -> cont ITclose_prag (setCurrentPos# buf 3#)
 618                            _    -> lex_sym cont (incLexeme buf)
 619                 _    -> lex_sym cont (incLexeme buf)
 620
 621     '`'# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '`'#
 622                 -> lex_cstring cont (setCurrentPos# buf 2#)
 623          | otherwise
 624                 -> cont ITbackquote (incLexeme buf)
 625
 626     '{'# ->     -- look for "{-##" special iface pragma   -- for Emacs: -}
 627             case lookAhead# buf 1# of
 628            '|'# | glaExtsEnabled exts
 629                 -> cont ITocurlybar (setCurrentPos# buf 2#)
 630            '-'# -> case lookAhead# buf 2# of
 631                     '#'# -> case lookAhead# buf 3# of
 632                                 '#'# ->
 633                                    lexPragma
 634                                       cont
 635                                       (\ cont lexeme buf' -> cont (ITpragma lexeme) buf')
 636                                       0#
 637                                       (stepOnBy# (stepOverLexeme buf) 4#)
 638                                 _ -> lex_prag cont (setCurrentPos# buf 3#)
 639                     _    -> cont ITocurly (incLexeme buf)
 640            _ -> (layoutOff `thenP_` cont ITocurly)  (incLexeme buf)
 641
 642     -- strings/characters -------------------------------------------------
 643     '\"'#{-"-} -> lex_string cont exts [] (incLexeme buf)
 644     '\''#      -> lex_char (char_end cont) exts (incLexeme buf)
 645
 646     -- strictness and cpr pragmas and __scc treated specially.
 647     '_'# | glaExtsEnabled exts ->
 648          case lookAhead# buf 1# of
 649            '_'# -> case lookAhead# buf 2# of
 650                     'S'# ->
 651                         lex_demand cont (stepOnUntil (not . isSpace)
 652                                         (stepOnBy# buf 3#)) -- past __S
 653                     'M'# ->
 654                         cont ITcprinfo (stepOnBy# buf 3#)       -- past __M
 655
 656                     's'# ->
 657                         case prefixMatch (stepOnBy# buf 3#) "cc" of
 658                                Just buf' -> lex_scc cont (stepOverLexeme buf')
 659                                Nothing   -> lex_id cont exts buf
 660                     _ -> lex_id cont exts buf
 661            _    -> lex_id cont exts buf
 662
 663         -- Hexadecimal and octal constants
 664     '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2
 665                 -> readNum (after_lexnum cont exts) buf' is_hexdigit 16 hex
 666          | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2
 667                 -> readNum (after_lexnum cont exts) buf' is_octdigit  8 oct_or_dec
 668         where ch   = lookAhead# buf 1#
 669               ch2  = lookAhead# buf 2#
 670               buf' = setCurrentPos# buf 2#
 671
 672     '\NUL'# ->
 673             if bufferExhausted (stepOn buf) then
 674                cont ITeof buf
 675             else
 676                trace "lexIface: misplaced NUL?" $
 677                cont (ITunknown "\NUL") (stepOn buf)
 678
 679     '?'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
 680             lex_ip ITdupipvarid cont (incLexeme buf)
 681     '%'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
 682             lex_ip ITsplitipvarid cont (incLexeme buf)
 683     c | is_digit  c -> lex_num cont exts 0 buf
 684       | is_symbol c -> lex_sym cont buf
 685       | is_upper  c -> lex_con cont exts buf
 686       | is_ident  c -> lex_id  cont exts buf
 687       | otherwise   -> lexError "illegal character" buf
 688
 689 -- Int# is unlifted, and therefore faster than Bool for flags.
 690 {-# INLINE flag #-}
 691 flag :: Int# -> Bool
 692 flag 0# = False
 693 flag _  = True
 694
 695 -------------------------------------------------------------------------------
 696 -- Pragmas
 697
 698 lex_prag cont buf
 699   = case expandWhile# is_space buf of { buf1 ->
 700     case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 ->
 701     let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
 702     case lookupUFM pragmaKeywordsFM lexeme of
 703         Just kw -> cont kw (mergeLexemes buf buf2)
 704         Nothing -> panic "lex_prag"
 705   }}
 706
 707 -------------------------------------------------------------------------------
 708 -- Strings & Chars
 709
 710 lex_string cont exts s buf
 711   = case currentChar# buf of
 712         '"'#{-"-} ->
 713            let buf' = incLexeme buf
 714                s' = mkFastStringNarrow (map chr (reverse s))
 715            in case currentChar# buf' of
 716                 '#'# | glaExtsEnabled exts -> if all (<= 0xFF) s
 717                     then cont (ITprimstring s') (incLexeme buf')
 718                     else lexError "primitive string literal must contain only characters <= \'\\xFF\'" buf'
 719                 _                   -> cont (ITstring s') buf'
 720
 721         -- ignore \& in a string, deal with string gaps
 722         '\\'# | next_ch `eqChar#` '&'#
 723                 -> lex_string cont exts s buf'
 724               | is_space next_ch
 725                 -> lex_stringgap cont exts s (incLexeme buf)
 726
 727             where next_ch = lookAhead# buf 1#
 728                   buf' = setCurrentPos# buf 2#
 729
 730         _ -> lex_char (lex_next_string cont s) exts buf
 731
 732 lex_stringgap cont exts s buf
 733   = let buf' = incLexeme buf in
 734     case currentChar# buf of
 735         '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont exts s buf'
 736                   st{loc = incSrcLine loc}
 737         '\\'# -> lex_string cont exts s buf'
 738         c | is_space c -> lex_stringgap cont exts s buf'
 739         other -> charError buf'
 740
 741 lex_next_string cont s exts c buf = lex_string cont exts (c:s) buf
 742
 743 lex_char :: (Int# -> Int -> P a) -> Int# -> P a
 744 lex_char cont exts buf
 745   = case currentChar# buf of
 746         '\\'# -> lex_escape (cont exts) (incLexeme buf)
 747         c | is_any c -> cont exts (I# (ord# c)) (incLexeme buf)
 748         other -> charError buf
 749
 750 char_end cont exts c buf
 751   = case currentChar# buf of
 752         '\''# -> let buf' = incLexeme buf in
 753                  case currentChar# buf' of
 754                         '#'# | glaExtsEnabled exts
 755                                 -> cont (ITprimchar c) (incLexeme buf')
 756                         _       -> cont (ITchar c) buf'
 757         _     -> charError buf
 758
 759 lex_escape cont buf
 760   = let buf' = incLexeme buf in
 761     case currentChar# buf of
 762         'a'#       -> cont (ord '\a') buf'
 763         'b'#       -> cont (ord '\b') buf'
 764         'f'#       -> cont (ord '\f') buf'
 765         'n'#       -> cont (ord '\n') buf'
 766         'r'#       -> cont (ord '\r') buf'
 767         't'#       -> cont (ord '\t') buf'
 768         'v'#       -> cont (ord '\v') buf'
 769         '\\'#      -> cont (ord '\\') buf'
 770         '"'#       -> cont (ord '\"') buf'
 771         '\''#      -> cont (ord '\'') buf'
 772         '^'#       -> let c = currentChar# buf' in
 773                       if c `geChar#` '@'# && c `leChar#` '_'#
 774                         then cont (I# (ord# c -# ord# '@'#)) (incLexeme buf')
 775                         else charError buf'
 776
 777         'x'#      -> readNum (after_charnum cont) buf' is_hexdigit 16 hex
 778         'o'#      -> readNum (after_charnum cont) buf' is_octdigit  8 oct_or_dec
 779         x | is_digit x
 780                   -> readNum (after_charnum cont) buf is_digit    10 oct_or_dec
 781
 782         _          -> case [ (c,buf2) | (p,c) <- silly_escape_chars,
 783                                        Just buf2 <- [prefixMatch buf p] ] of
 784                             (c,buf2):_ -> cont (ord c) buf2
 785                             [] -> charError buf'
 786
 787 after_charnum cont i buf
 788   = if i >= 0 && i <= 0x10FFFF
 789         then cont (fromInteger i) buf
 790         else charError buf
 791
 792 readNum cont buf is_digit base conv = read buf 0
 793   where read buf i
 794           = case currentChar# buf of { c ->
 795             if is_digit c
 796                 then read (incLexeme buf) (i*base + (toInteger (I# (conv c))))
 797                 else cont i buf
 798             }
 799
 800 is_hexdigit c
 801         =  is_digit c
 802         || (c `geChar#` 'a'# && c `leChar#` 'f'#)
 803         || (c `geChar#` 'A'# && c `leChar#` 'F'#)
 804
 805 hex c | is_digit c = ord# c -# ord# '0'#
 806       | otherwise  = ord# (to_lower c) -# ord# 'a'# +# 10#
 807 oct_or_dec c = ord# c -# ord# '0'#
 808
 809 is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'#
 810
 811 to_lower c
 812   | c `geChar#` 'A'# && c `leChar#` 'Z'#
 813         = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#))
 814   | otherwise = c
 815
 816 charError buf = lexError "error in character literal" buf
 817
 818 silly_escape_chars = [
 819         ("NUL", '\NUL'),
 820         ("SOH", '\SOH'),
 821         ("STX", '\STX'),
 822         ("ETX", '\ETX'),
 823         ("EOT", '\EOT'),
 824         ("ENQ", '\ENQ'),
 825         ("ACK", '\ACK'),
 826         ("BEL", '\BEL'),
 827         ("BS", '\BS'),
 828         ("HT", '\HT'),
 829         ("LF", '\LF'),
 830         ("VT", '\VT'),
 831         ("FF", '\FF'),
 832         ("CR", '\CR'),
 833         ("SO", '\SO'),
 834         ("SI", '\SI'),
 835         ("DLE", '\DLE'),
 836         ("DC1", '\DC1'),
 837         ("DC2", '\DC2'),
 838         ("DC3", '\DC3'),
 839         ("DC4", '\DC4'),
 840         ("NAK", '\NAK'),
 841         ("SYN", '\SYN'),
 842         ("ETB", '\ETB'),
 843         ("CAN", '\CAN'),
 844         ("EM", '\EM'),
 845         ("SUB", '\SUB'),
 846         ("ESC", '\ESC'),
 847         ("FS", '\FS'),
 848         ("GS", '\GS'),
 849         ("RS", '\RS'),
 850         ("US", '\US'),
 851         ("SP", '\SP'),
 852         ("DEL", '\DEL')
 853         ]
 854
 855 -------------------------------------------------------------------------------
 856
 857 lex_demand cont buf =
 858  case read_em [] buf of { (ls,buf') ->
 859  case currentChar# buf' of
 860    'b'# -> cont (ITstrict (StrictSig (mkTopDmdType ls BotRes))) (incLexeme buf')
 861    'm'# -> cont (ITstrict (StrictSig (mkTopDmdType ls RetCPR))) (incLexeme buf')
 862    _    -> cont (ITstrict (StrictSig (mkTopDmdType ls TopRes))) buf'
 863  }
 864  where
 865   read_em acc buf =
 866    case currentChar# buf of
 867     'T'# -> read_em (Top     : acc) (stepOn buf)
 868     'L'# -> read_em (lazyDmd : acc) (stepOn buf)
 869     'A'# -> read_em (Abs     : acc) (stepOn buf)
 870     'V'# -> read_em (evalDmd : acc) (stepOn buf)        -- Temporary, until
 871                                                         -- we've recompiled prelude etc
 872     'C'# -> do_unary Call  acc (stepOnBy# buf 2#)       -- Skip 'C('
 873
 874     'U'# -> do_seq1 Eval         acc (stepOnBy# buf 1#)
 875     'D'# -> do_seq1 Defer        acc (stepOnBy# buf 1#)
 876     'S'# -> do_seq1 (Box . Eval) acc (stepOnBy# buf 1#)
 877
 878     _    -> (reverse acc, buf)
 879
 880   do_seq1 fn acc buf
 881     = case currentChar# buf of
 882         '('# -> do_seq2 fn acc (stepOnBy# buf 1#)
 883         _    -> read_em (fn (Poly Abs) : acc) buf
 884
 885   do_seq2 fn acc buf
 886     = case read_em [] buf of { (dmds, buf) ->
 887       case currentChar# buf of
 888         ')'# -> read_em (fn (Prod dmds) : acc)
 889                         (stepOn buf)
 890         '*'# -> ASSERT( length dmds == 1 )
 891                 read_em (fn (Poly (head dmds)) : acc)
 892                         (stepOnBy# buf 2#)      -- Skip '*)'
 893       }
 894
 895   do_unary fn acc buf
 896     = case read_em [] buf of
 897         ([dmd], rest) -> read_em (fn dmd : acc) (stepOn rest)   -- Skip ')'
 898
 899 ------------------
 900 lex_scc cont buf =
 901  case currentChar# buf of
 902   'C'# -> cont ITsccAllCafs (incLexeme buf)
 903   other -> cont ITscc buf
 904
 905 -----------------------------------------------------------------------------
 906 -- Numbers
 907
 908 lex_num :: (Token -> P a) -> Int# -> Integer -> P a
 909 lex_num cont exts acc buf =
 910  case scanNumLit acc buf of
 911      (acc',buf') ->
 912        case currentChar# buf' of
 913          '.'# | is_digit (lookAhead# buf' 1#) ->
 914              -- this case is not optimised at all, as the
 915              -- presence of floating point numbers in interface
 916              -- files is not that common. (ToDo)
 917             case expandWhile# is_digit (incLexeme buf') of
 918               buf2 -> -- points to first non digit char
 919
 920                 let l = case currentChar# buf2 of
 921                           'E'# -> do_exponent
 922                           'e'# -> do_exponent
 923                           _ -> buf2
 924
 925                     do_exponent
 926                         = let buf3 = incLexeme buf2 in
 927                           case currentChar# buf3 of
 928                                 '-'# -> expandWhile# is_digit (incLexeme buf3)
 929                                 '+'# -> expandWhile# is_digit (incLexeme buf3)
 930                                 x | is_digit x -> expandWhile# is_digit buf3
 931                                 _ -> buf2
 932
 933                     v = readRational__ (lexemeToString l)
 934
 935                 in case currentChar# l of -- glasgow exts only
 936                       '#'# | glaExtsEnabled exts -> let l' = incLexeme l in
 937                               case currentChar# l' of
 938                                 '#'# -> cont (ITprimdouble v) (incLexeme l')
 939                                 _    -> cont (ITprimfloat  v) l'
 940                       _ -> cont (ITrational v) l
 941
 942          _ -> after_lexnum cont exts acc' buf'
 943
 944 after_lexnum cont exts i buf
 945   = case currentChar# buf of
 946         '#'# | glaExtsEnabled exts -> cont (ITprimint i) (incLexeme buf)
 947         _                          -> cont (ITinteger i) buf
 948
 949 -----------------------------------------------------------------------------
 950 -- C "literal literal"s  (i.e. things like ``NULL'', ``stdout'' etc.)
 951
 952 -- we lexemeToFastString on the bit between the ``''s, but include the
 953 -- quotes in the full lexeme.
 954
 955 lex_cstring cont buf =
 956  case expandUntilMatch (stepOverLexeme buf) "\'\'" of
 957    Just buf' -> cont (ITlitlit (lexemeToFastString
 958                                 (setCurrentPos# buf' (negateInt# 2#))))
 959                    (mergeLexemes buf buf')
 960    Nothing   -> lexError "unterminated ``" buf
 961
 962 -----------------------------------------------------------------------------
 963 -- identifiers, symbols etc.
 964
 965 lex_ip ip_constr cont buf =
 966  case expandWhile# is_ident buf of
 967    buf' -> cont (ip_constr (tailFS lexeme)) buf'
 968         where lexeme = lexemeToFastString buf'
 969
 970 lex_id cont exts buf =
 971  let buf1 = expandWhile# is_ident buf in
 972  seq buf1 $
 973
 974  case (if glaExtsEnabled exts
 975         then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
 976         else buf1)                              of { buf' ->
 977
 978  let lexeme  = lexemeToFastString buf' in
 979
 980  case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
 981         Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
 982                           cont kwd_token buf';
 983         Nothing        ->
 984
 985  let var_token = cont (ITvarid lexeme) buf' in
 986
 987  if not (glaExtsEnabled exts)
 988    then var_token
 989    else
 990
 991  case lookupUFM ghcExtensionKeywordsFM lexeme of {
 992         Just kwd_token -> cont kwd_token buf';
 993         Nothing        -> var_token
 994
 995  }}}
 996
 997 lex_sym cont buf =
 998  -- trace "lex_sym" $
 999  case expandWhile# is_symbol buf of
1000    buf' -> case lookupUFM haskellKeySymsFM lexeme of {
1001                 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
1002                                   cont kwd_token buf' ;
1003                 Nothing        -> --trace ("sym: "++unpackFS lexeme) $
1004                                   cont (mk_var_token lexeme) buf'
1005            }
1006         where lexeme = lexemeToFastString buf'
1007
1008
1009 -- lex_con recursively collects components of a qualified identifer.
1010 -- The argument buf is the StringBuffer representing the lexeme
1011 -- identified so far, where the next character is upper-case.
1012
1013 lex_con cont exts buf =
1014  -- trace ("con: "{-++unpackFS lexeme-}) $
1015  let empty_buf = stepOverLexeme buf in
1016  case expandWhile# is_ident empty_buf of { buf1 ->
1017  case slurp_trailing_hashes buf1 exts of { con_buf ->
1018
1019  let all_buf = mergeLexemes buf con_buf
1020
1021      con_lexeme = lexemeToFastString con_buf
1022      mod_lexeme = lexemeToFastString (decLexeme buf)
1023      all_lexeme = lexemeToFastString all_buf
1024
1025      just_a_conid
1026         | emptyLexeme buf = cont (ITconid con_lexeme)               all_buf
1027         | otherwise       = cont (ITqconid (mod_lexeme,con_lexeme)) all_buf
1028  in
1029
1030  case currentChar# all_buf of
1031      '.'# -> maybe_qualified cont exts all_lexeme
1032                 (incLexeme all_buf) just_a_conid
1033      _    -> just_a_conid
1034   }}
1035
1036
1037 maybe_qualified cont exts mod buf just_a_conid =
1038  -- trace ("qid: "{-++unpackFS lexeme-}) $
1039  case currentChar# buf of
1040   '['# ->       -- Special case for []
1041     case lookAhead# buf 1# of
1042      ']'# -> cont (ITqconid  (mod,SLIT("[]"))) (setCurrentPos# buf 2#)
1043      _    -> just_a_conid
1044
1045   '('# ->  -- Special case for (,,,)
1046            -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
1047     case lookAhead# buf 1# of
1048      '#'# | glaExtsEnabled exts -> case lookAhead# buf 2# of
1049                 ','# -> lex_ubx_tuple cont mod (setCurrentPos# buf 3#)
1050                                 just_a_conid
1051                 _    -> just_a_conid
1052      ')'# -> cont (ITqconid (mod,SLIT("()"))) (setCurrentPos# buf 2#)
1053      ','# -> lex_tuple cont mod (setCurrentPos# buf 2#) just_a_conid
1054      _    -> just_a_conid
1055
1056   '-'# -> case lookAhead# buf 1# of
1057             '>'# -> cont (ITqconid (mod,SLIT("(->)"))) (setCurrentPos# buf 2#)
1058             _    -> lex_id3 cont exts mod buf just_a_conid
1059
1060   _    -> lex_id3 cont exts mod buf just_a_conid
1061
1062
1063 lex_id3 cont exts mod buf just_a_conid
1064   | is_upper (currentChar# buf) =
1065      lex_con cont exts buf
1066
1067   | is_symbol (currentChar# buf) =
1068      let
1069         start_new_lexeme = stepOverLexeme buf
1070      in
1071      -- trace ("lex_id31 "{-++unpackFS lexeme-}) $
1072      case expandWhile# is_symbol start_new_lexeme of { buf' ->
1073      let
1074        lexeme  = lexemeToFastString buf'
1075         -- real lexeme is M.<sym>
1076        new_buf = mergeLexemes buf buf'
1077      in
1078      cont (mk_qvar_token mod lexeme) new_buf
1079         -- wrong, but arguably morally right: M... is now a qvarsym
1080      }
1081
1082   | otherwise   =
1083      let
1084         start_new_lexeme = stepOverLexeme buf
1085      in
1086      -- trace ("lex_id32 "{-++unpackFS lexeme-}) $
1087      case expandWhile# is_ident start_new_lexeme of { buf1 ->
1088      if emptyLexeme buf1
1089             then just_a_conid
1090             else
1091
1092      case slurp_trailing_hashes buf1 exts of { buf' ->
1093
1094      let
1095       lexeme      = lexemeToFastString buf'
1096       new_buf     = mergeLexemes buf buf'
1097       is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
1098      in
1099      case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
1100             Nothing          -> is_a_qvarid ;
1101
1102             Just kwd_token | isSpecial kwd_token   -- special ids (as, qualified, hiding) shouldn't be
1103                            -> is_a_qvarid          --  recognised as keywords here.
1104                            | otherwise
1105                            -> just_a_conid         -- avoid M.where etc.
1106      }}}
1107
1108 slurp_trailing_hashes buf exts
1109   | glaExtsEnabled exts = expandWhile# (`eqChar#` '#'#) buf
1110   | otherwise           = buf
1111
1112
1113 mk_var_token pk_str
1114   | is_upper f          = ITconid pk_str
1115   | is_ident f          = ITvarid pk_str
1116   | f `eqChar#` ':'#    = ITconsym pk_str
1117   | otherwise           = ITvarsym pk_str
1118   where
1119       (C# f) = _HEAD_ pk_str
1120       -- tl     = _TAIL_ pk_str
1121
1122 mk_qvar_token m token =
1123 -- trace ("mk_qvar ") $
1124  case mk_var_token token of
1125    ITconid n  -> ITqconid  (m,n)
1126    ITvarid n  -> ITqvarid  (m,n)
1127    ITconsym n -> ITqconsym (m,n)
1128    ITvarsym n -> ITqvarsym (m,n)
1129    _          -> ITunknown (show token)
1130 \end{code}
1131
1132 ----------------------------------------------------------------------------
1133 Horrible stuff for dealing with M.(,,,)
1134
1135 \begin{code}
1136 lex_tuple cont mod buf back_off =
1137   go 2 buf
1138   where
1139    go n buf =
1140     case currentChar# buf of
1141       ','# -> go (n+1) (stepOn buf)
1142       ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Boxed n))) (stepOn buf)
1143       _    -> back_off
1144
1145 lex_ubx_tuple cont mod buf back_off =
1146   go 2 buf
1147   where
1148    go n buf =
1149     case currentChar# buf of
1150       ','# -> go (n+1) (stepOn buf)
1151       '#'# -> case lookAhead# buf 1# of
1152                 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Unboxed n)))
1153                                  (stepOnBy# buf 2#)
1154                 _    -> back_off
1155       _    -> back_off
1156 \end{code}
1157
1158 -----------------------------------------------------------------------------
1159 'lexPragma' rips along really fast, looking for a '##-}',
1160 indicating the end of the pragma we're skipping
1161
1162 \begin{code}
1163 lexPragma cont contf inStr buf =
1164  case currentChar# buf of
1165    '#'# | inStr ==# 0# ->
1166        case lookAhead# buf 1# of { '#'# ->
1167        case lookAhead# buf 2# of { '-'# ->
1168        case lookAhead# buf 3# of { '}'# ->
1169            contf cont (lexemeToBuffer buf)
1170                       (stepOverLexeme (setCurrentPos# buf 4#));
1171         _    -> lexPragma cont contf inStr (incLexeme buf) };
1172         _    -> lexPragma cont contf inStr (incLexeme buf) };
1173         _    -> lexPragma cont contf inStr (incLexeme buf) }
1174
1175    '"'# ->
1176        let
1177         odd_slashes buf flg i# =
1178           case lookAhead# buf i# of
1179            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
1180            _     -> flg
1181
1182         not_inStr = if inStr ==# 0# then 1# else 0#
1183        in
1184        case lookAhead# buf (negateInt# 1#) of --backwards, actually
1185          '\\'# -> -- escaping something..
1186            if odd_slashes buf True (negateInt# 2#)
1187                 then  -- odd number of slashes, " is escaped.
1188                       lexPragma cont contf inStr (incLexeme buf)
1189                 else  -- even number of slashes, \ is escaped.
1190                       lexPragma cont contf not_inStr (incLexeme buf)
1191          _ -> lexPragma cont contf not_inStr (incLexeme buf)
1192
1193    '\''# | inStr ==# 0# ->
1194         case lookAhead# buf 1# of { '"'# ->
1195         case lookAhead# buf 2# of { '\''# ->
1196            lexPragma cont contf inStr (setCurrentPos# buf 3#);
1197         _ -> lexPragma cont contf inStr (incLexeme buf) };
1198         _ -> lexPragma cont contf inStr (incLexeme buf) }
1199
1200     -- a sign that the input is ill-formed, since pragmas are
1201     -- assumed to always be properly closed (in .hi files).
1202    '\NUL'# -> trace "lexPragma: unexpected end-of-file" $
1203               cont (ITunknown "\NUL") buf
1204
1205    _ -> lexPragma cont contf inStr (incLexeme buf)
1206
1207 \end{code}
1208
1209 -----------------------------------------------------------------------------
1210
1211 \begin{code}
1212 data LayoutContext
1213   = NoLayout
1214   | Layout Int#
1215
1216 data ParseResult a
1217   = POk PState a
1218   | PFailed Message
1219
1220 data PState = PState {
1221         loc        :: SrcLoc,
1222         extsBitmap :: Int#,     -- bitmap that determines permitted extensions
1223         bol        :: Int#,
1224         atbol      :: Int#,
1225         context    :: [LayoutContext]
1226      }
1227
1228 type P a = StringBuffer         -- Input string
1229         -> PState
1230         -> ParseResult a
1231
1232 returnP   :: a -> P a
1233 returnP a buf s = POk s a
1234
1235 thenP      :: P a -> (a -> P b) -> P b
1236 m `thenP` k = \ buf s ->
1237         case m buf s of
1238                 POk s1 a -> k a buf s1
1239                 PFailed err  -> PFailed err
1240
1241 thenP_     :: P a -> P b -> P b
1242 m `thenP_` k = m `thenP` \_ -> k
1243
1244 mapP :: (a -> P b) -> [a] -> P [b]
1245 mapP f [] = returnP []
1246 mapP f (a:as) =
1247      f a `thenP` \b ->
1248      mapP f as `thenP` \bs ->
1249      returnP (b:bs)
1250
1251 failP :: String -> P a
1252 failP msg buf s = PFailed (text msg)
1253
1254 failMsgP :: Message -> P a
1255 failMsgP msg buf s = PFailed msg
1256
1257 lexError :: String -> P a
1258 lexError str buf s@PState{ loc = loc }
1259   = failMsgP (hcat [ppr loc, text ": ", text str]) buf s
1260
1261 getSrcLocP :: P SrcLoc
1262 getSrcLocP buf s@(PState{ loc = loc }) = POk s loc
1263
1264 -- use a temporary SrcLoc for the duration of the argument
1265 setSrcLocP :: SrcLoc -> P a -> P a
1266 setSrcLocP new_loc p buf s =
1267   case p buf s{ loc=new_loc } of
1268       POk _ a   -> POk s a
1269       PFailed e -> PFailed e
1270
1271 getSrcFile :: P FAST_STRING
1272 getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc)
1273
1274 pushContext :: LayoutContext -> P ()
1275 pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} ()
1276
1277 {-
1278
1279 This special case in layoutOn is to handle layout contexts with are
1280 indented the same or less than the current context.  This is illegal
1281 according to the Haskell spec, so we have to arrange to close the
1282 current context.  eg.
1283
1284 class Foo a where
1285 class Bar a
1286
1287 after the first 'where', the sequence of events is:
1288
1289         - layout system inserts a ';' (column 0)
1290         - parser begins a new context at column 0
1291         - parser shifts ';' (legal empty declaration)
1292         - parser sees 'class': parse error (we're still in the inner context)
1293
1294 trouble is, by the time we know we need a new context, the lexer has
1295 already generated the ';'.  Hacky solution is as follows: since we
1296 know the column of the next token (it's the column number of the new
1297 context), we set the ACTUAL column number of the new context to this
1298 numer plus one.  Hence the next time the lexer is called, a '}' will
1299 be generated to close the new context straight away.  Furthermore, we
1300 have to set the atbol flag so that the ';' that the parser shifted as
1301 part of the new context is re-generated.
1302
1303 when the new context is *less* indented than the current one:
1304
1305 f = f where g = g where
1306 h = h
1307
1308         - current context: column 12.
1309         - on seeing 'h' (column 0), the layout system inserts '}'
1310         - parser starts a new context, column 0
1311         - parser sees '}', uses it to close new context
1312         - we still need to insert another '}' followed by a ';',
1313           hence the atbol trick.
1314
1315 There's also a special hack in here to deal with
1316
1317         do
1318            ....
1319            e $ do
1320            blah
1321
1322 i.e. the inner context is at the same indentation level as the outer
1323 context.  This is strictly illegal according to Haskell 98, but
1324 there's a lot of existing code using this style and it doesn't make
1325 any sense to disallow it, since empty 'do' lists don't make sense.
1326 -}
1327
1328 layoutOn :: Bool -> P ()
1329 layoutOn strict buf s@(PState{ bol = bol, context = ctx }) =
1330     let offset = lexemeIndex buf -# bol in
1331     case ctx of
1332         Layout prev_off : _
1333            | if strict then prev_off >=# offset else prev_off ># offset ->
1334                 --trace ("layout on, column: " ++  show (I# offset)) $
1335                 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
1336         other ->
1337                 --trace ("layout on, column: " ++  show (I# offset)) $
1338                 POk s{ context = Layout offset : ctx } ()
1339
1340 layoutOff :: P ()
1341 layoutOff buf s@(PState{ context = ctx }) =
1342     POk s{ context = NoLayout:ctx } ()
1343
1344 popContext :: P ()
1345 popContext = \ buf s@(PState{ context = ctx, loc = loc }) ->
1346   case ctx of
1347         (_:tl) -> POk s{ context = tl } ()
1348         []     -> PFailed (srcParseErr buf loc)
1349
1350 {-
1351  Note that if the name of the file we're processing ends
1352  with `hi-boot', we accept it on faith as having the right
1353  version. This is done so that .hi-boot files that comes
1354  with hsc don't have to be updated before every release,
1355  *and* it allows us to share .hi-boot files with versions
1356  of hsc that don't have .hi version checking (e.g., ghc-2.10's)
1357
1358  If the version number is 0, the checking is also turned off.
1359  (needed to deal with GHC.hi only!)
1360
1361  Once we can assume we're compiling with a version of ghc that
1362  supports interface file checking, we can drop the special
1363  pleading
1364 -}
1365 checkVersion :: Maybe Integer -> P ()
1366 checkVersion mb@(Just v) buf s@(PState{loc = loc})
1367  | (v==0) || (v == fromInt opt_HiVersion) || opt_NoHiCheck = POk s ()
1368  | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1369 checkVersion mb@Nothing  buf s@(PState{loc = loc})
1370  | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile loc)) = POk s ()
1371  | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1372
1373
1374 -- for reasons of efficiency, flags indicating language extensions (eg,
1375 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1376 -- integer
1377
1378 glaExtsBit, ffiBit, parrBit :: Int
1379 glaExtsBit = 0
1380 ffiBit     = 1  -- FIXME: not used yet; still part of `glaExtsBit'
1381 parrBit    = 2
1382
1383 glaExtsEnabled, ffiEnabled, parrEnabled :: Int# -> Bool
1384 glaExtsEnabled flags = testBit (I# flags) glaExtsBit
1385 ffiEnabled     flags = testBit (I# flags) ffiBit
1386 parrEnabled    flags = testBit (I# flags) parrBit
1387
1388 -- convenient record-based bitmap for the interface to the rest of the world
1389 --
1390 data ExtFlags = ExtFlags {
1391                   glasgowExtsEF :: Bool,
1392 --                ffiEF         :: Bool,  -- commented out to avoid warnings
1393                   parrEF        :: Bool   -- while not used yet
1394                 }
1395
1396 -- create a parse state
1397 --
1398 mkPState          :: SrcLoc -> ExtFlags -> PState
1399 mkPState loc exts  = PState {
1400                        loc        = loc,
1401                        extsBitmap = case bitmap of {I# bits -> bits},
1402                        bol        = 0#,
1403                        atbol      = 1#,
1404                        context    = []
1405                      }
1406                      where
1407                        bitmap =     glaExtsBit `setBitIf` glasgowExtsEF exts
1408 --                              .|. ffiBit     `setBitIf` ffiEF         exts
1409                                 .|. parrBit    `setBitIf` parrEF        exts
1410                        --
1411                        b `setBitIf` cond | cond      = bit b
1412                                          | otherwise = 0
1413
1414
1415 -----------------------------------------------------------------
1416
1417 ifaceParseErr :: StringBuffer -> SrcLoc -> Message
1418 ifaceParseErr s l
1419   = hsep [ppr l, ptext SLIT("Interface file parse error; on input `"),
1420           text (lexemeToString s), char '\'']
1421
1422 ifaceVersionErr hi_vers l toks
1423   = hsep [ppr l, ptext SLIT("Interface file version error;"),
1424           ptext SLIT("Expected"), int opt_HiVersion,
1425           ptext SLIT("found "), pp_version]
1426     where
1427      pp_version =
1428       case hi_vers of
1429         Nothing -> ptext SLIT("pre ghc-3.02 version")
1430         Just v  -> ptext SLIT("version") <+> integer v
1431
1432 -----------------------------------------------------------------------------
1433
1434 srcParseErr :: StringBuffer -> SrcLoc -> Message
1435 srcParseErr s l
1436   = hcat [ppr l,
1437           if null token
1438              then ptext SLIT(": parse error (possibly incorrect indentation)")
1439              else hcat [ptext SLIT(": parse error on input "),
1440                         char '`', text token, char '\'']
1441     ]
1442   where
1443         token = lexemeToString s
1444
1445 \end{code}