ghc/compiler/parser/Lex.lhs

   1 %
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
   3 %
   4 \section[Lexical analysis]{Lexical analysis}
   5
   6 --------------------------------------------------------
   7 [Jan 98]
   8 There's a known bug in here:
   9
  10         If an interface file ends prematurely, Lex tries to
  11         do headFS of an empty FastString.
  12
  13 An example that provokes the error is
  14
  15         f _:_ _forall_ [a] <<<END OF FILE>>>
  16 --------------------------------------------------------
  17
  18 \begin{code}
  19
  20 module Lex (
  21
  22         ifaceParseErr, srcParseErr,
  23
  24         -- Monad for parser
  25         Token(..), lexer, ParseResult(..), PState(..),
  26         checkVersion, ExtFlags(..), mkPState,
  27         StringBuffer,
  28
  29         P, thenP, thenP_, returnP, mapP, failP, failMsgP,
  30         getSrcLocP, setSrcLocP, getSrcFile,
  31         layoutOn, layoutOff, pushContext, popContext
  32     ) where
  33
  34 #include "HsVersions.h"
  35
  36 import Char             ( isSpace, toUpper )
  37 import List             ( isSuffixOf )
  38
  39 import PrelNames        ( mkTupNameStr )
  40 import CmdLineOpts      ( opt_HiVersion, opt_NoHiCheck )
  41 import ForeignCall      ( Safety(..) )
  42 import NewDemand        ( StrictSig(..), Demand(..), Demands(..),
  43                           DmdResult(..), mkTopDmdType, evalDmd, lazyDmd )
  44 import UniqFM           ( listToUFM, lookupUFM )
  45 import BasicTypes       ( Boxity(..) )
  46 import SrcLoc           ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
  47                           replaceSrcLine, mkSrcLoc )
  48
  49 import ErrUtils         ( Message )
  50 import Outputable
  51
  52 import FastString
  53 import StringBuffer
  54 import GlaExts
  55 import Ctype
  56 import Char             ( chr, ord )
  57 import PrelRead         ( readRational__ ) -- Glasgow non-std
  58 import PrelBits         ( Bits(..) )       -- non-std
  59 \end{code}
  60
  61 %************************************************************************
  62 %*                                                                      *
  63 \subsection{Data types}
  64 %*                                                                      *
  65 %************************************************************************
  66
  67 The token data type, fairly un-interesting except from one
  68 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
  69 strictness, unfolding etc).
  70
  71 The Idea/Observation here is that the renamer needs to scan through
  72 all of an interface file before it can continue. But only a fraction
  73 of the information contained in the file turns out to be useful, so
  74 delaying as much as possible of the scanning and parsing of an
  75 interface file Makes Sense (Heap profiles of the compiler
  76 show a reduction in heap usage by at least a factor of two,
  77 post-renamer).
  78
  79 Hence, the interface file lexer spots when value declarations are
  80 being scanned and return the @ITidinfo@ and @ITtype@ constructors
  81 for the type and any other id info for that binding (unfolding, strictness
  82 etc). These constructors are applied to the result of lexing these sub-chunks.
  83
  84 The lexing of the type and id info is all done lazily, of course, so
  85 the scanning (and subsequent parsing) will be done *only* on the ids the
  86 renamer finds out that it is interested in. The rest will just be junked.
  87 Laziness, you know it makes sense :-)
  88
  89 \begin{code}
  90 data Token
  91   = ITas                        -- Haskell keywords
  92   | ITcase
  93   | ITclass
  94   | ITdata
  95   | ITdefault
  96   | ITderiving
  97   | ITdo
  98   | ITelse
  99   | IThiding
 100   | ITif
 101   | ITimport
 102   | ITin
 103   | ITinfix
 104   | ITinfixl
 105   | ITinfixr
 106   | ITinstance
 107   | ITlet
 108   | ITmodule
 109   | ITnewtype
 110   | ITof
 111   | ITqualified
 112   | ITthen
 113   | ITtype
 114   | ITwhere
 115   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 116
 117   | ITforall                    -- GHC extension keywords
 118   | ITforeign
 119   | ITexport
 120   | ITlabel
 121   | ITdynamic
 122   | ITsafe
 123   | ITunsafe
 124   | ITwith
 125   | ITstdcallconv
 126   | ITccallconv
 127   | ITdotnet
 128
 129   | ITinterface                 -- interface keywords
 130   | IT__export
 131   | ITdepends
 132   | IT__forall
 133   | ITletrec
 134   | ITcoerce
 135   | ITinlineMe
 136   | ITinlineCall
 137   | ITccall (Bool,Bool,Safety)  -- (is_dyn, is_casm, may_gc)
 138   | ITdefaultbranch
 139   | ITbottom
 140   | ITinteger_lit
 141   | ITfloat_lit
 142   | ITword_lit
 143   | ITword64_lit
 144   | ITint64_lit
 145   | ITrational_lit
 146   | ITaddr_lit
 147   | ITlabel_lit
 148   | ITlit_lit
 149   | ITstring_lit
 150   | ITtypeapp
 151   | ITusage
 152   | ITfuall
 153   | ITarity
 154   | ITspecialise
 155   | ITnocaf
 156   | ITunfold
 157   | ITstrict StrictSig
 158   | ITrules
 159   | ITcprinfo
 160   | ITdeprecated
 161   | IT__scc
 162   | ITsccAllCafs
 163
 164   | ITspecialise_prag           -- Pragmas
 165   | ITsource_prag
 166   | ITinline_prag
 167   | ITnoinline_prag
 168   | ITrules_prag
 169   | ITdeprecated_prag
 170   | ITline_prag
 171   | ITscc_prag
 172   | ITclose_prag
 173
 174   | ITdotdot                    -- reserved symbols
 175   | ITdcolon
 176   | ITequal
 177   | ITlam
 178   | ITvbar
 179   | ITlarrow
 180   | ITrarrow
 181   | ITat
 182   | ITtilde
 183   | ITdarrow
 184   | ITminus
 185   | ITbang
 186   | ITdot
 187
 188   | ITbiglam                    -- GHC-extension symbols
 189
 190   | ITocurly                    -- special symbols
 191   | ITccurly
 192   | ITocurlybar                 -- {|, for type applications
 193   | ITccurlybar                 -- |}, for type applications
 194   | ITvccurly
 195   | ITobrack
 196   | ITopabrack                  -- [:, for parallel arrays with -fparr
 197   | ITcpabrack                  -- :], for parallel arrays with -fparr
 198   | ITcbrack
 199   | IToparen
 200   | ITcparen
 201   | IToubxparen
 202   | ITcubxparen
 203   | ITsemi
 204   | ITcomma
 205   | ITunderscore
 206   | ITbackquote
 207
 208   | ITvarid   FAST_STRING       -- identifiers
 209   | ITconid   FAST_STRING
 210   | ITvarsym  FAST_STRING
 211   | ITconsym  FAST_STRING
 212   | ITqvarid  (FAST_STRING,FAST_STRING)
 213   | ITqconid  (FAST_STRING,FAST_STRING)
 214   | ITqvarsym (FAST_STRING,FAST_STRING)
 215   | ITqconsym (FAST_STRING,FAST_STRING)
 216
 217   | ITdupipvarid   FAST_STRING  -- GHC extension: implicit param: ?x
 218   | ITsplitipvarid FAST_STRING  -- GHC extension: implicit param: %x
 219
 220   | ITpragma StringBuffer
 221
 222   | ITchar       Int
 223   | ITstring     FAST_STRING
 224   | ITinteger    Integer
 225   | ITrational   Rational
 226
 227   | ITprimchar   Int
 228   | ITprimstring FAST_STRING
 229   | ITprimint    Integer
 230   | ITprimfloat  Rational
 231   | ITprimdouble Rational
 232   | ITlitlit     FAST_STRING
 233
 234   | ITunknown String            -- Used when the lexer can't make sense of it
 235   | ITeof                       -- end of file token
 236   deriving Show -- debugging
 237 \end{code}
 238
 239 -----------------------------------------------------------------------------
 240 Keyword Lists
 241
 242 \begin{code}
 243 pragmaKeywordsFM = listToUFM $
 244       map (\ (x,y) -> (_PK_ x,y))
 245        [( "SPECIALISE", ITspecialise_prag ),
 246         ( "SPECIALIZE", ITspecialise_prag ),
 247         ( "SOURCE",     ITsource_prag ),
 248         ( "INLINE",     ITinline_prag ),
 249         ( "NOINLINE",   ITnoinline_prag ),
 250         ( "NOTINLINE",  ITnoinline_prag ),
 251         ( "LINE",       ITline_prag ),
 252         ( "RULES",      ITrules_prag ),
 253         ( "RULEZ",      ITrules_prag ), -- american spelling :-)
 254         ( "SCC",        ITscc_prag ),
 255         ( "DEPRECATED", ITdeprecated_prag )
 256         ]
 257
 258 haskellKeywordsFM = listToUFM $
 259       map (\ (x,y) -> (_PK_ x,y))
 260        [( "_",          ITunderscore ),
 261         ( "as",         ITas ),
 262         ( "case",       ITcase ),
 263         ( "class",      ITclass ),
 264         ( "data",       ITdata ),
 265         ( "default",    ITdefault ),
 266         ( "deriving",   ITderiving ),
 267         ( "do",         ITdo ),
 268         ( "else",       ITelse ),
 269         ( "hiding",     IThiding ),
 270         ( "if",         ITif ),
 271         ( "import",     ITimport ),
 272         ( "in",         ITin ),
 273         ( "infix",      ITinfix ),
 274         ( "infixl",     ITinfixl ),
 275         ( "infixr",     ITinfixr ),
 276         ( "instance",   ITinstance ),
 277         ( "let",        ITlet ),
 278         ( "module",     ITmodule ),
 279         ( "newtype",    ITnewtype ),
 280         ( "of",         ITof ),
 281         ( "qualified",  ITqualified ),
 282         ( "then",       ITthen ),
 283         ( "type",       ITtype ),
 284         ( "where",      ITwhere ),
 285         ( "_scc_",      ITscc )         -- ToDo: remove
 286      ]
 287
 288 isSpecial :: Token -> Bool
 289 -- If we see M.x, where x is a keyword, but
 290 -- is special, we treat is as just plain M.x,
 291 -- not as a keyword.
 292 isSpecial ITas          = True
 293 isSpecial IThiding      = True
 294 isSpecial ITqualified   = True
 295 isSpecial ITforall      = True
 296 isSpecial ITexport      = True
 297 isSpecial ITlabel       = True
 298 isSpecial ITdynamic     = True
 299 isSpecial ITsafe        = True
 300 isSpecial ITunsafe      = True
 301 isSpecial ITwith        = True
 302 isSpecial ITccallconv   = True
 303 isSpecial ITstdcallconv = True
 304 isSpecial _             = False
 305
 306 -- IMPORTANT: Keep this in synch with ParseIface.y's var_fs production! (SUP)
 307 ghcExtensionKeywordsFM = listToUFM $
 308         map (\ (x,y) -> (_PK_ x,y))
 309      [  ( "forall",     ITforall ),
 310         ( "foreign",    ITforeign ),
 311         ( "export",     ITexport ),
 312         ( "label",      ITlabel ),
 313         ( "dynamic",    ITdynamic ),
 314         ( "safe",       ITunsafe ),
 315         ( "unsafe",     ITunsafe ),
 316         ( "with",       ITwith ),
 317         ( "stdcall",    ITstdcallconv),
 318         ( "ccall",      ITccallconv),
 319         ( "dotnet",     ITdotnet),
 320         ("_ccall_",     ITccall (False, False, PlayRisky)),
 321         ("_ccall_GC_",  ITccall (False, False, PlaySafe)),
 322         ("_casm_",      ITccall (False, True,  PlayRisky)),
 323         ("_casm_GC_",   ITccall (False, True,  PlaySafe)),
 324
 325         -- interface keywords
 326         ("__interface",         ITinterface),
 327         ("__export",            IT__export),
 328         ("__depends",           ITdepends),
 329         ("__forall",            IT__forall),
 330         ("__letrec",            ITletrec),
 331         ("__coerce",            ITcoerce),
 332         ("__inline_me",         ITinlineMe),
 333         ("__inline_call",       ITinlineCall),
 334         ("__depends",           ITdepends),
 335         ("__DEFAULT",           ITdefaultbranch),
 336         ("__bot",               ITbottom),
 337         ("__integer",           ITinteger_lit),
 338         ("__float",             ITfloat_lit),
 339         ("__int64",             ITint64_lit),
 340         ("__word",              ITword_lit),
 341         ("__word64",            ITword64_lit),
 342         ("__rational",          ITrational_lit),
 343         ("__addr",              ITaddr_lit),
 344         ("__label",             ITlabel_lit),
 345         ("__litlit",            ITlit_lit),
 346         ("__string",            ITstring_lit),
 347         ("__a",                 ITtypeapp),
 348         ("__u",                 ITusage),
 349         ("__fuall",             ITfuall),
 350         ("__A",                 ITarity),
 351         ("__P",                 ITspecialise),
 352         ("__C",                 ITnocaf),
 353         ("__R",                 ITrules),
 354         ("__D",                 ITdeprecated),
 355         ("__U",                 ITunfold),
 356
 357         ("__ccall",             ITccall (False, False, PlayRisky)),
 358         ("__ccall_GC",          ITccall (False, False, PlaySafe)),
 359         ("__dyn_ccall",         ITccall (True,  False, PlayRisky)),
 360         ("__dyn_ccall_GC",      ITccall (True,  False, PlaySafe)),
 361         ("__casm",              ITccall (False, True,  PlayRisky)),
 362         ("__dyn_casm",          ITccall (True,  True,  PlayRisky)),
 363         ("__casm_GC",           ITccall (False, True,  PlaySafe)),
 364         ("__dyn_casm_GC",       ITccall (True,  True,  PlaySafe)),
 365
 366         ("/\\",                 ITbiglam)
 367      ]
 368
 369
 370 haskellKeySymsFM = listToUFM $
 371         map (\ (x,y) -> (_PK_ x,y))
 372       [ ("..",          ITdotdot)
 373        ,("::",          ITdcolon)
 374        ,("=",           ITequal)
 375        ,("\\",          ITlam)
 376        ,("|",           ITvbar)
 377        ,("<-",          ITlarrow)
 378        ,("->",          ITrarrow)
 379        ,("@",           ITat)
 380        ,("~",           ITtilde)
 381        ,("=>",          ITdarrow)
 382        ,("-",           ITminus)
 383        ,("!",           ITbang)
 384        ,(".",           ITdot)          -- sadly, for 'forall a . t'
 385        ]
 386 \end{code}
 387
 388 -----------------------------------------------------------------------------
 389 The lexical analyser
 390
 391 Lexer state:
 392
 393         - (exts)  lexing a source with extensions, eg, an interface file or
 394                   with -fglasgow-exts
 395         - (bol)   pointer to beginning of line (for column calculations)
 396         - (buf)   pointer to beginning of token
 397         - (buf)   pointer to current char
 398         - (atbol) flag indicating whether we're at the beginning of a line
 399
 400 \begin{code}
 401 lexer :: (Token -> P a) -> P a
 402 lexer cont buf s@(PState{
 403                     loc = loc,
 404                     extsBitmap = exts,
 405                     bol = bol,
 406                     atbol = atbol,
 407                     context = ctx
 408                 })
 409
 410         -- first, start a new lexeme and lose all the whitespace
 411   =  _scc_ "Lexer"
 412   tab line bol atbol (stepOverLexeme buf)
 413   where
 414         line = srcLocLine loc
 415
 416         tab y bol atbol buf = --trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $
 417           case currentChar# buf of
 418
 419             '\NUL'# ->
 420                    if bufferExhausted (stepOn buf)
 421                         then cont ITeof buf s'
 422                         else trace "lexer: misplaced NUL?" $
 423                              tab y bol atbol (stepOn buf)
 424
 425             '\n'# -> let buf' = stepOn buf
 426                      in tab (y +# 1#) (currentIndex# buf') 1# buf'
 427
 428                 -- find comments.  This got harder in Haskell 98.
 429             '-'# ->  let trundle n =
 430                           let next = lookAhead# buf n in
 431                           if next `eqChar#` '-'# then trundle (n +# 1#)
 432                           else if is_symbol next || n <# 2#
 433                                 then is_a_token
 434                                 else tab y bol atbol
 435                                          (stepOnUntilChar# (stepOnBy# buf n) '\n'#)
 436                     in trundle 1#
 437
 438                 -- comments and pragmas.  We deal with LINE pragmas here,
 439                 -- and throw out any unrecognised pragmas as comments.  Any
 440                 -- pragmas we know about are dealt with later (after any layout
 441                 -- processing if necessary).
 442             '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
 443                 if lookAhead# buf 2# `eqChar#` '#'# then
 444                   if lookAhead# buf 3# `eqChar#` '#'# then is_a_token else
 445                   case expandWhile# is_space (setCurrentPos# buf 3#) of { buf1->
 446                   case expandWhile# is_ident (stepOverLexeme buf1)   of { buf2->
 447                   let lexeme = mkFastString -- ToDo: too slow
 448                                   (map toUpper (lexemeToString buf2)) in
 449                   case lookupUFM pragmaKeywordsFM lexeme of
 450                         -- ignore RULES pragmas when -fglasgow-exts is off
 451                         Just ITrules_prag | not (glaExtsEnabled exts) ->
 452                            skip_to_end (stepOnBy# buf 2#) s'
 453                         Just ITline_prag ->
 454                            line_prag skip_to_end buf2 s'
 455                         Just other -> is_a_token
 456                         Nothing -> skip_to_end (stepOnBy# buf 2#) s'
 457                   }}
 458
 459                 else skip_to_end (stepOnBy# buf 2#) s'
 460                 where
 461                     skip_to_end = skipNestedComment (lexer cont)
 462
 463                 -- special GHC extension: we grok cpp-style #line pragmas
 464             '#'# | lexemeIndex buf ==# bol ->   -- the '#' must be in column 0
 465                 let buf1 | lookAhead# buf 1# `eqChar#` 'l'# &&
 466                            lookAhead# buf 2# `eqChar#` 'i'# &&
 467                            lookAhead# buf 3# `eqChar#` 'n'# &&
 468                            lookAhead# buf 4# `eqChar#` 'e'#  = stepOnBy# buf 5#
 469                          | otherwise = stepOn buf
 470                 in
 471                 case expandWhile# is_space buf1 of { buf2 ->
 472                 if is_digit (currentChar# buf2)
 473                         then line_prag next_line buf2 s'
 474                         else is_a_token
 475                 }
 476                 where
 477                 next_line buf = lexer cont (stepOnUntilChar# buf '\n'#)
 478
 479                 -- tabs have been expanded beforehand
 480             c | is_space c -> tab y bol atbol (stepOn buf)
 481               | otherwise  -> is_a_token
 482
 483            where s' = s{loc = replaceSrcLine loc y,
 484                         bol = bol,
 485                        atbol = atbol}
 486
 487                  is_a_token | atbol /=# 0# = lexBOL cont buf s'
 488                             | otherwise    = lexToken cont exts buf s'
 489
 490 -- {-# LINE .. #-} pragmas.  yeuch.
 491 line_prag cont buf s@PState{loc=loc} =
 492   case expandWhile# is_space buf                of { buf1 ->
 493   case scanNumLit 0 (stepOverLexeme buf1)       of { (line,buf2) ->
 494   -- subtract one: the line number refers to the *following* line.
 495   let real_line = line - 1 in
 496   case fromInteger real_line                    of { i@(I# l) ->
 497         -- ToDo, if no filename then we skip the newline.... d'oh
 498   case expandWhile# is_space buf2               of { buf3 ->
 499   case currentChar# buf3                        of
 500      '\"'#{-"-} ->
 501         case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
 502         let
 503             file = lexemeToFastString buf4
 504             new_buf = stepOn (stepOverLexeme buf4)
 505         in
 506         if nullFastString file
 507                 then cont new_buf s{loc = replaceSrcLine loc l}
 508                 else cont new_buf s{loc = mkSrcLoc file i}
 509         }
 510      _other -> cont (stepOverLexeme buf3) s{loc = replaceSrcLine loc l}
 511   }}}}
 512
 513 skipNestedComment :: P a -> P a
 514 skipNestedComment cont buf state = skipNestedComment' (loc state) cont buf state
 515
 516 skipNestedComment' :: SrcLoc -> P a -> P a
 517 skipNestedComment' orig_loc cont buf = loop buf
 518  where
 519    loop buf =
 520      case currentChar# buf of
 521         '-'# | lookAhead# buf 1# `eqChar#` '}'# -> cont (stepOnBy# buf 2#)
 522
 523         '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
 524               skipNestedComment
 525                 (skipNestedComment' orig_loc cont)
 526                 (stepOnBy# buf 2#)
 527
 528         '\n'# -> \ s@PState{loc=loc} ->
 529                  let buf' = stepOn buf in
 530                  loop buf' s{loc = incSrcLine loc,
 531                              bol = currentIndex# buf',
 532                              atbol = 1#}
 533
 534         -- pass the original SrcLoc to lexError so that the error is
 535         -- reported at the line it was originally on, not the line at
 536         -- the end of the file.
 537         '\NUL'# | bufferExhausted (stepOn buf) ->
 538                 \s -> lexError "unterminated `{-'" buf s{loc=orig_loc} -- -}
 539
 540         _   -> loop (stepOn buf)
 541
 542 -- When we are lexing the first token of a line, check whether we need to
 543 -- insert virtual semicolons or close braces due to layout.
 544
 545 lexBOL :: (Token -> P a) -> P a
 546 lexBOL cont buf s@(PState{
 547                     loc = loc,
 548                     extsBitmap = exts,
 549                     bol = bol,
 550                     atbol = atbol,
 551                     context = ctx
 552                   }) =
 553         if need_close_curly then
 554                 --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $
 555                 cont ITvccurly buf s{atbol = 1#, context = tail ctx}
 556         else if need_semi_colon then
 557                 --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $
 558                 cont ITsemi buf s{atbol = 0#}
 559         else
 560                 lexToken cont exts buf s{atbol = 0#}
 561   where
 562         col = currentIndex# buf -# bol
 563
 564         need_close_curly =
 565                 case ctx of
 566                         [] -> False
 567                         (i:_) -> case i of
 568                                     NoLayout -> False
 569                                     Layout n -> col <# n
 570         need_semi_colon =
 571                 case ctx of
 572                         [] -> False
 573                         (i:_) -> case i of
 574                                     NoLayout -> False
 575                                     Layout n -> col ==# n
 576
 577
 578 lexToken :: (Token -> P a) -> Int# -> P a
 579 lexToken cont exts buf =
 580 -- trace "lexToken" $
 581   case currentChar# buf of
 582
 583     -- special symbols ----------------------------------------------------
 584     '('# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '#'#
 585                 -> cont IToubxparen (setCurrentPos# buf 2#)
 586          | otherwise
 587                 -> cont IToparen (incLexeme buf)
 588
 589     ')'# -> cont ITcparen    (incLexeme buf)
 590     '['# | parrEnabled exts && lookAhead# buf 1# `eqChar#` ':'# ->
 591             cont ITopabrack  (setCurrentPos# buf 2#)
 592          | otherwise ->
 593             cont ITobrack    (incLexeme buf)
 594     ']'# -> cont ITcbrack    (incLexeme buf)
 595     ','# -> cont ITcomma     (incLexeme buf)
 596     ';'# -> cont ITsemi      (incLexeme buf)
 597     '}'# -> \ s@PState{context = ctx} ->
 598             case ctx of
 599                 (_:ctx') -> cont ITccurly (incLexeme buf) s{context=ctx'}
 600                 _        -> lexError "too many '}'s" buf s
 601     '|'# -> case lookAhead# buf 1# of
 602                  '}'#  | glaExtsEnabled exts -> cont ITccurlybar
 603                                                      (setCurrentPos# buf 2#)
 604                  _                           -> lex_sym cont (incLexeme buf)
 605     ':'# -> case lookAhead# buf 1# of
 606                  ']'#  | parrEnabled exts    -> cont ITcpabrack
 607                                                      (setCurrentPos# buf 2#)
 608                  _                           -> lex_sym cont (incLexeme buf)
 609
 610
 611     '#'# -> case lookAhead# buf 1# of
 612                 ')'#  | glaExtsEnabled exts
 613                      -> cont ITcubxparen (setCurrentPos# buf 2#)
 614                 '-'# -> case lookAhead# buf 2# of
 615                            '}'# -> cont ITclose_prag (setCurrentPos# buf 3#)
 616                            _    -> lex_sym cont (incLexeme buf)
 617                 _    -> lex_sym cont (incLexeme buf)
 618
 619     '`'# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '`'#
 620                 -> lex_cstring cont (setCurrentPos# buf 2#)
 621          | otherwise
 622                 -> cont ITbackquote (incLexeme buf)
 623
 624     '{'# ->     -- look for "{-##" special iface pragma   -- for Emacs: -}
 625             case lookAhead# buf 1# of
 626            '|'# | glaExtsEnabled exts
 627                 -> cont ITocurlybar (setCurrentPos# buf 2#)
 628            '-'# -> case lookAhead# buf 2# of
 629                     '#'# -> case lookAhead# buf 3# of
 630                                 '#'# ->
 631                                    lexPragma
 632                                       cont
 633                                       (\ cont lexeme buf' -> cont (ITpragma lexeme) buf')
 634                                       0#
 635                                       (stepOnBy# (stepOverLexeme buf) 4#)
 636                                 _ -> lex_prag cont (setCurrentPos# buf 3#)
 637                     _    -> cont ITocurly (incLexeme buf)
 638            _ -> (layoutOff `thenP_` cont ITocurly)  (incLexeme buf)
 639
 640     -- strings/characters -------------------------------------------------
 641     '\"'#{-"-} -> lex_string cont exts [] (incLexeme buf)
 642     '\''#      -> lex_char (char_end cont) exts (incLexeme buf)
 643
 644     -- strictness and cpr pragmas and __scc treated specially.
 645     '_'# | glaExtsEnabled exts ->
 646          case lookAhead# buf 1# of
 647            '_'# -> case lookAhead# buf 2# of
 648                     'S'# ->
 649                         lex_demand cont (stepOnUntil (not . isSpace)
 650                                         (stepOnBy# buf 3#)) -- past __S
 651                     'M'# ->
 652                         cont ITcprinfo (stepOnBy# buf 3#)       -- past __M
 653
 654                     's'# ->
 655                         case prefixMatch (stepOnBy# buf 3#) "cc" of
 656                                Just buf' -> lex_scc cont (stepOverLexeme buf')
 657                                Nothing   -> lex_id cont exts buf
 658                     _ -> lex_id cont exts buf
 659            _    -> lex_id cont exts buf
 660
 661         -- Hexadecimal and octal constants
 662     '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2
 663                 -> readNum (after_lexnum cont exts) buf' is_hexdigit 16 hex
 664          | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2
 665                 -> readNum (after_lexnum cont exts) buf' is_octdigit  8 oct_or_dec
 666         where ch   = lookAhead# buf 1#
 667               ch2  = lookAhead# buf 2#
 668               buf' = setCurrentPos# buf 2#
 669
 670     '\NUL'# ->
 671             if bufferExhausted (stepOn buf) then
 672                cont ITeof buf
 673             else
 674                trace "lexIface: misplaced NUL?" $
 675                cont (ITunknown "\NUL") (stepOn buf)
 676
 677     '?'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
 678             lex_ip ITdupipvarid cont (incLexeme buf)
 679     '%'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
 680             lex_ip ITsplitipvarid cont (incLexeme buf)
 681     c | is_digit  c -> lex_num cont exts 0 buf
 682       | is_symbol c -> lex_sym cont buf
 683       | is_upper  c -> lex_con cont exts buf
 684       | is_ident  c -> lex_id  cont exts buf
 685       | otherwise   -> lexError "illegal character" buf
 686
 687 -- Int# is unlifted, and therefore faster than Bool for flags.
 688 {-# INLINE flag #-}
 689 flag :: Int# -> Bool
 690 flag 0# = False
 691 flag _  = True
 692
 693 -------------------------------------------------------------------------------
 694 -- Pragmas
 695
 696 lex_prag cont buf
 697   = case expandWhile# is_space buf of { buf1 ->
 698     case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 ->
 699     let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
 700     case lookupUFM pragmaKeywordsFM lexeme of
 701         Just kw -> cont kw (mergeLexemes buf buf2)
 702         Nothing -> panic "lex_prag"
 703   }}
 704
 705 -------------------------------------------------------------------------------
 706 -- Strings & Chars
 707
 708 lex_string cont exts s buf
 709   = case currentChar# buf of
 710         '"'#{-"-} ->
 711            let buf' = incLexeme buf
 712                s' = mkFastStringNarrow (map chr (reverse s))
 713            in case currentChar# buf' of
 714                 '#'# | glaExtsEnabled exts -> if all (<= 0xFF) s
 715                     then cont (ITprimstring s') (incLexeme buf')
 716                     else lexError "primitive string literal must contain only characters <= \'\\xFF\'" buf'
 717                 _                   -> cont (ITstring s') buf'
 718
 719         -- ignore \& in a string, deal with string gaps
 720         '\\'# | next_ch `eqChar#` '&'#
 721                 -> lex_string cont exts s buf'
 722               | is_space next_ch
 723                 -> lex_stringgap cont exts s (incLexeme buf)
 724
 725             where next_ch = lookAhead# buf 1#
 726                   buf' = setCurrentPos# buf 2#
 727
 728         _ -> lex_char (lex_next_string cont s) exts buf
 729
 730 lex_stringgap cont exts s buf
 731   = let buf' = incLexeme buf in
 732     case currentChar# buf of
 733         '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont exts s buf'
 734                   st{loc = incSrcLine loc}
 735         '\\'# -> lex_string cont exts s buf'
 736         c | is_space c -> lex_stringgap cont exts s buf'
 737         other -> charError buf'
 738
 739 lex_next_string cont s exts c buf = lex_string cont exts (c:s) buf
 740
 741 lex_char :: (Int# -> Int -> P a) -> Int# -> P a
 742 lex_char cont exts buf
 743   = case currentChar# buf of
 744         '\\'# -> lex_escape (cont exts) (incLexeme buf)
 745         c | is_any c -> cont exts (I# (ord# c)) (incLexeme buf)
 746         other -> charError buf
 747
 748 char_end cont exts c buf
 749   = case currentChar# buf of
 750         '\''# -> let buf' = incLexeme buf in
 751                  case currentChar# buf' of
 752                         '#'# | glaExtsEnabled exts
 753                                 -> cont (ITprimchar c) (incLexeme buf')
 754                         _       -> cont (ITchar c) buf'
 755         _     -> charError buf
 756
 757 lex_escape cont buf
 758   = let buf' = incLexeme buf in
 759     case currentChar# buf of
 760         'a'#       -> cont (ord '\a') buf'
 761         'b'#       -> cont (ord '\b') buf'
 762         'f'#       -> cont (ord '\f') buf'
 763         'n'#       -> cont (ord '\n') buf'
 764         'r'#       -> cont (ord '\r') buf'
 765         't'#       -> cont (ord '\t') buf'
 766         'v'#       -> cont (ord '\v') buf'
 767         '\\'#      -> cont (ord '\\') buf'
 768         '"'#       -> cont (ord '\"') buf'
 769         '\''#      -> cont (ord '\'') buf'
 770         '^'#       -> let c = currentChar# buf' in
 771                       if c `geChar#` '@'# && c `leChar#` '_'#
 772                         then cont (I# (ord# c -# ord# '@'#)) (incLexeme buf')
 773                         else charError buf'
 774
 775         'x'#      -> readNum (after_charnum cont) buf' is_hexdigit 16 hex
 776         'o'#      -> readNum (after_charnum cont) buf' is_octdigit  8 oct_or_dec
 777         x | is_digit x
 778                   -> readNum (after_charnum cont) buf is_digit    10 oct_or_dec
 779
 780         _          -> case [ (c,buf2) | (p,c) <- silly_escape_chars,
 781                                        Just buf2 <- [prefixMatch buf p] ] of
 782                             (c,buf2):_ -> cont (ord c) buf2
 783                             [] -> charError buf'
 784
 785 after_charnum cont i buf
 786   = if i >= 0 && i <= 0x10FFFF
 787         then cont (fromInteger i) buf
 788         else charError buf
 789
 790 readNum cont buf is_digit base conv = read buf 0
 791   where read buf i
 792           = case currentChar# buf of { c ->
 793             if is_digit c
 794                 then read (incLexeme buf) (i*base + (toInteger (I# (conv c))))
 795                 else cont i buf
 796             }
 797
 798 is_hexdigit c
 799         =  is_digit c
 800         || (c `geChar#` 'a'# && c `leChar#` 'f'#)
 801         || (c `geChar#` 'A'# && c `leChar#` 'F'#)
 802
 803 hex c | is_digit c = ord# c -# ord# '0'#
 804       | otherwise  = ord# (to_lower c) -# ord# 'a'# +# 10#
 805 oct_or_dec c = ord# c -# ord# '0'#
 806
 807 is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'#
 808
 809 to_lower c
 810   | c `geChar#` 'A'# && c `leChar#` 'Z'#
 811         = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#))
 812   | otherwise = c
 813
 814 charError buf = lexError "error in character literal" buf
 815
 816 silly_escape_chars = [
 817         ("NUL", '\NUL'),
 818         ("SOH", '\SOH'),
 819         ("STX", '\STX'),
 820         ("ETX", '\ETX'),
 821         ("EOT", '\EOT'),
 822         ("ENQ", '\ENQ'),
 823         ("ACK", '\ACK'),
 824         ("BEL", '\BEL'),
 825         ("BS", '\BS'),
 826         ("HT", '\HT'),
 827         ("LF", '\LF'),
 828         ("VT", '\VT'),
 829         ("FF", '\FF'),
 830         ("CR", '\CR'),
 831         ("SO", '\SO'),
 832         ("SI", '\SI'),
 833         ("DLE", '\DLE'),
 834         ("DC1", '\DC1'),
 835         ("DC2", '\DC2'),
 836         ("DC3", '\DC3'),
 837         ("DC4", '\DC4'),
 838         ("NAK", '\NAK'),
 839         ("SYN", '\SYN'),
 840         ("ETB", '\ETB'),
 841         ("CAN", '\CAN'),
 842         ("EM", '\EM'),
 843         ("SUB", '\SUB'),
 844         ("ESC", '\ESC'),
 845         ("FS", '\FS'),
 846         ("GS", '\GS'),
 847         ("RS", '\RS'),
 848         ("US", '\US'),
 849         ("SP", '\SP'),
 850         ("DEL", '\DEL')
 851         ]
 852
 853 -------------------------------------------------------------------------------
 854
 855 lex_demand cont buf =
 856  case read_em [] buf of { (ls,buf') ->
 857  case currentChar# buf' of
 858    'b'# -> cont (ITstrict (StrictSig (mkTopDmdType ls BotRes))) (incLexeme buf')
 859    'm'# -> cont (ITstrict (StrictSig (mkTopDmdType ls RetCPR))) (incLexeme buf')
 860    _    -> cont (ITstrict (StrictSig (mkTopDmdType ls TopRes))) buf'
 861  }
 862  where
 863   read_em acc buf =
 864    case currentChar# buf of
 865     'T'# -> read_em (Top     : acc) (stepOn buf)
 866     'L'# -> read_em (lazyDmd : acc) (stepOn buf)
 867     'A'# -> read_em (Abs     : acc) (stepOn buf)
 868     'V'# -> read_em (evalDmd : acc) (stepOn buf)        -- Temporary, until
 869                                                         -- we've recompiled prelude etc
 870     'C'# -> do_unary Call  acc (stepOnBy# buf 2#)       -- Skip 'C('
 871
 872     'U'# -> do_seq1 Eval         acc (stepOnBy# buf 1#)
 873     'D'# -> do_seq1 Defer        acc (stepOnBy# buf 1#)
 874     'S'# -> do_seq1 (Box . Eval) acc (stepOnBy# buf 1#)
 875
 876     _    -> (reverse acc, buf)
 877
 878   do_seq1 fn acc buf
 879     = case currentChar# buf of
 880         '('# -> do_seq2 fn acc (stepOnBy# buf 1#)
 881         _    -> read_em (fn (Poly Abs) : acc) buf
 882
 883   do_seq2 fn acc buf
 884     = case read_em [] buf of { (dmds, buf) ->
 885       case currentChar# buf of
 886         ')'# -> read_em (fn (Prod dmds) : acc)
 887                         (stepOn buf)
 888         '*'# -> ASSERT( length dmds == 1 )
 889                 read_em (fn (Poly (head dmds)) : acc)
 890                         (stepOnBy# buf 2#)      -- Skip '*)'
 891       }
 892
 893   do_unary fn acc buf
 894     = case read_em [] buf of
 895         ([dmd], rest) -> read_em (fn dmd : acc) (stepOn rest)   -- Skip ')'
 896
 897 ------------------
 898 lex_scc cont buf =
 899  case currentChar# buf of
 900   'C'# -> cont ITsccAllCafs (incLexeme buf)
 901   other -> cont ITscc buf
 902
 903 -----------------------------------------------------------------------------
 904 -- Numbers
 905
 906 lex_num :: (Token -> P a) -> Int# -> Integer -> P a
 907 lex_num cont exts acc buf =
 908  case scanNumLit acc buf of
 909      (acc',buf') ->
 910        case currentChar# buf' of
 911          '.'# | is_digit (lookAhead# buf' 1#) ->
 912              -- this case is not optimised at all, as the
 913              -- presence of floating point numbers in interface
 914              -- files is not that common. (ToDo)
 915             case expandWhile# is_digit (incLexeme buf') of
 916               buf2 -> -- points to first non digit char
 917
 918                 let l = case currentChar# buf2 of
 919                           'E'# -> do_exponent
 920                           'e'# -> do_exponent
 921                           _ -> buf2
 922
 923                     do_exponent
 924                         = let buf3 = incLexeme buf2 in
 925                           case currentChar# buf3 of
 926                                 '-'# -> expandWhile# is_digit (incLexeme buf3)
 927                                 '+'# -> expandWhile# is_digit (incLexeme buf3)
 928                                 x | is_digit x -> expandWhile# is_digit buf3
 929                                 _ -> buf2
 930
 931                     v = readRational__ (lexemeToString l)
 932
 933                 in case currentChar# l of -- glasgow exts only
 934                       '#'# | glaExtsEnabled exts -> let l' = incLexeme l in
 935                               case currentChar# l' of
 936                                 '#'# -> cont (ITprimdouble v) (incLexeme l')
 937                                 _    -> cont (ITprimfloat  v) l'
 938                       _ -> cont (ITrational v) l
 939
 940          _ -> after_lexnum cont exts acc' buf'
 941
 942 after_lexnum cont exts i buf
 943   = case currentChar# buf of
 944         '#'# | glaExtsEnabled exts -> cont (ITprimint i) (incLexeme buf)
 945         _                          -> cont (ITinteger i) buf
 946
 947 -----------------------------------------------------------------------------
 948 -- C "literal literal"s  (i.e. things like ``NULL'', ``stdout'' etc.)
 949
 950 -- we lexemeToFastString on the bit between the ``''s, but include the
 951 -- quotes in the full lexeme.
 952
 953 lex_cstring cont buf =
 954  case expandUntilMatch (stepOverLexeme buf) "\'\'" of
 955    Just buf' -> cont (ITlitlit (lexemeToFastString
 956                                 (setCurrentPos# buf' (negateInt# 2#))))
 957                    (mergeLexemes buf buf')
 958    Nothing   -> lexError "unterminated ``" buf
 959
 960 -----------------------------------------------------------------------------
 961 -- identifiers, symbols etc.
 962
 963 lex_ip ip_constr cont buf =
 964  case expandWhile# is_ident buf of
 965    buf' -> cont (ip_constr (tailFS lexeme)) buf'
 966         where lexeme = lexemeToFastString buf'
 967
 968 lex_id cont exts buf =
 969  let buf1 = expandWhile# is_ident buf in
 970  seq buf1 $
 971
 972  case (if glaExtsEnabled exts
 973         then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
 974         else buf1)                              of { buf' ->
 975
 976  let lexeme  = lexemeToFastString buf' in
 977
 978  case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
 979         Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
 980                           cont kwd_token buf';
 981         Nothing        ->
 982
 983  let var_token = cont (ITvarid lexeme) buf' in
 984
 985  if not (glaExtsEnabled exts)
 986    then var_token
 987    else
 988
 989  case lookupUFM ghcExtensionKeywordsFM lexeme of {
 990         Just kwd_token -> cont kwd_token buf';
 991         Nothing        -> var_token
 992
 993  }}}
 994
 995 lex_sym cont buf =
 996  -- trace "lex_sym" $
 997  case expandWhile# is_symbol buf of
 998    buf' -> case lookupUFM haskellKeySymsFM lexeme of {
 999                 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
1000                                   cont kwd_token buf' ;
1001                 Nothing        -> --trace ("sym: "++unpackFS lexeme) $
1002                                   cont (mk_var_token lexeme) buf'
1003            }
1004         where lexeme = lexemeToFastString buf'
1005
1006
1007 -- lex_con recursively collects components of a qualified identifer.
1008 -- The argument buf is the StringBuffer representing the lexeme
1009 -- identified so far, where the next character is upper-case.
1010
1011 lex_con cont exts buf =
1012  -- trace ("con: "{-++unpackFS lexeme-}) $
1013  let empty_buf = stepOverLexeme buf in
1014  case expandWhile# is_ident empty_buf of { buf1 ->
1015  case slurp_trailing_hashes buf1 exts of { con_buf ->
1016
1017  let all_buf = mergeLexemes buf con_buf
1018
1019      con_lexeme = lexemeToFastString con_buf
1020      mod_lexeme = lexemeToFastString (decLexeme buf)
1021      all_lexeme = lexemeToFastString all_buf
1022
1023      just_a_conid
1024         | emptyLexeme buf = cont (ITconid con_lexeme)               all_buf
1025         | otherwise       = cont (ITqconid (mod_lexeme,con_lexeme)) all_buf
1026  in
1027
1028  case currentChar# all_buf of
1029      '.'# -> maybe_qualified cont exts all_lexeme
1030                 (incLexeme all_buf) just_a_conid
1031      _    -> just_a_conid
1032   }}
1033
1034
1035 maybe_qualified cont exts mod buf just_a_conid =
1036  -- trace ("qid: "{-++unpackFS lexeme-}) $
1037  case currentChar# buf of
1038   '['# ->       -- Special case for []
1039     case lookAhead# buf 1# of
1040      ']'# -> cont (ITqconid  (mod,SLIT("[]"))) (setCurrentPos# buf 2#)
1041      _    -> just_a_conid
1042
1043   '('# ->  -- Special case for (,,,)
1044            -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
1045     case lookAhead# buf 1# of
1046      '#'# | glaExtsEnabled exts -> case lookAhead# buf 2# of
1047                 ','# -> lex_ubx_tuple cont mod (setCurrentPos# buf 3#)
1048                                 just_a_conid
1049                 _    -> just_a_conid
1050      ')'# -> cont (ITqconid (mod,SLIT("()"))) (setCurrentPos# buf 2#)
1051      ','# -> lex_tuple cont mod (setCurrentPos# buf 2#) just_a_conid
1052      _    -> just_a_conid
1053
1054   '-'# -> case lookAhead# buf 1# of
1055             '>'# -> cont (ITqconid (mod,SLIT("(->)"))) (setCurrentPos# buf 2#)
1056             _    -> lex_id3 cont exts mod buf just_a_conid
1057
1058   _    -> lex_id3 cont exts mod buf just_a_conid
1059
1060
1061 lex_id3 cont exts mod buf just_a_conid
1062   | is_upper (currentChar# buf) =
1063      lex_con cont exts buf
1064
1065   | is_symbol (currentChar# buf) =
1066      let
1067         start_new_lexeme = stepOverLexeme buf
1068      in
1069      -- trace ("lex_id31 "{-++unpackFS lexeme-}) $
1070      case expandWhile# is_symbol start_new_lexeme of { buf' ->
1071      let
1072        lexeme  = lexemeToFastString buf'
1073         -- real lexeme is M.<sym>
1074        new_buf = mergeLexemes buf buf'
1075      in
1076      cont (mk_qvar_token mod lexeme) new_buf
1077         -- wrong, but arguably morally right: M... is now a qvarsym
1078      }
1079
1080   | otherwise   =
1081      let
1082         start_new_lexeme = stepOverLexeme buf
1083      in
1084      -- trace ("lex_id32 "{-++unpackFS lexeme-}) $
1085      case expandWhile# is_ident start_new_lexeme of { buf1 ->
1086      if emptyLexeme buf1
1087             then just_a_conid
1088             else
1089
1090      case slurp_trailing_hashes buf1 exts of { buf' ->
1091
1092      let
1093       lexeme      = lexemeToFastString buf'
1094       new_buf     = mergeLexemes buf buf'
1095       is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
1096      in
1097      case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
1098             Nothing          -> is_a_qvarid ;
1099
1100             Just kwd_token | isSpecial kwd_token   -- special ids (as, qualified, hiding) shouldn't be
1101                            -> is_a_qvarid          --  recognised as keywords here.
1102                            | otherwise
1103                            -> just_a_conid         -- avoid M.where etc.
1104      }}}
1105
1106 slurp_trailing_hashes buf exts
1107   | glaExtsEnabled exts = expandWhile# (`eqChar#` '#'#) buf
1108   | otherwise           = buf
1109
1110
1111 mk_var_token pk_str
1112   | is_upper f          = ITconid pk_str
1113   | is_ident f          = ITvarid pk_str
1114   | f `eqChar#` ':'#    = ITconsym pk_str
1115   | otherwise           = ITvarsym pk_str
1116   where
1117       (C# f) = _HEAD_ pk_str
1118       -- tl     = _TAIL_ pk_str
1119
1120 mk_qvar_token m token =
1121 -- trace ("mk_qvar ") $
1122  case mk_var_token token of
1123    ITconid n  -> ITqconid  (m,n)
1124    ITvarid n  -> ITqvarid  (m,n)
1125    ITconsym n -> ITqconsym (m,n)
1126    ITvarsym n -> ITqvarsym (m,n)
1127    _          -> ITunknown (show token)
1128 \end{code}
1129
1130 ----------------------------------------------------------------------------
1131 Horrible stuff for dealing with M.(,,,)
1132
1133 \begin{code}
1134 lex_tuple cont mod buf back_off =
1135   go 2 buf
1136   where
1137    go n buf =
1138     case currentChar# buf of
1139       ','# -> go (n+1) (stepOn buf)
1140       ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Boxed n))) (stepOn buf)
1141       _    -> back_off
1142
1143 lex_ubx_tuple cont mod buf back_off =
1144   go 2 buf
1145   where
1146    go n buf =
1147     case currentChar# buf of
1148       ','# -> go (n+1) (stepOn buf)
1149       '#'# -> case lookAhead# buf 1# of
1150                 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Unboxed n)))
1151                                  (stepOnBy# buf 2#)
1152                 _    -> back_off
1153       _    -> back_off
1154 \end{code}
1155
1156 -----------------------------------------------------------------------------
1157 'lexPragma' rips along really fast, looking for a '##-}',
1158 indicating the end of the pragma we're skipping
1159
1160 \begin{code}
1161 lexPragma cont contf inStr buf =
1162  case currentChar# buf of
1163    '#'# | inStr ==# 0# ->
1164        case lookAhead# buf 1# of { '#'# ->
1165        case lookAhead# buf 2# of { '-'# ->
1166        case lookAhead# buf 3# of { '}'# ->
1167            contf cont (lexemeToBuffer buf)
1168                       (stepOverLexeme (setCurrentPos# buf 4#));
1169         _    -> lexPragma cont contf inStr (incLexeme buf) };
1170         _    -> lexPragma cont contf inStr (incLexeme buf) };
1171         _    -> lexPragma cont contf inStr (incLexeme buf) }
1172
1173    '"'# ->
1174        let
1175         odd_slashes buf flg i# =
1176           case lookAhead# buf i# of
1177            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
1178            _     -> flg
1179
1180         not_inStr = if inStr ==# 0# then 1# else 0#
1181        in
1182        case lookAhead# buf (negateInt# 1#) of --backwards, actually
1183          '\\'# -> -- escaping something..
1184            if odd_slashes buf True (negateInt# 2#)
1185                 then  -- odd number of slashes, " is escaped.
1186                       lexPragma cont contf inStr (incLexeme buf)
1187                 else  -- even number of slashes, \ is escaped.
1188                       lexPragma cont contf not_inStr (incLexeme buf)
1189          _ -> lexPragma cont contf not_inStr (incLexeme buf)
1190
1191    '\''# | inStr ==# 0# ->
1192         case lookAhead# buf 1# of { '"'# ->
1193         case lookAhead# buf 2# of { '\''# ->
1194            lexPragma cont contf inStr (setCurrentPos# buf 3#);
1195         _ -> lexPragma cont contf inStr (incLexeme buf) };
1196         _ -> lexPragma cont contf inStr (incLexeme buf) }
1197
1198     -- a sign that the input is ill-formed, since pragmas are
1199     -- assumed to always be properly closed (in .hi files).
1200    '\NUL'# -> trace "lexPragma: unexpected end-of-file" $
1201               cont (ITunknown "\NUL") buf
1202
1203    _ -> lexPragma cont contf inStr (incLexeme buf)
1204
1205 \end{code}
1206
1207 -----------------------------------------------------------------------------
1208
1209 \begin{code}
1210 data LayoutContext
1211   = NoLayout
1212   | Layout Int#
1213
1214 data ParseResult a
1215   = POk PState a
1216   | PFailed Message
1217
1218 data PState = PState {
1219         loc        :: SrcLoc,
1220         extsBitmap :: Int#,     -- bitmap that determines permitted extensions
1221         bol        :: Int#,
1222         atbol      :: Int#,
1223         context    :: [LayoutContext]
1224      }
1225
1226 type P a = StringBuffer         -- Input string
1227         -> PState
1228         -> ParseResult a
1229
1230 returnP   :: a -> P a
1231 returnP a buf s = POk s a
1232
1233 thenP      :: P a -> (a -> P b) -> P b
1234 m `thenP` k = \ buf s ->
1235         case m buf s of
1236                 POk s1 a -> k a buf s1
1237                 PFailed err  -> PFailed err
1238
1239 thenP_     :: P a -> P b -> P b
1240 m `thenP_` k = m `thenP` \_ -> k
1241
1242 mapP :: (a -> P b) -> [a] -> P [b]
1243 mapP f [] = returnP []
1244 mapP f (a:as) =
1245      f a `thenP` \b ->
1246      mapP f as `thenP` \bs ->
1247      returnP (b:bs)
1248
1249 failP :: String -> P a
1250 failP msg buf s = PFailed (text msg)
1251
1252 failMsgP :: Message -> P a
1253 failMsgP msg buf s = PFailed msg
1254
1255 lexError :: String -> P a
1256 lexError str buf s@PState{ loc = loc }
1257   = failMsgP (hcat [ppr loc, text ": ", text str]) buf s
1258
1259 getSrcLocP :: P SrcLoc
1260 getSrcLocP buf s@(PState{ loc = loc }) = POk s loc
1261
1262 -- use a temporary SrcLoc for the duration of the argument
1263 setSrcLocP :: SrcLoc -> P a -> P a
1264 setSrcLocP new_loc p buf s =
1265   case p buf s{ loc=new_loc } of
1266       POk _ a   -> POk s a
1267       PFailed e -> PFailed e
1268
1269 getSrcFile :: P FAST_STRING
1270 getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc)
1271
1272 pushContext :: LayoutContext -> P ()
1273 pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} ()
1274
1275 {-
1276
1277 This special case in layoutOn is to handle layout contexts with are
1278 indented the same or less than the current context.  This is illegal
1279 according to the Haskell spec, so we have to arrange to close the
1280 current context.  eg.
1281
1282 class Foo a where
1283 class Bar a
1284
1285 after the first 'where', the sequence of events is:
1286
1287         - layout system inserts a ';' (column 0)
1288         - parser begins a new context at column 0
1289         - parser shifts ';' (legal empty declaration)
1290         - parser sees 'class': parse error (we're still in the inner context)
1291
1292 trouble is, by the time we know we need a new context, the lexer has
1293 already generated the ';'.  Hacky solution is as follows: since we
1294 know the column of the next token (it's the column number of the new
1295 context), we set the ACTUAL column number of the new context to this
1296 numer plus one.  Hence the next time the lexer is called, a '}' will
1297 be generated to close the new context straight away.  Furthermore, we
1298 have to set the atbol flag so that the ';' that the parser shifted as
1299 part of the new context is re-generated.
1300
1301 when the new context is *less* indented than the current one:
1302
1303 f = f where g = g where
1304 h = h
1305
1306         - current context: column 12.
1307         - on seeing 'h' (column 0), the layout system inserts '}'
1308         - parser starts a new context, column 0
1309         - parser sees '}', uses it to close new context
1310         - we still need to insert another '}' followed by a ';',
1311           hence the atbol trick.
1312
1313 There's also a special hack in here to deal with
1314
1315         do
1316            ....
1317            e $ do
1318            blah
1319
1320 i.e. the inner context is at the same indentation level as the outer
1321 context.  This is strictly illegal according to Haskell 98, but
1322 there's a lot of existing code using this style and it doesn't make
1323 any sense to disallow it, since empty 'do' lists don't make sense.
1324 -}
1325
1326 layoutOn :: Bool -> P ()
1327 layoutOn strict buf s@(PState{ bol = bol, context = ctx }) =
1328     let offset = lexemeIndex buf -# bol in
1329     case ctx of
1330         Layout prev_off : _
1331            | if strict then prev_off >=# offset else prev_off ># offset ->
1332                 --trace ("layout on, column: " ++  show (I# offset)) $
1333                 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
1334         other ->
1335                 --trace ("layout on, column: " ++  show (I# offset)) $
1336                 POk s{ context = Layout offset : ctx } ()
1337
1338 layoutOff :: P ()
1339 layoutOff buf s@(PState{ context = ctx }) =
1340     POk s{ context = NoLayout:ctx } ()
1341
1342 popContext :: P ()
1343 popContext = \ buf s@(PState{ context = ctx, loc = loc }) ->
1344   case ctx of
1345         (_:tl) -> POk s{ context = tl } ()
1346         []     -> PFailed (srcParseErr buf loc)
1347
1348 {-
1349  Note that if the name of the file we're processing ends
1350  with `hi-boot', we accept it on faith as having the right
1351  version. This is done so that .hi-boot files that comes
1352  with hsc don't have to be updated before every release,
1353  *and* it allows us to share .hi-boot files with versions
1354  of hsc that don't have .hi version checking (e.g., ghc-2.10's)
1355
1356  If the version number is 0, the checking is also turned off.
1357  (needed to deal with GHC.hi only!)
1358
1359  Once we can assume we're compiling with a version of ghc that
1360  supports interface file checking, we can drop the special
1361  pleading
1362 -}
1363 checkVersion :: Maybe Integer -> P ()
1364 checkVersion mb@(Just v) buf s@(PState{loc = loc})
1365  | (v==0) || (v == fromInt opt_HiVersion) || opt_NoHiCheck = POk s ()
1366  | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1367 checkVersion mb@Nothing  buf s@(PState{loc = loc})
1368  | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile loc)) = POk s ()
1369  | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1370
1371
1372 -- for reasons of efficiency, flags indicating language extensions (eg,
1373 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1374 -- integer
1375
1376 glaExtsBit, ffiBit, parrBit :: Int
1377 glaExtsBit = 0
1378 ffiBit     = 1  -- FIXME: not used yet; still part of `glaExtsBit'
1379 parrBit    = 2
1380
1381 glaExtsEnabled, ffiEnabled, parrEnabled :: Int# -> Bool
1382 glaExtsEnabled flags = testBit (I# flags) glaExtsBit
1383 ffiEnabled     flags = testBit (I# flags) ffiBit
1384 parrEnabled    flags = testBit (I# flags) parrBit
1385
1386 -- convenient record-based bitmap for the interface to the rest of the world
1387 --
1388 data ExtFlags = ExtFlags {
1389                   glasgowExtsEF :: Bool,
1390 --                ffiEF         :: Bool,  -- commented out to avoid warnings
1391                   parrEF        :: Bool   -- while not used yet
1392                 }
1393
1394 -- create a parse state
1395 --
1396 mkPState          :: SrcLoc -> ExtFlags -> PState
1397 mkPState loc exts  = PState {
1398                        loc        = loc,
1399                        extsBitmap = case bitmap of {I# bits -> bits},
1400                        bol        = 0#,
1401                        atbol      = 1#,
1402                        context    = []
1403                      }
1404                      where
1405                        bitmap =     glaExtsBit `setBitIf` glasgowExtsEF exts
1406 --                              .|. ffiBit     `setBitIf` ffiEF         exts
1407                                 .|. parrBit    `setBitIf` parrEF        exts
1408                        --
1409                        b `setBitIf` cond | cond      = bit b
1410                                          | otherwise = 0
1411
1412
1413 -----------------------------------------------------------------
1414
1415 ifaceParseErr :: StringBuffer -> SrcLoc -> Message
1416 ifaceParseErr s l
1417   = hsep [ppr l, ptext SLIT("Interface file parse error; on input `"),
1418           text (lexemeToString s), char '\'']
1419
1420 ifaceVersionErr hi_vers l toks
1421   = hsep [ppr l, ptext SLIT("Interface file version error;"),
1422           ptext SLIT("Expected"), int opt_HiVersion,
1423           ptext SLIT("found "), pp_version]
1424     where
1425      pp_version =
1426       case hi_vers of
1427         Nothing -> ptext SLIT("pre ghc-3.02 version")
1428         Just v  -> ptext SLIT("version") <+> integer v
1429
1430 -----------------------------------------------------------------------------
1431
1432 srcParseErr :: StringBuffer -> SrcLoc -> Message
1433 srcParseErr s l
1434   = hcat [ppr l,
1435           if null token
1436              then ptext SLIT(": parse error (possibly incorrect indentation)")
1437              else hcat [ptext SLIT(": parse error on input "),
1438                         char '`', text token, char '\'']
1439     ]
1440   where
1441         token = lexemeToString s
1442
1443 \end{code}