ghc/compiler/parser/Lex.lhs

   1 %
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
   3 %
   4 \section[Lexical analysis]{Lexical analysis}
   5
   6 --------------------------------------------------------
   7 [Jan 98]
   8 There's a known bug in here:
   9
  10         If an interface file ends prematurely, Lex tries to
  11         do headFS of an empty FastString.
  12
  13 An example that provokes the error is
  14
  15         f _:_ _forall_ [a] <<<END OF FILE>>>
  16 --------------------------------------------------------
  17
  18 \begin{code}
  19
  20 module Lex (
  21
  22         ifaceParseErr, srcParseErr,
  23
  24         -- Monad for parser
  25         Token(..), lexer, ParseResult(..), PState(..),
  26         checkVersion, ExtFlags(..), mkPState,
  27         StringBuffer,
  28
  29         P, thenP, thenP_, returnP, mapP, failP, failMsgP,
  30         getSrcLocP, setSrcLocP, getSrcFile,
  31         layoutOn, layoutOff, pushContext, popContext
  32     ) where
  33
  34 #include "HsVersions.h"
  35
  36 import Char             ( isSpace, toUpper )
  37 import List             ( isSuffixOf )
  38
  39 import PrelNames        ( mkTupNameStr )
  40 import CmdLineOpts      ( opt_HiVersion, opt_NoHiCheck )
  41 import ForeignCall      ( Safety(..) )
  42 import NewDemand        ( StrictSig(..), Demand(..), Demands(..),
  43                           DmdResult(..), mkTopDmdType, evalDmd, lazyDmd )
  44 import UniqFM           ( listToUFM, lookupUFM )
  45 import BasicTypes       ( Boxity(..) )
  46 import SrcLoc           ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
  47                           replaceSrcLine, mkSrcLoc )
  48
  49 import ErrUtils         ( Message )
  50 import Outputable
  51
  52 import FastString
  53 import StringBuffer
  54 import GlaExts
  55 import Ctype
  56 import Char             ( chr, ord )
  57
  58 import Bits             ( Bits(..) )       -- non-std
  59
  60 #if __GLASGOW_HASKELL__ >= 503
  61 import GHC.Read         ( readRational__ ) -- Glasgow non-std
  62 #else
  63 import PrelRead         ( readRational__ ) -- Glasgow non-std
  64 #endif
  65 \end{code}
  66
  67 %************************************************************************
  68 %*                                                                      *
  69 \subsection{Data types}
  70 %*                                                                      *
  71 %************************************************************************
  72
  73 The token data type, fairly un-interesting except from one
  74 constructor, @ITidinfo@, which is used to lazily lex id info (arity,
  75 strictness, unfolding etc).
  76
  77 The Idea/Observation here is that the renamer needs to scan through
  78 all of an interface file before it can continue. But only a fraction
  79 of the information contained in the file turns out to be useful, so
  80 delaying as much as possible of the scanning and parsing of an
  81 interface file Makes Sense (Heap profiles of the compiler
  82 show a reduction in heap usage by at least a factor of two,
  83 post-renamer).
  84
  85 Hence, the interface file lexer spots when value declarations are
  86 being scanned and return the @ITidinfo@ and @ITtype@ constructors
  87 for the type and any other id info for that binding (unfolding, strictness
  88 etc). These constructors are applied to the result of lexing these sub-chunks.
  89
  90 The lexing of the type and id info is all done lazily, of course, so
  91 the scanning (and subsequent parsing) will be done *only* on the ids the
  92 renamer finds out that it is interested in. The rest will just be junked.
  93 Laziness, you know it makes sense :-)
  94
  95 \begin{code}
  96 data Token
  97   = ITas                        -- Haskell keywords
  98   | ITcase
  99   | ITclass
 100   | ITdata
 101   | ITdefault
 102   | ITderiving
 103   | ITdo
 104   | ITelse
 105   | IThiding
 106   | ITif
 107   | ITimport
 108   | ITin
 109   | ITinfix
 110   | ITinfixl
 111   | ITinfixr
 112   | ITinstance
 113   | ITlet
 114   | ITmodule
 115   | ITnewtype
 116   | ITof
 117   | ITqualified
 118   | ITthen
 119   | ITtype
 120   | ITwhere
 121   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 122
 123   | ITforall                    -- GHC extension keywords
 124   | ITforeign
 125   | ITexport
 126   | ITlabel
 127   | ITdynamic
 128   | ITsafe
 129   | ITunsafe
 130   | ITwith
 131   | ITstdcallconv
 132   | ITccallconv
 133   | ITdotnet
 134
 135   | ITinterface                 -- interface keywords
 136   | IT__export
 137   | ITdepends
 138   | IT__forall
 139   | ITletrec
 140   | ITcoerce
 141   | ITinlineMe
 142   | ITinlineCall
 143   | ITccall (Bool,Bool,Safety)  -- (is_dyn, is_casm, may_gc)
 144   | ITdefaultbranch
 145   | ITbottom
 146   | ITinteger_lit
 147   | ITfloat_lit
 148   | ITword_lit
 149   | ITword64_lit
 150   | ITint64_lit
 151   | ITrational_lit
 152   | ITaddr_lit
 153   | ITlabel_lit
 154   | ITlit_lit
 155   | ITstring_lit
 156   | ITtypeapp
 157   | ITusage
 158   | ITfuall
 159   | ITarity
 160   | ITspecialise
 161   | ITnocaf
 162   | ITunfold
 163   | ITstrict StrictSig
 164   | ITrules
 165   | ITcprinfo
 166   | ITdeprecated
 167   | IT__scc
 168   | ITsccAllCafs
 169
 170   | ITspecialise_prag           -- Pragmas
 171   | ITsource_prag
 172   | ITinline_prag
 173   | ITnoinline_prag
 174   | ITrules_prag
 175   | ITdeprecated_prag
 176   | ITline_prag
 177   | ITscc_prag
 178   | ITclose_prag
 179
 180   | ITdotdot                    -- reserved symbols
 181   | ITdcolon
 182   | ITequal
 183   | ITlam
 184   | ITvbar
 185   | ITlarrow
 186   | ITrarrow
 187   | ITat
 188   | ITtilde
 189   | ITdarrow
 190   | ITminus
 191   | ITbang
 192   | ITstar
 193   | ITdot
 194
 195   | ITbiglam                    -- GHC-extension symbols
 196
 197   | ITocurly                    -- special symbols
 198   | ITccurly
 199   | ITocurlybar                 -- {|, for type applications
 200   | ITccurlybar                 -- |}, for type applications
 201   | ITvccurly
 202   | ITobrack
 203   | ITopabrack                  -- [:, for parallel arrays with -fparr
 204   | ITcpabrack                  -- :], for parallel arrays with -fparr
 205   | ITcbrack
 206   | IToparen
 207   | ITcparen
 208   | IToubxparen
 209   | ITcubxparen
 210   | ITsemi
 211   | ITcomma
 212   | ITunderscore
 213   | ITbackquote
 214
 215   | ITvarid   FAST_STRING       -- identifiers
 216   | ITconid   FAST_STRING
 217   | ITvarsym  FAST_STRING
 218   | ITconsym  FAST_STRING
 219   | ITqvarid  (FAST_STRING,FAST_STRING)
 220   | ITqconid  (FAST_STRING,FAST_STRING)
 221   | ITqvarsym (FAST_STRING,FAST_STRING)
 222   | ITqconsym (FAST_STRING,FAST_STRING)
 223
 224   | ITdupipvarid   FAST_STRING  -- GHC extension: implicit param: ?x
 225   | ITsplitipvarid FAST_STRING  -- GHC extension: implicit param: %x
 226
 227   | ITpragma StringBuffer
 228
 229   | ITchar       Int
 230   | ITstring     FAST_STRING
 231   | ITinteger    Integer
 232   | ITrational   Rational
 233
 234   | ITprimchar   Int
 235   | ITprimstring FAST_STRING
 236   | ITprimint    Integer
 237   | ITprimfloat  Rational
 238   | ITprimdouble Rational
 239   | ITlitlit     FAST_STRING
 240
 241   | ITunknown String            -- Used when the lexer can't make sense of it
 242   | ITeof                       -- end of file token
 243   deriving Show -- debugging
 244 \end{code}
 245
 246 -----------------------------------------------------------------------------
 247 Keyword Lists
 248
 249 \begin{code}
 250 pragmaKeywordsFM = listToUFM $
 251       map (\ (x,y) -> (_PK_ x,y))
 252        [( "SPECIALISE", ITspecialise_prag ),
 253         ( "SPECIALIZE", ITspecialise_prag ),
 254         ( "SOURCE",     ITsource_prag ),
 255         ( "INLINE",     ITinline_prag ),
 256         ( "NOINLINE",   ITnoinline_prag ),
 257         ( "NOTINLINE",  ITnoinline_prag ),
 258         ( "LINE",       ITline_prag ),
 259         ( "RULES",      ITrules_prag ),
 260         ( "RULEZ",      ITrules_prag ), -- american spelling :-)
 261         ( "SCC",        ITscc_prag ),
 262         ( "DEPRECATED", ITdeprecated_prag )
 263         ]
 264
 265 haskellKeywordsFM = listToUFM $
 266       map (\ (x,y) -> (_PK_ x,y))
 267        [( "_",          ITunderscore ),
 268         ( "as",         ITas ),
 269         ( "case",       ITcase ),
 270         ( "class",      ITclass ),
 271         ( "data",       ITdata ),
 272         ( "default",    ITdefault ),
 273         ( "deriving",   ITderiving ),
 274         ( "do",         ITdo ),
 275         ( "else",       ITelse ),
 276         ( "hiding",     IThiding ),
 277         ( "if",         ITif ),
 278         ( "import",     ITimport ),
 279         ( "in",         ITin ),
 280         ( "infix",      ITinfix ),
 281         ( "infixl",     ITinfixl ),
 282         ( "infixr",     ITinfixr ),
 283         ( "instance",   ITinstance ),
 284         ( "let",        ITlet ),
 285         ( "module",     ITmodule ),
 286         ( "newtype",    ITnewtype ),
 287         ( "of",         ITof ),
 288         ( "qualified",  ITqualified ),
 289         ( "then",       ITthen ),
 290         ( "type",       ITtype ),
 291         ( "where",      ITwhere ),
 292         ( "_scc_",      ITscc )         -- ToDo: remove
 293      ]
 294
 295 isSpecial :: Token -> Bool
 296 -- If we see M.x, where x is a keyword, but
 297 -- is special, we treat is as just plain M.x,
 298 -- not as a keyword.
 299 isSpecial ITas          = True
 300 isSpecial IThiding      = True
 301 isSpecial ITqualified   = True
 302 isSpecial ITforall      = True
 303 isSpecial ITexport      = True
 304 isSpecial ITlabel       = True
 305 isSpecial ITdynamic     = True
 306 isSpecial ITsafe        = True
 307 isSpecial ITunsafe      = True
 308 isSpecial ITwith        = True
 309 isSpecial ITccallconv   = True
 310 isSpecial ITstdcallconv = True
 311 isSpecial _             = False
 312
 313 -- IMPORTANT: Keep this in synch with ParseIface.y's var_fs production! (SUP)
 314 ghcExtensionKeywordsFM = listToUFM $
 315         map (\ (x,y) -> (_PK_ x,y))
 316      [  ( "forall",     ITforall ),
 317         ( "foreign",    ITforeign ),
 318         ( "export",     ITexport ),
 319         ( "label",      ITlabel ),
 320         ( "dynamic",    ITdynamic ),
 321         ( "safe",       ITunsafe ),
 322         ( "unsafe",     ITunsafe ),
 323         ( "with",       ITwith ),
 324         ( "stdcall",    ITstdcallconv),
 325         ( "ccall",      ITccallconv),
 326         ( "dotnet",     ITdotnet),
 327         ("_ccall_",     ITccall (False, False, PlayRisky)),
 328         ("_ccall_GC_",  ITccall (False, False, PlaySafe)),
 329         ("_casm_",      ITccall (False, True,  PlayRisky)),
 330         ("_casm_GC_",   ITccall (False, True,  PlaySafe)),
 331
 332         -- interface keywords
 333         ("__interface",         ITinterface),
 334         ("__export",            IT__export),
 335         ("__depends",           ITdepends),
 336         ("__forall",            IT__forall),
 337         ("__letrec",            ITletrec),
 338         ("__coerce",            ITcoerce),
 339         ("__inline_me",         ITinlineMe),
 340         ("__inline_call",       ITinlineCall),
 341         ("__depends",           ITdepends),
 342         ("__DEFAULT",           ITdefaultbranch),
 343         ("__bot",               ITbottom),
 344         ("__integer",           ITinteger_lit),
 345         ("__float",             ITfloat_lit),
 346         ("__int64",             ITint64_lit),
 347         ("__word",              ITword_lit),
 348         ("__word64",            ITword64_lit),
 349         ("__rational",          ITrational_lit),
 350         ("__addr",              ITaddr_lit),
 351         ("__label",             ITlabel_lit),
 352         ("__litlit",            ITlit_lit),
 353         ("__string",            ITstring_lit),
 354         ("__a",                 ITtypeapp),
 355         ("__u",                 ITusage),
 356         ("__fuall",             ITfuall),
 357         ("__A",                 ITarity),
 358         ("__P",                 ITspecialise),
 359         ("__C",                 ITnocaf),
 360         ("__R",                 ITrules),
 361         ("__D",                 ITdeprecated),
 362         ("__U",                 ITunfold),
 363
 364         ("__ccall",             ITccall (False, False, PlayRisky)),
 365         ("__ccall_GC",          ITccall (False, False, PlaySafe)),
 366         ("__dyn_ccall",         ITccall (True,  False, PlayRisky)),
 367         ("__dyn_ccall_GC",      ITccall (True,  False, PlaySafe)),
 368         ("__casm",              ITccall (False, True,  PlayRisky)),
 369         ("__dyn_casm",          ITccall (True,  True,  PlayRisky)),
 370         ("__casm_GC",           ITccall (False, True,  PlaySafe)),
 371         ("__dyn_casm_GC",       ITccall (True,  True,  PlaySafe)),
 372
 373         ("/\\",                 ITbiglam)
 374      ]
 375
 376
 377 haskellKeySymsFM = listToUFM $
 378         map (\ (x,y) -> (_PK_ x,y))
 379       [ ("..",          ITdotdot)
 380        ,("::",          ITdcolon)
 381        ,("=",           ITequal)
 382        ,("\\",          ITlam)
 383        ,("|",           ITvbar)
 384        ,("<-",          ITlarrow)
 385        ,("->",          ITrarrow)
 386        ,("@",           ITat)
 387        ,("~",           ITtilde)
 388        ,("=>",          ITdarrow)
 389        ,("-",           ITminus)
 390        ,("!",           ITbang)
 391        ,("*",           ITstar)
 392        ,(".",           ITdot)          -- sadly, for 'forall a . t'
 393        ]
 394 \end{code}
 395
 396 -----------------------------------------------------------------------------
 397 The lexical analyser
 398
 399 Lexer state:
 400
 401         - (exts)  lexing a source with extensions, eg, an interface file or
 402                   with -fglasgow-exts
 403         - (bol)   pointer to beginning of line (for column calculations)
 404         - (buf)   pointer to beginning of token
 405         - (buf)   pointer to current char
 406         - (atbol) flag indicating whether we're at the beginning of a line
 407
 408 \begin{code}
 409 lexer :: (Token -> P a) -> P a
 410 lexer cont buf s@(PState{
 411                     loc = loc,
 412                     extsBitmap = exts,
 413                     bol = bol,
 414                     atbol = atbol,
 415                     context = ctx
 416                 })
 417
 418         -- first, start a new lexeme and lose all the whitespace
 419   =  _scc_ "Lexer"
 420   tab line bol atbol (stepOverLexeme buf)
 421   where
 422         line = srcLocLine loc
 423
 424         tab y bol atbol buf = --trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $
 425           case currentChar# buf of
 426
 427             '\NUL'# ->
 428                    if bufferExhausted (stepOn buf)
 429                         then cont ITeof buf s'
 430                         else trace "lexer: misplaced NUL?" $
 431                              tab y bol atbol (stepOn buf)
 432
 433             '\n'# -> let buf' = stepOn buf
 434                      in tab (y +# 1#) (currentIndex# buf') 1# buf'
 435
 436                 -- find comments.  This got harder in Haskell 98.
 437             '-'# ->  let trundle n =
 438                           let next = lookAhead# buf n in
 439                           if next `eqChar#` '-'# then trundle (n +# 1#)
 440                           else if is_symbol next || n <# 2#
 441                                 then is_a_token
 442                                 else tab y bol atbol
 443                                          (stepOnUntilChar# (stepOnBy# buf n) '\n'#)
 444                     in trundle 1#
 445
 446                 -- comments and pragmas.  We deal with LINE pragmas here,
 447                 -- and throw out any unrecognised pragmas as comments.  Any
 448                 -- pragmas we know about are dealt with later (after any layout
 449                 -- processing if necessary).
 450             '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
 451                 if lookAhead# buf 2# `eqChar#` '#'# then
 452                   if lookAhead# buf 3# `eqChar#` '#'# then is_a_token else
 453                   case expandWhile# is_space (setCurrentPos# buf 3#) of { buf1->
 454                   case expandWhile# is_ident (stepOverLexeme buf1)   of { buf2->
 455                   let lexeme = mkFastString -- ToDo: too slow
 456                                   (map toUpper (lexemeToString buf2)) in
 457                   case lookupUFM pragmaKeywordsFM lexeme of
 458                         -- ignore RULES pragmas when -fglasgow-exts is off
 459                         Just ITrules_prag | not (glaExtsEnabled exts) ->
 460                            skip_to_end (stepOnBy# buf 2#) s'
 461                         Just ITline_prag ->
 462                            line_prag skip_to_end buf2 s'
 463                         Just other -> is_a_token
 464                         Nothing -> skip_to_end (stepOnBy# buf 2#) s'
 465                   }}
 466
 467                 else skip_to_end (stepOnBy# buf 2#) s'
 468                 where
 469                     skip_to_end = skipNestedComment (lexer cont)
 470
 471                 -- special GHC extension: we grok cpp-style #line pragmas
 472             '#'# | lexemeIndex buf ==# bol ->   -- the '#' must be in column 0
 473                 let buf1 | lookAhead# buf 1# `eqChar#` 'l'# &&
 474                            lookAhead# buf 2# `eqChar#` 'i'# &&
 475                            lookAhead# buf 3# `eqChar#` 'n'# &&
 476                            lookAhead# buf 4# `eqChar#` 'e'#  = stepOnBy# buf 5#
 477                          | otherwise = stepOn buf
 478                 in
 479                 case expandWhile# is_space buf1 of { buf2 ->
 480                 if is_digit (currentChar# buf2)
 481                         then line_prag next_line buf2 s'
 482                         else is_a_token
 483                 }
 484                 where
 485                 next_line buf = lexer cont (stepOnUntilChar# buf '\n'#)
 486
 487                 -- tabs have been expanded beforehand
 488             c | is_space c -> tab y bol atbol (stepOn buf)
 489               | otherwise  -> is_a_token
 490
 491            where s' = s{loc = replaceSrcLine loc y,
 492                         bol = bol,
 493                        atbol = atbol}
 494
 495                  is_a_token | atbol /=# 0# = lexBOL cont buf s'
 496                             | otherwise    = lexToken cont exts buf s'
 497
 498 -- {-# LINE .. #-} pragmas.  yeuch.
 499 line_prag cont buf s@PState{loc=loc} =
 500   case expandWhile# is_space buf                of { buf1 ->
 501   case scanNumLit 0 (stepOverLexeme buf1)       of { (line,buf2) ->
 502   -- subtract one: the line number refers to the *following* line.
 503   let real_line = line - 1 in
 504   case fromInteger real_line                    of { i@(I# l) ->
 505         -- ToDo, if no filename then we skip the newline.... d'oh
 506   case expandWhile# is_space buf2               of { buf3 ->
 507   case currentChar# buf3                        of
 508      '\"'#{-"-} ->
 509         case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
 510         let
 511             file = lexemeToFastString buf4
 512             new_buf = stepOn (stepOverLexeme buf4)
 513         in
 514         if nullFastString file
 515                 then cont new_buf s{loc = replaceSrcLine loc l}
 516                 else cont new_buf s{loc = mkSrcLoc file i}
 517         }
 518      _other -> cont (stepOverLexeme buf3) s{loc = replaceSrcLine loc l}
 519   }}}}
 520
 521 skipNestedComment :: P a -> P a
 522 skipNestedComment cont buf state = skipNestedComment' (loc state) cont buf state
 523
 524 skipNestedComment' :: SrcLoc -> P a -> P a
 525 skipNestedComment' orig_loc cont buf = loop buf
 526  where
 527    loop buf =
 528      case currentChar# buf of
 529         '-'# | lookAhead# buf 1# `eqChar#` '}'# -> cont (stepOnBy# buf 2#)
 530
 531         '{'# | lookAhead# buf 1# `eqChar#` '-'# ->
 532               skipNestedComment
 533                 (skipNestedComment' orig_loc cont)
 534                 (stepOnBy# buf 2#)
 535
 536         '\n'# -> \ s@PState{loc=loc} ->
 537                  let buf' = stepOn buf in
 538                  loop buf' s{loc = incSrcLine loc,
 539                              bol = currentIndex# buf',
 540                              atbol = 1#}
 541
 542         -- pass the original SrcLoc to lexError so that the error is
 543         -- reported at the line it was originally on, not the line at
 544         -- the end of the file.
 545         '\NUL'# | bufferExhausted (stepOn buf) ->
 546                 \s -> lexError "unterminated `{-'" buf s{loc=orig_loc} -- -}
 547
 548         _   -> loop (stepOn buf)
 549
 550 -- When we are lexing the first token of a line, check whether we need to
 551 -- insert virtual semicolons or close braces due to layout.
 552
 553 lexBOL :: (Token -> P a) -> P a
 554 lexBOL cont buf s@(PState{
 555                     loc = loc,
 556                     extsBitmap = exts,
 557                     bol = bol,
 558                     atbol = atbol,
 559                     context = ctx
 560                   }) =
 561         if need_close_curly then
 562                 --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $
 563                 cont ITvccurly buf s{atbol = 1#, context = tail ctx}
 564         else if need_semi_colon then
 565                 --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $
 566                 cont ITsemi buf s{atbol = 0#}
 567         else
 568                 lexToken cont exts buf s{atbol = 0#}
 569   where
 570         col = currentIndex# buf -# bol
 571
 572         need_close_curly =
 573                 case ctx of
 574                         [] -> False
 575                         (i:_) -> case i of
 576                                     NoLayout -> False
 577                                     Layout n -> col <# n
 578         need_semi_colon =
 579                 case ctx of
 580                         [] -> False
 581                         (i:_) -> case i of
 582                                     NoLayout -> False
 583                                     Layout n -> col ==# n
 584
 585
 586 lexToken :: (Token -> P a) -> Int# -> P a
 587 lexToken cont exts buf =
 588 -- trace "lexToken" $
 589   case currentChar# buf of
 590
 591     -- special symbols ----------------------------------------------------
 592     '('# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '#'#
 593                 -> cont IToubxparen (setCurrentPos# buf 2#)
 594          | otherwise
 595                 -> cont IToparen (incLexeme buf)
 596
 597     ')'# -> cont ITcparen    (incLexeme buf)
 598     '['# | parrEnabled exts && lookAhead# buf 1# `eqChar#` ':'# ->
 599             cont ITopabrack  (setCurrentPos# buf 2#)
 600          | otherwise ->
 601             cont ITobrack    (incLexeme buf)
 602     ']'# -> cont ITcbrack    (incLexeme buf)
 603     ','# -> cont ITcomma     (incLexeme buf)
 604     ';'# -> cont ITsemi      (incLexeme buf)
 605     '}'# -> \ s@PState{context = ctx} ->
 606             case ctx of
 607                 (_:ctx') -> cont ITccurly (incLexeme buf) s{context=ctx'}
 608                 _        -> lexError "too many '}'s" buf s
 609     '|'# -> case lookAhead# buf 1# of
 610                  '}'#  | glaExtsEnabled exts -> cont ITccurlybar
 611                                                      (setCurrentPos# buf 2#)
 612                  _                           -> lex_sym cont (incLexeme buf)
 613     ':'# -> case lookAhead# buf 1# of
 614                  ']'#  | parrEnabled exts    -> cont ITcpabrack
 615                                                      (setCurrentPos# buf 2#)
 616                  _                           -> lex_sym cont (incLexeme buf)
 617
 618
 619     '#'# -> case lookAhead# buf 1# of
 620                 ')'#  | glaExtsEnabled exts
 621                      -> cont ITcubxparen (setCurrentPos# buf 2#)
 622                 '-'# -> case lookAhead# buf 2# of
 623                            '}'# -> cont ITclose_prag (setCurrentPos# buf 3#)
 624                            _    -> lex_sym cont (incLexeme buf)
 625                 _    -> lex_sym cont (incLexeme buf)
 626
 627     '`'# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '`'#
 628                 -> lex_cstring cont (setCurrentPos# buf 2#)
 629          | otherwise
 630                 -> cont ITbackquote (incLexeme buf)
 631
 632     '{'# ->     -- look for "{-##" special iface pragma   -- for Emacs: -}
 633             case lookAhead# buf 1# of
 634            '|'# | glaExtsEnabled exts
 635                 -> cont ITocurlybar (setCurrentPos# buf 2#)
 636            '-'# -> case lookAhead# buf 2# of
 637                     '#'# -> case lookAhead# buf 3# of
 638                                 '#'# ->
 639                                    lexPragma
 640                                       cont
 641                                       (\ cont lexeme buf' -> cont (ITpragma lexeme) buf')
 642                                       0#
 643                                       (stepOnBy# (stepOverLexeme buf) 4#)
 644                                 _ -> lex_prag cont (setCurrentPos# buf 3#)
 645                     _    -> cont ITocurly (incLexeme buf)
 646            _ -> (layoutOff `thenP_` cont ITocurly)  (incLexeme buf)
 647
 648     -- strings/characters -------------------------------------------------
 649     '\"'#{-"-} -> lex_string cont exts [] (incLexeme buf)
 650     '\''#      -> lex_char (char_end cont) exts (incLexeme buf)
 651
 652     -- strictness and cpr pragmas and __scc treated specially.
 653     '_'# | glaExtsEnabled exts ->
 654          case lookAhead# buf 1# of
 655            '_'# -> case lookAhead# buf 2# of
 656                     'S'# ->
 657                         lex_demand cont (stepOnUntil (not . isSpace)
 658                                         (stepOnBy# buf 3#)) -- past __S
 659                     'M'# ->
 660                         cont ITcprinfo (stepOnBy# buf 3#)       -- past __M
 661
 662                     's'# ->
 663                         case prefixMatch (stepOnBy# buf 3#) "cc" of
 664                                Just buf' -> lex_scc cont (stepOverLexeme buf')
 665                                Nothing   -> lex_id cont exts buf
 666                     _ -> lex_id cont exts buf
 667            _    -> lex_id cont exts buf
 668
 669         -- Hexadecimal and octal constants
 670     '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2
 671                 -> readNum (after_lexnum cont exts) buf' is_hexdigit 16 hex
 672          | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2
 673                 -> readNum (after_lexnum cont exts) buf' is_octdigit  8 oct_or_dec
 674         where ch   = lookAhead# buf 1#
 675               ch2  = lookAhead# buf 2#
 676               buf' = setCurrentPos# buf 2#
 677
 678     '\NUL'# ->
 679             if bufferExhausted (stepOn buf) then
 680                cont ITeof buf
 681             else
 682                trace "lexIface: misplaced NUL?" $
 683                cont (ITunknown "\NUL") (stepOn buf)
 684
 685     '?'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
 686             lex_ip ITdupipvarid cont (incLexeme buf)
 687     '%'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) ->
 688             lex_ip ITsplitipvarid cont (incLexeme buf)
 689     c | is_digit  c -> lex_num cont exts 0 buf
 690       | is_symbol c -> lex_sym cont buf
 691       | is_upper  c -> lex_con cont exts buf
 692       | is_ident  c -> lex_id  cont exts buf
 693       | otherwise   -> lexError "illegal character" buf
 694
 695 -- Int# is unlifted, and therefore faster than Bool for flags.
 696 {-# INLINE flag #-}
 697 flag :: Int# -> Bool
 698 flag 0# = False
 699 flag _  = True
 700
 701 -------------------------------------------------------------------------------
 702 -- Pragmas
 703
 704 lex_prag cont buf
 705   = case expandWhile# is_space buf of { buf1 ->
 706     case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 ->
 707     let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
 708     case lookupUFM pragmaKeywordsFM lexeme of
 709         Just kw -> cont kw (mergeLexemes buf buf2)
 710         Nothing -> panic "lex_prag"
 711   }}
 712
 713 -------------------------------------------------------------------------------
 714 -- Strings & Chars
 715
 716 lex_string cont exts s buf
 717   = case currentChar# buf of
 718         '"'#{-"-} ->
 719            let buf' = incLexeme buf
 720                s' = mkFastStringNarrow (map chr (reverse s))
 721            in case currentChar# buf' of
 722                 '#'# | glaExtsEnabled exts -> if all (<= 0xFF) s
 723                     then cont (ITprimstring s') (incLexeme buf')
 724                     else lexError "primitive string literal must contain only characters <= \'\\xFF\'" buf'
 725                 _                   -> cont (ITstring s') buf'
 726
 727         -- ignore \& in a string, deal with string gaps
 728         '\\'# | next_ch `eqChar#` '&'#
 729                 -> lex_string cont exts s buf'
 730               | is_space next_ch
 731                 -> lex_stringgap cont exts s (incLexeme buf)
 732
 733             where next_ch = lookAhead# buf 1#
 734                   buf' = setCurrentPos# buf 2#
 735
 736         _ -> lex_char (lex_next_string cont s) exts buf
 737
 738 lex_stringgap cont exts s buf
 739   = let buf' = incLexeme buf in
 740     case currentChar# buf of
 741         '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont exts s buf'
 742                   st{loc = incSrcLine loc}
 743         '\\'# -> lex_string cont exts s buf'
 744         c | is_space c -> lex_stringgap cont exts s buf'
 745         other -> charError buf'
 746
 747 lex_next_string cont s exts c buf = lex_string cont exts (c:s) buf
 748
 749 lex_char :: (Int# -> Int -> P a) -> Int# -> P a
 750 lex_char cont exts buf
 751   = case currentChar# buf of
 752         '\\'# -> lex_escape (cont exts) (incLexeme buf)
 753         c | is_any c -> cont exts (I# (ord# c)) (incLexeme buf)
 754         other -> charError buf
 755
 756 char_end cont exts c buf
 757   = case currentChar# buf of
 758         '\''# -> let buf' = incLexeme buf in
 759                  case currentChar# buf' of
 760                         '#'# | glaExtsEnabled exts
 761                                 -> cont (ITprimchar c) (incLexeme buf')
 762                         _       -> cont (ITchar c) buf'
 763         _     -> charError buf
 764
 765 lex_escape cont buf
 766   = let buf' = incLexeme buf in
 767     case currentChar# buf of
 768         'a'#       -> cont (ord '\a') buf'
 769         'b'#       -> cont (ord '\b') buf'
 770         'f'#       -> cont (ord '\f') buf'
 771         'n'#       -> cont (ord '\n') buf'
 772         'r'#       -> cont (ord '\r') buf'
 773         't'#       -> cont (ord '\t') buf'
 774         'v'#       -> cont (ord '\v') buf'
 775         '\\'#      -> cont (ord '\\') buf'
 776         '"'#       -> cont (ord '\"') buf'
 777         '\''#      -> cont (ord '\'') buf'
 778         '^'#       -> let c = currentChar# buf' in
 779                       if c `geChar#` '@'# && c `leChar#` '_'#
 780                         then cont (I# (ord# c -# ord# '@'#)) (incLexeme buf')
 781                         else charError buf'
 782
 783         'x'#      -> readNum (after_charnum cont) buf' is_hexdigit 16 hex
 784         'o'#      -> readNum (after_charnum cont) buf' is_octdigit  8 oct_or_dec
 785         x | is_digit x
 786                   -> readNum (after_charnum cont) buf is_digit    10 oct_or_dec
 787
 788         _          -> case [ (c,buf2) | (p,c) <- silly_escape_chars,
 789                                        Just buf2 <- [prefixMatch buf p] ] of
 790                             (c,buf2):_ -> cont (ord c) buf2
 791                             [] -> charError buf'
 792
 793 after_charnum cont i buf
 794   = if i >= 0 && i <= 0x10FFFF
 795         then cont (fromInteger i) buf
 796         else charError buf
 797
 798 readNum cont buf is_digit base conv = read buf 0
 799   where read buf i
 800           = case currentChar# buf of { c ->
 801             if is_digit c
 802                 then read (incLexeme buf) (i*base + (toInteger (I# (conv c))))
 803                 else cont i buf
 804             }
 805
 806 is_hexdigit c
 807         =  is_digit c
 808         || (c `geChar#` 'a'# && c `leChar#` 'f'#)
 809         || (c `geChar#` 'A'# && c `leChar#` 'F'#)
 810
 811 hex c | is_digit c = ord# c -# ord# '0'#
 812       | otherwise  = ord# (to_lower c) -# ord# 'a'# +# 10#
 813 oct_or_dec c = ord# c -# ord# '0'#
 814
 815 is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'#
 816
 817 to_lower c
 818   | c `geChar#` 'A'# && c `leChar#` 'Z'#
 819         = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#))
 820   | otherwise = c
 821
 822 charError buf = lexError "error in character literal" buf
 823
 824 silly_escape_chars = [
 825         ("NUL", '\NUL'),
 826         ("SOH", '\SOH'),
 827         ("STX", '\STX'),
 828         ("ETX", '\ETX'),
 829         ("EOT", '\EOT'),
 830         ("ENQ", '\ENQ'),
 831         ("ACK", '\ACK'),
 832         ("BEL", '\BEL'),
 833         ("BS", '\BS'),
 834         ("HT", '\HT'),
 835         ("LF", '\LF'),
 836         ("VT", '\VT'),
 837         ("FF", '\FF'),
 838         ("CR", '\CR'),
 839         ("SO", '\SO'),
 840         ("SI", '\SI'),
 841         ("DLE", '\DLE'),
 842         ("DC1", '\DC1'),
 843         ("DC2", '\DC2'),
 844         ("DC3", '\DC3'),
 845         ("DC4", '\DC4'),
 846         ("NAK", '\NAK'),
 847         ("SYN", '\SYN'),
 848         ("ETB", '\ETB'),
 849         ("CAN", '\CAN'),
 850         ("EM", '\EM'),
 851         ("SUB", '\SUB'),
 852         ("ESC", '\ESC'),
 853         ("FS", '\FS'),
 854         ("GS", '\GS'),
 855         ("RS", '\RS'),
 856         ("US", '\US'),
 857         ("SP", '\SP'),
 858         ("DEL", '\DEL')
 859         ]
 860
 861 -------------------------------------------------------------------------------
 862
 863 lex_demand cont buf =
 864  case read_em [] buf of { (ls,buf') ->
 865  case currentChar# buf' of
 866    'b'# -> cont (ITstrict (StrictSig (mkTopDmdType ls BotRes))) (incLexeme buf')
 867    'm'# -> cont (ITstrict (StrictSig (mkTopDmdType ls RetCPR))) (incLexeme buf')
 868    _    -> cont (ITstrict (StrictSig (mkTopDmdType ls TopRes))) buf'
 869  }
 870  where
 871   read_em acc buf =
 872    case currentChar# buf of
 873     'T'# -> read_em (Top     : acc) (stepOn buf)
 874     'L'# -> read_em (lazyDmd : acc) (stepOn buf)
 875     'A'# -> read_em (Abs     : acc) (stepOn buf)
 876     'V'# -> read_em (evalDmd : acc) (stepOn buf)        -- Temporary, until
 877                                                         -- we've recompiled prelude etc
 878     'C'# -> do_unary Call  acc (stepOnBy# buf 2#)       -- Skip 'C('
 879
 880     'U'# -> do_seq1 Eval         acc (stepOnBy# buf 1#)
 881     'D'# -> do_seq1 Defer        acc (stepOnBy# buf 1#)
 882     'S'# -> do_seq1 (Box . Eval) acc (stepOnBy# buf 1#)
 883
 884     _    -> (reverse acc, buf)
 885
 886   do_seq1 fn acc buf
 887     = case currentChar# buf of
 888         '('# -> do_seq2 fn acc (stepOnBy# buf 1#)
 889         _    -> read_em (fn (Poly Abs) : acc) buf
 890
 891   do_seq2 fn acc buf
 892     = case read_em [] buf of { (dmds, buf) ->
 893       case currentChar# buf of
 894         ')'# -> read_em (fn (Prod dmds) : acc)
 895                         (stepOn buf)
 896         '*'# -> ASSERT( length dmds == 1 )
 897                 read_em (fn (Poly (head dmds)) : acc)
 898                         (stepOnBy# buf 2#)      -- Skip '*)'
 899       }
 900
 901   do_unary fn acc buf
 902     = case read_em [] buf of
 903         ([dmd], rest) -> read_em (fn dmd : acc) (stepOn rest)   -- Skip ')'
 904
 905 ------------------
 906 lex_scc cont buf =
 907  case currentChar# buf of
 908   'C'# -> cont ITsccAllCafs (incLexeme buf)
 909   other -> cont ITscc buf
 910
 911 -----------------------------------------------------------------------------
 912 -- Numbers
 913
 914 lex_num :: (Token -> P a) -> Int# -> Integer -> P a
 915 lex_num cont exts acc buf =
 916  case scanNumLit acc buf of
 917      (acc',buf') ->
 918        case currentChar# buf' of
 919          '.'# | is_digit (lookAhead# buf' 1#) ->
 920              -- this case is not optimised at all, as the
 921              -- presence of floating point numbers in interface
 922              -- files is not that common. (ToDo)
 923             case expandWhile# is_digit (incLexeme buf') of
 924               buf2 -> -- points to first non digit char
 925
 926                 let l = case currentChar# buf2 of
 927                           'E'# -> do_exponent
 928                           'e'# -> do_exponent
 929                           _ -> buf2
 930
 931                     do_exponent
 932                         = let buf3 = incLexeme buf2 in
 933                           case currentChar# buf3 of
 934                                 '-'# | is_digit (lookAhead# buf3 1#)
 935                                    -> expandWhile# is_digit (incLexeme buf3)
 936                                 '+'# | is_digit (lookAhead# buf3 1#)
 937                                    -> expandWhile# is_digit (incLexeme buf3)
 938                                 x | is_digit x -> expandWhile# is_digit buf3
 939                                 _ -> buf2
 940
 941                     v = readRational__ (lexemeToString l)
 942
 943                 in case currentChar# l of -- glasgow exts only
 944                       '#'# | glaExtsEnabled exts -> let l' = incLexeme l in
 945                               case currentChar# l' of
 946                                 '#'# -> cont (ITprimdouble v) (incLexeme l')
 947                                 _    -> cont (ITprimfloat  v) l'
 948                       _ -> cont (ITrational v) l
 949
 950          _ -> after_lexnum cont exts acc' buf'
 951
 952 after_lexnum cont exts i buf
 953   = case currentChar# buf of
 954         '#'# | glaExtsEnabled exts -> cont (ITprimint i) (incLexeme buf)
 955         _                          -> cont (ITinteger i) buf
 956
 957 -----------------------------------------------------------------------------
 958 -- C "literal literal"s  (i.e. things like ``NULL'', ``stdout'' etc.)
 959
 960 -- we lexemeToFastString on the bit between the ``''s, but include the
 961 -- quotes in the full lexeme.
 962
 963 lex_cstring cont buf =
 964  case expandUntilMatch (stepOverLexeme buf) "\'\'" of
 965    Just buf' -> cont (ITlitlit (lexemeToFastString
 966                                 (setCurrentPos# buf' (negateInt# 2#))))
 967                    (mergeLexemes buf buf')
 968    Nothing   -> lexError "unterminated ``" buf
 969
 970 -----------------------------------------------------------------------------
 971 -- identifiers, symbols etc.
 972
 973 lex_ip ip_constr cont buf =
 974  case expandWhile# is_ident buf of
 975    buf' -> cont (ip_constr (tailFS lexeme)) buf'
 976         where lexeme = lexemeToFastString buf'
 977
 978 lex_id cont exts buf =
 979  let buf1 = expandWhile# is_ident buf in
 980  seq buf1 $
 981
 982  case (if glaExtsEnabled exts
 983         then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
 984         else buf1)                              of { buf' ->
 985
 986  let lexeme  = lexemeToFastString buf' in
 987
 988  case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
 989         Just kwd_token -> --trace ("hkeywd: "++_UNPK_(lexeme)) $
 990                           cont kwd_token buf';
 991         Nothing        ->
 992
 993  let var_token = cont (ITvarid lexeme) buf' in
 994
 995  if not (glaExtsEnabled exts)
 996    then var_token
 997    else
 998
 999  case lookupUFM ghcExtensionKeywordsFM lexeme of {
1000         Just kwd_token -> cont kwd_token buf';
1001         Nothing        -> var_token
1002
1003  }}}
1004
1005 lex_sym cont buf =
1006  -- trace "lex_sym" $
1007  case expandWhile# is_symbol buf of
1008    buf' -> case lookupUFM haskellKeySymsFM lexeme of {
1009                 Just kwd_token -> --trace ("keysym: "++unpackFS lexeme) $
1010                                   cont kwd_token buf' ;
1011                 Nothing        -> --trace ("sym: "++unpackFS lexeme) $
1012                                   cont (mk_var_token lexeme) buf'
1013            }
1014         where lexeme = lexemeToFastString buf'
1015
1016
1017 -- lex_con recursively collects components of a qualified identifer.
1018 -- The argument buf is the StringBuffer representing the lexeme
1019 -- identified so far, where the next character is upper-case.
1020
1021 lex_con cont exts buf =
1022  -- trace ("con: "{-++unpackFS lexeme-}) $
1023  let empty_buf = stepOverLexeme buf in
1024  case expandWhile# is_ident empty_buf of { buf1 ->
1025  case slurp_trailing_hashes buf1 exts of { con_buf ->
1026
1027  let all_buf = mergeLexemes buf con_buf
1028
1029      con_lexeme = lexemeToFastString con_buf
1030      mod_lexeme = lexemeToFastString (decLexeme buf)
1031      all_lexeme = lexemeToFastString all_buf
1032
1033      just_a_conid
1034         | emptyLexeme buf = cont (ITconid con_lexeme)               all_buf
1035         | otherwise       = cont (ITqconid (mod_lexeme,con_lexeme)) all_buf
1036  in
1037
1038  case currentChar# all_buf of
1039      '.'# -> maybe_qualified cont exts all_lexeme
1040                 (incLexeme all_buf) just_a_conid
1041      _    -> just_a_conid
1042   }}
1043
1044
1045 maybe_qualified cont exts mod buf just_a_conid =
1046  -- trace ("qid: "{-++unpackFS lexeme-}) $
1047  case currentChar# buf of
1048   '['# ->       -- Special case for []
1049     case lookAhead# buf 1# of
1050      ']'# -> cont (ITqconid  (mod,SLIT("[]"))) (setCurrentPos# buf 2#)
1051      _    -> just_a_conid
1052
1053   '('# ->  -- Special case for (,,,)
1054            -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)"
1055     case lookAhead# buf 1# of
1056      '#'# | glaExtsEnabled exts -> case lookAhead# buf 2# of
1057                 ','# -> lex_ubx_tuple cont mod (setCurrentPos# buf 3#)
1058                                 just_a_conid
1059                 _    -> just_a_conid
1060      ')'# -> cont (ITqconid (mod,SLIT("()"))) (setCurrentPos# buf 2#)
1061      ','# -> lex_tuple cont mod (setCurrentPos# buf 2#) just_a_conid
1062      _    -> just_a_conid
1063
1064   '-'# -> case lookAhead# buf 1# of
1065             '>'# -> cont (ITqconid (mod,SLIT("(->)"))) (setCurrentPos# buf 2#)
1066             _    -> lex_id3 cont exts mod buf just_a_conid
1067
1068   _    -> lex_id3 cont exts mod buf just_a_conid
1069
1070
1071 lex_id3 cont exts mod buf just_a_conid
1072   | is_upper (currentChar# buf) =
1073      lex_con cont exts buf
1074
1075   | is_symbol (currentChar# buf) =
1076      let
1077         start_new_lexeme = stepOverLexeme buf
1078      in
1079      -- trace ("lex_id31 "{-++unpackFS lexeme-}) $
1080      case expandWhile# is_symbol start_new_lexeme of { buf' ->
1081      let
1082        lexeme  = lexemeToFastString buf'
1083         -- real lexeme is M.<sym>
1084        new_buf = mergeLexemes buf buf'
1085      in
1086      cont (mk_qvar_token mod lexeme) new_buf
1087         -- wrong, but arguably morally right: M... is now a qvarsym
1088      }
1089
1090   | otherwise   =
1091      let
1092         start_new_lexeme = stepOverLexeme buf
1093      in
1094      -- trace ("lex_id32 "{-++unpackFS lexeme-}) $
1095      case expandWhile# is_ident start_new_lexeme of { buf1 ->
1096      if emptyLexeme buf1
1097             then just_a_conid
1098             else
1099
1100      case slurp_trailing_hashes buf1 exts of { buf' ->
1101
1102      let
1103       lexeme      = lexemeToFastString buf'
1104       new_buf     = mergeLexemes buf buf'
1105       is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
1106      in
1107      case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
1108             Nothing          -> is_a_qvarid ;
1109
1110             Just kwd_token | isSpecial kwd_token   -- special ids (as, qualified, hiding) shouldn't be
1111                            -> is_a_qvarid          --  recognised as keywords here.
1112                            | otherwise
1113                            -> just_a_conid         -- avoid M.where etc.
1114      }}}
1115
1116 slurp_trailing_hashes buf exts
1117   | glaExtsEnabled exts = expandWhile# (`eqChar#` '#'#) buf
1118   | otherwise           = buf
1119
1120
1121 mk_var_token pk_str
1122   | is_upper f          = ITconid pk_str
1123   | is_ident f          = ITvarid pk_str
1124   | f `eqChar#` ':'#    = ITconsym pk_str
1125   | otherwise           = ITvarsym pk_str
1126   where
1127       (C# f) = _HEAD_ pk_str
1128       -- tl     = _TAIL_ pk_str
1129
1130 mk_qvar_token m token =
1131 -- trace ("mk_qvar ") $
1132  case mk_var_token token of
1133    ITconid n  -> ITqconid  (m,n)
1134    ITvarid n  -> ITqvarid  (m,n)
1135    ITconsym n -> ITqconsym (m,n)
1136    ITvarsym n -> ITqvarsym (m,n)
1137    _          -> ITunknown (show token)
1138 \end{code}
1139
1140 ----------------------------------------------------------------------------
1141 Horrible stuff for dealing with M.(,,,)
1142
1143 \begin{code}
1144 lex_tuple cont mod buf back_off =
1145   go 2 buf
1146   where
1147    go n buf =
1148     case currentChar# buf of
1149       ','# -> go (n+1) (stepOn buf)
1150       ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Boxed n))) (stepOn buf)
1151       _    -> back_off
1152
1153 lex_ubx_tuple cont mod buf back_off =
1154   go 2 buf
1155   where
1156    go n buf =
1157     case currentChar# buf of
1158       ','# -> go (n+1) (stepOn buf)
1159       '#'# -> case lookAhead# buf 1# of
1160                 ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Unboxed n)))
1161                                  (stepOnBy# buf 2#)
1162                 _    -> back_off
1163       _    -> back_off
1164 \end{code}
1165
1166 -----------------------------------------------------------------------------
1167 'lexPragma' rips along really fast, looking for a '##-}',
1168 indicating the end of the pragma we're skipping
1169
1170 \begin{code}
1171 lexPragma cont contf inStr buf =
1172  case currentChar# buf of
1173    '#'# | inStr ==# 0# ->
1174        case lookAhead# buf 1# of { '#'# ->
1175        case lookAhead# buf 2# of { '-'# ->
1176        case lookAhead# buf 3# of { '}'# ->
1177            contf cont (lexemeToBuffer buf)
1178                       (stepOverLexeme (setCurrentPos# buf 4#));
1179         _    -> lexPragma cont contf inStr (incLexeme buf) };
1180         _    -> lexPragma cont contf inStr (incLexeme buf) };
1181         _    -> lexPragma cont contf inStr (incLexeme buf) }
1182
1183    '"'# ->
1184        let
1185         odd_slashes buf flg i# =
1186           case lookAhead# buf i# of
1187            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
1188            _     -> flg
1189
1190         not_inStr = if inStr ==# 0# then 1# else 0#
1191        in
1192        case lookAhead# buf (negateInt# 1#) of --backwards, actually
1193          '\\'# -> -- escaping something..
1194            if odd_slashes buf True (negateInt# 2#)
1195                 then  -- odd number of slashes, " is escaped.
1196                       lexPragma cont contf inStr (incLexeme buf)
1197                 else  -- even number of slashes, \ is escaped.
1198                       lexPragma cont contf not_inStr (incLexeme buf)
1199          _ -> lexPragma cont contf not_inStr (incLexeme buf)
1200
1201    '\''# | inStr ==# 0# ->
1202         case lookAhead# buf 1# of { '"'# ->
1203         case lookAhead# buf 2# of { '\''# ->
1204            lexPragma cont contf inStr (setCurrentPos# buf 3#);
1205         _ -> lexPragma cont contf inStr (incLexeme buf) };
1206         _ -> lexPragma cont contf inStr (incLexeme buf) }
1207
1208     -- a sign that the input is ill-formed, since pragmas are
1209     -- assumed to always be properly closed (in .hi files).
1210    '\NUL'# -> trace "lexPragma: unexpected end-of-file" $
1211               cont (ITunknown "\NUL") buf
1212
1213    _ -> lexPragma cont contf inStr (incLexeme buf)
1214
1215 \end{code}
1216
1217 -----------------------------------------------------------------------------
1218
1219 \begin{code}
1220 data LayoutContext
1221   = NoLayout
1222   | Layout Int#
1223
1224 data ParseResult a
1225   = POk PState a
1226   | PFailed Message
1227
1228 data PState = PState {
1229         loc        :: SrcLoc,
1230         extsBitmap :: Int#,     -- bitmap that determines permitted extensions
1231         bol        :: Int#,
1232         atbol      :: Int#,
1233         context    :: [LayoutContext]
1234      }
1235
1236 type P a = StringBuffer         -- Input string
1237         -> PState
1238         -> ParseResult a
1239
1240 returnP   :: a -> P a
1241 returnP a buf s = POk s a
1242
1243 thenP      :: P a -> (a -> P b) -> P b
1244 m `thenP` k = \ buf s ->
1245         case m buf s of
1246                 POk s1 a -> k a buf s1
1247                 PFailed err  -> PFailed err
1248
1249 thenP_     :: P a -> P b -> P b
1250 m `thenP_` k = m `thenP` \_ -> k
1251
1252 mapP :: (a -> P b) -> [a] -> P [b]
1253 mapP f [] = returnP []
1254 mapP f (a:as) =
1255      f a `thenP` \b ->
1256      mapP f as `thenP` \bs ->
1257      returnP (b:bs)
1258
1259 failP :: String -> P a
1260 failP msg buf s = PFailed (text msg)
1261
1262 failMsgP :: Message -> P a
1263 failMsgP msg buf s = PFailed msg
1264
1265 lexError :: String -> P a
1266 lexError str buf s@PState{ loc = loc }
1267   = failMsgP (hcat [ppr loc, text ": ", text str]) buf s
1268
1269 getSrcLocP :: P SrcLoc
1270 getSrcLocP buf s@(PState{ loc = loc }) = POk s loc
1271
1272 -- use a temporary SrcLoc for the duration of the argument
1273 setSrcLocP :: SrcLoc -> P a -> P a
1274 setSrcLocP new_loc p buf s =
1275   case p buf s{ loc=new_loc } of
1276       POk _ a   -> POk s a
1277       PFailed e -> PFailed e
1278
1279 getSrcFile :: P FAST_STRING
1280 getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc)
1281
1282 pushContext :: LayoutContext -> P ()
1283 pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} ()
1284
1285 {-
1286
1287 This special case in layoutOn is to handle layout contexts with are
1288 indented the same or less than the current context.  This is illegal
1289 according to the Haskell spec, so we have to arrange to close the
1290 current context.  eg.
1291
1292 class Foo a where
1293 class Bar a
1294
1295 after the first 'where', the sequence of events is:
1296
1297         - layout system inserts a ';' (column 0)
1298         - parser begins a new context at column 0
1299         - parser shifts ';' (legal empty declaration)
1300         - parser sees 'class': parse error (we're still in the inner context)
1301
1302 trouble is, by the time we know we need a new context, the lexer has
1303 already generated the ';'.  Hacky solution is as follows: since we
1304 know the column of the next token (it's the column number of the new
1305 context), we set the ACTUAL column number of the new context to this
1306 numer plus one.  Hence the next time the lexer is called, a '}' will
1307 be generated to close the new context straight away.  Furthermore, we
1308 have to set the atbol flag so that the ';' that the parser shifted as
1309 part of the new context is re-generated.
1310
1311 when the new context is *less* indented than the current one:
1312
1313 f = f where g = g where
1314 h = h
1315
1316         - current context: column 12.
1317         - on seeing 'h' (column 0), the layout system inserts '}'
1318         - parser starts a new context, column 0
1319         - parser sees '}', uses it to close new context
1320         - we still need to insert another '}' followed by a ';',
1321           hence the atbol trick.
1322
1323 There's also a special hack in here to deal with
1324
1325         do
1326            ....
1327            e $ do
1328            blah
1329
1330 i.e. the inner context is at the same indentation level as the outer
1331 context.  This is strictly illegal according to Haskell 98, but
1332 there's a lot of existing code using this style and it doesn't make
1333 any sense to disallow it, since empty 'do' lists don't make sense.
1334 -}
1335
1336 layoutOn :: Bool -> P ()
1337 layoutOn strict buf s@(PState{ bol = bol, context = ctx }) =
1338     let offset = lexemeIndex buf -# bol in
1339     case ctx of
1340         Layout prev_off : _
1341            | if strict then prev_off >=# offset else prev_off ># offset ->
1342                 --trace ("layout on, column: " ++  show (I# offset)) $
1343                 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
1344         other ->
1345                 --trace ("layout on, column: " ++  show (I# offset)) $
1346                 POk s{ context = Layout offset : ctx } ()
1347
1348 layoutOff :: P ()
1349 layoutOff buf s@(PState{ context = ctx }) =
1350     POk s{ context = NoLayout:ctx } ()
1351
1352 popContext :: P ()
1353 popContext = \ buf s@(PState{ context = ctx, loc = loc }) ->
1354   case ctx of
1355         (_:tl) -> POk s{ context = tl } ()
1356         []     -> PFailed (srcParseErr buf loc)
1357
1358 {-
1359  Note that if the name of the file we're processing ends
1360  with `hi-boot', we accept it on faith as having the right
1361  version. This is done so that .hi-boot files that comes
1362  with hsc don't have to be updated before every release,
1363  *and* it allows us to share .hi-boot files with versions
1364  of hsc that don't have .hi version checking (e.g., ghc-2.10's)
1365
1366  If the version number is 0, the checking is also turned off.
1367  (needed to deal with GHC.hi only!)
1368
1369  Once we can assume we're compiling with a version of ghc that
1370  supports interface file checking, we can drop the special
1371  pleading
1372 -}
1373 checkVersion :: Maybe Integer -> P ()
1374 checkVersion mb@(Just v) buf s@(PState{loc = loc})
1375  | (v==0) || (v == fromIntegral opt_HiVersion) || opt_NoHiCheck = POk s ()
1376  | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1377 checkVersion mb@Nothing  buf s@(PState{loc = loc})
1378  | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile loc)) = POk s ()
1379  | otherwise = PFailed (ifaceVersionErr mb loc ([]::[Token]){-Todo-})
1380
1381
1382 -- for reasons of efficiency, flags indicating language extensions (eg,
1383 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1384 -- integer
1385
1386 glaExtsBit, ffiBit, parrBit :: Int
1387 glaExtsBit = 0
1388 ffiBit     = 1  -- FIXME: not used yet; still part of `glaExtsBit'
1389 parrBit    = 2
1390
1391 glaExtsEnabled, ffiEnabled, parrEnabled :: Int# -> Bool
1392 glaExtsEnabled flags = testBit (I# flags) glaExtsBit
1393 ffiEnabled     flags = testBit (I# flags) ffiBit
1394 parrEnabled    flags = testBit (I# flags) parrBit
1395
1396 -- convenient record-based bitmap for the interface to the rest of the world
1397 --
1398 data ExtFlags = ExtFlags {
1399                   glasgowExtsEF :: Bool,
1400 --                ffiEF         :: Bool,  -- commented out to avoid warnings
1401                   parrEF        :: Bool   -- while not used yet
1402                 }
1403
1404 -- create a parse state
1405 --
1406 mkPState          :: SrcLoc -> ExtFlags -> PState
1407 mkPState loc exts  = PState {
1408                        loc        = loc,
1409                        extsBitmap = case bitmap of {I# bits -> bits},
1410                        bol        = 0#,
1411                        atbol      = 1#,
1412                        context    = []
1413                      }
1414                      where
1415                        bitmap =     glaExtsBit `setBitIf` glasgowExtsEF exts
1416 --                              .|. ffiBit     `setBitIf` ffiEF         exts
1417                                 .|. parrBit    `setBitIf` parrEF        exts
1418                        --
1419                        b `setBitIf` cond | cond      = bit b
1420                                          | otherwise = 0
1421
1422
1423 -----------------------------------------------------------------
1424
1425 ifaceParseErr :: StringBuffer -> SrcLoc -> Message
1426 ifaceParseErr s l
1427   = hsep [ppr l, ptext SLIT("Interface file parse error; on input `"),
1428           text (lexemeToString s), char '\'']
1429
1430 ifaceVersionErr hi_vers l toks
1431   = hsep [ppr l, ptext SLIT("Interface file version error;"),
1432           ptext SLIT("Expected"), int opt_HiVersion,
1433           ptext SLIT("found "), pp_version]
1434     where
1435      pp_version =
1436       case hi_vers of
1437         Nothing -> ptext SLIT("pre ghc-3.02 version")
1438         Just v  -> ptext SLIT("version") <+> integer v
1439
1440 -----------------------------------------------------------------------------
1441
1442 srcParseErr :: StringBuffer -> SrcLoc -> Message
1443 srcParseErr s l
1444   = hcat [ppr l,
1445           if null token
1446              then ptext SLIT(": parse error (possibly incorrect indentation)")
1447              else hcat [ptext SLIT(": parse error on input "),
1448                         char '`', text token, char '\'']
1449     ]
1450   where
1451         token = lexemeToString s
1452
1453 \end{code}