ghc/compiler/reader/Lex.lhs

   1 --------------------------------------------------------
   2 [Jan 98]
   3 There's a known bug in here:
   4
   5         If an interface file ends prematurely, Lex tries to
   6         do headFS of an empty FastString.
   7
   8 An example that provokes the error is
   9
  10         f _:_ _forall_ [a] <<<END OF FILE>>>
  11 --------------------------------------------------------
  12
  13
  14 %
  15 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
  16 %
  17 \section[Lexical analysis]{Lexical analysis}
  18
  19 \begin{code}
  20 module Lex (
  21
  22         isLexCon, isLexVar, isLexId, isLexSym,
  23         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  24         mkTupNameStr, ifaceParseErr,
  25
  26         -- Monad for parser
  27         IfaceToken(..), lexIface, IfM, thenIf, returnIf, getSrcLocIf,
  28         checkVersion,
  29         happyError,
  30         StringBuffer
  31
  32     ) where
  33
  34 #include "HsVersions.h"
  35
  36 import Char             (isDigit, isAlphanum, isUpper,isLower, isSpace, ord )
  37 import List             ( isSuffixOf )
  38
  39 import {-# SOURCE #-} CostCentre
  40
  41 import CmdLineOpts      ( opt_IgnoreIfacePragmas, opt_NoHiCheck )
  42 import Demand           ( Demand(..) {- instance Read -} )
  43 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  44 import BasicTypes       ( NewOrData(..), IfaceFlavour(..) )
  45 import SrcLoc           ( SrcLoc, incSrcLine, srcLocFile )
  46
  47 import Maybes           ( MaybeErr(..) )
  48 import ErrUtils         ( ErrMsg(..) )
  49 import Outputable
  50 import Util             ( nOfThem, panic )
  51
  52 import FastString
  53 import StringBuffer
  54 import GlaExts
  55 import ST               ( runST )
  56
  57 import PrelRead                 ( readRational__ ) -- Glasgow non-std
  58 \end{code}
  59
  60 %************************************************************************
  61 %*                                                                      *
  62 \subsection{Lexical categories}
  63 %*                                                                      *
  64 %************************************************************************
  65
  66 These functions test strings to see if they fit the lexical categories
  67 defined in the Haskell report.  Normally applied as in e.g. @isCon
  68 (getLocalName foo)@.
  69
  70 \begin{code}
  71 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  72  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  73
  74 isLexCon cs = isLexConId  cs || isLexConSym cs
  75 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  76
  77 isLexId  cs = isLexConId  cs || isLexVarId  cs
  78 isLexSym cs = isLexConSym cs || isLexVarSym cs
  79
  80 -------------
  81
  82 isLexConId cs
  83   | _NULL_ cs        = False
  84   | cs == SLIT("[]") = True
  85   | c  == '('        = True     -- (), (,), (,,), ...
  86   | otherwise        = isUpper c || isUpperISO c
  87   where
  88     c = _HEAD_ cs
  89
  90 isLexVarId cs
  91   | _NULL_ cs    = False
  92   | otherwise    = isLower c || isLowerISO c
  93   where
  94     c = _HEAD_ cs
  95
  96 isLexConSym cs
  97   | _NULL_ cs   = False
  98   | otherwise   = c  == ':'
  99                || cs == SLIT("->")
 100   where
 101     c = _HEAD_ cs
 102
 103 isLexVarSym cs
 104   | _NULL_ cs = False
 105   | otherwise = isSymbolASCII c
 106              || isSymbolISO c
 107   where
 108     c = _HEAD_ cs
 109
 110 -------------
 111 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 112 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 113 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 114 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 115 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 116 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 117 \end{code}
 118
 119
 120 %************************************************************************
 121 %*                                                                      *
 122 \subsection{Tuple strings -- ugh!}
 123 %*                                                                      *
 124 %************************************************************************
 125
 126 \begin{code}
 127 mkTupNameStr 0 = SLIT("()")
 128 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 129 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 130 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 131 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 132 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 133 \end{code}
 134
 135
 136
 137 %************************************************************************
 138 %*                                                                      *
 139 \subsection{Data types}
 140 %*                                                                      *
 141 %************************************************************************
 142
 143 The token data type, fairly un-interesting except from two constructors,
 144 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 145 strictness, unfolding etc) and types for id decls.
 146
 147 The Idea/Observation here is that the renamer needs to scan through
 148 all of an interface file before it can continue. But only a fraction
 149 of the information contained in the file turns out to be useful, so
 150 delaying as much as possible of the scanning and parsing of an
 151 interface file Makes Sense (Heap profiles of the compiler
 152 show a reduction in heap usage by at least a factor of two,
 153 post-renamer).
 154
 155 Hence, the interface file lexer spots when value declarations are
 156 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 157 for the type and any other id info for that binding (unfolding, strictness
 158 etc). These constructors are applied to the result of lexing these sub-chunks.
 159
 160 The lexing of the type and id info is all done lazily, of course, so
 161 the scanning (and subsequent parsing) will be done *only* on the ids the
 162 renamer finds out that it is interested in. The rest will just be junked.
 163 Laziness, you know it makes sense :-)
 164
 165 \begin{code}
 166 data IfaceToken
 167   = ITinterface         -- keywords
 168   | ITusages
 169   | ITversions
 170   | ITexports
 171   | ITinstance_modules
 172   | ITinstances
 173   | ITfixities
 174   | ITdeclarations
 175   | ITpragmas
 176   | ITdata
 177   | ITtype
 178   | ITnewtype
 179   | ITderiving
 180   | ITclass
 181   | ITwhere
 182   | ITinstance
 183   | ITinfixl
 184   | ITinfixr
 185   | ITinfix
 186   | ITforall
 187   | ITbang              -- magic symbols
 188   | ITvbar
 189   | ITdcolon
 190   | ITcomma
 191   | ITdarrow
 192   | ITdotdot
 193   | ITequal
 194   | ITocurly
 195   | ITobrack
 196   | IToparen
 197   | ITrarrow
 198   | ITccurly
 199   | ITcbrack
 200   | ITcparen
 201   | ITsemi
 202   | ITvarid   FAST_STRING
 203   | ITconid   FAST_STRING
 204   | ITvarsym  FAST_STRING
 205   | ITconsym  FAST_STRING
 206   | ITqvarid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 207   | ITqconid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 208   | ITqvarsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 209   | ITqconsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 210
 211   | ITtysig StringBuffer (Maybe StringBuffer)
 212                            -- lazily return the stream of tokens for
 213                            -- the info attached to an id.
 214         -- Stuff for reading unfoldings
 215   | ITarity
 216   | ITunfold Bool               -- True <=> there's an INLINE pragma on this Id
 217   | ITstrict [Demand] | ITbottom
 218   | ITspecialise
 219   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 220   | ITcoerce | ITinline | ITatsign
 221   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 222   | ITscc CostCentre
 223   | ITchar Char | ITstring FAST_STRING
 224   | ITinteger Integer | ITrational Rational
 225   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 226   | ITunknown String            -- Used when the lexer can't make sense of it
 227   | ITeof                               -- end of file token
 228   deriving Text -- debugging
 229
 230 instance Text CostCentre -- cheat!
 231
 232 \end{code}
 233
 234 %************************************************************************
 235 %*                                                                      *
 236 \subsection{The lexical analyser}
 237 %*                                                                      *
 238 %************************************************************************
 239
 240 \begin{code}
 241 lexIface :: (IfaceToken -> IfM a) -> IfM a
 242 lexIface cont buf =
 243  _scc_ "Lexer"
 244 -- if bufferExhausted buf then
 245 --  []
 246 -- else
 247 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 248   case currentChar# buf of
 249       -- whitespace and comments, ignore.
 250     ' '#  -> lexIface cont (stepOn buf)
 251     '\t'# -> lexIface cont (stepOn buf)
 252     '\n'# -> \ loc -> lexIface cont (stepOn buf) (incSrcLine loc)
 253
 254 -- Numbers and comments
 255     '-'#  ->
 256       case lookAhead# buf 1# of
 257         '-'# -> lex_comment cont (stepOnBy# buf 2#)
 258         c    ->
 259           if isDigit (C# c)
 260           then lex_num cont (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 261           else lex_id cont buf
 262
 263 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 264 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 265
 266     '('# ->
 267          case prefixMatch (stepOn buf) "..)" of
 268            Just buf' ->  cont ITdotdot (stepOverLexeme buf')
 269            Nothing ->
 270             case lookAhead# buf 1# of
 271               ','# -> lex_tuple cont Nothing  (stepOnBy# buf 2#)
 272               ')'# -> cont (ITconid SLIT("()")) (stepOnBy# buf 2#)
 273               _    -> cont IToparen (stepOn buf)
 274
 275     '{'# -> cont ITocurly (stepOn buf)
 276     '}'# -> cont ITccurly (stepOn buf)
 277     ')'# -> cont ITcparen (stepOn buf)
 278     '['# ->
 279       case lookAhead# buf 1# of
 280         ']'# -> cont (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 281         _    -> cont ITobrack (stepOn buf)
 282     ']'# -> cont ITcbrack (stepOn buf)
 283     ','# -> cont ITcomma  (stepOn buf)
 284     ';'#  -> cont ITsemi (stepOn buf)
 285     '\"'# -> case untilEndOfString# (stepOn buf) of
 286               buf' ->
 287                   -- the string literal does *not* include the dquotes
 288                 case lexemeToFastString buf' of
 289                  v -> cont (ITstring v) (stepOn (stepOverLexeme buf'))
 290
 291     '\''# -> --
 292              -- untilEndOfChar# extends the current lexeme until
 293              -- it hits a non-escaped single quote. The lexeme of the
 294              -- StringBuffer returned does *not* include the closing quote,
 295              -- hence we augment the lexeme and make sure to add the
 296              -- starting quote, before `read'ing the string.
 297              --
 298              case untilEndOfChar# (stepOn buf) of
 299                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 300                         [  (ch, rest)] -> cont (ITchar ch) (stepOverLexeme (incLexeme buf'))
 301
 302 -- ``thingy'' form for casm
 303     '`'# ->
 304             case lookAhead# buf 1# of
 305               '`'# -> lex_cstring cont (stepOnBy# buf 2#) -- remove the `` and go.
 306               _    -> lex_id cont (incLexeme buf)         -- add ` to lexeme and assume
 307                                                      -- scanning an id of some sort.
 308 -- Keywords
 309     '_'# ->
 310          case lookAhead# buf 1# of
 311            'S'# -> case lookAhead# buf 2# of
 312                     '_'# ->
 313                             lex_demand cont (stepOnUntil (not . isSpace)
 314                                             (stepOnBy# buf 3#)) -- past _S_
 315            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 316                      Just buf' -> lex_scc cont (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 317                      Nothing   -> lex_keyword cont (stepOnBy# buf 1#) -- drop the '_' and assume
 318                                                                  -- it is a keyword.
 319            _    -> lex_keyword cont (stepOn buf)
 320
 321     '\NUL'# ->
 322             if bufferExhausted (stepOn buf) then
 323                cont ITeof buf
 324             else
 325                lex_id cont buf
 326     c ->
 327         if isDigit (C# c) then
 328            lex_num cont (id) (ord# c -# ord# '0'#) (incLexeme buf)
 329         else
 330            lex_id cont buf
 331 --  where
 332 lex_comment cont buf =
 333 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 334    case untilChar# buf '\n'# of {buf' -> lexIface cont (stepOverLexeme buf')}
 335
 336 ------------------
 337 lex_demand cont buf =
 338  case read_em [] buf of { (ls,buf') -> cont (ITstrict ls) (stepOverLexeme buf')}
 339  where
 340    -- code snatched from Demand.lhs
 341   read_em acc buf =
 342    case currentChar# buf of
 343     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 344     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 345     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 346     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 347     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 348     ')'# -> (reverse acc, stepOn buf)
 349     'U'# -> do_unpack DataType True  acc (stepOnBy# buf 2#)
 350     'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
 351     'N'# -> do_unpack NewType True  acc (stepOnBy# buf 2#)
 352     'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
 353     _    -> (reverse acc, buf)
 354
 355   do_unpack new_or_data wrapper_unpacks acc buf
 356    = case read_em [] buf of
 357       (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
 358
 359 ------------------
 360 lex_scc cont buf =
 361  case currentChar# buf of
 362   '"'# ->
 363       -- YUCK^2
 364      case prefixMatch (stepOn buf) "NO_CC\"" of
 365       Just buf' -> cont (ITscc noCostCentre) (stepOverLexeme buf')
 366       Nothing ->
 367        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 368         Just buf' -> cont (ITscc useCurrentCostCentre) (stepOverLexeme buf')
 369         Nothing   ->
 370          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 371          Just buf' -> cont (ITscc overheadCostCentre) (stepOverLexeme buf')
 372          Nothing   ->
 373           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 374            Just buf' -> cont (ITscc dontCareCostCentre) (stepOverLexeme buf')
 375            Nothing   ->
 376             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 377              Just buf' -> cont (ITscc subsumedCosts) (stepOverLexeme buf')
 378              Nothing ->
 379               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 380                Just buf' -> cont (ITscc preludeCafsCostCentre) (stepOverLexeme buf')
 381                Nothing ->
 382                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 383                  Just buf' ->
 384                   case untilChar# (stepOverLexeme buf') '\"'# of
 385                    buf'' -> cont (ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_)) (stepOn (stepOverLexeme buf''))
 386                  Nothing ->
 387                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 388                    Just buf' -> cont (ITscc (preludeDictsCostCentre True)) (stepOverLexeme buf')
 389                    Nothing ->
 390                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 391                      Just buf' ->
 392                       case untilChar# (stepOverLexeme buf') '\"'# of
 393                        buf'' -> cont (ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True))
 394                                 (stepOn (stepOverLexeme buf''))
 395                      Nothing ->
 396                       let
 397                        match_user_cc buf =
 398                         case untilChar# buf '/'# of
 399                          buf' ->
 400                           let mod_name = lexemeToFastString buf' in
 401 --                        case untilChar# (stepOn (stepOverLexeme buf')) '/'# of
 402 --                         buf'' ->
 403 --                            let grp_name = lexemeToFastString buf'' in
 404                             case untilChar# (stepOn (stepOverLexeme buf')) '\"'# of
 405                              buf'' ->
 406                                -- The label may contain arbitrary characters, so it
 407                                -- may have been escaped etc., hence we `read' it in to get
 408                                -- rid of these meta-chars in the string and then pack it (again.)
 409                                -- ToDo: do the same for module name (single quotes allowed in m-names).
 410                                -- BTW, the code in this module is totally gruesome..
 411                                let upk_label = _UNPK_ (lexemeToFastString buf'') in
 412                                case reads ('"':upk_label++"\"") of
 413                                 ((cc_label,_):_) ->
 414                                     let cc_name = _PK_ cc_label in
 415                                     (mkUserCC cc_name mod_name _NIL_{-grp_name-},
 416                                      stepOn (stepOverLexeme buf''))
 417                                 _ ->
 418                                   trace ("trouble lexing scc label: " ++ upk_label ++ " , ignoring")
 419                                   (mkUserCC _NIL_ mod_name _NIL_{-grp_name-},
 420                                    stepOn (stepOverLexeme buf''))
 421                       in
 422                       case prefixMatch (stepOn buf) "CAF:" of
 423                        Just buf' ->
 424                          case match_user_cc (stepOverLexeme buf') of
 425                           (cc, buf'') -> cont (ITscc (cafifyCC cc)) buf''
 426                        Nothing ->
 427                          case match_user_cc (stepOn buf) of
 428                           (cc, buf'') -> cont (ITscc cc) buf''
 429   c -> cont (ITunknown [C# c]) (stepOn buf)
 430
 431
 432 -----------
 433 lex_num :: (IfaceToken -> IfM a) ->
 434         (Int -> Int) -> Int# -> IfM a
 435 lex_num cont minus acc# buf =
 436 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 437  case scanNumLit (I# acc#) buf of
 438      (acc',buf') ->
 439        case currentChar# buf' of
 440          '.'# ->
 441              -- this case is not optimised at all, as the
 442              -- presence of floating point numbers in interface
 443              -- files is not that common. (ToDo)
 444             case expandWhile (isDigit) (incLexeme buf') of
 445               buf2 -> -- points to first non digit char
 446                 let l = case currentChar# buf2 of
 447                           'e'# -> let buf3 = incLexeme buf2 in
 448                               case currentChar# buf3 of
 449                                 '-'# -> expandWhile (isDigit) (incLexeme buf3)
 450                                 _    -> expandWhile (isDigit) buf3
 451                           _ -> buf2
 452                 in let v = readRational__ (lexemeToString l) in
 453                    cont (ITrational v) (stepOverLexeme l)
 454
 455          _ -> cont (ITinteger (fromInt (minus acc'))) (stepOverLexeme buf')
 456
 457
 458
 459 ------------
 460 lex_keyword cont buf =
 461 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 462  case currentChar# buf of
 463   ':'# -> case lookAhead# buf 1# of
 464             '_'# -> -- a binding, type (and other id-info) follows,
 465                     -- to make the parser ever so slightly, we push
 466                     --
 467                 lex_decl cont (stepOnBy# buf 2#)
 468             v# -> cont (ITunknown (['_',':',C# v#])) (stepOnBy# buf 2#)
 469   _ ->
 470     case expandWhile (is_kwd_char) buf of
 471      buf' ->
 472       let kw = lexemeToFastString buf' in
 473 --    _trace ("kw: "++lexemeToString buf') $
 474       case lookupUFM ifaceKeywordsFM kw of
 475        Nothing -> cont (ITunknown (_UNPK_ kw)) -- (minor) sigh
 476                   (stepOverLexeme buf')
 477        Just xx -> cont xx (stepOverLexeme buf')
 478
 479 lex_decl cont buf =
 480  case doDiscard False buf of -- spin until ;; is found
 481    buf' ->
 482       {- _trace (show (lexemeToString buf')) $ -}
 483       case currentChar# buf' of
 484        '\n'# -> -- newline, no id info.
 485            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 486                 (stepOverLexeme buf')
 487        '\r'# -> -- just to be sure for those Win* boxes..
 488            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 489                 (stepOverLexeme buf')
 490        '\NUL'# ->
 491            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 492                 (stepOverLexeme buf')
 493        c     -> -- run all over the id info
 494          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 495            buf'' ->
 496                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 497                     --_trace (show (lexemeToString (decLexeme buf''))) $
 498                     let idinfo =
 499                             if opt_IgnoreIfacePragmas then
 500                                 Nothing
 501                             else
 502                                 Just (lexemeToBuffer (decLexeme buf''))
 503                         --_trace (show is) $
 504                     in
 505                     cont (ITtysig (lexemeToBuffer (decLexeme buf')) idinfo)
 506                         (stepOverLexeme buf'')
 507
 508 -- ToDo: hammer!
 509 is_kwd_char c@(C# c#) =
 510  isAlphanum c || -- OLD: c `elem` "_@/\\"
 511  (case c# of
 512    '_'#  -> True
 513    '@'#  -> True
 514    '/'#  -> True
 515    '\\'# -> True
 516    _     -> False)
 517
 518
 519
 520 -----------
 521 lex_cstring cont buf =
 522  case expandUntilMatch buf "\'\'" of
 523    buf' -> cont (ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))))
 524            (stepOverLexeme buf')
 525
 526 -----------
 527 lex_tuple cont module_dot buf =
 528   go 2 buf
 529   where
 530    go n buf =
 531     case currentChar# buf of
 532       ','# -> go (n+1) (stepOn buf)
 533       ')'# -> end_lex_id cont module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 534       _    -> cont (ITunknown ("tuple " ++ show n)) buf
 535
 536 -- Similarly ' itself is ok inside an identifier, but not at the start
 537
 538 -- id_arr is an array of bytes, indexed by characters,
 539 -- containing 0 if the character isn't a valid character from an identifier
 540 -- and 1 if it is.  It's just a memo table for is_id_char.
 541 id_arr :: ByteArray Int
 542 id_arr =
 543  runST (
 544   newCharArray (0,255) >>= \ barr ->
 545   let
 546    loop 256# = return ()
 547    loop i# =
 548     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 549        writeCharArray barr (I# i#) '\1'         >>
 550        loop (i# +# 1#)
 551     else
 552        writeCharArray barr (I# i#) '\0'         >>
 553        loop (i# +# 1#)
 554   in
 555   loop 0#                                       >>
 556   unsafeFreezeByteArray barr)
 557
 558 is_id_char (C# c#) =
 559  let
 560   ByteArray _ arr# = id_arr
 561  in
 562  case ord# (indexCharArray# arr# (ord# c#)) of
 563   0# -> False
 564   1# -> True
 565
 566 --OLD: is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 567
 568 is_sym c# =
 569  case c# of {
 570    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 571    '#'# -> True; '$'#  -> True; '%'# -> True;
 572    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 573    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 574    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 575    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 576
 577 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 578
 579
 580 -- mod_arr is an array of bytes, indexed by characters,
 581 -- containing 0 if the character isn't a valid character from a module name,
 582 -- and 1 if it is.
 583 mod_arr :: ByteArray Int
 584 mod_arr =
 585  runST (
 586   newCharArray (0,255) >>= \ barr ->
 587   let
 588    loop 256# = return ()
 589    loop i# =
 590     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 591        writeCharArray barr (I# i#) '\1'         >>
 592        loop (i# +# 1#)
 593     else
 594        writeCharArray barr (I# i#) '\0'         >>
 595        loop (i# +# 1#)
 596   in
 597   loop 0#                                       >>
 598   unsafeFreezeByteArray barr)
 599
 600
 601 is_mod_char (C# c#) =
 602  let
 603   ByteArray _ arr# = mod_arr
 604  in
 605  case ord# (indexCharArray# arr# (ord# c#)) of
 606   0# -> False
 607   1# -> True
 608
 609 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 610
 611 lex_id cont buf =
 612 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 613  case expandWhile (is_mod_char) buf of
 614    buf' ->
 615     case currentChar# buf' of
 616      '.'# -> munch buf' HiFile
 617      '!'# -> munch buf' HiBootFile
 618      _    -> lex_id2 cont Nothing buf'
 619    where
 620     munch buf' hif =
 621         if not (emptyLexeme buf') then
 622 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 623            case lexemeToFastString buf' of
 624              l@(FastString u# l# ba#) -> lex_id2 cont (Just (FastString u# l# ba#, hif))
 625                                                  (stepOn (stepOverLexeme buf'))
 626         else
 627            lex_id2 cont Nothing buf'
 628
 629
 630 -- Dealt with the Module.part
 631 lex_id2 cont module_dot buf =
 632 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 633  case currentChar# buf of
 634
 635   '['# ->       -- Special case for []
 636     case lookAhead# buf 1# of
 637      ']'# -> end_lex_id cont module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 638      _    -> lex_id3 cont module_dot buf
 639
 640   '('# ->       -- Special case for (,,,)
 641     case lookAhead# buf 1# of
 642      ')'# -> end_lex_id cont module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 643      ','# -> lex_tuple cont module_dot (stepOnBy# buf 2#)
 644      _    -> lex_id3 cont module_dot buf
 645   ':'# -> lex_id3 cont module_dot (incLexeme buf)
 646   '-'# ->
 647      case module_dot of
 648        Nothing  -> lex_id3 cont module_dot buf
 649        Just ghc -> -- this should be "GHC" (current home of (->))
 650          case lookAhead# buf 1# of
 651           '>'# -> end_lex_id cont module_dot (ITconid SLIT("->"))
 652                         (stepOnBy# buf 2#)
 653           _    -> lex_id3 cont module_dot buf
 654   _    -> lex_id3 cont module_dot buf
 655
 656
 657
 658 -- Dealt with [], (), : special cases
 659
 660 lex_id3 cont module_dot buf =
 661  case expandWhile (is_id_char) buf of
 662   buf' ->
 663     case module_dot of
 664      Just _ ->
 665        end_lex_id cont module_dot (mk_var_token lexeme) new_buf
 666      Nothing ->
 667        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 668          Just kwd_token -> cont kwd_token new_buf
 669          Nothing        -> cont (mk_var_token lexeme) new_buf
 670     where
 671      lexeme  = lexemeToFastString buf'
 672      new_buf = stepOverLexeme buf'
 673
 674
 675 -- Dealt with [], (), : special cases
 676 mk_var_token pk_str =
 677      let
 678       f = _HEAD_ pk_str
 679      in
 680      --
 681      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 682      -- remove the second half of disjunction when using a 1.3 prelude.
 683      --
 684      if      isUpper f    then ITconid pk_str
 685      else if isLower f    then ITvarid pk_str
 686      else if f == ':'     then ITconsym pk_str
 687      else if isLowerISO f then ITvarid pk_str
 688      else if isUpperISO f then ITconid pk_str
 689      else ITvarsym pk_str
 690
 691 end_lex_id cont Nothing token buf  = cont token buf
 692 end_lex_id cont (Just (m,hif)) token buf =
 693  case token of
 694    ITconid n  -> cont (ITqconid  (m,n,hif))         buf
 695    ITvarid n  -> cont (ITqvarid  (m,n,hif))         buf
 696    ITconsym n -> cont (ITqconsym (m,n,hif))         buf
 697
 698         -- Special case for ->
 699         -- "->" by itself is a special token (ITrarrow),
 700         -- but M.-> is a ITqconid
 701    ITvarsym n |  n == SLIT("->")
 702               -> cont (ITqconsym (m,n,hif))         buf
 703
 704    ITvarsym n -> cont (ITqvarsym (m,n,hif))         buf
 705
 706 -- ITbang can't happen here I think
 707 --   ITbang     -> cont (ITqvarsym (m,SLIT("!"),hif)) buf
 708
 709    _          -> cont (ITunknown (show token))      buf
 710
 711 ------------
 712 ifaceKeywordsFM :: UniqFM IfaceToken
 713 ifaceKeywordsFM = listToUFM $
 714       map (\ (x,y) -> (_PK_ x,y))
 715        [("/\\_",                ITbiglam)
 716        ,("@_",                  ITatsign)
 717        ,("letrec_",             ITletrec)
 718        ,("interface_",          ITinterface)
 719        ,("usages_",             ITusages)
 720        ,("versions_",           ITversions)
 721        ,("exports_",            ITexports)
 722        ,("instance_modules_",   ITinstance_modules)
 723        ,("instances_",          ITinstances)
 724        ,("fixities_",           ITfixities)
 725        ,("declarations_",       ITdeclarations)
 726        ,("pragmas_",            ITpragmas)
 727        ,("forall_",             ITforall)
 728        ,("u_",                  ITunfold False)
 729        ,("U_",                  ITunfold True)
 730        ,("A_",                  ITarity)
 731        ,("P_",                  ITspecialise)
 732        ,("coerce_",             ITcoerce)
 733        ,("inline_",             ITinline)
 734        ,("bot_",                ITbottom)
 735        ,("integer_",            ITinteger_lit)
 736        ,("rational_",           ITrational_lit)
 737        ,("addr_",               ITaddr_lit)
 738        ,("float_",              ITfloat_lit)
 739        ,("string_",             ITstring_lit)
 740        ,("litlit_",             ITlit_lit)
 741        ,("ccall_",              ITccall (False, False))
 742        ,("ccall_GC_",           ITccall (False, True))
 743        ,("casm_",               ITccall (True,  False))
 744        ,("casm_GC_",            ITccall (True,  True))
 745        ]
 746
 747 haskellKeywordsFM = listToUFM $
 748       map (\ (x,y) -> (_PK_ x,y))
 749       [ ("data",                ITdata)
 750        ,("type",                ITtype)
 751        ,("newtype",             ITnewtype)
 752        ,("class",               ITclass)
 753        ,("where",               ITwhere)
 754        ,("instance",            ITinstance)
 755        ,("infixl",              ITinfixl)
 756        ,("infixr",              ITinfixr)
 757        ,("infix",               ITinfix)
 758        ,("case",                ITcase)
 759        ,("case#",               ITprim_case)
 760        ,("of",                  ITof)
 761        ,("in",                  ITin)
 762        ,("let",                 ITlet)
 763        ,("deriving",            ITderiving)
 764
 765        ,("->",                  ITrarrow)
 766        ,("\\",                  ITlam)
 767        ,("|",                   ITvbar)
 768        ,("!",                   ITbang)
 769        ,("=>",                  ITdarrow)
 770        ,("=",                   ITequal)
 771        ,("::",                  ITdcolon)
 772        ]
 773
 774
 775 -- doDiscard rips along really fast, looking for a double semicolon,
 776 -- indicating the end of the pragma we're skipping
 777 doDiscard inStr buf =
 778 -- _trace (show (C# (currentChar# buf))) $
 779  case currentChar# buf of
 780    ';'# ->
 781      if not inStr then
 782        case lookAhead# buf 1# of
 783         ';'# -> incLexeme (incLexeme buf)
 784         _    -> doDiscard inStr (incLexeme buf)
 785      else
 786        doDiscard inStr (incLexeme buf)
 787    '"'# ->
 788        let
 789         odd_slashes buf flg i# =
 790           case lookAhead# buf i# of
 791            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 792            _     -> flg
 793        in
 794        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 795          '\\'# -> -- escaping something..
 796            if odd_slashes buf True (negateInt# 2#) then
 797                -- odd number of slashes, " is escaped.
 798               doDiscard inStr (incLexeme buf)
 799            else
 800                -- even number of slashes, \ is escaped.
 801               doDiscard (not inStr) (incLexeme buf)
 802          _ -> case inStr of -- forced to avoid build-up
 803                True  -> doDiscard False (incLexeme buf)
 804                False -> doDiscard True  (incLexeme buf)
 805    _ -> doDiscard inStr (incLexeme buf)
 806
 807 \end{code}
 808
 809 begin{code}
 810 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 811 my_span p xs = go [] xs
 812   where
 813     go so_far (x:xs') | p x = go (x:so_far) xs'
 814     go so_far xs            = (reverse so_far, xs)
 815
 816 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 817 my_span' p xs = go [] 0 xs
 818   where
 819     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 820     go so_far n xs            = (reverse so_far,n, xs)
 821 end{code}
 822
 823
 824 %************************************************************************
 825 %*                                                                      *
 826 \subsection{Other utility functions
 827 %*                                                                      *
 828 %************************************************************************
 829
 830 \begin{code}
 831 type IfM a = StringBuffer       -- Input string
 832           -> SrcLoc
 833           -> MaybeErr a ErrMsg
 834
 835 returnIf   :: a -> IfM a
 836 returnIf a s l = Succeeded a
 837
 838 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 839 m `thenIf` k = \s l ->
 840         case m s l of
 841                 Succeeded a -> k a s l
 842                 Failed err  -> Failed err
 843
 844 getSrcLocIf :: IfM SrcLoc
 845 getSrcLocIf s l = Succeeded l
 846
 847 happyError :: IfM a
 848 happyError s l = Failed (ifaceParseErr l ([]::[IfaceToken]){-Todo-})
 849
 850
 851 {-
 852  Note that if the name of the file we're processing ends
 853  with `hi-boot', we accept it on faith as having the right
 854  version. This is done so that .hi-boot files that comes
 855  with hsc don't have to be updated before every release,
 856  *and* it allows us to share .hi-boot files with versions
 857  of hsc that don't have .hi version checking (e.g., ghc-2.10's)
 858
 859  If the version number is 0, the checking is also turned off.
 860  (needed to deal with GHC.hi only!)
 861
 862  Once we can assume we're compiling with a version of ghc that
 863  supports interface file checking, we can drop the special
 864  pleading
 865 -}
 866 checkVersion :: Maybe Integer -> IfM ()
 867 checkVersion mb@(Just v) s l
 868  | (v==0) || (v == PROJECTVERSION) || opt_NoHiCheck = Succeeded ()
 869  | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
 870 checkVersion mb@Nothing  s l
 871  | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile l)) = Succeeded ()
 872  | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
 873
 874 -----------------------------------------------------------------
 875
 876 ifaceParseErr l toks
 877   = hsep [ppr l, ptext SLIT("Interface-file parse error;"),
 878           ptext SLIT("toks="), text (show (take 10 toks))]
 879
 880 ifaceVersionErr hi_vers l toks
 881   = hsep [ppr l, ptext SLIT("Interface file version error;"),
 882           ptext SLIT("Expected"), int PROJECTVERSION,
 883           ptext SLIT(" found "), pp_version]
 884     where
 885      pp_version =
 886       case hi_vers of
 887         Nothing -> ptext SLIT("pre ghc-3.02 version")
 888         Just v  -> ptext SLIT("version") <+> integer v
 889
 890 \end{code}