ghc/compiler/reader/Lex.lhs

   1 --------------------------------------------------------
   2 [Jan 98]
   3 There's a known bug in here:
   4
   5         If an interface file ends prematurely, Lex tries to
   6         do headFS of an empty FastString.
   7
   8 An example that provokes the error is
   9
  10         f _:_ _forall_ [a] <<<END OF FILE>>>
  11 --------------------------------------------------------
  12
  13
  14 %
  15 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
  16 %
  17 \section[Lexical analysis]{Lexical analysis}
  18
  19 \begin{code}
  20 module Lex (
  21
  22         isLexCon, isLexVar, isLexId, isLexSym,
  23         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  24         mkTupNameStr, ifaceParseErr,
  25
  26         -- Monad for parser
  27         IfaceToken(..), lexIface, IfM, thenIf, returnIf, getSrcLocIf,
  28         checkVersion,
  29         happyError,
  30         StringBuffer
  31
  32     ) where
  33
  34 #include "HsVersions.h"
  35
  36 import Char             (isDigit, isAlphanum, isUpper,isLower, isSpace, ord )
  37 import List             ( isSuffixOf )
  38
  39 import {-# SOURCE #-} CostCentre
  40
  41 import CmdLineOpts      ( opt_IgnoreIfacePragmas, opt_HiVersion, opt_NoHiCheck )
  42 import Demand           ( Demand(..) {- instance Read -} )
  43 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  44 import BasicTypes       ( NewOrData(..), IfaceFlavour(..) )
  45 import SrcLoc           ( SrcLoc, incSrcLine, srcLocFile )
  46
  47 import Maybes           ( MaybeErr(..) )
  48 import ErrUtils         ( ErrMsg )
  49 import Outputable
  50 import Util             ( nOfThem, panic )
  51
  52 import FastString
  53 import StringBuffer
  54 import GlaExts
  55 import ST               ( runST )
  56
  57 import PrelRead                 ( readRational__ ) -- Glasgow non-std
  58 \end{code}
  59
  60 %************************************************************************
  61 %*                                                                      *
  62 \subsection{Lexical categories}
  63 %*                                                                      *
  64 %************************************************************************
  65
  66 These functions test strings to see if they fit the lexical categories
  67 defined in the Haskell report.  Normally applied as in e.g. @isCon
  68 (getLocalName foo)@.
  69
  70 \begin{code}
  71 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  72  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  73
  74 isLexCon cs = isLexConId  cs || isLexConSym cs
  75 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  76
  77 isLexId  cs = isLexConId  cs || isLexVarId  cs
  78 isLexSym cs = isLexConSym cs || isLexVarSym cs
  79
  80 -------------
  81
  82 isLexConId cs
  83   | _NULL_ cs        = False
  84   | cs == SLIT("[]") = True
  85   | c  == '('        = True     -- (), (,), (,,), ...
  86   | otherwise        = isUpper c || isUpperISO c
  87   where
  88     c = _HEAD_ cs
  89
  90 isLexVarId cs
  91   | _NULL_ cs    = False
  92   | otherwise    = isLower c || isLowerISO c
  93   where
  94     c = _HEAD_ cs
  95
  96 isLexConSym cs
  97   | _NULL_ cs   = False
  98   | otherwise   = c  == ':'
  99                || cs == SLIT("->")
 100   where
 101     c = _HEAD_ cs
 102
 103 isLexVarSym cs
 104   | _NULL_ cs = False
 105   | otherwise = isSymbolASCII c
 106              || isSymbolISO c
 107   where
 108     c = _HEAD_ cs
 109
 110 -------------
 111 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 112 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 113 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 114 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 115 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 116 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 117 \end{code}
 118
 119
 120 %************************************************************************
 121 %*                                                                      *
 122 \subsection{Tuple strings -- ugh!}
 123 %*                                                                      *
 124 %************************************************************************
 125
 126 \begin{code}
 127 mkTupNameStr 0 = SLIT("()")
 128 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 129 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 130 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 131 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 132 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 133 \end{code}
 134
 135
 136
 137 %************************************************************************
 138 %*                                                                      *
 139 \subsection{Data types}
 140 %*                                                                      *
 141 %************************************************************************
 142
 143 The token data type, fairly un-interesting except from two constructors,
 144 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 145 strictness, unfolding etc) and types for id decls.
 146
 147 The Idea/Observation here is that the renamer needs to scan through
 148 all of an interface file before it can continue. But only a fraction
 149 of the information contained in the file turns out to be useful, so
 150 delaying as much as possible of the scanning and parsing of an
 151 interface file Makes Sense (Heap profiles of the compiler
 152 show a reduction in heap usage by at least a factor of two,
 153 post-renamer).
 154
 155 Hence, the interface file lexer spots when value declarations are
 156 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 157 for the type and any other id info for that binding (unfolding, strictness
 158 etc). These constructors are applied to the result of lexing these sub-chunks.
 159
 160 The lexing of the type and id info is all done lazily, of course, so
 161 the scanning (and subsequent parsing) will be done *only* on the ids the
 162 renamer finds out that it is interested in. The rest will just be junked.
 163 Laziness, you know it makes sense :-)
 164
 165 \begin{code}
 166 data IfaceToken
 167   = ITinterface         -- keywords
 168   | ITusages
 169   | ITversions
 170   | ITexports
 171   | ITinstance_modules
 172   | ITinstances
 173   | ITfixities
 174   | ITdeclarations
 175   | ITpragmas
 176   | ITdata
 177   | ITtype
 178   | ITnewtype
 179   | ITclass
 180   | ITwhere
 181   | ITinstance
 182   | ITinfixl
 183   | ITinfixr
 184   | ITinfix
 185   | ITforall
 186   | ITbang              -- magic symbols
 187   | ITvbar
 188   | ITdcolon
 189   | ITcomma
 190   | ITdarrow
 191   | ITdotdot
 192   | ITequal
 193   | ITocurly
 194   | ITobrack
 195   | IToparen
 196   | ITrarrow
 197   | ITccurly
 198   | ITcbrack
 199   | ITcparen
 200   | ITsemi
 201   | ITvarid   FAST_STRING
 202   | ITconid   FAST_STRING
 203   | ITvarsym  FAST_STRING
 204   | ITconsym  FAST_STRING
 205   | ITqvarid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 206   | ITqconid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 207   | ITqvarsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 208   | ITqconsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 209
 210   | ITtysig StringBuffer (Maybe StringBuffer)
 211                            -- lazily return the stream of tokens for
 212                            -- the info attached to an id.
 213         -- Stuff for reading unfoldings
 214   | ITarity
 215   | ITunfold Bool               -- True <=> there's an INLINE pragma on this Id
 216   | ITstrict [Demand] | ITbottom
 217   | ITspecialise
 218   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 219   | ITcoerce | ITinline | ITatsign
 220   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 221   | ITscc CostCentre
 222   | ITchar Char | ITstring FAST_STRING
 223   | ITinteger Integer | ITrational Rational
 224   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 225   | ITunknown String            -- Used when the lexer can't make sense of it
 226   | ITeof                               -- end of file token
 227   deriving Text -- debugging
 228
 229 instance Text CostCentre -- cheat!
 230
 231 \end{code}
 232
 233 %************************************************************************
 234 %*                                                                      *
 235 \subsection{The lexical analyser}
 236 %*                                                                      *
 237 %************************************************************************
 238
 239 \begin{code}
 240 lexIface :: (IfaceToken -> IfM a) -> IfM a
 241 lexIface cont buf =
 242  _scc_ "Lexer"
 243 -- if bufferExhausted buf then
 244 --  []
 245 -- else
 246 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 247   case currentChar# buf of
 248       -- whitespace and comments, ignore.
 249     ' '#  -> lexIface cont (stepOn buf)
 250     '\t'# -> lexIface cont (stepOn buf)
 251     '\n'# -> \ loc -> lexIface cont (stepOn buf) (incSrcLine loc)
 252
 253 -- Numbers and comments
 254     '-'#  ->
 255       case lookAhead# buf 1# of
 256         '-'# -> lex_comment cont (stepOnBy# buf 2#)
 257         c    ->
 258           if isDigit (C# c)
 259           then lex_num cont (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 260           else lex_id cont buf
 261
 262 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 263 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 264
 265     '('# ->
 266          case prefixMatch (stepOn buf) "..)" of
 267            Just buf' ->  cont ITdotdot (stepOverLexeme buf')
 268            Nothing ->
 269             case lookAhead# buf 1# of
 270               ','# -> lex_tuple cont Nothing  (stepOnBy# buf 2#)
 271               ')'# -> cont (ITconid SLIT("()")) (stepOnBy# buf 2#)
 272               _    -> cont IToparen (stepOn buf)
 273
 274     '{'# -> cont ITocurly (stepOn buf)
 275     '}'# -> cont ITccurly (stepOn buf)
 276     ')'# -> cont ITcparen (stepOn buf)
 277     '['# ->
 278       case lookAhead# buf 1# of
 279         ']'# -> cont (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 280         _    -> cont ITobrack (stepOn buf)
 281     ']'# -> cont ITcbrack (stepOn buf)
 282     ','# -> cont ITcomma  (stepOn buf)
 283     ';'#  -> cont ITsemi (stepOn buf)
 284     '\"'# -> case untilEndOfString# (stepOn buf) of
 285               buf' ->
 286                   -- the string literal does *not* include the dquotes
 287                 case lexemeToFastString buf' of
 288                  v -> cont (ITstring v) (stepOn (stepOverLexeme buf'))
 289
 290     '\''# -> --
 291              -- untilEndOfChar# extends the current lexeme until
 292              -- it hits a non-escaped single quote. The lexeme of the
 293              -- StringBuffer returned does *not* include the closing quote,
 294              -- hence we augment the lexeme and make sure to add the
 295              -- starting quote, before `read'ing the string.
 296              --
 297              case untilEndOfChar# (stepOn buf) of
 298                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 299                         [  (ch, rest)] -> cont (ITchar ch) (stepOverLexeme (incLexeme buf'))
 300
 301 -- ``thingy'' form for casm
 302     '`'# ->
 303             case lookAhead# buf 1# of
 304               '`'# -> lex_cstring cont (stepOnBy# buf 2#) -- remove the `` and go.
 305               _    -> lex_id cont (incLexeme buf)         -- add ` to lexeme and assume
 306                                                      -- scanning an id of some sort.
 307 -- Keywords
 308     '_'# ->
 309          case lookAhead# buf 1# of
 310            'S'# -> case lookAhead# buf 2# of
 311                     '_'# ->
 312                             lex_demand cont (stepOnUntil (not . isSpace)
 313                                             (stepOnBy# buf 3#)) -- past _S_
 314            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 315                      Just buf' -> lex_scc cont (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 316                      Nothing   -> lex_keyword cont (stepOnBy# buf 1#) -- drop the '_' and assume
 317                                                                  -- it is a keyword.
 318            _    -> lex_keyword cont (stepOn buf)
 319
 320     '\NUL'# ->
 321             if bufferExhausted (stepOn buf) then
 322                cont ITeof buf
 323             else
 324                lex_id cont buf
 325     c ->
 326         if isDigit (C# c) then
 327            lex_num cont (id) (ord# c -# ord# '0'#) (incLexeme buf)
 328         else
 329            lex_id cont buf
 330 --  where
 331 lex_comment cont buf =
 332 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 333    case untilChar# buf '\n'# of {buf' -> lexIface cont (stepOverLexeme buf')}
 334
 335 ------------------
 336 lex_demand cont buf =
 337  case read_em [] buf of { (ls,buf') -> cont (ITstrict ls) (stepOverLexeme buf')}
 338  where
 339    -- code snatched from Demand.lhs
 340   read_em acc buf =
 341    case currentChar# buf of
 342     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 343     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 344     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 345     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 346     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 347     ')'# -> (reverse acc, stepOn buf)
 348     'U'# -> do_unpack DataType True  acc (stepOnBy# buf 2#)
 349     'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
 350     'N'# -> do_unpack NewType True  acc (stepOnBy# buf 2#)
 351     'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
 352     _    -> (reverse acc, buf)
 353
 354   do_unpack new_or_data wrapper_unpacks acc buf
 355    = case read_em [] buf of
 356       (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
 357
 358 ------------------
 359 lex_scc cont buf =
 360  case currentChar# buf of
 361   '"'# ->
 362       -- YUCK^2
 363      case prefixMatch (stepOn buf) "NO_CC\"" of
 364       Just buf' -> cont (ITscc noCostCentre) (stepOverLexeme buf')
 365       Nothing ->
 366        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 367         Just buf' -> cont (ITscc useCurrentCostCentre) (stepOverLexeme buf')
 368         Nothing   ->
 369          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 370          Just buf' -> cont (ITscc overheadCostCentre) (stepOverLexeme buf')
 371          Nothing   ->
 372           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 373            Just buf' -> cont (ITscc dontCareCostCentre) (stepOverLexeme buf')
 374            Nothing   ->
 375             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 376              Just buf' -> cont (ITscc subsumedCosts) (stepOverLexeme buf')
 377              Nothing ->
 378               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 379                Just buf' -> cont (ITscc preludeCafsCostCentre) (stepOverLexeme buf')
 380                Nothing ->
 381                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 382                  Just buf' ->
 383                   case untilChar# (stepOverLexeme buf') '\"'# of
 384                    buf'' -> cont (ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_)) (stepOn (stepOverLexeme buf''))
 385                  Nothing ->
 386                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 387                    Just buf' -> cont (ITscc (preludeDictsCostCentre True)) (stepOverLexeme buf')
 388                    Nothing ->
 389                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 390                      Just buf' ->
 391                       case untilChar# (stepOverLexeme buf') '\"'# of
 392                        buf'' -> cont (ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True))
 393                                 (stepOn (stepOverLexeme buf''))
 394                      Nothing ->
 395                       let
 396                        match_user_cc buf =
 397                         case untilChar# buf '/'# of
 398                          buf' ->
 399                           let mod_name = lexemeToFastString buf' in
 400 --                        case untilChar# (stepOn (stepOverLexeme buf')) '/'# of
 401 --                         buf'' ->
 402 --                            let grp_name = lexemeToFastString buf'' in
 403                             case untilChar# (stepOn (stepOverLexeme buf')) '\"'# of
 404                              buf'' ->
 405                                -- The label may contain arbitrary characters, so it
 406                                -- may have been escaped etc., hence we `read' it in to get
 407                                -- rid of these meta-chars in the string and then pack it (again.)
 408                                -- ToDo: do the same for module name (single quotes allowed in m-names).
 409                                -- BTW, the code in this module is totally gruesome..
 410                                let upk_label = _UNPK_ (lexemeToFastString buf'') in
 411                                case reads ('"':upk_label++"\"") of
 412                                 ((cc_label,_):_) ->
 413                                     let cc_name = _PK_ cc_label in
 414                                     (mkUserCC cc_name mod_name _NIL_{-grp_name-},
 415                                      stepOn (stepOverLexeme buf''))
 416                                 _ ->
 417                                   trace ("trouble lexing scc label: " ++ upk_label ++ " , ignoring")
 418                                   (mkUserCC _NIL_ mod_name _NIL_{-grp_name-},
 419                                    stepOn (stepOverLexeme buf''))
 420                       in
 421                       case prefixMatch (stepOn buf) "CAF:" of
 422                        Just buf' ->
 423                          case match_user_cc (stepOverLexeme buf') of
 424                           (cc, buf'') -> cont (ITscc (cafifyCC cc)) buf''
 425                        Nothing ->
 426                          case match_user_cc (stepOn buf) of
 427                           (cc, buf'') -> cont (ITscc cc) buf''
 428   c -> cont (ITunknown [C# c]) (stepOn buf)
 429
 430
 431 -----------
 432 lex_num :: (IfaceToken -> IfM a) ->
 433         (Int -> Int) -> Int# -> IfM a
 434 lex_num cont minus acc# buf =
 435 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 436  case scanNumLit (I# acc#) buf of
 437      (acc',buf') ->
 438        case currentChar# buf' of
 439          '.'# ->
 440              -- this case is not optimised at all, as the
 441              -- presence of floating point numbers in interface
 442              -- files is not that common. (ToDo)
 443             case expandWhile (isDigit) (incLexeme buf') of
 444               buf2 -> -- points to first non digit char
 445                 let l = case currentChar# buf2 of
 446                           'e'# -> let buf3 = incLexeme buf2 in
 447                               case currentChar# buf3 of
 448                                 '-'# -> expandWhile (isDigit) (incLexeme buf3)
 449                                 _    -> expandWhile (isDigit) buf3
 450                           _ -> buf2
 451                 in let v = readRational__ (lexemeToString l) in
 452                    cont (ITrational v) (stepOverLexeme l)
 453
 454          _ -> cont (ITinteger (fromInt (minus acc'))) (stepOverLexeme buf')
 455
 456
 457
 458 ------------
 459 lex_keyword cont buf =
 460 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 461  case currentChar# buf of
 462   ':'# -> case lookAhead# buf 1# of
 463             '_'# -> -- a binding, type (and other id-info) follows,
 464                     -- to make the parser ever so slightly, we push
 465                     --
 466                 lex_decl cont (stepOnBy# buf 2#)
 467             v# -> cont (ITunknown (['_',':',C# v#])) (stepOnBy# buf 2#)
 468   _ ->
 469     case expandWhile (is_kwd_char) buf of
 470      buf' ->
 471       let kw = lexemeToFastString buf' in
 472 --    _trace ("kw: "++lexemeToString buf') $
 473       case lookupUFM ifaceKeywordsFM kw of
 474        Nothing -> cont (ITunknown (_UNPK_ kw)) -- (minor) sigh
 475                   (stepOverLexeme buf')
 476        Just xx -> cont xx (stepOverLexeme buf')
 477
 478 lex_decl cont buf =
 479  case doDiscard False buf of -- spin until ;; is found
 480    buf' ->
 481       {- _trace (show (lexemeToString buf')) $ -}
 482       case currentChar# buf' of
 483        '\n'# -> -- newline, no id info.
 484            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 485                 (stepOverLexeme buf')
 486        '\r'# -> -- just to be sure for those Win* boxes..
 487            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 488                 (stepOverLexeme buf')
 489        '\NUL'# ->
 490            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 491                 (stepOverLexeme buf')
 492        c     -> -- run all over the id info
 493          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 494            buf'' ->
 495                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 496                     --_trace (show (lexemeToString (decLexeme buf''))) $
 497                     let idinfo =
 498                             if opt_IgnoreIfacePragmas then
 499                                 Nothing
 500                             else
 501                                 Just (lexemeToBuffer (decLexeme buf''))
 502                         --_trace (show is) $
 503                     in
 504                     cont (ITtysig (lexemeToBuffer (decLexeme buf')) idinfo)
 505                         (stepOverLexeme buf'')
 506
 507 -- ToDo: hammer!
 508 is_kwd_char c@(C# c#) =
 509  isAlphanum c || -- OLD: c `elem` "_@/\\"
 510  (case c# of
 511    '_'#  -> True
 512    '@'#  -> True
 513    '/'#  -> True
 514    '\\'# -> True
 515    _     -> False)
 516
 517
 518
 519 -----------
 520 lex_cstring cont buf =
 521  case expandUntilMatch buf "\'\'" of
 522    buf' -> cont (ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))))
 523            (stepOverLexeme buf')
 524
 525 -----------
 526 lex_tuple cont module_dot buf =
 527   go 2 buf
 528   where
 529    go n buf =
 530     case currentChar# buf of
 531       ','# -> go (n+1) (stepOn buf)
 532       ')'# -> end_lex_id cont module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 533       _    -> cont (ITunknown ("tuple " ++ show n)) buf
 534
 535 -- Similarly ' itself is ok inside an identifier, but not at the start
 536
 537 -- id_arr is an array of bytes, indexed by characters,
 538 -- containing 0 if the character isn't a valid character from an identifier
 539 -- and 1 if it is.  It's just a memo table for is_id_char.
 540 id_arr :: ByteArray Int
 541 id_arr =
 542  runST (
 543   newCharArray (0,255) >>= \ barr ->
 544   let
 545    loop 256# = return ()
 546    loop i# =
 547     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 548        writeCharArray barr (I# i#) '\1'         >>
 549        loop (i# +# 1#)
 550     else
 551        writeCharArray barr (I# i#) '\0'         >>
 552        loop (i# +# 1#)
 553   in
 554   loop 0#                                       >>
 555   unsafeFreezeByteArray barr)
 556
 557 is_id_char (C# c#) =
 558  let
 559   ByteArray _ arr# = id_arr
 560  in
 561  case ord# (indexCharArray# arr# (ord# c#)) of
 562   0# -> False
 563   1# -> True
 564
 565 --OLD: is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 566
 567 is_sym c# =
 568  case c# of {
 569    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 570    '#'# -> True; '$'#  -> True; '%'# -> True;
 571    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 572    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 573    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 574    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 575
 576 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 577
 578
 579 -- mod_arr is an array of bytes, indexed by characters,
 580 -- containing 0 if the character isn't a valid character from a module name,
 581 -- and 1 if it is.
 582 mod_arr :: ByteArray Int
 583 mod_arr =
 584  runST (
 585   newCharArray (0,255) >>= \ barr ->
 586   let
 587    loop 256# = return ()
 588    loop i# =
 589     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 590        writeCharArray barr (I# i#) '\1'         >>
 591        loop (i# +# 1#)
 592     else
 593        writeCharArray barr (I# i#) '\0'         >>
 594        loop (i# +# 1#)
 595   in
 596   loop 0#                                       >>
 597   unsafeFreezeByteArray barr)
 598
 599
 600 is_mod_char (C# c#) =
 601  let
 602   ByteArray _ arr# = mod_arr
 603  in
 604  case ord# (indexCharArray# arr# (ord# c#)) of
 605   0# -> False
 606   1# -> True
 607
 608 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 609
 610 lex_id cont buf =
 611 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 612  case expandWhile (is_mod_char) buf of
 613    buf' ->
 614     case currentChar# buf' of
 615      '.'# -> munch buf' HiFile
 616      '!'# -> munch buf' HiBootFile
 617      _    -> lex_id2 cont Nothing buf'
 618    where
 619     munch buf' hif =
 620         if not (emptyLexeme buf') then
 621 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 622            case lexemeToFastString buf' of
 623              l@(FastString u# l# ba#) -> lex_id2 cont (Just (FastString u# l# ba#, hif))
 624                                                  (stepOn (stepOverLexeme buf'))
 625         else
 626            lex_id2 cont Nothing buf'
 627
 628
 629 -- Dealt with the Module.part
 630 lex_id2 cont module_dot buf =
 631 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 632  case currentChar# buf of
 633
 634   '['# ->       -- Special case for []
 635     case lookAhead# buf 1# of
 636      ']'# -> end_lex_id cont module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 637      _    -> lex_id3 cont module_dot buf
 638
 639   '('# ->       -- Special case for (,,,)
 640     case lookAhead# buf 1# of
 641      ')'# -> end_lex_id cont module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 642      ','# -> lex_tuple cont module_dot (stepOnBy# buf 2#)
 643      _    -> lex_id3 cont module_dot buf
 644   ':'# -> lex_id3 cont module_dot (incLexeme buf)
 645   '-'# ->
 646      case module_dot of
 647        Nothing  -> lex_id3 cont module_dot buf
 648        Just ghc -> -- this should be "GHC" (current home of (->))
 649          case lookAhead# buf 1# of
 650           '>'# -> end_lex_id cont module_dot (ITconid SLIT("->"))
 651                         (stepOnBy# buf 2#)
 652           _    -> lex_id3 cont module_dot buf
 653   _    -> lex_id3 cont module_dot buf
 654
 655
 656
 657 -- Dealt with [], (), : special cases
 658
 659 lex_id3 cont module_dot buf =
 660  case expandWhile (is_id_char) buf of
 661   buf' ->
 662     case module_dot of
 663      Just _ ->
 664        end_lex_id cont module_dot (mk_var_token lexeme) new_buf
 665      Nothing ->
 666        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 667          Just kwd_token -> cont kwd_token new_buf
 668          Nothing        -> cont (mk_var_token lexeme) new_buf
 669     where
 670      lexeme  = lexemeToFastString buf'
 671      new_buf = stepOverLexeme buf'
 672
 673
 674 -- Dealt with [], (), : special cases
 675 mk_var_token pk_str =
 676      let
 677       f = _HEAD_ pk_str
 678      in
 679      --
 680      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 681      -- remove the second half of disjunction when using a 1.3 prelude.
 682      --
 683      if      isUpper f    then ITconid pk_str
 684      else if isLower f    then ITvarid pk_str
 685      else if f == ':'     then ITconsym pk_str
 686      else if isLowerISO f then ITvarid pk_str
 687      else if isUpperISO f then ITconid pk_str
 688      else ITvarsym pk_str
 689
 690 end_lex_id cont Nothing token buf  = cont token buf
 691 end_lex_id cont (Just (m,hif)) token buf =
 692  case token of
 693    ITconid n  -> cont (ITqconid  (m,n,hif))         buf
 694    ITvarid n  -> cont (ITqvarid  (m,n,hif))         buf
 695    ITconsym n -> cont (ITqconsym (m,n,hif))         buf
 696
 697         -- Special case for ->
 698         -- "->" by itself is a special token (ITrarrow),
 699         -- but M.-> is a ITqconid
 700    ITvarsym n |  n == SLIT("->")
 701               -> cont (ITqconsym (m,n,hif))         buf
 702
 703    ITvarsym n -> cont (ITqvarsym (m,n,hif))         buf
 704
 705 -- ITbang can't happen here I think
 706 --   ITbang     -> cont (ITqvarsym (m,SLIT("!"),hif)) buf
 707
 708    _          -> cont (ITunknown (show token))      buf
 709
 710 ------------
 711 ifaceKeywordsFM :: UniqFM IfaceToken
 712 ifaceKeywordsFM = listToUFM $
 713       map (\ (x,y) -> (_PK_ x,y))
 714        [("/\\_",                ITbiglam)
 715        ,("@_",                  ITatsign)
 716        ,("letrec_",             ITletrec)
 717        ,("interface_",          ITinterface)
 718        ,("usages_",             ITusages)
 719        ,("versions_",           ITversions)
 720        ,("exports_",            ITexports)
 721        ,("instance_modules_",   ITinstance_modules)
 722        ,("instances_",          ITinstances)
 723        ,("fixities_",           ITfixities)
 724        ,("declarations_",       ITdeclarations)
 725        ,("pragmas_",            ITpragmas)
 726        ,("forall_",             ITforall)
 727        ,("u_",                  ITunfold False)
 728        ,("U_",                  ITunfold True)
 729        ,("A_",                  ITarity)
 730        ,("P_",                  ITspecialise)
 731        ,("coerce_",             ITcoerce)
 732        ,("inline_",             ITinline)
 733        ,("bot_",                ITbottom)
 734        ,("integer_",            ITinteger_lit)
 735        ,("rational_",           ITrational_lit)
 736        ,("addr_",               ITaddr_lit)
 737        ,("float_",              ITfloat_lit)
 738        ,("string_",             ITstring_lit)
 739        ,("litlit_",             ITlit_lit)
 740        ,("ccall_",              ITccall (False, False))
 741        ,("ccall_GC_",           ITccall (False, True))
 742        ,("casm_",               ITccall (True,  False))
 743        ,("casm_GC_",            ITccall (True,  True))
 744        ]
 745
 746 haskellKeywordsFM = listToUFM $
 747       map (\ (x,y) -> (_PK_ x,y))
 748       [ ("data",                ITdata)
 749        ,("type",                ITtype)
 750        ,("newtype",             ITnewtype)
 751        ,("class",               ITclass)
 752        ,("where",               ITwhere)
 753        ,("instance",            ITinstance)
 754        ,("infixl",              ITinfixl)
 755        ,("infixr",              ITinfixr)
 756        ,("infix",               ITinfix)
 757        ,("case",                ITcase)
 758        ,("case#",               ITprim_case)
 759        ,("of",                  ITof)
 760        ,("in",                  ITin)
 761        ,("let",                 ITlet)
 762
 763        ,("->",                  ITrarrow)
 764        ,("\\",                  ITlam)
 765        ,("|",                   ITvbar)
 766        ,("!",                   ITbang)
 767        ,("=>",                  ITdarrow)
 768        ,("=",                   ITequal)
 769        ,("::",                  ITdcolon)
 770        ]
 771
 772
 773 -- doDiscard rips along really fast, looking for a double semicolon,
 774 -- indicating the end of the pragma we're skipping
 775 doDiscard inStr buf =
 776 -- _trace (show (C# (currentChar# buf))) $
 777  case currentChar# buf of
 778    ';'# ->
 779      if not inStr then
 780        case lookAhead# buf 1# of
 781         ';'# -> incLexeme (incLexeme buf)
 782         _    -> doDiscard inStr (incLexeme buf)
 783      else
 784        doDiscard inStr (incLexeme buf)
 785    '"'# ->
 786        let
 787         odd_slashes buf flg i# =
 788           case lookAhead# buf i# of
 789            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 790            _     -> flg
 791        in
 792        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 793          '\\'# -> -- escaping something..
 794            if odd_slashes buf True (negateInt# 2#) then
 795                -- odd number of slashes, " is escaped.
 796               doDiscard inStr (incLexeme buf)
 797            else
 798                -- even number of slashes, \ is escaped.
 799               doDiscard (not inStr) (incLexeme buf)
 800          _ -> case inStr of -- forced to avoid build-up
 801                True  -> doDiscard False (incLexeme buf)
 802                False -> doDiscard True  (incLexeme buf)
 803    _ -> doDiscard inStr (incLexeme buf)
 804
 805 \end{code}
 806
 807 begin{code}
 808 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 809 my_span p xs = go [] xs
 810   where
 811     go so_far (x:xs') | p x = go (x:so_far) xs'
 812     go so_far xs            = (reverse so_far, xs)
 813
 814 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 815 my_span' p xs = go [] 0 xs
 816   where
 817     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 818     go so_far n xs            = (reverse so_far,n, xs)
 819 end{code}
 820
 821
 822 %************************************************************************
 823 %*                                                                      *
 824 \subsection{Other utility functions
 825 %*                                                                      *
 826 %************************************************************************
 827
 828 \begin{code}
 829 type IfM a = StringBuffer       -- Input string
 830           -> SrcLoc
 831           -> MaybeErr a ErrMsg
 832
 833 returnIf   :: a -> IfM a
 834 returnIf a s l = Succeeded a
 835
 836 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 837 m `thenIf` k = \s l ->
 838         case m s l of
 839                 Succeeded a -> k a s l
 840                 Failed err  -> Failed err
 841
 842 getSrcLocIf :: IfM SrcLoc
 843 getSrcLocIf s l = Succeeded l
 844
 845 happyError :: IfM a
 846 happyError s l = Failed (ifaceParseErr l ([]::[IfaceToken]){-Todo-})
 847
 848
 849 {-
 850  Note that if the name of the file we're processing ends
 851  with `hi-boot', we accept it on faith as having the right
 852  version. This is done so that .hi-boot files that comes
 853  with hsc don't have to be updated before every release,
 854  *and* it allows us to share .hi-boot files with versions
 855  of hsc that don't have .hi version checking (e.g., ghc-2.10's)
 856
 857  If the version number is 0, the checking is also turned off.
 858  (needed to deal with GHC.hi only!)
 859
 860  Once we can assume we're compiling with a version of ghc that
 861  supports interface file checking, we can drop the special
 862  pleading
 863 -}
 864 checkVersion :: Maybe Integer -> IfM ()
 865 checkVersion mb@(Just v) s l
 866  | (v==0) || (v == fromInt opt_HiVersion) || opt_NoHiCheck = Succeeded ()
 867  | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
 868 checkVersion mb@Nothing  s l
 869  | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile l)) = Succeeded ()
 870  | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
 871
 872 -----------------------------------------------------------------
 873
 874 ifaceParseErr l toks
 875   = hsep [ppr l, ptext SLIT("Interface-file parse error;"),
 876           ptext SLIT("toks="), text (show (take 10 toks))]
 877
 878 ifaceVersionErr hi_vers l toks
 879   = hsep [ppr l, ptext SLIT("Interface file version error;"),
 880           ptext SLIT("Expected"), int opt_HiVersion,
 881           ptext SLIT(" found "), pp_version]
 882     where
 883      pp_version =
 884       case hi_vers of
 885         Nothing -> ptext SLIT("pre ghc-3.02 version")
 886         Just v  -> ptext SLIT("version") <+> integer v
 887
 888 \end{code}