ghc/compiler/reader/Lex.lhs

   1 --------------------------------------------------------
   2 [Jan 98]
   3 There's a known bug in here:
   4
   5         If an interface file ends prematurely, Lex tries to
   6         do headFS of an empty FastString.
   7
   8 An example that provokes the error is
   9
  10         f _:_ _forall_ [a] <<<END OF FILE>>>
  11 --------------------------------------------------------
  12
  13
  14 %
  15 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
  16 %
  17 \section[Lexical analysis]{Lexical analysis}
  18
  19 \begin{code}
  20 module Lex (
  21
  22         isLexCon, isLexVar, isLexId, isLexSym,
  23         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  24         mkTupNameStr, ifaceParseErr,
  25
  26         -- Monad for parser
  27         IfaceToken(..), lexIface, IfM, thenIf, returnIf, getSrcLocIf,
  28         checkVersion,
  29         happyError,
  30         StringBuffer
  31
  32     ) where
  33
  34 #include "HsVersions.h"
  35
  36 import Char             (isDigit, isAlphanum, isUpper,isLower, isSpace, ord )
  37 import List             ( isSuffixOf )
  38
  39 import {-# SOURCE #-} CostCentre
  40
  41 import CmdLineOpts      ( opt_IgnoreIfacePragmas )
  42 import Demand           ( Demand(..) {- instance Read -} )
  43 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  44 import BasicTypes       ( NewOrData(..), IfaceFlavour(..) )
  45 import SrcLoc           ( SrcLoc, incSrcLine, srcLocFile )
  46
  47 import Maybes           ( MaybeErr(..) )
  48 import ErrUtils         ( ErrMsg(..) )
  49 import Outputable
  50 import Util             ( nOfThem, panic )
  51
  52 import FastString
  53 import StringBuffer
  54 import GlaExts
  55 import ST               ( runST )
  56 \end{code}
  57
  58 %************************************************************************
  59 %*                                                                      *
  60 \subsection{Lexical categories}
  61 %*                                                                      *
  62 %************************************************************************
  63
  64 These functions test strings to see if they fit the lexical categories
  65 defined in the Haskell report.  Normally applied as in e.g. @isCon
  66 (getLocalName foo)@.
  67
  68 \begin{code}
  69 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  70  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  71
  72 isLexCon cs = isLexConId  cs || isLexConSym cs
  73 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  74
  75 isLexId  cs = isLexConId  cs || isLexVarId  cs
  76 isLexSym cs = isLexConSym cs || isLexVarSym cs
  77
  78 -------------
  79
  80 isLexConId cs
  81   | _NULL_ cs        = False
  82   | cs == SLIT("[]") = True
  83   | c  == '('        = True     -- (), (,), (,,), ...
  84   | otherwise        = isUpper c || isUpperISO c
  85   where
  86     c = _HEAD_ cs
  87
  88 isLexVarId cs
  89   | _NULL_ cs    = False
  90   | otherwise    = isLower c || isLowerISO c
  91   where
  92     c = _HEAD_ cs
  93
  94 isLexConSym cs
  95   | _NULL_ cs   = False
  96   | otherwise   = c  == ':'
  97                || cs == SLIT("->")
  98   where
  99     c = _HEAD_ cs
 100
 101 isLexVarSym cs
 102   | _NULL_ cs = False
 103   | otherwise = isSymbolASCII c
 104              || isSymbolISO c
 105   where
 106     c = _HEAD_ cs
 107
 108 -------------
 109 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 110 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 111 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 112 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 113 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 114 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 115 \end{code}
 116
 117
 118 %************************************************************************
 119 %*                                                                      *
 120 \subsection{Tuple strings -- ugh!}
 121 %*                                                                      *
 122 %************************************************************************
 123
 124 \begin{code}
 125 mkTupNameStr 0 = SLIT("()")
 126 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 127 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 128 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 129 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 130 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 131 \end{code}
 132
 133
 134
 135 %************************************************************************
 136 %*                                                                      *
 137 \subsection{Data types}
 138 %*                                                                      *
 139 %************************************************************************
 140
 141 The token data type, fairly un-interesting except from two constructors,
 142 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 143 strictness, unfolding etc) and types for id decls.
 144
 145 The Idea/Observation here is that the renamer needs to scan through
 146 all of an interface file before it can continue. But only a fraction
 147 of the information contained in the file turns out to be useful, so
 148 delaying as much as possible of the scanning and parsing of an
 149 interface file Makes Sense (Heap profiles of the compiler
 150 show at a reduction in heap usage by at least a factor of two,
 151 post-renamer).
 152
 153 Hence, the interface file lexer spots when value declarations are
 154 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 155 for the type and any other id info for that binding (unfolding, strictness
 156 etc). These constructors are applied to the result of lexing these sub-chunks.
 157
 158 The lexing of the type and id info is all done lazily, of course, so
 159 the scanning (and subsequent parsing) will be done *only* on the ids the
 160 renamer finds out that it is interested in. The rest will just be junked.
 161 Laziness, you know it makes sense :-)
 162
 163 \begin{code}
 164 data IfaceToken
 165   = ITinterface         -- keywords
 166   | ITusages
 167   | ITversions
 168   | ITexports
 169   | ITinstance_modules
 170   | ITinstances
 171   | ITfixities
 172   | ITdeclarations
 173   | ITpragmas
 174   | ITdata
 175   | ITtype
 176   | ITnewtype
 177   | ITderiving
 178   | ITclass
 179   | ITwhere
 180   | ITinstance
 181   | ITinfixl
 182   | ITinfixr
 183   | ITinfix
 184   | ITforall
 185   | ITbang              -- magic symbols
 186   | ITvbar
 187   | ITdcolon
 188   | ITcomma
 189   | ITdarrow
 190   | ITdotdot
 191   | ITequal
 192   | ITocurly
 193   | ITobrack
 194   | IToparen
 195   | ITrarrow
 196   | ITccurly
 197   | ITcbrack
 198   | ITcparen
 199   | ITsemi
 200   | ITvarid   FAST_STRING
 201   | ITconid   FAST_STRING
 202   | ITvarsym  FAST_STRING
 203   | ITconsym  FAST_STRING
 204   | ITqvarid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 205   | ITqconid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 206   | ITqvarsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 207   | ITqconsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 208
 209   | ITtysig StringBuffer (Maybe StringBuffer)
 210                            -- lazily return the stream of tokens for
 211                            -- the info attached to an id.
 212         -- Stuff for reading unfoldings
 213   | ITarity
 214   | ITunfold Bool               -- True <=> there's an INLINE pragma on this Id
 215   | ITstrict [Demand] | ITbottom
 216   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 217   | ITcoerce_in | ITcoerce_out | ITatsign
 218   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 219   | ITscc CostCentre
 220   | ITchar Char | ITstring FAST_STRING
 221   | ITinteger Integer | ITdouble Double
 222   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 223   | ITunknown String            -- Used when the lexer can't make sense of it
 224   | ITeof                               -- end of file token
 225   deriving Text -- debugging
 226
 227 instance Text CostCentre -- cheat!
 228
 229 \end{code}
 230
 231 %************************************************************************
 232 %*                                                                      *
 233 \subsection{The lexical analyser}
 234 %*                                                                      *
 235 %************************************************************************
 236
 237 \begin{code}
 238 lexIface :: (IfaceToken -> IfM a) -> IfM a
 239 lexIface cont buf =
 240  _scc_ "Lexer"
 241 -- if bufferExhausted buf then
 242 --  []
 243 -- else
 244 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 245   case currentChar# buf of
 246       -- whitespace and comments, ignore.
 247     ' '#  -> lexIface cont (stepOn buf)
 248     '\t'# -> lexIface cont (stepOn buf)
 249     '\n'# -> \ loc -> lexIface cont (stepOn buf) (incSrcLine loc)
 250
 251 -- Numbers and comments
 252     '-'#  ->
 253       case lookAhead# buf 1# of
 254         '-'# -> lex_comment cont (stepOnBy# buf 2#)
 255         c    ->
 256           if isDigit (C# c)
 257           then lex_num cont (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 258           else lex_id cont buf
 259
 260 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 261 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 262
 263     '('# ->
 264          case prefixMatch (stepOn buf) "..)" of
 265            Just buf' ->  cont ITdotdot (stepOverLexeme buf')
 266            Nothing ->
 267             case lookAhead# buf 1# of
 268               ','# -> lex_tuple cont Nothing  (stepOnBy# buf 2#)
 269               ')'# -> cont (ITconid SLIT("()")) (stepOnBy# buf 2#)
 270               _    -> cont IToparen (stepOn buf)
 271
 272     '{'# -> cont ITocurly (stepOn buf)
 273     '}'# -> cont ITccurly (stepOn buf)
 274     ')'# -> cont ITcparen (stepOn buf)
 275     '['# ->
 276       case lookAhead# buf 1# of
 277         ']'# -> cont (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 278         _    -> cont ITobrack (stepOn buf)
 279     ']'# -> cont ITcbrack (stepOn buf)
 280     ','# -> cont ITcomma  (stepOn buf)
 281     ';'#  -> cont ITsemi (stepOn buf)
 282     '\"'# -> case untilEndOfString# (stepOn buf) of
 283               buf' ->
 284                   -- the string literal does *not* include the dquotes
 285                 case lexemeToFastString buf' of
 286                  v -> cont (ITstring v) (stepOn (stepOverLexeme buf'))
 287
 288     '\''# -> --
 289              -- untilEndOfChar# extends the current lexeme until
 290              -- it hits a non-escaped single quote. The lexeme of the
 291              -- StringBuffer returned does *not* include the closing quote,
 292              -- hence we augment the lexeme and make sure to add the
 293              -- starting quote, before `read'ing the string.
 294              --
 295              case untilEndOfChar# (stepOn buf) of
 296                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 297                         [  (ch, rest)] -> cont (ITchar ch) (stepOverLexeme (incLexeme buf'))
 298
 299 -- ``thingy'' form for casm
 300     '`'# ->
 301             case lookAhead# buf 1# of
 302               '`'# -> lex_cstring cont (stepOnBy# buf 2#) -- remove the `` and go.
 303               _    -> lex_id cont (incLexeme buf)         -- add ` to lexeme and assume
 304                                                      -- scanning an id of some sort.
 305 -- Keywords
 306     '_'# ->
 307          case lookAhead# buf 1# of
 308            'S'# -> case lookAhead# buf 2# of
 309                     '_'# ->
 310                             lex_demand cont (stepOnUntil (not . isSpace)
 311                                             (stepOnBy# buf 3#)) -- past _S_
 312            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 313                      Just buf' -> lex_scc cont (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 314                      Nothing   -> lex_keyword cont (stepOnBy# buf 1#) -- drop the '_' and assume
 315                                                                  -- it is a keyword.
 316            _    -> lex_keyword cont (stepOn buf)
 317
 318     '\NUL'# ->
 319             if bufferExhausted (stepOn buf) then
 320                cont ITeof buf
 321             else
 322                lex_id cont buf
 323     c ->
 324         if isDigit (C# c) then
 325            lex_num cont (id) (ord# c -# ord# '0'#) (incLexeme buf)
 326         else
 327            lex_id cont buf
 328 --  where
 329 lex_comment cont buf =
 330 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 331    case untilChar# buf '\n'# of {buf' -> lexIface cont (stepOverLexeme buf')}
 332
 333 ------------------
 334 lex_demand cont buf =
 335 -- _trace ("demand: "++[C# (currentChar# buf)]) $
 336  case read_em [] buf of { (ls,buf') -> cont (ITstrict ls) (stepOverLexeme buf')}
 337  where
 338    -- code snatched from Demand.lhs
 339   read_em acc buf =
 340 --   _trace ("read_em: "++[C# (currentChar# buf)]) $
 341    case currentChar# buf of
 342     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 343     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 344     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 345     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 346     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 347     ')'# -> (reverse acc, stepOn buf)
 348     'U'# -> do_unpack DataType True  acc (stepOnBy# buf 2#)
 349     'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
 350     'N'# -> do_unpack NewType True  acc (stepOnBy# buf 2#)
 351     'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
 352     _    -> (reverse acc, buf)
 353
 354   do_unpack new_or_data wrapper_unpacks acc buf
 355    = case read_em [] buf of
 356       (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
 357
 358 ------------------
 359 lex_scc cont buf =
 360 -- _trace ("scc: "++[C# (currentChar# buf)]) $
 361  case currentChar# buf of
 362   '"'# ->
 363       -- YUCK^2
 364      case prefixMatch (stepOn buf) "NO_CC\"" of
 365       Just buf' -> cont (ITscc noCostCentre) (stepOverLexeme buf')
 366       Nothing ->
 367        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 368         Just buf' -> cont (ITscc useCurrentCostCentre) (stepOverLexeme buf')
 369         Nothing   ->
 370          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 371          Just buf' -> cont (ITscc overheadCostCentre) (stepOverLexeme buf')
 372          Nothing   ->
 373           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 374            Just buf' -> cont (ITscc dontCareCostCentre) (stepOverLexeme buf')
 375            Nothing   ->
 376             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 377              Just buf' -> cont (ITscc subsumedCosts) (stepOverLexeme buf')
 378              Nothing ->
 379               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 380                Just buf' -> cont (ITscc preludeCafsCostCentre) (stepOverLexeme buf')
 381                Nothing ->
 382                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 383                  Just buf' ->
 384                   case untilChar# (stepOverLexeme buf') '\"'# of
 385                    buf'' -> cont (ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_)) (stepOn (stepOverLexeme buf''))
 386                  Nothing ->
 387                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 388                    Just buf' -> cont (ITscc (preludeDictsCostCentre True)) (stepOverLexeme buf')
 389                    Nothing ->
 390                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 391                      Just buf' ->
 392                       case untilChar# (stepOverLexeme buf') '\"'# of
 393                        buf'' -> cont (ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True))
 394                                 (stepOn (stepOverLexeme buf''))
 395                      Nothing ->
 396                       let
 397                        match_user_cc buf =
 398                         case untilChar# buf '/'# of
 399                          buf' ->
 400                           let mod_name = lexemeToFastString buf' in
 401 --                        case untilChar# (stepOn (stepOverLexeme buf')) '/'# of
 402 --                         buf'' ->
 403 --                            let grp_name = lexemeToFastString buf'' in
 404                             case untilChar# (stepOn (stepOverLexeme buf')) '\"'# of
 405                              buf'' ->
 406                                -- The label may contain arbitrary characters, so it
 407                                -- may have been escaped etc., hence we `read' it in to get
 408                                -- rid of these meta-chars in the string and then pack it (again.)
 409                                -- ToDo: do the same for module name (single quotes allowed in m-names).
 410                                -- BTW, the code in this module is totally gruesome..
 411                                let upk_label = _UNPK_ (lexemeToFastString buf'') in
 412                                case reads ('"':upk_label++"\"") of
 413                                 ((cc_label,_):_) ->
 414                                     let cc_name = _PK_ cc_label in
 415                                     (mkUserCC cc_name mod_name _NIL_{-grp_name-},
 416                                      stepOn (stepOverLexeme buf''))
 417                                 _ ->
 418                                   trace ("trouble lexing scc label: " ++ upk_label ++ " , ignoring")
 419                                   (mkUserCC _NIL_ mod_name _NIL_{-grp_name-},
 420                                    stepOn (stepOverLexeme buf''))
 421                       in
 422                       case prefixMatch (stepOn buf) "CAF:" of
 423                        Just buf' ->
 424                          case match_user_cc (stepOverLexeme buf') of
 425                           (cc, buf'') -> cont (ITscc (cafifyCC cc)) buf''
 426                        Nothing ->
 427                          case match_user_cc (stepOn buf) of
 428                           (cc, buf'') -> cont (ITscc cc) buf''
 429   c -> cont (ITunknown [C# c]) (stepOn buf)
 430
 431
 432 -----------
 433 lex_num :: (IfaceToken -> IfM a) ->
 434         (Int -> Int) -> Int# -> IfM a
 435 lex_num cont minus acc# buf =
 436 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 437  case scanNumLit (I# acc#) buf of
 438      (acc',buf') ->
 439        case currentChar# buf' of
 440          '.'# ->
 441              -- this case is not optimised at all, as the
 442              -- presence of floating point numbers in interface
 443              -- files is not that common. (ToDo)
 444             case expandWhile (isDigit) (incLexeme buf') of
 445               buf'' -> -- points to first non digit char
 446                 case reads (lexemeToString buf'') of
 447                   [(v,_)] -> cont (ITdouble v) (stepOverLexeme buf'')
 448          _ -> cont (ITinteger (fromInt (minus acc'))) (stepOverLexeme buf')
 449
 450 --         case reads (lexemeToString buf') of
 451 --           [(i,_)] -> cont (ITinteger i) (stepOverLexeme buf')
 452
 453 ------------
 454 lex_keyword cont buf =
 455 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 456  case currentChar# buf of
 457   ':'# -> case lookAhead# buf 1# of
 458             '_'# -> -- a binding, type (and other id-info) follows,
 459                     -- to make the parser ever so slightly, we push
 460                     --
 461                 lex_decl cont (stepOnBy# buf 2#)
 462             v# -> cont (ITunknown (['_',':',C# v#])) (stepOnBy# buf 2#)
 463   _ ->
 464     case expandWhile (is_kwd_char) buf of
 465      buf' ->
 466       let kw = lexemeToFastString buf' in
 467 --    _trace ("kw: "++lexemeToString buf') $
 468       case lookupUFM ifaceKeywordsFM kw of
 469        Nothing -> cont (ITunknown (_UNPK_ kw)) -- (minor) sigh
 470                   (stepOverLexeme buf')
 471        Just xx -> cont xx (stepOverLexeme buf')
 472
 473 lex_decl cont buf =
 474  case doDiscard False buf of -- spin until ;; is found
 475    buf' ->
 476       {- _trace (show (lexemeToString buf')) $ -}
 477       case currentChar# buf' of
 478        '\n'# -> -- newline, no id info.
 479            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 480                 (stepOverLexeme buf')
 481        '\r'# -> -- just to be sure for those Win* boxes..
 482            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 483                 (stepOverLexeme buf')
 484        '\NUL'# ->
 485            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 486                 (stepOverLexeme buf')
 487        c     -> -- run all over the id info
 488          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 489            buf'' ->
 490                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 491                     --_trace (show (lexemeToString (decLexeme buf''))) $
 492                     let idinfo =
 493                             if opt_IgnoreIfacePragmas then
 494                                 Nothing
 495                             else
 496                                 Just (lexemeToBuffer (decLexeme buf''))
 497                         --_trace (show is) $
 498                     in
 499                     cont (ITtysig (lexemeToBuffer (decLexeme buf')) idinfo)
 500                         (stepOverLexeme buf'')
 501
 502 -- ToDo: hammer!
 503 is_kwd_char c@(C# c#) =
 504  isAlphanum c || -- OLD: c `elem` "_@/\\"
 505  (case c# of
 506    '_'#  -> True
 507    '@'#  -> True
 508    '/'#  -> True
 509    '\\'# -> True
 510    _     -> False)
 511
 512
 513
 514 -----------
 515 lex_cstring cont buf =
 516 -- _trace ("lex_cstring: "++[C# (currentChar# buf)]) $
 517  case expandUntilMatch buf "\'\'" of
 518    buf' -> cont (ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))))
 519            (stepOverLexeme buf')
 520
 521 -----------
 522 lex_tuple cont module_dot buf =
 523 -- _trace ("lex_tuple: "++[C# (currentChar# buf)]) $
 524   go 2 buf
 525   where
 526    go n buf =
 527     case currentChar# buf of
 528       ','# -> go (n+1) (stepOn buf)
 529       ')'# -> end_lex_id cont module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 530       _    -> cont (ITunknown ("tuple " ++ show n)) buf
 531
 532 -- Similarly ' itself is ok inside an identifier, but not at the start
 533
 534 -- id_arr is an array of bytes, indexed by characters,
 535 -- containing 0 if the character isn't a valid character from an identifier
 536 -- and 1 if it is.  It's just a memo table for is_id_char.
 537 id_arr :: ByteArray Int
 538 id_arr =
 539  runST (
 540   newCharArray (0,255) >>= \ barr ->
 541   let
 542    loop 256# = return ()
 543    loop i# =
 544     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 545        writeCharArray barr (I# i#) '\1'         >>
 546        loop (i# +# 1#)
 547     else
 548        writeCharArray barr (I# i#) '\0'         >>
 549        loop (i# +# 1#)
 550   in
 551   loop 0#                                       >>
 552   unsafeFreezeByteArray barr)
 553
 554 is_id_char (C# c#) =
 555  let
 556   ByteArray _ arr# = id_arr
 557  in
 558  case ord# (indexCharArray# arr# (ord# c#)) of
 559   0# -> False
 560   1# -> True
 561
 562 --OLD: is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 563
 564 is_sym c# =
 565  case c# of {
 566    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 567    '#'# -> True; '$'#  -> True; '%'# -> True;
 568    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 569    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 570    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 571    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 572
 573 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 574
 575
 576 -- mod_arr is an array of bytes, indexed by characters,
 577 -- containing 0 if the character isn't a valid character from a module name,
 578 -- and 1 if it is.
 579 mod_arr :: ByteArray Int
 580 mod_arr =
 581  runST (
 582   newCharArray (0,255) >>= \ barr ->
 583   let
 584    loop 256# = return ()
 585    loop i# =
 586     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 587        writeCharArray barr (I# i#) '\1'         >>
 588        loop (i# +# 1#)
 589     else
 590        writeCharArray barr (I# i#) '\0'         >>
 591        loop (i# +# 1#)
 592   in
 593   loop 0#                                       >>
 594   unsafeFreezeByteArray barr)
 595
 596
 597 is_mod_char (C# c#) =
 598  let
 599   ByteArray _ arr# = mod_arr
 600  in
 601  case ord# (indexCharArray# arr# (ord# c#)) of
 602   0# -> False
 603   1# -> True
 604
 605 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 606
 607 lex_id cont buf =
 608 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 609  case expandWhile (is_mod_char) buf of
 610    buf' ->
 611     case currentChar# buf' of
 612      '.'# -> munch buf' HiFile
 613      '!'# -> munch buf' HiBootFile
 614      _    -> lex_id2 cont Nothing buf'
 615    where
 616     munch buf' hif =
 617         if not (emptyLexeme buf') then
 618 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 619            case lexemeToFastString buf' of
 620              l@(FastString u# l# ba#) -> lex_id2 cont (Just (FastString u# l# ba#, hif))
 621                                                  (stepOn (stepOverLexeme buf'))
 622         else
 623            lex_id2 cont Nothing buf'
 624
 625
 626 -- Dealt with the Module.part
 627 lex_id2 cont module_dot buf =
 628 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 629  case currentChar# buf of
 630
 631   '['# ->       -- Special case for []
 632     case lookAhead# buf 1# of
 633      ']'# -> end_lex_id cont module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 634      _    -> lex_id3 cont module_dot buf
 635
 636   '('# ->       -- Special case for (,,,)
 637     case lookAhead# buf 1# of
 638      ')'# -> end_lex_id cont module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 639      ','# -> lex_tuple cont module_dot (stepOnBy# buf 2#)
 640      _    -> lex_id3 cont module_dot buf
 641   ':'# -> lex_id3 cont module_dot (incLexeme buf)
 642   '-'# ->
 643      case module_dot of
 644        Nothing  -> lex_id3 cont module_dot buf
 645        Just ghc -> -- this should be "GHC" (current home of (->))
 646          case lookAhead# buf 1# of
 647           '>'# -> end_lex_id cont module_dot (ITconid SLIT("->"))
 648                         (stepOnBy# buf 2#)
 649           _    -> lex_id3 cont module_dot buf
 650   _    -> lex_id3 cont module_dot buf
 651
 652
 653
 654 -- Dealt with [], (), : special cases
 655
 656 lex_id3 cont module_dot buf =
 657 -- _trace ("lex_id3: "++[C# (currentChar# buf)]) $
 658  case expandWhile (is_id_char) buf of
 659   buf' ->
 660     case module_dot of
 661      Just _ ->
 662        end_lex_id cont module_dot (mk_var_token lexeme) new_buf
 663      Nothing ->
 664        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 665          Just kwd_token -> cont kwd_token new_buf
 666          Nothing        -> cont (mk_var_token lexeme) new_buf
 667     where
 668      lexeme  = lexemeToFastString buf'
 669      new_buf = stepOverLexeme buf'
 670
 671
 672 {- OLD:
 673 lex_id2 module_dot [] ('[': ']': cs) = end_lex_id module_dot (ITconid SLIT("[]")) cs
 674 lex_id2 module_dot [] ('(': ')': cs) = end_lex_id module_dot (ITconid SLIT("()")) cs
 675 lex_id2 module_dot [] ('(': ',': cs) = lex_tuple module_dot cs
 676 lex_id2 module_dot [] (':' : cs)     = lex_id3 module_dot [':'] cs
 677 lex_id2 module_dot xs cs             = lex_id3 module_dot xs cs
 678 -}
 679
 680 -- Dealt with [], (), : special cases
 681
 682 {-
 683 lex_id3 module_dot len_xs xs cs =
 684  case my_span' (is_id_char) cs of
 685    (xs1,len_xs1,rest) ->
 686     case module_dot of
 687      Just m  -> end_lex_id (Just m) (mk_var_token rxs) rest --OLD:_PK_ (reverse xs))) rest
 688      Nothing ->
 689       case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM rxs of
 690        Just kwd_token -> kwd_token          : lexIface rest
 691        other          -> token : lexIface cs end_lex_id Nothing (mk_var_token rxs) rest
 692     where
 693      rxs = packNChars (len_xs+len_xs1) (xs++xs1) -- OLD: _PK_ (reverse xs)
 694 -}
 695 mk_var_token pk_str =
 696      let
 697       f = _HEAD_ pk_str
 698      in
 699      --
 700      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 701      -- remove the second half of disjunction when using a 1.3 prelude.
 702      --
 703      if      isUpper f    then ITconid pk_str
 704      else if isLower f    then ITvarid pk_str
 705      else if f == ':'     then ITconsym pk_str
 706      else if isLowerISO f then ITvarid pk_str
 707      else if isUpperISO f then ITconid pk_str
 708      else ITvarsym pk_str
 709
 710 {-
 711     mk_var_token xs@(f:_) | isUpper f || isUpperISO f = ITconid n
 712                           | f == ':'              = ITconsym n
 713                           | isAlpha f             = ITvarid n
 714                           | otherwise             = ITvarsym n
 715                 where
 716                       n = _PK_ xs
 717 -}
 718
 719 end_lex_id cont Nothing token buf  = cont token buf
 720 end_lex_id cont (Just (m,hif)) token buf =
 721  case token of
 722    ITconid n  -> cont (ITqconid  (m,n,hif))         buf
 723    ITvarid n  -> cont (ITqvarid  (m,n,hif))         buf
 724    ITconsym n -> cont (ITqconsym (m,n,hif))         buf
 725
 726         -- Special case for ->
 727         -- "->" by itself is a special token (ITrarrow),
 728         -- but M.-> is a ITqconid
 729    ITvarsym n |  n == SLIT("->")
 730               -> cont (ITqconsym (m,n,hif))         buf
 731
 732    ITvarsym n -> cont (ITqvarsym (m,n,hif))         buf
 733
 734 -- ITbang can't happen here I think
 735 --   ITbang     -> cont (ITqvarsym (m,SLIT("!"),hif)) buf
 736
 737    _          -> cont (ITunknown (show token))      buf
 738
 739 ------------
 740 ifaceKeywordsFM :: UniqFM IfaceToken
 741 ifaceKeywordsFM = listToUFM $
 742       map (\ (x,y) -> (_PK_ x,y))
 743        [("/\\_",                ITbiglam)
 744        ,("@_",                  ITatsign)
 745        ,("letrec_",             ITletrec)
 746        ,("interface_",          ITinterface)
 747        ,("usages_",             ITusages)
 748        ,("versions_",           ITversions)
 749        ,("exports_",            ITexports)
 750        ,("instance_modules_",   ITinstance_modules)
 751        ,("instances_",          ITinstances)
 752        ,("fixities_",           ITfixities)
 753        ,("declarations_",       ITdeclarations)
 754        ,("pragmas_",            ITpragmas)
 755        ,("forall_",             ITforall)
 756        ,("U_",                  ITunfold False)
 757        ,("U!_",                 ITunfold True)
 758        ,("A_",                  ITarity)
 759        ,("coerce_in_",          ITcoerce_in)
 760        ,("coerce_out_",         ITcoerce_out)
 761        ,("bot_",                ITbottom)
 762        ,("integer_",            ITinteger_lit)
 763        ,("rational_",           ITrational_lit)
 764        ,("addr_",               ITaddr_lit)
 765        ,("float_",              ITfloat_lit)
 766        ,("string_",             ITstring_lit)
 767        ,("litlit_",             ITlit_lit)
 768        ,("ccall_",              ITccall (False, False))
 769        ,("ccall_GC_",           ITccall (False, True))
 770        ,("casm_",               ITccall (True,  False))
 771        ,("casm_GC_",            ITccall (True,  True))
 772        ]
 773
 774 haskellKeywordsFM = listToUFM $
 775       map (\ (x,y) -> (_PK_ x,y))
 776       [ ("data",                ITdata)
 777        ,("type",                ITtype)
 778        ,("newtype",             ITnewtype)
 779        ,("class",               ITclass)
 780        ,("where",               ITwhere)
 781        ,("instance",            ITinstance)
 782        ,("infixl",              ITinfixl)
 783        ,("infixr",              ITinfixr)
 784        ,("infix",               ITinfix)
 785        ,("case",                ITcase)
 786        ,("case#",               ITprim_case)
 787        ,("of",                  ITof)
 788        ,("in",                  ITin)
 789        ,("let",                 ITlet)
 790        ,("deriving",            ITderiving)
 791
 792        ,("->",                  ITrarrow)
 793        ,("\\",                  ITlam)
 794        ,("|",                   ITvbar)
 795        ,("!",                   ITbang)
 796        ,("=>",                  ITdarrow)
 797        ,("=",                   ITequal)
 798        ,("::",                  ITdcolon)
 799        ]
 800
 801
 802 -- doDiscard rips along really fast, looking for a double semicolon,
 803 -- indicating the end of the pragma we're skipping
 804 doDiscard inStr buf =
 805 -- _trace (show (C# (currentChar# buf))) $
 806  case currentChar# buf of
 807    ';'# ->
 808      if not inStr then
 809        case lookAhead# buf 1# of
 810         ';'# -> incLexeme (incLexeme buf)
 811         _    -> doDiscard inStr (incLexeme buf)
 812      else
 813        doDiscard inStr (incLexeme buf)
 814    '"'# ->
 815        let
 816         odd_slashes buf flg i# =
 817           case lookAhead# buf i# of
 818            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 819            _     -> flg
 820        in
 821        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 822          '\\'# -> -- escaping something..
 823            if odd_slashes buf True (negateInt# 2#) then
 824                -- odd number of slashes, " is escaped.
 825               doDiscard inStr (incLexeme buf)
 826            else
 827                -- even number of slashes, \ is escaped.
 828               doDiscard (not inStr) (incLexeme buf)
 829          _ -> case inStr of -- forced to avoid build-up
 830                True  -> doDiscard False (incLexeme buf)
 831                False -> doDiscard True  (incLexeme buf)
 832    _ -> doDiscard inStr (incLexeme buf)
 833
 834 \end{code}
 835
 836 begin{code}
 837 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 838 my_span p xs = go [] xs
 839   where
 840     go so_far (x:xs') | p x = go (x:so_far) xs'
 841     go so_far xs            = (reverse so_far, xs)
 842
 843 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 844 my_span' p xs = go [] 0 xs
 845   where
 846     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 847     go so_far n xs            = (reverse so_far,n, xs)
 848 end{code}
 849
 850
 851 %************************************************************************
 852 %*                                                                      *
 853 \subsection{Other utility functions
 854 %*                                                                      *
 855 %************************************************************************
 856
 857 \begin{code}
 858 type IfM a = StringBuffer       -- Input string
 859           -> SrcLoc
 860           -> MaybeErr a ErrMsg
 861
 862 returnIf   :: a -> IfM a
 863 returnIf a s l = Succeeded a
 864
 865 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 866 m `thenIf` k = \s l ->
 867         case m s l of
 868                 Succeeded a -> k a s l
 869                 Failed err  -> Failed err
 870
 871 getSrcLocIf :: IfM SrcLoc
 872 getSrcLocIf s l = Succeeded l
 873
 874 happyError :: IfM a
 875 happyError s l = Failed (ifaceParseErr l ([]::[IfaceToken]){-Todo-})
 876
 877
 878 {-
 879  Note that if the file we're processing ends with `hi-boot',
 880  we accept it on faith as having the right version.
 881  This is done so that .hi-boot files  that comes with hsc
 882  don't have to be updated before every release, and it
 883  allows us to share .hi-boot files with versions of hsc
 884  that don't have .hi version checking (e.g., ghc-2.10's)
 885
 886  If the version number is 0, the checking is also turned off.
 887 -}
 888 checkVersion :: Maybe Integer -> IfM ()
 889 checkVersion mb@(Just v) s l
 890  | (v==0) || (v == PROJECTVERSION) = Succeeded ()
 891  | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
 892 checkVersion mb@Nothing  s l
 893  | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile l)) = Succeeded ()
 894  | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
 895
 896 -----------------------------------------------------------------
 897
 898 ifaceParseErr l toks
 899   = hsep [ppr l, ptext SLIT("Interface-file parse error;"),
 900           ptext SLIT("toks="), text (show (take 10 toks))]
 901
 902 ifaceVersionErr hi_vers l toks
 903   = hsep [ppr l, ptext SLIT("Interface file version error;"),
 904           ptext SLIT("Expected"), int PROJECTVERSION,
 905           ptext SLIT(" found "), pp_version]
 906     where
 907      pp_version =
 908       case hi_vers of
 909         Nothing -> ptext SLIT("pre ghc-3.02 version")
 910         Just v  -> ptext SLIT("version") <+> integer v
 911
 912 \end{code}