ghc/compiler/reader/Lex.lhs

   1 --------------------------------------------------------
   2 [Jan 98]
   3 There's a known bug in here:
   4
   5         If an interface file ends prematurely, Lex tries to
   6         do headFS of an empty FastString.
   7
   8 An example that provokes the error is
   9
  10         f _:_ _forall_ [a] <<<END OF FILE>>>
  11 --------------------------------------------------------
  12
  13
  14 %
  15 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
  16 %
  17 \section[Lexical analysis]{Lexical analysis}
  18
  19 \begin{code}
  20 module Lex (
  21
  22         isLexCon, isLexVar, isLexId, isLexSym,
  23         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  24         mkTupNameStr, ifaceParseErr,
  25
  26         -- Monad for parser
  27         IfaceToken(..), lexIface, IfM, thenIf, returnIf, getSrcLocIf,
  28         happyError,
  29         StringBuffer
  30
  31     ) where
  32
  33 #include "HsVersions.h"
  34
  35 import Char             (isDigit, isAlpha, isAlphanum, isUpper,isLower, isSpace, ord )
  36
  37 import {-# SOURCE #-} CostCentre
  38
  39 import CmdLineOpts      ( opt_IgnoreIfacePragmas )
  40 import Demand           ( Demand(..) {- instance Read -} )
  41 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  42 import BasicTypes       ( NewOrData(..), IfaceFlavour(..) )
  43 import SrcLoc           ( SrcLoc, incSrcLine )
  44
  45 import Maybes           ( MaybeErr(..) )
  46 import ErrUtils         ( ErrMsg(..) )
  47 import Outputable
  48 import Util             ( nOfThem, panic )
  49
  50 import FastString
  51 import StringBuffer
  52 import GlaExts
  53 import ST               ( runST )
  54 \end{code}
  55
  56 %************************************************************************
  57 %*                                                                      *
  58 \subsection{Lexical categories}
  59 %*                                                                      *
  60 %************************************************************************
  61
  62 These functions test strings to see if they fit the lexical categories
  63 defined in the Haskell report.  Normally applied as in e.g. @isCon
  64 (getLocalName foo)@.
  65
  66 \begin{code}
  67 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  68  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  69
  70 isLexCon cs = isLexConId  cs || isLexConSym cs
  71 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  72
  73 isLexId  cs = isLexConId  cs || isLexVarId  cs
  74 isLexSym cs = isLexConSym cs || isLexVarSym cs
  75
  76 -------------
  77
  78 isLexConId cs
  79   | _NULL_ cs        = False
  80   | cs == SLIT("[]") = True
  81   | c  == '('        = True     -- (), (,), (,,), ...
  82   | otherwise        = isUpper c || isUpperISO c
  83   where
  84     c = _HEAD_ cs
  85
  86 isLexVarId cs
  87   | _NULL_ cs    = False
  88   | otherwise    = isLower c || isLowerISO c
  89   where
  90     c = _HEAD_ cs
  91
  92 isLexConSym cs
  93   | _NULL_ cs   = False
  94   | otherwise   = c  == ':'
  95                || cs == SLIT("->")
  96   where
  97     c = _HEAD_ cs
  98
  99 isLexVarSym cs
 100   | _NULL_ cs = False
 101   | otherwise = isSymbolASCII c
 102              || isSymbolISO c
 103   where
 104     c = _HEAD_ cs
 105
 106 -------------
 107 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 108 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 109 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 110 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 111 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 112 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 113 \end{code}
 114
 115
 116 %************************************************************************
 117 %*                                                                      *
 118 \subsection{Tuple strings -- ugh!}
 119 %*                                                                      *
 120 %************************************************************************
 121
 122 \begin{code}
 123 mkTupNameStr 0 = SLIT("()")
 124 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 125 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 126 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 127 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 128 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 129 \end{code}
 130
 131
 132
 133 %************************************************************************
 134 %*                                                                      *
 135 \subsection{Data types}
 136 %*                                                                      *
 137 %************************************************************************
 138
 139 The token data type, fairly un-interesting except from two constructors,
 140 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 141 strictness, unfolding etc) and types for id decls.
 142
 143 The Idea/Observation here is that the renamer needs to scan through
 144 all of an interface file before it can continue. But only a fraction
 145 of the information contained in the file turns out to be useful, so
 146 delaying as much as possible of the scanning and parsing of an
 147 interface file Makes Sense (Heap profiles of the compiler
 148 show at a reduction in heap usage by at least a factor of two,
 149 post-renamer).
 150
 151 Hence, the interface file lexer spots when value declarations are
 152 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 153 for the type and any other id info for that binding (unfolding, strictness
 154 etc). These constructors are applied to the result of lexing these sub-chunks.
 155
 156 The lexing of the type and id info is all done lazily, of course, so
 157 the scanning (and subsequent parsing) will be done *only* on the ids the
 158 renamer finds out that it is interested in. The rest will just be junked.
 159 Laziness, you know it makes sense :-)
 160
 161 \begin{code}
 162 data IfaceToken
 163   = ITinterface         -- keywords
 164   | ITusages
 165   | ITversions
 166   | ITexports
 167   | ITinstance_modules
 168   | ITinstances
 169   | ITfixities
 170   | ITdeclarations
 171   | ITpragmas
 172   | ITdata
 173   | ITtype
 174   | ITnewtype
 175   | ITderiving
 176   | ITclass
 177   | ITwhere
 178   | ITinstance
 179   | ITinfixl
 180   | ITinfixr
 181   | ITinfix
 182   | ITforall
 183   | ITbang              -- magic symbols
 184   | ITvbar
 185   | ITdcolon
 186   | ITcomma
 187   | ITdarrow
 188   | ITdotdot
 189   | ITequal
 190   | ITocurly
 191   | ITobrack
 192   | IToparen
 193   | ITrarrow
 194   | ITccurly
 195   | ITcbrack
 196   | ITcparen
 197   | ITsemi
 198   | ITvarid   FAST_STRING
 199   | ITconid   FAST_STRING
 200   | ITvarsym  FAST_STRING
 201   | ITconsym  FAST_STRING
 202   | ITqvarid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 203   | ITqconid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 204   | ITqvarsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 205   | ITqconsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 206
 207   | ITtysig StringBuffer (Maybe StringBuffer)
 208                            -- lazily return the stream of tokens for
 209                            -- the info attached to an id.
 210         -- Stuff for reading unfoldings
 211   | ITarity
 212   | ITunfold Bool               -- True <=> there's an INLINE pragma on this Id
 213   | ITstrict [Demand] | ITbottom
 214   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 215   | ITcoerce_in | ITcoerce_out | ITatsign
 216   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 217   | ITscc CostCentre
 218   | ITchar Char | ITstring FAST_STRING
 219   | ITinteger Integer | ITdouble Double
 220   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 221   | ITunknown String            -- Used when the lexer can't make sense of it
 222   | ITeof                               -- end of file token
 223   deriving Text -- debugging
 224
 225 instance Text CostCentre -- cheat!
 226
 227 \end{code}
 228
 229 %************************************************************************
 230 %*                                                                      *
 231 \subsection{The lexical analyser}
 232 %*                                                                      *
 233 %************************************************************************
 234
 235 \begin{code}
 236 lexIface :: (IfaceToken -> IfM a) -> IfM a
 237 lexIface cont buf =
 238  _scc_ "Lexer"
 239 -- if bufferExhausted buf then
 240 --  []
 241 -- else
 242 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 243   case currentChar# buf of
 244       -- whitespace and comments, ignore.
 245     ' '#  -> lexIface cont (stepOn buf)
 246     '\t'# -> lexIface cont (stepOn buf)
 247     '\n'# -> \ loc -> lexIface cont (stepOn buf) (incSrcLine loc)
 248
 249 -- Numbers and comments
 250     '-'#  ->
 251       case lookAhead# buf 1# of
 252         '-'# -> lex_comment cont (stepOnBy# buf 2#)
 253         c    ->
 254           if isDigit (C# c)
 255           then lex_num cont (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 256           else lex_id cont buf
 257
 258 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 259 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 260
 261     '('# ->
 262          case prefixMatch (stepOn buf) "..)" of
 263            Just buf' ->  cont ITdotdot (stepOverLexeme buf')
 264            Nothing ->
 265             case lookAhead# buf 1# of
 266               ','# -> lex_tuple cont Nothing  (stepOnBy# buf 2#)
 267               ')'# -> cont (ITconid SLIT("()")) (stepOnBy# buf 2#)
 268               _    -> cont IToparen (stepOn buf)
 269
 270     '{'# -> cont ITocurly (stepOn buf)
 271     '}'# -> cont ITccurly (stepOn buf)
 272     ')'# -> cont ITcparen (stepOn buf)
 273     '['# ->
 274       case lookAhead# buf 1# of
 275         ']'# -> cont (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 276         _    -> cont ITobrack (stepOn buf)
 277     ']'# -> cont ITcbrack (stepOn buf)
 278     ','# -> cont ITcomma  (stepOn buf)
 279     ';'#  -> cont ITsemi (stepOn buf)
 280     '\"'# -> case untilEndOfString# (stepOn buf) of
 281               buf' ->
 282                   -- the string literal does *not* include the dquotes
 283                 case lexemeToFastString buf' of
 284                  v -> cont (ITstring v) (stepOn (stepOverLexeme buf'))
 285
 286     '\''# -> --
 287              -- untilEndOfChar# extends the current lexeme until
 288              -- it hits a non-escaped single quote. The lexeme of the
 289              -- StringBuffer returned does *not* include the closing quote,
 290              -- hence we augment the lexeme and make sure to add the
 291              -- starting quote, before `read'ing the string.
 292              --
 293              case untilEndOfChar# (stepOn buf) of
 294                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 295                         [  (ch, rest)] -> cont (ITchar ch) (stepOverLexeme (incLexeme buf'))
 296
 297 -- ``thingy'' form for casm
 298     '`'# ->
 299             case lookAhead# buf 1# of
 300               '`'# -> lex_cstring cont (stepOnBy# buf 2#) -- remove the `` and go.
 301               _    -> lex_id cont (incLexeme buf)         -- add ` to lexeme and assume
 302                                                      -- scanning an id of some sort.
 303 -- Keywords
 304     '_'# ->
 305          case lookAhead# buf 1# of
 306            'S'# -> case lookAhead# buf 2# of
 307                     '_'# ->
 308                             lex_demand cont (stepOnUntil (not . isSpace)
 309                                             (stepOnBy# buf 3#)) -- past _S_
 310            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 311                      Just buf' -> lex_scc cont (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 312                      Nothing   -> lex_keyword cont (stepOnBy# buf 1#) -- drop the '_' and assume
 313                                                                  -- it is a keyword.
 314            _    -> lex_keyword cont (stepOn buf)
 315
 316     '\NUL'# ->
 317             if bufferExhausted (stepOn buf) then
 318                cont ITeof buf
 319             else
 320                lex_id cont buf
 321     c ->
 322         if isDigit (C# c) then
 323            lex_num cont (id) (ord# c -# ord# '0'#) (incLexeme buf)
 324         else
 325            lex_id cont buf
 326 --  where
 327 lex_comment cont buf =
 328 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 329    case untilChar# buf '\n'# of {buf' -> lexIface cont (stepOverLexeme buf')}
 330
 331 ------------------
 332 lex_demand cont buf =
 333 -- _trace ("demand: "++[C# (currentChar# buf)]) $
 334  case read_em [] buf of { (ls,buf') -> cont (ITstrict ls) (stepOverLexeme buf')}
 335  where
 336    -- code snatched from Demand.lhs
 337   read_em acc buf =
 338 --   _trace ("read_em: "++[C# (currentChar# buf)]) $
 339    case currentChar# buf of
 340     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 341     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 342     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 343     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 344     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 345     ')'# -> (reverse acc, stepOn buf)
 346     'U'# -> do_unpack DataType True  acc (stepOnBy# buf 2#)
 347     'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
 348     'N'# -> do_unpack NewType True  acc (stepOnBy# buf 2#)
 349     'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
 350     _    -> (reverse acc, buf)
 351
 352   do_unpack new_or_data wrapper_unpacks acc buf
 353    = case read_em [] buf of
 354       (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
 355
 356 ------------------
 357 lex_scc cont buf =
 358 -- _trace ("scc: "++[C# (currentChar# buf)]) $
 359  case currentChar# buf of
 360   '"'# ->
 361       -- YUCK^2
 362      case prefixMatch (stepOn buf) "NO_CC\"" of
 363       Just buf' -> cont (ITscc noCostCentre) (stepOverLexeme buf')
 364       Nothing ->
 365        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 366         Just buf' -> cont (ITscc useCurrentCostCentre) (stepOverLexeme buf')
 367         Nothing   ->
 368          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 369          Just buf' -> cont (ITscc overheadCostCentre) (stepOverLexeme buf')
 370          Nothing   ->
 371           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 372            Just buf' -> cont (ITscc dontCareCostCentre) (stepOverLexeme buf')
 373            Nothing   ->
 374             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 375              Just buf' -> cont (ITscc subsumedCosts) (stepOverLexeme buf')
 376              Nothing ->
 377               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 378                Just buf' -> cont (ITscc preludeCafsCostCentre) (stepOverLexeme buf')
 379                Nothing ->
 380                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 381                  Just buf' ->
 382                   case untilChar# (stepOverLexeme buf') '\"'# of
 383                    buf'' -> cont (ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_)) (stepOn (stepOverLexeme buf''))
 384                  Nothing ->
 385                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 386                    Just buf' -> cont (ITscc (preludeDictsCostCentre True)) (stepOverLexeme buf')
 387                    Nothing ->
 388                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 389                      Just buf' ->
 390                       case untilChar# (stepOverLexeme buf') '\"'# of
 391                        buf'' -> cont (ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True))
 392                                 (stepOn (stepOverLexeme buf''))
 393                      Nothing ->
 394                       let
 395                        match_user_cc buf =
 396                         case untilChar# buf '/'# of
 397                          buf' ->
 398                           let mod_name = lexemeToFastString buf' in
 399 --                        case untilChar# (stepOn (stepOverLexeme buf')) '/'# of
 400 --                         buf'' ->
 401 --                            let grp_name = lexemeToFastString buf'' in
 402                             case untilChar# (stepOn (stepOverLexeme buf')) '\"'# of
 403                              buf'' ->
 404                                -- The label may contain arbitrary characters, so it
 405                                -- may have been escaped etc., hence we `read' it in to get
 406                                -- rid of these meta-chars in the string and then pack it (again.)
 407                                -- ToDo: do the same for module name (single quotes allowed in m-names).
 408                                -- BTW, the code in this module is totally gruesome..
 409                                let upk_label = _UNPK_ (lexemeToFastString buf'') in
 410                                case reads ('"':upk_label++"\"") of
 411                                 ((cc_label,_):_) ->
 412                                     let cc_name = _PK_ cc_label in
 413                                     (mkUserCC cc_name mod_name _NIL_{-grp_name-},
 414                                      stepOn (stepOverLexeme buf''))
 415                                 _ ->
 416                                   trace ("trouble lexing scc label: " ++ upk_label ++ " , ignoring")
 417                                   (mkUserCC _NIL_ mod_name _NIL_{-grp_name-},
 418                                    stepOn (stepOverLexeme buf''))
 419                       in
 420                       case prefixMatch (stepOn buf) "CAF:" of
 421                        Just buf' ->
 422                          case match_user_cc (stepOverLexeme buf') of
 423                           (cc, buf'') -> cont (ITscc (cafifyCC cc)) buf''
 424                        Nothing ->
 425                          case match_user_cc (stepOn buf) of
 426                           (cc, buf'') -> cont (ITscc cc) buf''
 427   c -> cont (ITunknown [C# c]) (stepOn buf)
 428
 429
 430 -----------
 431 lex_num :: (IfaceToken -> IfM a) ->
 432         (Int -> Int) -> Int# -> IfM a
 433 lex_num cont minus acc# buf =
 434 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 435  case scanNumLit (I# acc#) buf of
 436      (acc',buf') ->
 437        case currentChar# buf' of
 438          '.'# ->
 439              -- this case is not optimised at all, as the
 440              -- presence of floating point numbers in interface
 441              -- files is not that common. (ToDo)
 442             case expandWhile (isDigit) (incLexeme buf') of
 443               buf'' -> -- points to first non digit char
 444                 case reads (lexemeToString buf'') of
 445                   [(v,_)] -> cont (ITdouble v) (stepOverLexeme buf'')
 446          _ -> cont (ITinteger (fromInt (minus acc'))) (stepOverLexeme buf')
 447
 448 --         case reads (lexemeToString buf') of
 449 --           [(i,_)] -> cont (ITinteger i) (stepOverLexeme buf')
 450
 451 ------------
 452 lex_keyword cont buf =
 453 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 454  case currentChar# buf of
 455   ':'# -> case lookAhead# buf 1# of
 456             '_'# -> -- a binding, type (and other id-info) follows,
 457                     -- to make the parser ever so slightly, we push
 458                     --
 459                 lex_decl cont (stepOnBy# buf 2#)
 460             v# -> cont (ITunknown (['_',':',C# v#])) (stepOnBy# buf 2#)
 461   _ ->
 462     case expandWhile (is_kwd_char) buf of
 463      buf' ->
 464       let kw = lexemeToFastString buf' in
 465 --    _trace ("kw: "++lexemeToString buf') $
 466       case lookupUFM ifaceKeywordsFM kw of
 467        Nothing -> cont (ITunknown (_UNPK_ kw)) -- (minor) sigh
 468                   (stepOverLexeme buf')
 469        Just xx -> cont xx (stepOverLexeme buf')
 470
 471 lex_decl cont buf =
 472  case doDiscard False buf of -- spin until ;; is found
 473    buf' ->
 474       {- _trace (show (lexemeToString buf')) $ -}
 475       case currentChar# buf' of
 476        '\n'# -> -- newline, no id info.
 477            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 478                 (stepOverLexeme buf')
 479        '\r'# -> -- just to be sure for those Win* boxes..
 480            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 481                 (stepOverLexeme buf')
 482        '\NUL'# ->
 483            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 484                 (stepOverLexeme buf')
 485        c     -> -- run all over the id info
 486          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 487            buf'' ->
 488                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 489                     --_trace (show (lexemeToString (decLexeme buf''))) $
 490                     let idinfo =
 491                             if opt_IgnoreIfacePragmas then
 492                                 Nothing
 493                             else
 494                                 Just (lexemeToBuffer (decLexeme buf''))
 495                         --_trace (show is) $
 496                     in
 497                     cont (ITtysig (lexemeToBuffer (decLexeme buf')) idinfo)
 498                         (stepOverLexeme buf'')
 499
 500 -- ToDo: hammer!
 501 is_kwd_char c@(C# c#) =
 502  isAlphanum c || -- OLD: c `elem` "_@/\\"
 503  (case c# of
 504    '_'#  -> True
 505    '@'#  -> True
 506    '/'#  -> True
 507    '\\'# -> True
 508    _     -> False)
 509
 510
 511
 512 -----------
 513 lex_cstring cont buf =
 514 -- _trace ("lex_cstring: "++[C# (currentChar# buf)]) $
 515  case expandUntilMatch buf "\'\'" of
 516    buf' -> cont (ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))))
 517            (stepOverLexeme buf')
 518
 519 -----------
 520 lex_tuple cont module_dot buf =
 521 -- _trace ("lex_tuple: "++[C# (currentChar# buf)]) $
 522   go 2 buf
 523   where
 524    go n buf =
 525     case currentChar# buf of
 526       ','# -> go (n+1) (stepOn buf)
 527       ')'# -> end_lex_id cont module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 528       _    -> cont (ITunknown ("tuple " ++ show n)) buf
 529
 530 -- Similarly ' itself is ok inside an identifier, but not at the start
 531
 532 -- id_arr is an array of bytes, indexed by characters,
 533 -- containing 0 if the character isn't a valid character from an identifier
 534 -- and 1 if it is.  It's just a memo table for is_id_char.
 535 id_arr :: ByteArray Int
 536 id_arr =
 537  runST (
 538   newCharArray (0,255) >>= \ barr ->
 539   let
 540    loop 256# = return ()
 541    loop i# =
 542     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 543        writeCharArray barr (I# i#) '\1'         >>
 544        loop (i# +# 1#)
 545     else
 546        writeCharArray barr (I# i#) '\0'         >>
 547        loop (i# +# 1#)
 548   in
 549   loop 0#                                       >>
 550   unsafeFreezeByteArray barr)
 551
 552 is_id_char (C# c#) =
 553  let
 554   ByteArray _ arr# = id_arr
 555  in
 556  case ord# (indexCharArray# arr# (ord# c#)) of
 557   0# -> False
 558   1# -> True
 559
 560 --OLD: is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 561
 562 is_sym c# =
 563  case c# of {
 564    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 565    '#'# -> True; '$'#  -> True; '%'# -> True;
 566    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 567    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 568    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 569    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 570
 571 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 572
 573
 574 -- mod_arr is an array of bytes, indexed by characters,
 575 -- containing 0 if the character isn't a valid character from a module name,
 576 -- and 1 if it is.
 577 mod_arr :: ByteArray Int
 578 mod_arr =
 579  runST (
 580   newCharArray (0,255) >>= \ barr ->
 581   let
 582    loop 256# = return ()
 583    loop i# =
 584     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 585        writeCharArray barr (I# i#) '\1'         >>
 586        loop (i# +# 1#)
 587     else
 588        writeCharArray barr (I# i#) '\0'         >>
 589        loop (i# +# 1#)
 590   in
 591   loop 0#                                       >>
 592   unsafeFreezeByteArray barr)
 593
 594
 595 is_mod_char (C# c#) =
 596  let
 597   ByteArray _ arr# = mod_arr
 598  in
 599  case ord# (indexCharArray# arr# (ord# c#)) of
 600   0# -> False
 601   1# -> True
 602
 603 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 604
 605 lex_id cont buf =
 606 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 607  case expandWhile (is_mod_char) buf of
 608    buf' ->
 609     case currentChar# buf' of
 610      '.'# -> munch buf' HiFile
 611      '!'# -> munch buf' HiBootFile
 612      _    -> lex_id2 cont Nothing buf'
 613    where
 614     munch buf' hif =
 615         if not (emptyLexeme buf') then
 616 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 617            case lexemeToFastString buf' of
 618              l@(FastString u# l# ba#) -> lex_id2 cont (Just (FastString u# l# ba#, hif))
 619                                                  (stepOn (stepOverLexeme buf'))
 620         else
 621            lex_id2 cont Nothing buf'
 622
 623
 624 -- Dealt with the Module.part
 625 lex_id2 cont module_dot buf =
 626 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 627  case currentChar# buf of
 628
 629   '['# ->       -- Special case for []
 630     case lookAhead# buf 1# of
 631      ']'# -> end_lex_id cont module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 632      _    -> lex_id3 cont module_dot buf
 633
 634   '('# ->       -- Special case for (,,,)
 635     case lookAhead# buf 1# of
 636      ')'# -> end_lex_id cont module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 637      ','# -> lex_tuple cont module_dot (stepOnBy# buf 2#)
 638      _    -> lex_id3 cont module_dot buf
 639   ':'# -> lex_id3 cont module_dot (incLexeme buf)
 640   '-'# ->
 641      case module_dot of
 642        Nothing  -> lex_id3 cont module_dot buf
 643        Just ghc -> -- this should be "GHC" (current home of (->))
 644          case lookAhead# buf 1# of
 645           '>'# -> end_lex_id cont module_dot (ITconid SLIT("->"))
 646                         (stepOnBy# buf 2#)
 647           _    -> lex_id3 cont module_dot buf
 648   _    -> lex_id3 cont module_dot buf
 649
 650
 651
 652 -- Dealt with [], (), : special cases
 653
 654 lex_id3 cont module_dot buf =
 655 -- _trace ("lex_id3: "++[C# (currentChar# buf)]) $
 656  case expandWhile (is_id_char) buf of
 657   buf' ->
 658     case module_dot of
 659      Just _ ->
 660        end_lex_id cont module_dot (mk_var_token lexeme) new_buf
 661      Nothing ->
 662        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 663          Just kwd_token -> cont kwd_token new_buf
 664          Nothing        -> cont (mk_var_token lexeme) new_buf
 665     where
 666      lexeme  = lexemeToFastString buf'
 667      new_buf = stepOverLexeme buf'
 668
 669
 670 {- OLD:
 671 lex_id2 module_dot [] ('[': ']': cs) = end_lex_id module_dot (ITconid SLIT("[]")) cs
 672 lex_id2 module_dot [] ('(': ')': cs) = end_lex_id module_dot (ITconid SLIT("()")) cs
 673 lex_id2 module_dot [] ('(': ',': cs) = lex_tuple module_dot cs
 674 lex_id2 module_dot [] (':' : cs)     = lex_id3 module_dot [':'] cs
 675 lex_id2 module_dot xs cs             = lex_id3 module_dot xs cs
 676 -}
 677
 678 -- Dealt with [], (), : special cases
 679
 680 {-
 681 lex_id3 module_dot len_xs xs cs =
 682  case my_span' (is_id_char) cs of
 683    (xs1,len_xs1,rest) ->
 684     case module_dot of
 685      Just m  -> end_lex_id (Just m) (mk_var_token rxs) rest --OLD:_PK_ (reverse xs))) rest
 686      Nothing ->
 687       case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM rxs of
 688        Just kwd_token -> kwd_token          : lexIface rest
 689        other          -> token : lexIface cs end_lex_id Nothing (mk_var_token rxs) rest
 690     where
 691      rxs = packNChars (len_xs+len_xs1) (xs++xs1) -- OLD: _PK_ (reverse xs)
 692 -}
 693 mk_var_token pk_str =
 694      let
 695       f = _HEAD_ pk_str
 696      in
 697      --
 698      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 699      -- remove the second half of disjunction when using a 1.3 prelude.
 700      --
 701      if      isUpper f    then ITconid pk_str
 702      else if isLower f    then ITvarid pk_str
 703      else if f == ':'     then ITconsym pk_str
 704      else if isLowerISO f then ITvarid pk_str
 705      else if isUpperISO f then ITconid pk_str
 706      else ITvarsym pk_str
 707
 708 {-
 709     mk_var_token xs@(f:_) | isUpper f || isUpperISO f = ITconid n
 710                           | f == ':'              = ITconsym n
 711                           | isAlpha f             = ITvarid n
 712                           | otherwise             = ITvarsym n
 713                 where
 714                       n = _PK_ xs
 715 -}
 716
 717 end_lex_id cont Nothing token buf  = cont token buf
 718 end_lex_id cont (Just (m,hif)) token buf =
 719  case token of
 720    ITconid n  -> cont (ITqconid  (m,n,hif))         buf
 721    ITvarid n  -> cont (ITqvarid  (m,n,hif))         buf
 722    ITconsym n -> cont (ITqconsym (m,n,hif))         buf
 723
 724         -- Special case for ->
 725         -- "->" by itself is a special token (ITrarrow),
 726         -- but M.-> is a ITqconid
 727    ITvarsym n |  n == SLIT("->")
 728               -> cont (ITqconsym (m,n,hif))         buf
 729
 730    ITvarsym n -> cont (ITqvarsym (m,n,hif))         buf
 731
 732 -- ITbang can't happen here I think
 733 --   ITbang     -> cont (ITqvarsym (m,SLIT("!"),hif)) buf
 734
 735    _          -> cont (ITunknown (show token))      buf
 736
 737 ------------
 738 ifaceKeywordsFM :: UniqFM IfaceToken
 739 ifaceKeywordsFM = listToUFM $
 740       map (\ (x,y) -> (_PK_ x,y))
 741        [("/\\_",                ITbiglam)
 742        ,("@_",                  ITatsign)
 743        ,("letrec_",             ITletrec)
 744        ,("interface_",          ITinterface)
 745        ,("usages_",             ITusages)
 746        ,("versions_",           ITversions)
 747        ,("exports_",            ITexports)
 748        ,("instance_modules_",   ITinstance_modules)
 749        ,("instances_",          ITinstances)
 750        ,("fixities_",           ITfixities)
 751        ,("declarations_",       ITdeclarations)
 752        ,("pragmas_",            ITpragmas)
 753        ,("forall_",             ITforall)
 754        ,("U_",                  ITunfold False)
 755        ,("U!_",                 ITunfold True)
 756        ,("A_",                  ITarity)
 757        ,("coerce_in_",          ITcoerce_in)
 758        ,("coerce_out_",         ITcoerce_out)
 759        ,("bot_",                ITbottom)
 760        ,("integer_",            ITinteger_lit)
 761        ,("rational_",           ITrational_lit)
 762        ,("addr_",               ITaddr_lit)
 763        ,("float_",              ITfloat_lit)
 764        ,("string_",             ITstring_lit)
 765        ,("litlit_",             ITlit_lit)
 766        ,("ccall_",              ITccall (False, False))
 767        ,("ccall_GC_",           ITccall (False, True))
 768        ,("casm_",               ITccall (True,  False))
 769        ,("casm_GC_",            ITccall (True,  True))
 770        ]
 771
 772 haskellKeywordsFM = listToUFM $
 773       map (\ (x,y) -> (_PK_ x,y))
 774       [ ("data",                ITdata)
 775        ,("type",                ITtype)
 776        ,("newtype",             ITnewtype)
 777        ,("class",               ITclass)
 778        ,("where",               ITwhere)
 779        ,("instance",            ITinstance)
 780        ,("infixl",              ITinfixl)
 781        ,("infixr",              ITinfixr)
 782        ,("infix",               ITinfix)
 783        ,("case",                ITcase)
 784        ,("case#",               ITprim_case)
 785        ,("of",                  ITof)
 786        ,("in",                  ITin)
 787        ,("let",                 ITlet)
 788        ,("deriving",            ITderiving)
 789
 790        ,("->",                  ITrarrow)
 791        ,("\\",                  ITlam)
 792        ,("|",                   ITvbar)
 793        ,("!",                   ITbang)
 794        ,("=>",                  ITdarrow)
 795        ,("=",                   ITequal)
 796        ,("::",                  ITdcolon)
 797        ]
 798
 799
 800 -- doDiscard rips along really fast, looking for a double semicolon,
 801 -- indicating the end of the pragma we're skipping
 802 doDiscard inStr buf =
 803 -- _trace (show (C# (currentChar# buf))) $
 804  case currentChar# buf of
 805    ';'# ->
 806      if not inStr then
 807        case lookAhead# buf 1# of
 808         ';'# -> incLexeme (incLexeme buf)
 809         _    -> doDiscard inStr (incLexeme buf)
 810      else
 811        doDiscard inStr (incLexeme buf)
 812    '"'# ->
 813        let
 814         odd_slashes buf flg i# =
 815           case lookAhead# buf i# of
 816            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 817            _     -> flg
 818        in
 819        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 820          '\\'# -> -- escaping something..
 821            if odd_slashes buf True (negateInt# 2#) then
 822                -- odd number of slashes, " is escaped.
 823               doDiscard inStr (incLexeme buf)
 824            else
 825                -- even number of slashes, \ is escaped.
 826               doDiscard (not inStr) (incLexeme buf)
 827          _ -> case inStr of -- forced to avoid build-up
 828                True  -> doDiscard False (incLexeme buf)
 829                False -> doDiscard True  (incLexeme buf)
 830    _ -> doDiscard inStr (incLexeme buf)
 831
 832 \end{code}
 833
 834 begin{code}
 835 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 836 my_span p xs = go [] xs
 837   where
 838     go so_far (x:xs') | p x = go (x:so_far) xs'
 839     go so_far xs            = (reverse so_far, xs)
 840
 841 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 842 my_span' p xs = go [] 0 xs
 843   where
 844     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 845     go so_far n xs            = (reverse so_far,n, xs)
 846 end{code}
 847
 848
 849 %************************************************************************
 850 %*                                                                      *
 851 \subsection{Other utility functions
 852 %*                                                                      *
 853 %************************************************************************
 854
 855 \begin{code}
 856 type IfM a = StringBuffer       -- Input string
 857           -> SrcLoc
 858           -> MaybeErr a ErrMsg
 859
 860 returnIf   :: a -> IfM a
 861 returnIf a s l = Succeeded a
 862
 863 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 864 m `thenIf` k = \s l ->
 865         case m s l of
 866                 Succeeded a -> k a s l
 867                 Failed err  -> Failed err
 868
 869 getSrcLocIf :: IfM SrcLoc
 870 getSrcLocIf s l = Succeeded l
 871
 872 happyError :: IfM a
 873 happyError s l = Failed (ifaceParseErr l ([]::[IfaceToken]){-Todo-})
 874
 875 -----------------------------------------------------------------
 876
 877 ifaceParseErr l toks
 878   = hsep [ppr l, ptext SLIT("Interface-file parse error;"),
 879           ptext SLIT("toks="), text (show (take 10 toks))]
 880 \end{code}