ghc/compiler/reader/Lex.lhs

   1 --------------------------------------------------------
   2 [Jan 98]
   3 There's a known bug in here:
   4
   5         If an interface file ends prematurely, Lex tries to
   6         do headFS of an empty FastString.
   7
   8 An example that provokes the error is
   9
  10         f _:_ _forall_ [a] <<<END OF FILE>>>
  11 --------------------------------------------------------
  12
  13
  14 %
  15 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
  16 %
  17 \section[Lexical analysis]{Lexical analysis}
  18
  19 \begin{code}
  20 module Lex (
  21
  22         isLexCon, isLexVar, isLexId, isLexSym,
  23         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  24         mkTupNameStr, ifaceParseErr,
  25
  26         -- Monad for parser
  27         IfaceToken(..), lexIface, IfM, thenIf, returnIf, getSrcLocIf,
  28         checkVersion,
  29         happyError,
  30         StringBuffer
  31
  32     ) where
  33
  34 #include "HsVersions.h"
  35
  36 import Char             (isDigit, isAlphanum, isUpper,isLower, isSpace, ord )
  37 import List             ( isSuffixOf )
  38
  39 import {-# SOURCE #-} CostCentre
  40
  41 import CmdLineOpts      ( opt_IgnoreIfacePragmas )
  42 import Demand           ( Demand(..) {- instance Read -} )
  43 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  44 import BasicTypes       ( NewOrData(..), IfaceFlavour(..) )
  45 import SrcLoc           ( SrcLoc, incSrcLine, srcLocFile )
  46
  47 import Maybes           ( MaybeErr(..) )
  48 import ErrUtils         ( ErrMsg(..) )
  49 import Outputable
  50 import Util             ( nOfThem, panic )
  51
  52 import FastString
  53 import StringBuffer
  54 import GlaExts
  55 import ST               ( runST )
  56 \end{code}
  57
  58 %************************************************************************
  59 %*                                                                      *
  60 \subsection{Lexical categories}
  61 %*                                                                      *
  62 %************************************************************************
  63
  64 These functions test strings to see if they fit the lexical categories
  65 defined in the Haskell report.  Normally applied as in e.g. @isCon
  66 (getLocalName foo)@.
  67
  68 \begin{code}
  69 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  70  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  71
  72 isLexCon cs = isLexConId  cs || isLexConSym cs
  73 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  74
  75 isLexId  cs = isLexConId  cs || isLexVarId  cs
  76 isLexSym cs = isLexConSym cs || isLexVarSym cs
  77
  78 -------------
  79
  80 isLexConId cs
  81   | _NULL_ cs        = False
  82   | cs == SLIT("[]") = True
  83   | c  == '('        = True     -- (), (,), (,,), ...
  84   | otherwise        = isUpper c || isUpperISO c
  85   where
  86     c = _HEAD_ cs
  87
  88 isLexVarId cs
  89   | _NULL_ cs    = False
  90   | otherwise    = isLower c || isLowerISO c
  91   where
  92     c = _HEAD_ cs
  93
  94 isLexConSym cs
  95   | _NULL_ cs   = False
  96   | otherwise   = c  == ':'
  97                || cs == SLIT("->")
  98   where
  99     c = _HEAD_ cs
 100
 101 isLexVarSym cs
 102   | _NULL_ cs = False
 103   | otherwise = isSymbolASCII c
 104              || isSymbolISO c
 105   where
 106     c = _HEAD_ cs
 107
 108 -------------
 109 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 110 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 111 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 112 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 113 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 114 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 115 \end{code}
 116
 117
 118 %************************************************************************
 119 %*                                                                      *
 120 \subsection{Tuple strings -- ugh!}
 121 %*                                                                      *
 122 %************************************************************************
 123
 124 \begin{code}
 125 mkTupNameStr 0 = SLIT("()")
 126 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 127 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 128 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 129 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 130 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 131 \end{code}
 132
 133
 134
 135 %************************************************************************
 136 %*                                                                      *
 137 \subsection{Data types}
 138 %*                                                                      *
 139 %************************************************************************
 140
 141 The token data type, fairly un-interesting except from two constructors,
 142 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 143 strictness, unfolding etc) and types for id decls.
 144
 145 The Idea/Observation here is that the renamer needs to scan through
 146 all of an interface file before it can continue. But only a fraction
 147 of the information contained in the file turns out to be useful, so
 148 delaying as much as possible of the scanning and parsing of an
 149 interface file Makes Sense (Heap profiles of the compiler
 150 show at a reduction in heap usage by at least a factor of two,
 151 post-renamer).
 152
 153 Hence, the interface file lexer spots when value declarations are
 154 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 155 for the type and any other id info for that binding (unfolding, strictness
 156 etc). These constructors are applied to the result of lexing these sub-chunks.
 157
 158 The lexing of the type and id info is all done lazily, of course, so
 159 the scanning (and subsequent parsing) will be done *only* on the ids the
 160 renamer finds out that it is interested in. The rest will just be junked.
 161 Laziness, you know it makes sense :-)
 162
 163 \begin{code}
 164 data IfaceToken
 165   = ITinterface         -- keywords
 166   | ITusages
 167   | ITversions
 168   | ITexports
 169   | ITinstance_modules
 170   | ITinstances
 171   | ITfixities
 172   | ITdeclarations
 173   | ITpragmas
 174   | ITdata
 175   | ITtype
 176   | ITnewtype
 177   | ITderiving
 178   | ITclass
 179   | ITwhere
 180   | ITinstance
 181   | ITinfixl
 182   | ITinfixr
 183   | ITinfix
 184   | ITforall
 185   | ITbang              -- magic symbols
 186   | ITvbar
 187   | ITdcolon
 188   | ITcomma
 189   | ITdarrow
 190   | ITdotdot
 191   | ITequal
 192   | ITocurly
 193   | ITobrack
 194   | IToparen
 195   | ITrarrow
 196   | ITccurly
 197   | ITcbrack
 198   | ITcparen
 199   | ITsemi
 200   | ITvarid   FAST_STRING
 201   | ITconid   FAST_STRING
 202   | ITvarsym  FAST_STRING
 203   | ITconsym  FAST_STRING
 204   | ITqvarid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 205   | ITqconid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 206   | ITqvarsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 207   | ITqconsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 208
 209   | ITtysig StringBuffer (Maybe StringBuffer)
 210                            -- lazily return the stream of tokens for
 211                            -- the info attached to an id.
 212         -- Stuff for reading unfoldings
 213   | ITarity
 214   | ITunfold Bool               -- True <=> there's an INLINE pragma on this Id
 215   | ITstrict [Demand] | ITbottom
 216   | ITspecialise
 217   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 218   | ITcoerce | ITinline | ITatsign
 219   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 220   | ITscc CostCentre
 221   | ITchar Char | ITstring FAST_STRING
 222   | ITinteger Integer | ITdouble Double
 223   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 224   | ITunknown String            -- Used when the lexer can't make sense of it
 225   | ITeof                               -- end of file token
 226   deriving Text -- debugging
 227
 228 instance Text CostCentre -- cheat!
 229
 230 \end{code}
 231
 232 %************************************************************************
 233 %*                                                                      *
 234 \subsection{The lexical analyser}
 235 %*                                                                      *
 236 %************************************************************************
 237
 238 \begin{code}
 239 lexIface :: (IfaceToken -> IfM a) -> IfM a
 240 lexIface cont buf =
 241  _scc_ "Lexer"
 242 -- if bufferExhausted buf then
 243 --  []
 244 -- else
 245 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 246   case currentChar# buf of
 247       -- whitespace and comments, ignore.
 248     ' '#  -> lexIface cont (stepOn buf)
 249     '\t'# -> lexIface cont (stepOn buf)
 250     '\n'# -> \ loc -> lexIface cont (stepOn buf) (incSrcLine loc)
 251
 252 -- Numbers and comments
 253     '-'#  ->
 254       case lookAhead# buf 1# of
 255         '-'# -> lex_comment cont (stepOnBy# buf 2#)
 256         c    ->
 257           if isDigit (C# c)
 258           then lex_num cont (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 259           else lex_id cont buf
 260
 261 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 262 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 263
 264     '('# ->
 265          case prefixMatch (stepOn buf) "..)" of
 266            Just buf' ->  cont ITdotdot (stepOverLexeme buf')
 267            Nothing ->
 268             case lookAhead# buf 1# of
 269               ','# -> lex_tuple cont Nothing  (stepOnBy# buf 2#)
 270               ')'# -> cont (ITconid SLIT("()")) (stepOnBy# buf 2#)
 271               _    -> cont IToparen (stepOn buf)
 272
 273     '{'# -> cont ITocurly (stepOn buf)
 274     '}'# -> cont ITccurly (stepOn buf)
 275     ')'# -> cont ITcparen (stepOn buf)
 276     '['# ->
 277       case lookAhead# buf 1# of
 278         ']'# -> cont (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 279         _    -> cont ITobrack (stepOn buf)
 280     ']'# -> cont ITcbrack (stepOn buf)
 281     ','# -> cont ITcomma  (stepOn buf)
 282     ';'#  -> cont ITsemi (stepOn buf)
 283     '\"'# -> case untilEndOfString# (stepOn buf) of
 284               buf' ->
 285                   -- the string literal does *not* include the dquotes
 286                 case lexemeToFastString buf' of
 287                  v -> cont (ITstring v) (stepOn (stepOverLexeme buf'))
 288
 289     '\''# -> --
 290              -- untilEndOfChar# extends the current lexeme until
 291              -- it hits a non-escaped single quote. The lexeme of the
 292              -- StringBuffer returned does *not* include the closing quote,
 293              -- hence we augment the lexeme and make sure to add the
 294              -- starting quote, before `read'ing the string.
 295              --
 296              case untilEndOfChar# (stepOn buf) of
 297                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 298                         [  (ch, rest)] -> cont (ITchar ch) (stepOverLexeme (incLexeme buf'))
 299
 300 -- ``thingy'' form for casm
 301     '`'# ->
 302             case lookAhead# buf 1# of
 303               '`'# -> lex_cstring cont (stepOnBy# buf 2#) -- remove the `` and go.
 304               _    -> lex_id cont (incLexeme buf)         -- add ` to lexeme and assume
 305                                                      -- scanning an id of some sort.
 306 -- Keywords
 307     '_'# ->
 308          case lookAhead# buf 1# of
 309            'S'# -> case lookAhead# buf 2# of
 310                     '_'# ->
 311                             lex_demand cont (stepOnUntil (not . isSpace)
 312                                             (stepOnBy# buf 3#)) -- past _S_
 313            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 314                      Just buf' -> lex_scc cont (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 315                      Nothing   -> lex_keyword cont (stepOnBy# buf 1#) -- drop the '_' and assume
 316                                                                  -- it is a keyword.
 317            _    -> lex_keyword cont (stepOn buf)
 318
 319     '\NUL'# ->
 320             if bufferExhausted (stepOn buf) then
 321                cont ITeof buf
 322             else
 323                lex_id cont buf
 324     c ->
 325         if isDigit (C# c) then
 326            lex_num cont (id) (ord# c -# ord# '0'#) (incLexeme buf)
 327         else
 328            lex_id cont buf
 329 --  where
 330 lex_comment cont buf =
 331 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 332    case untilChar# buf '\n'# of {buf' -> lexIface cont (stepOverLexeme buf')}
 333
 334 ------------------
 335 lex_demand cont buf =
 336 -- _trace ("demand: "++[C# (currentChar# buf)]) $
 337  case read_em [] buf of { (ls,buf') -> cont (ITstrict ls) (stepOverLexeme buf')}
 338  where
 339    -- code snatched from Demand.lhs
 340   read_em acc buf =
 341 --   _trace ("read_em: "++[C# (currentChar# buf)]) $
 342    case currentChar# buf of
 343     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 344     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 345     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 346     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 347     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 348     ')'# -> (reverse acc, stepOn buf)
 349     'U'# -> do_unpack DataType True  acc (stepOnBy# buf 2#)
 350     'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
 351     'N'# -> do_unpack NewType True  acc (stepOnBy# buf 2#)
 352     'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
 353     _    -> (reverse acc, buf)
 354
 355   do_unpack new_or_data wrapper_unpacks acc buf
 356    = case read_em [] buf of
 357       (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
 358
 359 ------------------
 360 lex_scc cont buf =
 361 -- _trace ("scc: "++[C# (currentChar# buf)]) $
 362  case currentChar# buf of
 363   '"'# ->
 364       -- YUCK^2
 365      case prefixMatch (stepOn buf) "NO_CC\"" of
 366       Just buf' -> cont (ITscc noCostCentre) (stepOverLexeme buf')
 367       Nothing ->
 368        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 369         Just buf' -> cont (ITscc useCurrentCostCentre) (stepOverLexeme buf')
 370         Nothing   ->
 371          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 372          Just buf' -> cont (ITscc overheadCostCentre) (stepOverLexeme buf')
 373          Nothing   ->
 374           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 375            Just buf' -> cont (ITscc dontCareCostCentre) (stepOverLexeme buf')
 376            Nothing   ->
 377             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 378              Just buf' -> cont (ITscc subsumedCosts) (stepOverLexeme buf')
 379              Nothing ->
 380               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 381                Just buf' -> cont (ITscc preludeCafsCostCentre) (stepOverLexeme buf')
 382                Nothing ->
 383                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 384                  Just buf' ->
 385                   case untilChar# (stepOverLexeme buf') '\"'# of
 386                    buf'' -> cont (ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_)) (stepOn (stepOverLexeme buf''))
 387                  Nothing ->
 388                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 389                    Just buf' -> cont (ITscc (preludeDictsCostCentre True)) (stepOverLexeme buf')
 390                    Nothing ->
 391                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 392                      Just buf' ->
 393                       case untilChar# (stepOverLexeme buf') '\"'# of
 394                        buf'' -> cont (ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True))
 395                                 (stepOn (stepOverLexeme buf''))
 396                      Nothing ->
 397                       let
 398                        match_user_cc buf =
 399                         case untilChar# buf '/'# of
 400                          buf' ->
 401                           let mod_name = lexemeToFastString buf' in
 402 --                        case untilChar# (stepOn (stepOverLexeme buf')) '/'# of
 403 --                         buf'' ->
 404 --                            let grp_name = lexemeToFastString buf'' in
 405                             case untilChar# (stepOn (stepOverLexeme buf')) '\"'# of
 406                              buf'' ->
 407                                -- The label may contain arbitrary characters, so it
 408                                -- may have been escaped etc., hence we `read' it in to get
 409                                -- rid of these meta-chars in the string and then pack it (again.)
 410                                -- ToDo: do the same for module name (single quotes allowed in m-names).
 411                                -- BTW, the code in this module is totally gruesome..
 412                                let upk_label = _UNPK_ (lexemeToFastString buf'') in
 413                                case reads ('"':upk_label++"\"") of
 414                                 ((cc_label,_):_) ->
 415                                     let cc_name = _PK_ cc_label in
 416                                     (mkUserCC cc_name mod_name _NIL_{-grp_name-},
 417                                      stepOn (stepOverLexeme buf''))
 418                                 _ ->
 419                                   trace ("trouble lexing scc label: " ++ upk_label ++ " , ignoring")
 420                                   (mkUserCC _NIL_ mod_name _NIL_{-grp_name-},
 421                                    stepOn (stepOverLexeme buf''))
 422                       in
 423                       case prefixMatch (stepOn buf) "CAF:" of
 424                        Just buf' ->
 425                          case match_user_cc (stepOverLexeme buf') of
 426                           (cc, buf'') -> cont (ITscc (cafifyCC cc)) buf''
 427                        Nothing ->
 428                          case match_user_cc (stepOn buf) of
 429                           (cc, buf'') -> cont (ITscc cc) buf''
 430   c -> cont (ITunknown [C# c]) (stepOn buf)
 431
 432
 433 -----------
 434 lex_num :: (IfaceToken -> IfM a) ->
 435         (Int -> Int) -> Int# -> IfM a
 436 lex_num cont minus acc# buf =
 437 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 438  case scanNumLit (I# acc#) buf of
 439      (acc',buf') ->
 440        case currentChar# buf' of
 441          '.'# ->
 442              -- this case is not optimised at all, as the
 443              -- presence of floating point numbers in interface
 444              -- files is not that common. (ToDo)
 445             case expandWhile (isDigit) (incLexeme buf') of
 446               buf'' -> -- points to first non digit char
 447                 case reads (lexemeToString buf'') of
 448                   [(v,_)] -> cont (ITdouble v) (stepOverLexeme buf'')
 449          _ -> cont (ITinteger (fromInt (minus acc'))) (stepOverLexeme buf')
 450
 451 --         case reads (lexemeToString buf') of
 452 --           [(i,_)] -> cont (ITinteger i) (stepOverLexeme buf')
 453
 454 ------------
 455 lex_keyword cont buf =
 456 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 457  case currentChar# buf of
 458   ':'# -> case lookAhead# buf 1# of
 459             '_'# -> -- a binding, type (and other id-info) follows,
 460                     -- to make the parser ever so slightly, we push
 461                     --
 462                 lex_decl cont (stepOnBy# buf 2#)
 463             v# -> cont (ITunknown (['_',':',C# v#])) (stepOnBy# buf 2#)
 464   _ ->
 465     case expandWhile (is_kwd_char) buf of
 466      buf' ->
 467       let kw = lexemeToFastString buf' in
 468 --    _trace ("kw: "++lexemeToString buf') $
 469       case lookupUFM ifaceKeywordsFM kw of
 470        Nothing -> cont (ITunknown (_UNPK_ kw)) -- (minor) sigh
 471                   (stepOverLexeme buf')
 472        Just xx -> cont xx (stepOverLexeme buf')
 473
 474 lex_decl cont buf =
 475  case doDiscard False buf of -- spin until ;; is found
 476    buf' ->
 477       {- _trace (show (lexemeToString buf')) $ -}
 478       case currentChar# buf' of
 479        '\n'# -> -- newline, no id info.
 480            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 481                 (stepOverLexeme buf')
 482        '\r'# -> -- just to be sure for those Win* boxes..
 483            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 484                 (stepOverLexeme buf')
 485        '\NUL'# ->
 486            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 487                 (stepOverLexeme buf')
 488        c     -> -- run all over the id info
 489          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 490            buf'' ->
 491                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 492                     --_trace (show (lexemeToString (decLexeme buf''))) $
 493                     let idinfo =
 494                             if opt_IgnoreIfacePragmas then
 495                                 Nothing
 496                             else
 497                                 Just (lexemeToBuffer (decLexeme buf''))
 498                         --_trace (show is) $
 499                     in
 500                     cont (ITtysig (lexemeToBuffer (decLexeme buf')) idinfo)
 501                         (stepOverLexeme buf'')
 502
 503 -- ToDo: hammer!
 504 is_kwd_char c@(C# c#) =
 505  isAlphanum c || -- OLD: c `elem` "_@/\\"
 506  (case c# of
 507    '_'#  -> True
 508    '@'#  -> True
 509    '/'#  -> True
 510    '\\'# -> True
 511    _     -> False)
 512
 513
 514
 515 -----------
 516 lex_cstring cont buf =
 517 -- _trace ("lex_cstring: "++[C# (currentChar# buf)]) $
 518  case expandUntilMatch buf "\'\'" of
 519    buf' -> cont (ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))))
 520            (stepOverLexeme buf')
 521
 522 -----------
 523 lex_tuple cont module_dot buf =
 524 -- _trace ("lex_tuple: "++[C# (currentChar# buf)]) $
 525   go 2 buf
 526   where
 527    go n buf =
 528     case currentChar# buf of
 529       ','# -> go (n+1) (stepOn buf)
 530       ')'# -> end_lex_id cont module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 531       _    -> cont (ITunknown ("tuple " ++ show n)) buf
 532
 533 -- Similarly ' itself is ok inside an identifier, but not at the start
 534
 535 -- id_arr is an array of bytes, indexed by characters,
 536 -- containing 0 if the character isn't a valid character from an identifier
 537 -- and 1 if it is.  It's just a memo table for is_id_char.
 538 id_arr :: ByteArray Int
 539 id_arr =
 540  runST (
 541   newCharArray (0,255) >>= \ barr ->
 542   let
 543    loop 256# = return ()
 544    loop i# =
 545     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 546        writeCharArray barr (I# i#) '\1'         >>
 547        loop (i# +# 1#)
 548     else
 549        writeCharArray barr (I# i#) '\0'         >>
 550        loop (i# +# 1#)
 551   in
 552   loop 0#                                       >>
 553   unsafeFreezeByteArray barr)
 554
 555 is_id_char (C# c#) =
 556  let
 557   ByteArray _ arr# = id_arr
 558  in
 559  case ord# (indexCharArray# arr# (ord# c#)) of
 560   0# -> False
 561   1# -> True
 562
 563 --OLD: is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 564
 565 is_sym c# =
 566  case c# of {
 567    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 568    '#'# -> True; '$'#  -> True; '%'# -> True;
 569    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 570    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 571    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 572    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 573
 574 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 575
 576
 577 -- mod_arr is an array of bytes, indexed by characters,
 578 -- containing 0 if the character isn't a valid character from a module name,
 579 -- and 1 if it is.
 580 mod_arr :: ByteArray Int
 581 mod_arr =
 582  runST (
 583   newCharArray (0,255) >>= \ barr ->
 584   let
 585    loop 256# = return ()
 586    loop i# =
 587     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 588        writeCharArray barr (I# i#) '\1'         >>
 589        loop (i# +# 1#)
 590     else
 591        writeCharArray barr (I# i#) '\0'         >>
 592        loop (i# +# 1#)
 593   in
 594   loop 0#                                       >>
 595   unsafeFreezeByteArray barr)
 596
 597
 598 is_mod_char (C# c#) =
 599  let
 600   ByteArray _ arr# = mod_arr
 601  in
 602  case ord# (indexCharArray# arr# (ord# c#)) of
 603   0# -> False
 604   1# -> True
 605
 606 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 607
 608 lex_id cont buf =
 609 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 610  case expandWhile (is_mod_char) buf of
 611    buf' ->
 612     case currentChar# buf' of
 613      '.'# -> munch buf' HiFile
 614      '!'# -> munch buf' HiBootFile
 615      _    -> lex_id2 cont Nothing buf'
 616    where
 617     munch buf' hif =
 618         if not (emptyLexeme buf') then
 619 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 620            case lexemeToFastString buf' of
 621              l@(FastString u# l# ba#) -> lex_id2 cont (Just (FastString u# l# ba#, hif))
 622                                                  (stepOn (stepOverLexeme buf'))
 623         else
 624            lex_id2 cont Nothing buf'
 625
 626
 627 -- Dealt with the Module.part
 628 lex_id2 cont module_dot buf =
 629 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 630  case currentChar# buf of
 631
 632   '['# ->       -- Special case for []
 633     case lookAhead# buf 1# of
 634      ']'# -> end_lex_id cont module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 635      _    -> lex_id3 cont module_dot buf
 636
 637   '('# ->       -- Special case for (,,,)
 638     case lookAhead# buf 1# of
 639      ')'# -> end_lex_id cont module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 640      ','# -> lex_tuple cont module_dot (stepOnBy# buf 2#)
 641      _    -> lex_id3 cont module_dot buf
 642   ':'# -> lex_id3 cont module_dot (incLexeme buf)
 643   '-'# ->
 644      case module_dot of
 645        Nothing  -> lex_id3 cont module_dot buf
 646        Just ghc -> -- this should be "GHC" (current home of (->))
 647          case lookAhead# buf 1# of
 648           '>'# -> end_lex_id cont module_dot (ITconid SLIT("->"))
 649                         (stepOnBy# buf 2#)
 650           _    -> lex_id3 cont module_dot buf
 651   _    -> lex_id3 cont module_dot buf
 652
 653
 654
 655 -- Dealt with [], (), : special cases
 656
 657 lex_id3 cont module_dot buf =
 658 -- _trace ("lex_id3: "++[C# (currentChar# buf)]) $
 659  case expandWhile (is_id_char) buf of
 660   buf' ->
 661     case module_dot of
 662      Just _ ->
 663        end_lex_id cont module_dot (mk_var_token lexeme) new_buf
 664      Nothing ->
 665        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 666          Just kwd_token -> cont kwd_token new_buf
 667          Nothing        -> cont (mk_var_token lexeme) new_buf
 668     where
 669      lexeme  = lexemeToFastString buf'
 670      new_buf = stepOverLexeme buf'
 671
 672
 673 {- OLD:
 674 lex_id2 module_dot [] ('[': ']': cs) = end_lex_id module_dot (ITconid SLIT("[]")) cs
 675 lex_id2 module_dot [] ('(': ')': cs) = end_lex_id module_dot (ITconid SLIT("()")) cs
 676 lex_id2 module_dot [] ('(': ',': cs) = lex_tuple module_dot cs
 677 lex_id2 module_dot [] (':' : cs)     = lex_id3 module_dot [':'] cs
 678 lex_id2 module_dot xs cs             = lex_id3 module_dot xs cs
 679 -}
 680
 681 -- Dealt with [], (), : special cases
 682
 683 {-
 684 lex_id3 module_dot len_xs xs cs =
 685  case my_span' (is_id_char) cs of
 686    (xs1,len_xs1,rest) ->
 687     case module_dot of
 688      Just m  -> end_lex_id (Just m) (mk_var_token rxs) rest --OLD:_PK_ (reverse xs))) rest
 689      Nothing ->
 690       case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM rxs of
 691        Just kwd_token -> kwd_token          : lexIface rest
 692        other          -> token : lexIface cs end_lex_id Nothing (mk_var_token rxs) rest
 693     where
 694      rxs = packNChars (len_xs+len_xs1) (xs++xs1) -- OLD: _PK_ (reverse xs)
 695 -}
 696 mk_var_token pk_str =
 697      let
 698       f = _HEAD_ pk_str
 699      in
 700      --
 701      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 702      -- remove the second half of disjunction when using a 1.3 prelude.
 703      --
 704      if      isUpper f    then ITconid pk_str
 705      else if isLower f    then ITvarid pk_str
 706      else if f == ':'     then ITconsym pk_str
 707      else if isLowerISO f then ITvarid pk_str
 708      else if isUpperISO f then ITconid pk_str
 709      else ITvarsym pk_str
 710
 711 {-
 712     mk_var_token xs@(f:_) | isUpper f || isUpperISO f = ITconid n
 713                           | f == ':'              = ITconsym n
 714                           | isAlpha f             = ITvarid n
 715                           | otherwise             = ITvarsym n
 716                 where
 717                       n = _PK_ xs
 718 -}
 719
 720 end_lex_id cont Nothing token buf  = cont token buf
 721 end_lex_id cont (Just (m,hif)) token buf =
 722  case token of
 723    ITconid n  -> cont (ITqconid  (m,n,hif))         buf
 724    ITvarid n  -> cont (ITqvarid  (m,n,hif))         buf
 725    ITconsym n -> cont (ITqconsym (m,n,hif))         buf
 726
 727         -- Special case for ->
 728         -- "->" by itself is a special token (ITrarrow),
 729         -- but M.-> is a ITqconid
 730    ITvarsym n |  n == SLIT("->")
 731               -> cont (ITqconsym (m,n,hif))         buf
 732
 733    ITvarsym n -> cont (ITqvarsym (m,n,hif))         buf
 734
 735 -- ITbang can't happen here I think
 736 --   ITbang     -> cont (ITqvarsym (m,SLIT("!"),hif)) buf
 737
 738    _          -> cont (ITunknown (show token))      buf
 739
 740 ------------
 741 ifaceKeywordsFM :: UniqFM IfaceToken
 742 ifaceKeywordsFM = listToUFM $
 743       map (\ (x,y) -> (_PK_ x,y))
 744        [("/\\_",                ITbiglam)
 745        ,("@_",                  ITatsign)
 746        ,("letrec_",             ITletrec)
 747        ,("interface_",          ITinterface)
 748        ,("usages_",             ITusages)
 749        ,("versions_",           ITversions)
 750        ,("exports_",            ITexports)
 751        ,("instance_modules_",   ITinstance_modules)
 752        ,("instances_",          ITinstances)
 753        ,("fixities_",           ITfixities)
 754        ,("declarations_",       ITdeclarations)
 755        ,("pragmas_",            ITpragmas)
 756        ,("forall_",             ITforall)
 757        ,("u_",                  ITunfold False)
 758        ,("U_",                  ITunfold True)
 759        ,("A_",                  ITarity)
 760        ,("P_",                  ITspecialise)
 761        ,("coerce_",             ITcoerce)
 762        ,("inline_",             ITinline)
 763        ,("bot_",                ITbottom)
 764        ,("integer_",            ITinteger_lit)
 765        ,("rational_",           ITrational_lit)
 766        ,("addr_",               ITaddr_lit)
 767        ,("float_",              ITfloat_lit)
 768        ,("string_",             ITstring_lit)
 769        ,("litlit_",             ITlit_lit)
 770        ,("ccall_",              ITccall (False, False))
 771        ,("ccall_GC_",           ITccall (False, True))
 772        ,("casm_",               ITccall (True,  False))
 773        ,("casm_GC_",            ITccall (True,  True))
 774        ]
 775
 776 haskellKeywordsFM = listToUFM $
 777       map (\ (x,y) -> (_PK_ x,y))
 778       [ ("data",                ITdata)
 779        ,("type",                ITtype)
 780        ,("newtype",             ITnewtype)
 781        ,("class",               ITclass)
 782        ,("where",               ITwhere)
 783        ,("instance",            ITinstance)
 784        ,("infixl",              ITinfixl)
 785        ,("infixr",              ITinfixr)
 786        ,("infix",               ITinfix)
 787        ,("case",                ITcase)
 788        ,("case#",               ITprim_case)
 789        ,("of",                  ITof)
 790        ,("in",                  ITin)
 791        ,("let",                 ITlet)
 792        ,("deriving",            ITderiving)
 793
 794        ,("->",                  ITrarrow)
 795        ,("\\",                  ITlam)
 796        ,("|",                   ITvbar)
 797        ,("!",                   ITbang)
 798        ,("=>",                  ITdarrow)
 799        ,("=",                   ITequal)
 800        ,("::",                  ITdcolon)
 801        ]
 802
 803
 804 -- doDiscard rips along really fast, looking for a double semicolon,
 805 -- indicating the end of the pragma we're skipping
 806 doDiscard inStr buf =
 807 -- _trace (show (C# (currentChar# buf))) $
 808  case currentChar# buf of
 809    ';'# ->
 810      if not inStr then
 811        case lookAhead# buf 1# of
 812         ';'# -> incLexeme (incLexeme buf)
 813         _    -> doDiscard inStr (incLexeme buf)
 814      else
 815        doDiscard inStr (incLexeme buf)
 816    '"'# ->
 817        let
 818         odd_slashes buf flg i# =
 819           case lookAhead# buf i# of
 820            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 821            _     -> flg
 822        in
 823        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 824          '\\'# -> -- escaping something..
 825            if odd_slashes buf True (negateInt# 2#) then
 826                -- odd number of slashes, " is escaped.
 827               doDiscard inStr (incLexeme buf)
 828            else
 829                -- even number of slashes, \ is escaped.
 830               doDiscard (not inStr) (incLexeme buf)
 831          _ -> case inStr of -- forced to avoid build-up
 832                True  -> doDiscard False (incLexeme buf)
 833                False -> doDiscard True  (incLexeme buf)
 834    _ -> doDiscard inStr (incLexeme buf)
 835
 836 \end{code}
 837
 838 begin{code}
 839 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 840 my_span p xs = go [] xs
 841   where
 842     go so_far (x:xs') | p x = go (x:so_far) xs'
 843     go so_far xs            = (reverse so_far, xs)
 844
 845 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 846 my_span' p xs = go [] 0 xs
 847   where
 848     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 849     go so_far n xs            = (reverse so_far,n, xs)
 850 end{code}
 851
 852
 853 %************************************************************************
 854 %*                                                                      *
 855 \subsection{Other utility functions
 856 %*                                                                      *
 857 %************************************************************************
 858
 859 \begin{code}
 860 type IfM a = StringBuffer       -- Input string
 861           -> SrcLoc
 862           -> MaybeErr a ErrMsg
 863
 864 returnIf   :: a -> IfM a
 865 returnIf a s l = Succeeded a
 866
 867 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 868 m `thenIf` k = \s l ->
 869         case m s l of
 870                 Succeeded a -> k a s l
 871                 Failed err  -> Failed err
 872
 873 getSrcLocIf :: IfM SrcLoc
 874 getSrcLocIf s l = Succeeded l
 875
 876 happyError :: IfM a
 877 happyError s l = Failed (ifaceParseErr l ([]::[IfaceToken]){-Todo-})
 878
 879
 880 {-
 881  Note that if the file we're processing ends with `hi-boot',
 882  we accept it on faith as having the right version.
 883  This is done so that .hi-boot files  that comes with hsc
 884  don't have to be updated before every release, and it
 885  allows us to share .hi-boot files with versions of hsc
 886  that don't have .hi version checking (e.g., ghc-2.10's)
 887
 888  If the version number is 0, the checking is also turned off.
 889 -}
 890 checkVersion :: Maybe Integer -> IfM ()
 891 checkVersion mb@(Just v) s l
 892  | (v==0) || (v == PROJECTVERSION) = Succeeded ()
 893  | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
 894 checkVersion mb@Nothing  s l
 895  | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile l)) = Succeeded ()
 896  | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
 897
 898 -----------------------------------------------------------------
 899
 900 ifaceParseErr l toks
 901   = hsep [ppr l, ptext SLIT("Interface-file parse error;"),
 902           ptext SLIT("toks="), text (show (take 10 toks))]
 903
 904 ifaceVersionErr hi_vers l toks
 905   = hsep [ppr l, ptext SLIT("Interface file version error;"),
 906           ptext SLIT("Expected"), int PROJECTVERSION,
 907           ptext SLIT(" found "), pp_version]
 908     where
 909      pp_version =
 910       case hi_vers of
 911         Nothing -> ptext SLIT("pre ghc-3.02 version")
 912         Just v  -> ptext SLIT("version") <+> integer v
 913
 914 \end{code}