ghc/compiler/reader/Lex.lhs

   1 --------------------------------------------------------
   2 [Jan 98]
   3 There's a known bug in here:
   4
   5         If an interface file ends prematurely, Lex tries to
   6         do headFS of an empty FastString.
   7
   8 An example that provokes the error is
   9
  10         f _:_ _forall_ [a] <<<END OF FILE>>>
  11 --------------------------------------------------------
  12
  13
  14 %
  15 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
  16 %
  17 \section[Lexical analysis]{Lexical analysis}
  18
  19 \begin{code}
  20 module Lex (
  21
  22         isLexCon, isLexVar, isLexId, isLexSym,
  23         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  24         mkTupNameStr, ifaceParseErr,
  25
  26         -- Monad for parser
  27         IfaceToken(..), lexIface, IfM, thenIf, returnIf, getSrcLocIf,
  28         checkVersion,
  29         happyError,
  30         StringBuffer
  31
  32     ) where
  33
  34 #include "HsVersions.h"
  35
  36 import Char             (isDigit, isAlphanum, isUpper,isLower, isSpace, ord )
  37 import List             ( isSuffixOf )
  38
  39 import {-# SOURCE #-} CostCentre
  40
  41 import CmdLineOpts      ( opt_IgnoreIfacePragmas )
  42 import Demand           ( Demand(..) {- instance Read -} )
  43 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  44 import BasicTypes       ( NewOrData(..), IfaceFlavour(..) )
  45 import SrcLoc           ( SrcLoc, incSrcLine, srcLocFile )
  46
  47 import Maybes           ( MaybeErr(..) )
  48 import ErrUtils         ( ErrMsg(..) )
  49 import Outputable
  50 import Util             ( nOfThem, panic )
  51
  52 import FastString
  53 import StringBuffer
  54 import GlaExts
  55 import ST               ( runST )
  56
  57 import PrelRead                 ( readRational__ ) -- Glasgow non-std
  58 \end{code}
  59
  60 %************************************************************************
  61 %*                                                                      *
  62 \subsection{Lexical categories}
  63 %*                                                                      *
  64 %************************************************************************
  65
  66 These functions test strings to see if they fit the lexical categories
  67 defined in the Haskell report.  Normally applied as in e.g. @isCon
  68 (getLocalName foo)@.
  69
  70 \begin{code}
  71 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  72  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  73
  74 isLexCon cs = isLexConId  cs || isLexConSym cs
  75 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  76
  77 isLexId  cs = isLexConId  cs || isLexVarId  cs
  78 isLexSym cs = isLexConSym cs || isLexVarSym cs
  79
  80 -------------
  81
  82 isLexConId cs
  83   | _NULL_ cs        = False
  84   | cs == SLIT("[]") = True
  85   | c  == '('        = True     -- (), (,), (,,), ...
  86   | otherwise        = isUpper c || isUpperISO c
  87   where
  88     c = _HEAD_ cs
  89
  90 isLexVarId cs
  91   | _NULL_ cs    = False
  92   | otherwise    = isLower c || isLowerISO c
  93   where
  94     c = _HEAD_ cs
  95
  96 isLexConSym cs
  97   | _NULL_ cs   = False
  98   | otherwise   = c  == ':'
  99                || cs == SLIT("->")
 100   where
 101     c = _HEAD_ cs
 102
 103 isLexVarSym cs
 104   | _NULL_ cs = False
 105   | otherwise = isSymbolASCII c
 106              || isSymbolISO c
 107   where
 108     c = _HEAD_ cs
 109
 110 -------------
 111 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 112 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 113 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 114 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 115 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 116 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 117 \end{code}
 118
 119
 120 %************************************************************************
 121 %*                                                                      *
 122 \subsection{Tuple strings -- ugh!}
 123 %*                                                                      *
 124 %************************************************************************
 125
 126 \begin{code}
 127 mkTupNameStr 0 = SLIT("()")
 128 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 129 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 130 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 131 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 132 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 133 \end{code}
 134
 135
 136
 137 %************************************************************************
 138 %*                                                                      *
 139 \subsection{Data types}
 140 %*                                                                      *
 141 %************************************************************************
 142
 143 The token data type, fairly un-interesting except from two constructors,
 144 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 145 strictness, unfolding etc) and types for id decls.
 146
 147 The Idea/Observation here is that the renamer needs to scan through
 148 all of an interface file before it can continue. But only a fraction
 149 of the information contained in the file turns out to be useful, so
 150 delaying as much as possible of the scanning and parsing of an
 151 interface file Makes Sense (Heap profiles of the compiler
 152 show at a reduction in heap usage by at least a factor of two,
 153 post-renamer).
 154
 155 Hence, the interface file lexer spots when value declarations are
 156 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 157 for the type and any other id info for that binding (unfolding, strictness
 158 etc). These constructors are applied to the result of lexing these sub-chunks.
 159
 160 The lexing of the type and id info is all done lazily, of course, so
 161 the scanning (and subsequent parsing) will be done *only* on the ids the
 162 renamer finds out that it is interested in. The rest will just be junked.
 163 Laziness, you know it makes sense :-)
 164
 165 \begin{code}
 166 data IfaceToken
 167   = ITinterface         -- keywords
 168   | ITusages
 169   | ITversions
 170   | ITexports
 171   | ITinstance_modules
 172   | ITinstances
 173   | ITfixities
 174   | ITdeclarations
 175   | ITpragmas
 176   | ITdata
 177   | ITtype
 178   | ITnewtype
 179   | ITderiving
 180   | ITclass
 181   | ITwhere
 182   | ITinstance
 183   | ITinfixl
 184   | ITinfixr
 185   | ITinfix
 186   | ITforall
 187   | ITbang              -- magic symbols
 188   | ITvbar
 189   | ITdcolon
 190   | ITcomma
 191   | ITdarrow
 192   | ITdotdot
 193   | ITequal
 194   | ITocurly
 195   | ITobrack
 196   | IToparen
 197   | ITrarrow
 198   | ITccurly
 199   | ITcbrack
 200   | ITcparen
 201   | ITsemi
 202   | ITvarid   FAST_STRING
 203   | ITconid   FAST_STRING
 204   | ITvarsym  FAST_STRING
 205   | ITconsym  FAST_STRING
 206   | ITqvarid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 207   | ITqconid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 208   | ITqvarsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 209   | ITqconsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 210
 211   | ITtysig StringBuffer (Maybe StringBuffer)
 212                            -- lazily return the stream of tokens for
 213                            -- the info attached to an id.
 214         -- Stuff for reading unfoldings
 215   | ITarity
 216   | ITunfold Bool               -- True <=> there's an INLINE pragma on this Id
 217   | ITstrict [Demand] | ITbottom
 218   | ITspecialise
 219   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 220   | ITcoerce | ITinline | ITatsign
 221   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 222   | ITscc CostCentre
 223   | ITchar Char | ITstring FAST_STRING
 224   | ITinteger Integer | ITrational Rational
 225   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 226   | ITunknown String            -- Used when the lexer can't make sense of it
 227   | ITeof                               -- end of file token
 228   deriving Text -- debugging
 229
 230 instance Text CostCentre -- cheat!
 231
 232 \end{code}
 233
 234 %************************************************************************
 235 %*                                                                      *
 236 \subsection{The lexical analyser}
 237 %*                                                                      *
 238 %************************************************************************
 239
 240 \begin{code}
 241 lexIface :: (IfaceToken -> IfM a) -> IfM a
 242 lexIface cont buf =
 243  _scc_ "Lexer"
 244 -- if bufferExhausted buf then
 245 --  []
 246 -- else
 247 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 248   case currentChar# buf of
 249       -- whitespace and comments, ignore.
 250     ' '#  -> lexIface cont (stepOn buf)
 251     '\t'# -> lexIface cont (stepOn buf)
 252     '\n'# -> \ loc -> lexIface cont (stepOn buf) (incSrcLine loc)
 253
 254 -- Numbers and comments
 255     '-'#  ->
 256       case lookAhead# buf 1# of
 257         '-'# -> lex_comment cont (stepOnBy# buf 2#)
 258         c    ->
 259           if isDigit (C# c)
 260           then lex_num cont (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 261           else lex_id cont buf
 262
 263 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 264 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 265
 266     '('# ->
 267          case prefixMatch (stepOn buf) "..)" of
 268            Just buf' ->  cont ITdotdot (stepOverLexeme buf')
 269            Nothing ->
 270             case lookAhead# buf 1# of
 271               ','# -> lex_tuple cont Nothing  (stepOnBy# buf 2#)
 272               ')'# -> cont (ITconid SLIT("()")) (stepOnBy# buf 2#)
 273               _    -> cont IToparen (stepOn buf)
 274
 275     '{'# -> cont ITocurly (stepOn buf)
 276     '}'# -> cont ITccurly (stepOn buf)
 277     ')'# -> cont ITcparen (stepOn buf)
 278     '['# ->
 279       case lookAhead# buf 1# of
 280         ']'# -> cont (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 281         _    -> cont ITobrack (stepOn buf)
 282     ']'# -> cont ITcbrack (stepOn buf)
 283     ','# -> cont ITcomma  (stepOn buf)
 284     ';'#  -> cont ITsemi (stepOn buf)
 285     '\"'# -> case untilEndOfString# (stepOn buf) of
 286               buf' ->
 287                   -- the string literal does *not* include the dquotes
 288                 case lexemeToFastString buf' of
 289                  v -> cont (ITstring v) (stepOn (stepOverLexeme buf'))
 290
 291     '\''# -> --
 292              -- untilEndOfChar# extends the current lexeme until
 293              -- it hits a non-escaped single quote. The lexeme of the
 294              -- StringBuffer returned does *not* include the closing quote,
 295              -- hence we augment the lexeme and make sure to add the
 296              -- starting quote, before `read'ing the string.
 297              --
 298              case untilEndOfChar# (stepOn buf) of
 299                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 300                         [  (ch, rest)] -> cont (ITchar ch) (stepOverLexeme (incLexeme buf'))
 301
 302 -- ``thingy'' form for casm
 303     '`'# ->
 304             case lookAhead# buf 1# of
 305               '`'# -> lex_cstring cont (stepOnBy# buf 2#) -- remove the `` and go.
 306               _    -> lex_id cont (incLexeme buf)         -- add ` to lexeme and assume
 307                                                      -- scanning an id of some sort.
 308 -- Keywords
 309     '_'# ->
 310          case lookAhead# buf 1# of
 311            'S'# -> case lookAhead# buf 2# of
 312                     '_'# ->
 313                             lex_demand cont (stepOnUntil (not . isSpace)
 314                                             (stepOnBy# buf 3#)) -- past _S_
 315            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 316                      Just buf' -> lex_scc cont (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 317                      Nothing   -> lex_keyword cont (stepOnBy# buf 1#) -- drop the '_' and assume
 318                                                                  -- it is a keyword.
 319            _    -> lex_keyword cont (stepOn buf)
 320
 321     '\NUL'# ->
 322             if bufferExhausted (stepOn buf) then
 323                cont ITeof buf
 324             else
 325                lex_id cont buf
 326     c ->
 327         if isDigit (C# c) then
 328            lex_num cont (id) (ord# c -# ord# '0'#) (incLexeme buf)
 329         else
 330            lex_id cont buf
 331 --  where
 332 lex_comment cont buf =
 333 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 334    case untilChar# buf '\n'# of {buf' -> lexIface cont (stepOverLexeme buf')}
 335
 336 ------------------
 337 lex_demand cont buf =
 338 -- _trace ("demand: "++[C# (currentChar# buf)]) $
 339  case read_em [] buf of { (ls,buf') -> cont (ITstrict ls) (stepOverLexeme buf')}
 340  where
 341    -- code snatched from Demand.lhs
 342   read_em acc buf =
 343 --   _trace ("read_em: "++[C# (currentChar# buf)]) $
 344    case currentChar# buf of
 345     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 346     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 347     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 348     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 349     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 350     ')'# -> (reverse acc, stepOn buf)
 351     'U'# -> do_unpack DataType True  acc (stepOnBy# buf 2#)
 352     'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
 353     'N'# -> do_unpack NewType True  acc (stepOnBy# buf 2#)
 354     'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
 355     _    -> (reverse acc, buf)
 356
 357   do_unpack new_or_data wrapper_unpacks acc buf
 358    = case read_em [] buf of
 359       (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
 360
 361 ------------------
 362 lex_scc cont buf =
 363 -- _trace ("scc: "++[C# (currentChar# buf)]) $
 364  case currentChar# buf of
 365   '"'# ->
 366       -- YUCK^2
 367      case prefixMatch (stepOn buf) "NO_CC\"" of
 368       Just buf' -> cont (ITscc noCostCentre) (stepOverLexeme buf')
 369       Nothing ->
 370        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 371         Just buf' -> cont (ITscc useCurrentCostCentre) (stepOverLexeme buf')
 372         Nothing   ->
 373          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 374          Just buf' -> cont (ITscc overheadCostCentre) (stepOverLexeme buf')
 375          Nothing   ->
 376           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 377            Just buf' -> cont (ITscc dontCareCostCentre) (stepOverLexeme buf')
 378            Nothing   ->
 379             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 380              Just buf' -> cont (ITscc subsumedCosts) (stepOverLexeme buf')
 381              Nothing ->
 382               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 383                Just buf' -> cont (ITscc preludeCafsCostCentre) (stepOverLexeme buf')
 384                Nothing ->
 385                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 386                  Just buf' ->
 387                   case untilChar# (stepOverLexeme buf') '\"'# of
 388                    buf'' -> cont (ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_)) (stepOn (stepOverLexeme buf''))
 389                  Nothing ->
 390                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 391                    Just buf' -> cont (ITscc (preludeDictsCostCentre True)) (stepOverLexeme buf')
 392                    Nothing ->
 393                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 394                      Just buf' ->
 395                       case untilChar# (stepOverLexeme buf') '\"'# of
 396                        buf'' -> cont (ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True))
 397                                 (stepOn (stepOverLexeme buf''))
 398                      Nothing ->
 399                       let
 400                        match_user_cc buf =
 401                         case untilChar# buf '/'# of
 402                          buf' ->
 403                           let mod_name = lexemeToFastString buf' in
 404 --                        case untilChar# (stepOn (stepOverLexeme buf')) '/'# of
 405 --                         buf'' ->
 406 --                            let grp_name = lexemeToFastString buf'' in
 407                             case untilChar# (stepOn (stepOverLexeme buf')) '\"'# of
 408                              buf'' ->
 409                                -- The label may contain arbitrary characters, so it
 410                                -- may have been escaped etc., hence we `read' it in to get
 411                                -- rid of these meta-chars in the string and then pack it (again.)
 412                                -- ToDo: do the same for module name (single quotes allowed in m-names).
 413                                -- BTW, the code in this module is totally gruesome..
 414                                let upk_label = _UNPK_ (lexemeToFastString buf'') in
 415                                case reads ('"':upk_label++"\"") of
 416                                 ((cc_label,_):_) ->
 417                                     let cc_name = _PK_ cc_label in
 418                                     (mkUserCC cc_name mod_name _NIL_{-grp_name-},
 419                                      stepOn (stepOverLexeme buf''))
 420                                 _ ->
 421                                   trace ("trouble lexing scc label: " ++ upk_label ++ " , ignoring")
 422                                   (mkUserCC _NIL_ mod_name _NIL_{-grp_name-},
 423                                    stepOn (stepOverLexeme buf''))
 424                       in
 425                       case prefixMatch (stepOn buf) "CAF:" of
 426                        Just buf' ->
 427                          case match_user_cc (stepOverLexeme buf') of
 428                           (cc, buf'') -> cont (ITscc (cafifyCC cc)) buf''
 429                        Nothing ->
 430                          case match_user_cc (stepOn buf) of
 431                           (cc, buf'') -> cont (ITscc cc) buf''
 432   c -> cont (ITunknown [C# c]) (stepOn buf)
 433
 434
 435 -----------
 436 lex_num :: (IfaceToken -> IfM a) ->
 437         (Int -> Int) -> Int# -> IfM a
 438 lex_num cont minus acc# buf =
 439 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 440  case scanNumLit (I# acc#) buf of
 441      (acc',buf') ->
 442        case currentChar# buf' of
 443          '.'# ->
 444              -- this case is not optimised at all, as the
 445              -- presence of floating point numbers in interface
 446              -- files is not that common. (ToDo)
 447             case expandWhile (isDigit) (incLexeme buf') of
 448               buf2 -> -- points to first non digit char
 449                 let l = case currentChar# buf2 of
 450                           'e'# -> let buf3 = incLexeme buf2 in
 451                               case currentChar# buf3 of
 452                                 '-'# -> expandWhile (isDigit) (incLexeme buf3)
 453                                 _    -> expandWhile (isDigit) buf3
 454                           _ -> buf2
 455                 in let v = readRational__ (lexemeToString l) in
 456                    cont (ITrational v) (stepOverLexeme l)
 457
 458          _ -> cont (ITinteger (fromInt (minus acc'))) (stepOverLexeme buf')
 459
 460
 461
 462 ------------
 463 lex_keyword cont buf =
 464 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 465  case currentChar# buf of
 466   ':'# -> case lookAhead# buf 1# of
 467             '_'# -> -- a binding, type (and other id-info) follows,
 468                     -- to make the parser ever so slightly, we push
 469                     --
 470                 lex_decl cont (stepOnBy# buf 2#)
 471             v# -> cont (ITunknown (['_',':',C# v#])) (stepOnBy# buf 2#)
 472   _ ->
 473     case expandWhile (is_kwd_char) buf of
 474      buf' ->
 475       let kw = lexemeToFastString buf' in
 476 --    _trace ("kw: "++lexemeToString buf') $
 477       case lookupUFM ifaceKeywordsFM kw of
 478        Nothing -> cont (ITunknown (_UNPK_ kw)) -- (minor) sigh
 479                   (stepOverLexeme buf')
 480        Just xx -> cont xx (stepOverLexeme buf')
 481
 482 lex_decl cont buf =
 483  case doDiscard False buf of -- spin until ;; is found
 484    buf' ->
 485       {- _trace (show (lexemeToString buf')) $ -}
 486       case currentChar# buf' of
 487        '\n'# -> -- newline, no id info.
 488            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 489                 (stepOverLexeme buf')
 490        '\r'# -> -- just to be sure for those Win* boxes..
 491            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 492                 (stepOverLexeme buf')
 493        '\NUL'# ->
 494            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 495                 (stepOverLexeme buf')
 496        c     -> -- run all over the id info
 497          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 498            buf'' ->
 499                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 500                     --_trace (show (lexemeToString (decLexeme buf''))) $
 501                     let idinfo =
 502                             if opt_IgnoreIfacePragmas then
 503                                 Nothing
 504                             else
 505                                 Just (lexemeToBuffer (decLexeme buf''))
 506                         --_trace (show is) $
 507                     in
 508                     cont (ITtysig (lexemeToBuffer (decLexeme buf')) idinfo)
 509                         (stepOverLexeme buf'')
 510
 511 -- ToDo: hammer!
 512 is_kwd_char c@(C# c#) =
 513  isAlphanum c || -- OLD: c `elem` "_@/\\"
 514  (case c# of
 515    '_'#  -> True
 516    '@'#  -> True
 517    '/'#  -> True
 518    '\\'# -> True
 519    _     -> False)
 520
 521
 522
 523 -----------
 524 lex_cstring cont buf =
 525 -- _trace ("lex_cstring: "++[C# (currentChar# buf)]) $
 526  case expandUntilMatch buf "\'\'" of
 527    buf' -> cont (ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))))
 528            (stepOverLexeme buf')
 529
 530 -----------
 531 lex_tuple cont module_dot buf =
 532 -- _trace ("lex_tuple: "++[C# (currentChar# buf)]) $
 533   go 2 buf
 534   where
 535    go n buf =
 536     case currentChar# buf of
 537       ','# -> go (n+1) (stepOn buf)
 538       ')'# -> end_lex_id cont module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 539       _    -> cont (ITunknown ("tuple " ++ show n)) buf
 540
 541 -- Similarly ' itself is ok inside an identifier, but not at the start
 542
 543 -- id_arr is an array of bytes, indexed by characters,
 544 -- containing 0 if the character isn't a valid character from an identifier
 545 -- and 1 if it is.  It's just a memo table for is_id_char.
 546 id_arr :: ByteArray Int
 547 id_arr =
 548  runST (
 549   newCharArray (0,255) >>= \ barr ->
 550   let
 551    loop 256# = return ()
 552    loop i# =
 553     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 554        writeCharArray barr (I# i#) '\1'         >>
 555        loop (i# +# 1#)
 556     else
 557        writeCharArray barr (I# i#) '\0'         >>
 558        loop (i# +# 1#)
 559   in
 560   loop 0#                                       >>
 561   unsafeFreezeByteArray barr)
 562
 563 is_id_char (C# c#) =
 564  let
 565   ByteArray _ arr# = id_arr
 566  in
 567  case ord# (indexCharArray# arr# (ord# c#)) of
 568   0# -> False
 569   1# -> True
 570
 571 --OLD: is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 572
 573 is_sym c# =
 574  case c# of {
 575    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 576    '#'# -> True; '$'#  -> True; '%'# -> True;
 577    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 578    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 579    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 580    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 581
 582 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 583
 584
 585 -- mod_arr is an array of bytes, indexed by characters,
 586 -- containing 0 if the character isn't a valid character from a module name,
 587 -- and 1 if it is.
 588 mod_arr :: ByteArray Int
 589 mod_arr =
 590  runST (
 591   newCharArray (0,255) >>= \ barr ->
 592   let
 593    loop 256# = return ()
 594    loop i# =
 595     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 596        writeCharArray barr (I# i#) '\1'         >>
 597        loop (i# +# 1#)
 598     else
 599        writeCharArray barr (I# i#) '\0'         >>
 600        loop (i# +# 1#)
 601   in
 602   loop 0#                                       >>
 603   unsafeFreezeByteArray barr)
 604
 605
 606 is_mod_char (C# c#) =
 607  let
 608   ByteArray _ arr# = mod_arr
 609  in
 610  case ord# (indexCharArray# arr# (ord# c#)) of
 611   0# -> False
 612   1# -> True
 613
 614 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 615
 616 lex_id cont buf =
 617 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 618  case expandWhile (is_mod_char) buf of
 619    buf' ->
 620     case currentChar# buf' of
 621      '.'# -> munch buf' HiFile
 622      '!'# -> munch buf' HiBootFile
 623      _    -> lex_id2 cont Nothing buf'
 624    where
 625     munch buf' hif =
 626         if not (emptyLexeme buf') then
 627 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 628            case lexemeToFastString buf' of
 629              l@(FastString u# l# ba#) -> lex_id2 cont (Just (FastString u# l# ba#, hif))
 630                                                  (stepOn (stepOverLexeme buf'))
 631         else
 632            lex_id2 cont Nothing buf'
 633
 634
 635 -- Dealt with the Module.part
 636 lex_id2 cont module_dot buf =
 637 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 638  case currentChar# buf of
 639
 640   '['# ->       -- Special case for []
 641     case lookAhead# buf 1# of
 642      ']'# -> end_lex_id cont module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 643      _    -> lex_id3 cont module_dot buf
 644
 645   '('# ->       -- Special case for (,,,)
 646     case lookAhead# buf 1# of
 647      ')'# -> end_lex_id cont module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 648      ','# -> lex_tuple cont module_dot (stepOnBy# buf 2#)
 649      _    -> lex_id3 cont module_dot buf
 650   ':'# -> lex_id3 cont module_dot (incLexeme buf)
 651   '-'# ->
 652      case module_dot of
 653        Nothing  -> lex_id3 cont module_dot buf
 654        Just ghc -> -- this should be "GHC" (current home of (->))
 655          case lookAhead# buf 1# of
 656           '>'# -> end_lex_id cont module_dot (ITconid SLIT("->"))
 657                         (stepOnBy# buf 2#)
 658           _    -> lex_id3 cont module_dot buf
 659   _    -> lex_id3 cont module_dot buf
 660
 661
 662
 663 -- Dealt with [], (), : special cases
 664
 665 lex_id3 cont module_dot buf =
 666 -- _trace ("lex_id3: "++[C# (currentChar# buf)]) $
 667  case expandWhile (is_id_char) buf of
 668   buf' ->
 669     case module_dot of
 670      Just _ ->
 671        end_lex_id cont module_dot (mk_var_token lexeme) new_buf
 672      Nothing ->
 673        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 674          Just kwd_token -> cont kwd_token new_buf
 675          Nothing        -> cont (mk_var_token lexeme) new_buf
 676     where
 677      lexeme  = lexemeToFastString buf'
 678      new_buf = stepOverLexeme buf'
 679
 680
 681 {- OLD:
 682 lex_id2 module_dot [] ('[': ']': cs) = end_lex_id module_dot (ITconid SLIT("[]")) cs
 683 lex_id2 module_dot [] ('(': ')': cs) = end_lex_id module_dot (ITconid SLIT("()")) cs
 684 lex_id2 module_dot [] ('(': ',': cs) = lex_tuple module_dot cs
 685 lex_id2 module_dot [] (':' : cs)     = lex_id3 module_dot [':'] cs
 686 lex_id2 module_dot xs cs             = lex_id3 module_dot xs cs
 687 -}
 688
 689 -- Dealt with [], (), : special cases
 690
 691 {-
 692 lex_id3 module_dot len_xs xs cs =
 693  case my_span' (is_id_char) cs of
 694    (xs1,len_xs1,rest) ->
 695     case module_dot of
 696      Just m  -> end_lex_id (Just m) (mk_var_token rxs) rest --OLD:_PK_ (reverse xs))) rest
 697      Nothing ->
 698       case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM rxs of
 699        Just kwd_token -> kwd_token          : lexIface rest
 700        other          -> token : lexIface cs end_lex_id Nothing (mk_var_token rxs) rest
 701     where
 702      rxs = packNChars (len_xs+len_xs1) (xs++xs1) -- OLD: _PK_ (reverse xs)
 703 -}
 704 mk_var_token pk_str =
 705      let
 706       f = _HEAD_ pk_str
 707      in
 708      --
 709      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 710      -- remove the second half of disjunction when using a 1.3 prelude.
 711      --
 712      if      isUpper f    then ITconid pk_str
 713      else if isLower f    then ITvarid pk_str
 714      else if f == ':'     then ITconsym pk_str
 715      else if isLowerISO f then ITvarid pk_str
 716      else if isUpperISO f then ITconid pk_str
 717      else ITvarsym pk_str
 718
 719 {-
 720     mk_var_token xs@(f:_) | isUpper f || isUpperISO f = ITconid n
 721                           | f == ':'              = ITconsym n
 722                           | isAlpha f             = ITvarid n
 723                           | otherwise             = ITvarsym n
 724                 where
 725                       n = _PK_ xs
 726 -}
 727
 728 end_lex_id cont Nothing token buf  = cont token buf
 729 end_lex_id cont (Just (m,hif)) token buf =
 730  case token of
 731    ITconid n  -> cont (ITqconid  (m,n,hif))         buf
 732    ITvarid n  -> cont (ITqvarid  (m,n,hif))         buf
 733    ITconsym n -> cont (ITqconsym (m,n,hif))         buf
 734
 735         -- Special case for ->
 736         -- "->" by itself is a special token (ITrarrow),
 737         -- but M.-> is a ITqconid
 738    ITvarsym n |  n == SLIT("->")
 739               -> cont (ITqconsym (m,n,hif))         buf
 740
 741    ITvarsym n -> cont (ITqvarsym (m,n,hif))         buf
 742
 743 -- ITbang can't happen here I think
 744 --   ITbang     -> cont (ITqvarsym (m,SLIT("!"),hif)) buf
 745
 746    _          -> cont (ITunknown (show token))      buf
 747
 748 ------------
 749 ifaceKeywordsFM :: UniqFM IfaceToken
 750 ifaceKeywordsFM = listToUFM $
 751       map (\ (x,y) -> (_PK_ x,y))
 752        [("/\\_",                ITbiglam)
 753        ,("@_",                  ITatsign)
 754        ,("letrec_",             ITletrec)
 755        ,("interface_",          ITinterface)
 756        ,("usages_",             ITusages)
 757        ,("versions_",           ITversions)
 758        ,("exports_",            ITexports)
 759        ,("instance_modules_",   ITinstance_modules)
 760        ,("instances_",          ITinstances)
 761        ,("fixities_",           ITfixities)
 762        ,("declarations_",       ITdeclarations)
 763        ,("pragmas_",            ITpragmas)
 764        ,("forall_",             ITforall)
 765        ,("u_",                  ITunfold False)
 766        ,("U_",                  ITunfold True)
 767        ,("A_",                  ITarity)
 768        ,("P_",                  ITspecialise)
 769        ,("coerce_",             ITcoerce)
 770        ,("inline_",             ITinline)
 771        ,("bot_",                ITbottom)
 772        ,("integer_",            ITinteger_lit)
 773        ,("rational_",           ITrational_lit)
 774        ,("addr_",               ITaddr_lit)
 775        ,("float_",              ITfloat_lit)
 776        ,("string_",             ITstring_lit)
 777        ,("litlit_",             ITlit_lit)
 778        ,("ccall_",              ITccall (False, False))
 779        ,("ccall_GC_",           ITccall (False, True))
 780        ,("casm_",               ITccall (True,  False))
 781        ,("casm_GC_",            ITccall (True,  True))
 782        ]
 783
 784 haskellKeywordsFM = listToUFM $
 785       map (\ (x,y) -> (_PK_ x,y))
 786       [ ("data",                ITdata)
 787        ,("type",                ITtype)
 788        ,("newtype",             ITnewtype)
 789        ,("class",               ITclass)
 790        ,("where",               ITwhere)
 791        ,("instance",            ITinstance)
 792        ,("infixl",              ITinfixl)
 793        ,("infixr",              ITinfixr)
 794        ,("infix",               ITinfix)
 795        ,("case",                ITcase)
 796        ,("case#",               ITprim_case)
 797        ,("of",                  ITof)
 798        ,("in",                  ITin)
 799        ,("let",                 ITlet)
 800        ,("deriving",            ITderiving)
 801
 802        ,("->",                  ITrarrow)
 803        ,("\\",                  ITlam)
 804        ,("|",                   ITvbar)
 805        ,("!",                   ITbang)
 806        ,("=>",                  ITdarrow)
 807        ,("=",                   ITequal)
 808        ,("::",                  ITdcolon)
 809        ]
 810
 811
 812 -- doDiscard rips along really fast, looking for a double semicolon,
 813 -- indicating the end of the pragma we're skipping
 814 doDiscard inStr buf =
 815 -- _trace (show (C# (currentChar# buf))) $
 816  case currentChar# buf of
 817    ';'# ->
 818      if not inStr then
 819        case lookAhead# buf 1# of
 820         ';'# -> incLexeme (incLexeme buf)
 821         _    -> doDiscard inStr (incLexeme buf)
 822      else
 823        doDiscard inStr (incLexeme buf)
 824    '"'# ->
 825        let
 826         odd_slashes buf flg i# =
 827           case lookAhead# buf i# of
 828            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 829            _     -> flg
 830        in
 831        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 832          '\\'# -> -- escaping something..
 833            if odd_slashes buf True (negateInt# 2#) then
 834                -- odd number of slashes, " is escaped.
 835               doDiscard inStr (incLexeme buf)
 836            else
 837                -- even number of slashes, \ is escaped.
 838               doDiscard (not inStr) (incLexeme buf)
 839          _ -> case inStr of -- forced to avoid build-up
 840                True  -> doDiscard False (incLexeme buf)
 841                False -> doDiscard True  (incLexeme buf)
 842    _ -> doDiscard inStr (incLexeme buf)
 843
 844 \end{code}
 845
 846 begin{code}
 847 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 848 my_span p xs = go [] xs
 849   where
 850     go so_far (x:xs') | p x = go (x:so_far) xs'
 851     go so_far xs            = (reverse so_far, xs)
 852
 853 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 854 my_span' p xs = go [] 0 xs
 855   where
 856     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 857     go so_far n xs            = (reverse so_far,n, xs)
 858 end{code}
 859
 860
 861 %************************************************************************
 862 %*                                                                      *
 863 \subsection{Other utility functions
 864 %*                                                                      *
 865 %************************************************************************
 866
 867 \begin{code}
 868 type IfM a = StringBuffer       -- Input string
 869           -> SrcLoc
 870           -> MaybeErr a ErrMsg
 871
 872 returnIf   :: a -> IfM a
 873 returnIf a s l = Succeeded a
 874
 875 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 876 m `thenIf` k = \s l ->
 877         case m s l of
 878                 Succeeded a -> k a s l
 879                 Failed err  -> Failed err
 880
 881 getSrcLocIf :: IfM SrcLoc
 882 getSrcLocIf s l = Succeeded l
 883
 884 happyError :: IfM a
 885 happyError s l = Failed (ifaceParseErr l ([]::[IfaceToken]){-Todo-})
 886
 887
 888 {-
 889  Note that if the file we're processing ends with `hi-boot',
 890  we accept it on faith as having the right version.
 891  This is done so that .hi-boot files  that comes with hsc
 892  don't have to be updated before every release, and it
 893  allows us to share .hi-boot files with versions of hsc
 894  that don't have .hi version checking (e.g., ghc-2.10's)
 895
 896  If the version number is 0, the checking is also turned off.
 897 -}
 898 checkVersion :: Maybe Integer -> IfM ()
 899 checkVersion mb@(Just v) s l
 900  | (v==0) || (v == PROJECTVERSION) = Succeeded ()
 901  | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
 902 checkVersion mb@Nothing  s l
 903  | "hi-boot" `isSuffixOf` (_UNPK_ (srcLocFile l)) = Succeeded ()
 904  | otherwise = Failed (ifaceVersionErr mb l ([]::[IfaceToken]){-Todo-})
 905
 906 -----------------------------------------------------------------
 907
 908 ifaceParseErr l toks
 909   = hsep [ppr l, ptext SLIT("Interface-file parse error;"),
 910           ptext SLIT("toks="), text (show (take 10 toks))]
 911
 912 ifaceVersionErr hi_vers l toks
 913   = hsep [ppr l, ptext SLIT("Interface file version error;"),
 914           ptext SLIT("Expected"), int PROJECTVERSION,
 915           ptext SLIT(" found "), pp_version]
 916     where
 917      pp_version =
 918       case hi_vers of
 919         Nothing -> ptext SLIT("pre ghc-3.02 version")
 920         Just v  -> ptext SLIT("version") <+> integer v
 921
 922 \end{code}