ghc/compiler/reader/Lex.lhs

   1 %
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
   3 %
   4 \section[Lexical analysis]{Lexical analysis}
   5
   6 \begin{code}
   7 #include "HsVersions.h"
   8
   9 module Lex (
  10
  11         isLexCon, isLexVar, isLexId, isLexSym,
  12         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  13         mkTupNameStr, ifaceParseErr,
  14
  15         -- Monad for parser
  16         IfaceToken(..), lexIface, SYN_IE(IfM), thenIf, returnIf, happyError,
  17         StringBuffer
  18
  19     ) where
  20
  21
  22 IMPORT_1_3(Char(isDigit, isAlpha, isAlphanum, isUpper,isLower, isSpace, ord))
  23
  24 #if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ <= 201
  25 IMPORT_DELOOPER(Ubiq)
  26 IMPORT_DELOOPER(IdLoop)    -- get the CostCentre type&constructors from here
  27 #else
  28 import {-# SOURCE #-} CostCentre
  29 # if __GLASGOW_HASKELL__ == 202
  30 import PrelBase ( Char(..) )
  31 # endif
  32 #endif
  33
  34 import CmdLineOpts      ( opt_IgnoreIfacePragmas )
  35 import Demand           ( Demand(..) {- instance Read -} )
  36 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  37 import BasicTypes       ( NewOrData(..), IfaceFlavour(..) )
  38
  39 #if __GLASGOW_HASKELL__ >= 202
  40 import Maybes           ( MaybeErr(..) )
  41 #else
  42 import Maybes           ( Maybe(..), MaybeErr(..) )
  43 #endif
  44 import Pretty
  45
  46
  47
  48 import ErrUtils         ( Error(..) )
  49 import Outputable       ( Outputable(..), PprStyle(..) )
  50 import Util             ( nOfThem, panic )
  51
  52 import FastString
  53 import StringBuffer
  54
  55 #if __GLASGOW_HASKELL__ <= 201
  56 import PreludeGlaST
  57 #else
  58 import GlaExts
  59 #endif
  60 \end{code}
  61
  62 %************************************************************************
  63 %*                                                                      *
  64 \subsection{Lexical categories}
  65 %*                                                                      *
  66 %************************************************************************
  67
  68 These functions test strings to see if they fit the lexical categories
  69 defined in the Haskell report.  Normally applied as in e.g. @isCon
  70 (getLocalName foo)@.
  71
  72 \begin{code}
  73 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  74  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  75
  76 isLexCon cs = isLexConId  cs || isLexConSym cs
  77 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  78
  79 isLexId  cs = isLexConId  cs || isLexVarId  cs
  80 isLexSym cs = isLexConSym cs || isLexVarSym cs
  81
  82 -------------
  83
  84 isLexConId cs
  85   | _NULL_ cs        = False
  86   | cs == SLIT("[]") = True
  87   | c  == '('        = True     -- (), (,), (,,), ...
  88   | otherwise        = isUpper c || isUpperISO c
  89   where
  90     c = _HEAD_ cs
  91
  92 isLexVarId cs
  93   | _NULL_ cs    = False
  94   | otherwise    = isLower c || isLowerISO c
  95   where
  96     c = _HEAD_ cs
  97
  98 isLexConSym cs
  99   | _NULL_ cs   = False
 100   | otherwise   = c  == ':'
 101                || cs == SLIT("->")
 102   where
 103     c = _HEAD_ cs
 104
 105 isLexVarSym cs
 106   | _NULL_ cs = False
 107   | otherwise = isSymbolASCII c
 108              || isSymbolISO c
 109   where
 110     c = _HEAD_ cs
 111
 112 -------------
 113 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 114 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 115 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 116 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 117 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 118 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 119 \end{code}
 120
 121
 122 %************************************************************************
 123 %*                                                                      *
 124 \subsection{Tuple strings -- ugh!}
 125 %*                                                                      *
 126 %************************************************************************
 127
 128 \begin{code}
 129 mkTupNameStr 0 = SLIT("()")
 130 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 131 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 132 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 133 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 134 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 135 \end{code}
 136
 137
 138
 139 %************************************************************************
 140 %*                                                                      *
 141 \subsection{Data types}
 142 %*                                                                      *
 143 %************************************************************************
 144
 145 The token data type, fairly un-interesting except from two constructors,
 146 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 147 strictness, unfolding etc) and types for id decls.
 148
 149 The Idea/Observation here is that the renamer needs to scan through
 150 all of an interface file before it can continue. But only a fraction
 151 of the information contained in the file turns out to be useful, so
 152 delaying as much as possible of the scanning and parsing of an
 153 interface file Makes Sense (Heap profiles of the compiler
 154 show at a reduction in heap usage by at least a factor of two,
 155 post-renamer).
 156
 157 Hence, the interface file lexer spots when value declarations are
 158 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 159 for the type and any other id info for that binding (unfolding, strictness
 160 etc). These constructors are applied to the result of lexing these sub-chunks.
 161
 162 The lexing of the type and id info is all done lazily, of course, so
 163 the scanning (and subsequent parsing) will be done *only* on the ids the
 164 renamer finds out that it is interested in. The rest will just be junked.
 165 Laziness, you know it makes sense :-)
 166
 167 \begin{code}
 168 data IfaceToken
 169   = ITinterface         -- keywords
 170   | ITusages
 171   | ITversions
 172   | ITexports
 173   | ITinstance_modules
 174   | ITinstances
 175   | ITfixities
 176   | ITdeclarations
 177   | ITpragmas
 178   | ITdata
 179   | ITtype
 180   | ITnewtype
 181   | ITderiving
 182   | ITclass
 183   | ITwhere
 184   | ITinstance
 185   | ITinfixl
 186   | ITinfixr
 187   | ITinfix
 188   | ITforall
 189   | ITbang              -- magic symbols
 190   | ITvbar
 191   | ITdcolon
 192   | ITcomma
 193   | ITdarrow
 194   | ITdotdot
 195   | ITequal
 196   | ITocurly
 197   | ITobrack
 198   | IToparen
 199   | ITrarrow
 200   | ITccurly
 201   | ITcbrack
 202   | ITcparen
 203   | ITsemi
 204   | ITvarid   FAST_STRING
 205   | ITconid   FAST_STRING
 206   | ITvarsym  FAST_STRING
 207   | ITconsym  FAST_STRING
 208   | ITqvarid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 209   | ITqconid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 210   | ITqvarsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 211   | ITqconsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 212
 213   | ITidinfo [IfaceToken]  -- lazily return the stream of tokens for
 214                            -- the info attached to an id.
 215   | ITtysig [IfaceToken]   -- lazily return the stream of tokens for
 216                            -- the info attached to an id.
 217         -- Stuff for reading unfoldings
 218   | ITarity | ITstrict
 219   | ITunfold Bool               -- True <=> there's an INLINE pragma on this Id
 220   | ITdemand [Demand] | ITbottom
 221   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 222   | ITcoerce_in | ITcoerce_out | ITatsign
 223   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 224   | ITscc CostCentre
 225   | ITchar Char | ITstring FAST_STRING
 226   | ITinteger Integer | ITdouble Double
 227   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 228   | ITunknown String            -- Used when the lexer can't make sense of it
 229   deriving Text -- debugging
 230
 231 instance Text CostCentre -- cheat!
 232
 233 \end{code}
 234
 235 %************************************************************************
 236 %*                                                                      *
 237 \subsection{The lexical analyser}
 238 %*                                                                      *
 239 %************************************************************************
 240
 241 \begin{code}
 242 lexIface :: StringBuffer -> [IfaceToken]
 243 lexIface buf =
 244  _scc_ "Lexer"
 245 -- if bufferExhausted buf then
 246 --  []
 247 -- else
 248 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 249   case currentChar# buf of
 250       -- whitespace and comments, ignore.
 251     ' '#  -> lexIface (stepOn buf)
 252     '\t'# -> lexIface (stepOn buf)
 253     '\n'# -> lexIface (stepOn buf)
 254
 255 -- Numbers and comments
 256     '-'#  ->
 257       case lookAhead# buf 1# of
 258         '-'# -> lex_comment (stepOnBy# buf 2#)
 259         c    ->
 260           if isDigit (C# c)
 261           then lex_num (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 262           else lex_id buf
 263
 264 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 265 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 266
 267     '('# ->
 268          case prefixMatch (stepOn buf) "..)" of
 269            Just buf' ->  ITdotdot : lexIface (stepOverLexeme buf')
 270            Nothing ->
 271             case lookAhead# buf 1# of
 272               ','# -> lex_tuple Nothing  (stepOnBy# buf 2#)
 273               ')'# -> ITconid SLIT("()") : lexIface (stepOnBy# buf 2#)
 274               _    -> IToparen : lexIface (stepOn buf)
 275
 276     '{'# -> ITocurly : lexIface (stepOn buf)
 277     '}'# -> ITccurly : lexIface (stepOn buf)
 278     ')'# -> ITcparen : lexIface (stepOn buf)
 279     '['# ->
 280       case lookAhead# buf 1# of
 281         ']'# -> ITconid SLIT("[]") : lexIface (stepOnBy# buf 2#)
 282         _    -> ITobrack : lexIface (stepOn buf)
 283     ']'# -> ITcbrack     : lexIface (stepOn buf)
 284     ','# -> ITcomma      : lexIface (stepOn buf)
 285     ':'# -> case lookAhead# buf 1# of
 286               ':'# -> ITdcolon  : lexIface (stepOnBy# buf 2#)
 287               _    -> lex_id (incLexeme buf)
 288     ';'#  -> ITsemi     : lexIface (stepOn buf)
 289     '\"'# -> case untilEndOfString# (stepOn buf) of
 290               buf' ->
 291                   -- the string literal does *not* include the dquotes
 292                 case lexemeToFastString buf' of
 293                  v -> ITstring v : lexIface (stepOn (stepOverLexeme buf'))
 294
 295     '\''# -> --
 296              -- untilEndOfChar# extends the current lexeme until
 297              -- it hits a non-escaped single quote. The lexeme of the
 298              -- StringBuffer returned does *not* include the closing quote,
 299              -- hence we augment the lexeme and make sure to add the
 300              -- starting quote, before `read'ing the string.
 301              --
 302              case untilEndOfChar# (stepOn buf) of
 303                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 304                         [  (ch, rest)] -> ITchar ch : lexIface (stepOverLexeme (incLexeme buf'))
 305
 306 -- ``thingy'' form for casm
 307     '`'# ->
 308             case lookAhead# buf 1# of
 309               '`'# -> lex_cstring (stepOnBy# buf 2#) -- remove the `` and go.
 310               _    -> lex_id (incLexeme buf)         -- add ` to lexeme and assume
 311                                                      -- scanning an id of some sort.
 312 -- Keywords
 313     '_'# ->
 314          case lookAhead# buf 1# of
 315            'S'# -> case lookAhead# buf 2# of
 316                     '_'# -> ITstrict :
 317                             lex_demand (stepOnUntil (not . isSpace)
 318                                                     (stepOnBy# buf 3#)) -- past _S_
 319            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 320                      Just buf' -> lex_scc (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 321                      Nothing   -> lex_keyword (stepOnBy# buf 1#) -- drop the '_' and assume
 322                                                                  -- it is a keyword.
 323            _    -> lex_keyword (stepOn buf)
 324
 325     '\NUL'# ->
 326             if bufferExhausted (stepOn buf) then
 327                []
 328             else
 329                lex_id buf
 330     c ->
 331         if isDigit (C# c) then
 332            lex_num (id) (ord# c -# ord# '0'#) (incLexeme buf)
 333         else
 334            lex_id buf
 335 --  where
 336 lex_comment buf =
 337 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 338    case untilChar# buf '\n'# of {buf' -> lexIface (stepOverLexeme buf')}
 339
 340 ------------------
 341 lex_demand buf =
 342 -- _trace ("demand: "++[C# (currentChar# buf)]) $
 343  case read_em [] buf of { (ls,buf') -> ITdemand ls : lexIface (stepOverLexeme buf')}
 344  where
 345    -- code snatched from Demand.lhs
 346   read_em acc buf =
 347 --   _trace ("read_em: "++[C# (currentChar# buf)]) $
 348    case currentChar# buf of
 349     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 350     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 351     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 352     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 353     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 354     ')'# -> (reverse acc, stepOn buf)
 355     'U'# -> do_unpack DataType True  acc (stepOnBy# buf 2#)
 356     'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
 357     'N'# -> do_unpack NewType True  acc (stepOnBy# buf 2#)
 358     'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
 359     _    -> (reverse acc, buf)
 360
 361   do_unpack new_or_data wrapper_unpacks acc buf
 362    = case read_em [] buf of
 363       (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
 364
 365 ------------------
 366 lex_scc buf =
 367 -- _trace ("scc: "++[C# (currentChar# buf)]) $
 368  case currentChar# buf of
 369   '"'# ->
 370       -- YUCK^2
 371      case prefixMatch (stepOn buf) "NO_CC\"" of
 372       Just buf' -> ITscc noCostCentre : lexIface (stepOverLexeme buf')
 373       Nothing ->
 374        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 375         Just buf' -> ITscc useCurrentCostCentre : lexIface (stepOverLexeme buf')
 376         Nothing   ->
 377          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 378          Just buf' -> ITscc overheadCostCentre : lexIface (stepOverLexeme buf')
 379          Nothing   ->
 380           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 381            Just buf' -> ITscc dontCareCostCentre : lexIface (stepOverLexeme buf')
 382            Nothing   ->
 383             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 384              Just buf' -> ITscc subsumedCosts : lexIface (stepOverLexeme buf')
 385              Nothing ->
 386               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 387                Just buf' -> ITscc preludeCafsCostCentre : lexIface (stepOverLexeme buf')
 388                Nothing ->
 389                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 390                  Just buf' ->
 391                   case untilChar# (stepOverLexeme buf') '\"'# of
 392                    buf'' -> ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_):
 393                             lexIface (stepOn (stepOverLexeme buf''))
 394                  Nothing ->
 395                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 396                    Just buf' -> ITscc (preludeDictsCostCentre True) : lexIface (stepOverLexeme buf')
 397                    Nothing ->
 398                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 399                      Just buf' ->
 400                       case untilChar# (stepOverLexeme buf') '\"'# of
 401                        buf'' -> ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True):
 402                                 lexIface (stepOn (stepOverLexeme buf''))
 403                      Nothing ->
 404                       let
 405                        match_user_cc buf =
 406                         case untilChar# buf '/'# of
 407                          buf' ->
 408                           let mod_name = lexemeToFastString buf' in
 409 --                        case untilChar# (stepOn (stepOverLexeme buf')) '/'# of
 410 --                         buf'' ->
 411 --                            let grp_name = lexemeToFastString buf'' in
 412                             case untilChar# (stepOn (stepOverLexeme buf')) '\"'# of
 413                              buf'' ->
 414                                let cc_name = lexemeToFastString buf'' in
 415                                (mkUserCC cc_name mod_name _NIL_{-grp_name-},
 416                                 stepOn (stepOverLexeme buf''))
 417                       in
 418                       case prefixMatch (stepOn buf) "CAF:" of
 419                        Just buf' ->
 420                          case match_user_cc (stepOverLexeme buf') of
 421                           (cc, buf'') -> ITscc (cafifyCC cc) : lexIface buf''
 422                        Nothing ->
 423                          case match_user_cc (stepOn buf) of
 424                           (cc, buf'') -> ITscc cc : lexIface buf''
 425   c -> ITunknown [C# c] : lexIface (stepOn buf)
 426
 427
 428 -----------
 429 lex_num :: (Int -> Int) -> Int# -> StringBuffer -> [IfaceToken]
 430 lex_num minus acc# buf =
 431 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 432  case scanNumLit (I# acc#) buf of
 433      (acc',buf') ->
 434        case currentChar# buf' of
 435          '.'# ->
 436              -- this case is not optimised at all, as the
 437              -- presence of floating point numbers in interface
 438              -- files is not that common. (ToDo)
 439             case expandWhile (isDigit) (incLexeme buf') of
 440               buf'' -> -- points to first non digit char
 441                 case reads (lexemeToString buf'') of
 442                   [(v,_)] -> ITdouble v : lexIface (stepOverLexeme buf'')
 443          _ -> ITinteger (fromInt (minus acc')) : lexIface (stepOverLexeme buf')
 444
 445 --         case reads (lexemeToString buf') of
 446 --           [(i,_)] -> ITinteger i : lexIface (stepOverLexeme buf')
 447
 448 ------------
 449 lex_keyword buf =
 450 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 451  case currentChar# buf of
 452   ':'# -> case lookAhead# buf 1# of
 453             '_'# -> -- a binding, type (and other id-info) follows,
 454                     -- to make the parser ever so slightly, we push
 455                     --
 456                 lex_decl (stepOnBy# buf 2#)
 457             v# -> ITunknown (['_',':',C# v#]) : lexIface (stepOnBy# buf 2#)
 458   _ ->
 459     case expandWhile (is_kwd_char) buf of
 460      buf' ->
 461       let kw = lexemeToFastString buf' in
 462 --    _trace ("kw: "++lexemeToString buf') $
 463       case lookupUFM ifaceKeywordsFM kw of
 464        Nothing -> ITunknown (_UNPK_ kw) : -- (minor) sigh
 465                   lexIface (stepOverLexeme buf')
 466        Just xx -> xx : lexIface (stepOverLexeme buf')
 467
 468 lex_decl buf =
 469  case doDiscard False buf of -- spin until ;; is found
 470    buf' ->
 471       {- _trace (show (lexemeToString buf')) $ -}
 472       case currentChar# buf' of
 473        '\n'# -> -- newline, no id info.
 474            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 475            lexIface (stepOverLexeme buf')
 476        '\r'# -> -- just to be sure for those Win* boxes..
 477            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 478            lexIface (stepOverLexeme buf')
 479        '\NUL'# ->
 480            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 481            lexIface (stepOverLexeme buf')
 482        c     -> -- run all over the id info
 483          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 484            buf'' ->
 485                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 486                     --_trace (show (lexemeToString (decLexeme buf''))) $
 487                     ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))):
 488                     let ls = lexIface (stepOverLexeme buf'') in
 489                     if opt_IgnoreIfacePragmas then
 490                         ls
 491                     else
 492                         let is = lexIface (lexemeToBuffer (decLexeme buf'')) in
 493                         --_trace (show is) $
 494                         ITidinfo is : ls
 495
 496 -- ToDo: hammer!
 497 is_kwd_char c@(C# c#) =
 498  isAlphanum c || -- OLD: c `elem` "_@/\\"
 499  (case c# of
 500    '_'#  -> True
 501    '@'#  -> True
 502    '/'#  -> True
 503    '\\'# -> True
 504    _     -> False)
 505
 506
 507
 508 -----------
 509 lex_cstring buf =
 510 -- _trace ("lex_cstring: "++[C# (currentChar# buf)]) $
 511  case expandUntilMatch buf "\'\'" of
 512    buf' -> ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))) :
 513            lexIface (stepOverLexeme buf')
 514
 515 -----------
 516 lex_tuple module_dot buf =
 517 -- _trace ("lex_tuple: "++[C# (currentChar# buf)]) $
 518   go 2 buf
 519   where
 520    go n buf =
 521     case currentChar# buf of
 522       ','# -> go (n+1) (stepOn buf)
 523       ')'# -> end_lex_id module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 524       _    -> ITunknown ("tuple " ++ show n) : lexIface buf
 525
 526 -- Similarly ' itself is ok inside an identifier, but not at the start
 527
 528 id_arr :: _ByteArray Int
 529 id_arr =
 530  unsafePerformPrimIO (
 531   newCharArray (0,255) `thenPrimIO` \ barr ->
 532   let
 533    loop 256# = returnPrimIO ()
 534    loop i# =
 535     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 536        writeCharArray barr (I# i#) '\1' `seqPrimIO`
 537        loop (i# +# 1#)
 538     else
 539        writeCharArray barr (I# i#) '\0' `seqPrimIO`
 540        loop (i# +# 1#)
 541   in
 542   loop 0#                    `seqPrimIO`
 543   unsafeFreezeByteArray barr)
 544
 545 is_id_char (C# c#) =
 546  let
 547   _ByteArray _ arr# = id_arr
 548  in
 549  case ord# (indexCharArray# arr# (ord# c#)) of
 550   0# -> False
 551   1# -> True
 552
 553 --is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 554
 555 is_sym c#=
 556  case c# of {
 557    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 558    '#'# -> True; '$'#  -> True; ':'#  -> True; '%'# -> True;
 559    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 560    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 561    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 562    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 563
 564 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 565
 566
 567 mod_arr :: _ByteArray Int
 568 mod_arr =
 569  unsafePerformPrimIO (
 570   newCharArray (0,255) `thenPrimIO` \ barr ->
 571   let
 572    loop 256# = returnPrimIO ()
 573    loop i# =
 574     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 575        writeCharArray barr (I# i#) '\1' `seqPrimIO`
 576        loop (i# +# 1#)
 577     else
 578        writeCharArray barr (I# i#) '\0' `seqPrimIO`
 579        loop (i# +# 1#)
 580   in
 581   loop 0#                    `seqPrimIO`
 582   unsafeFreezeByteArray barr)
 583
 584
 585 is_mod_char (C# c#) =
 586  let
 587   _ByteArray _ arr# = mod_arr
 588  in
 589  case ord# (indexCharArray# arr# (ord# c#)) of
 590   0# -> False
 591   1# -> True
 592
 593 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 594
 595 {-
 596 lex_id cs =
 597  case _scc_ "lex_id.span" my_span' (is_mod_char) cs of
 598    (xs, len, cs') ->
 599     case cs' of
 600      [] -> case xs of
 601             [] -> lex_id2 Nothing cs
 602             _  -> lex_id3 Nothing len xs cs
 603
 604      '.':cs'' ->
 605         case xs of
 606           [] -> lex_id2 Nothing cs
 607           _  ->
 608            let
 609             pk_str = _PK_ (xs::String)
 610             len = lengthPS pk_str
 611            in
 612            if len==len+1 then
 613               error "Well, I never!"
 614            else
 615               lex_id2 (Just pk_str) cs''
 616      _ -> case xs of
 617             [] -> lex_id2 Nothing cs
 618             _  -> lex_id3 Nothing len xs cs'
 619
 620 -}
 621
 622 lex_id buf =
 623 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 624  case expandWhile (is_mod_char) buf of
 625    buf' ->
 626     case currentChar# buf' of
 627      '.'# -> munch buf' HiFile
 628      '!'# -> munch buf' HiBootFile
 629      _    -> lex_id2 Nothing buf'
 630    where
 631     munch buf' hif =
 632         if not (emptyLexeme buf') then
 633 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 634            case lexemeToFastString buf' of
 635              l@(FastString u# l# ba#) -> lex_id2 (Just (FastString u# l# ba#, hif))
 636                                                  (stepOn (stepOverLexeme buf'))
 637         else
 638            lex_id2 Nothing buf'
 639
 640
 641 -- Dealt with the Module.part
 642 lex_id2 module_dot buf =
 643 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 644  case currentChar# buf of
 645   '['# ->
 646     case lookAhead# buf 1# of
 647      ']'# -> end_lex_id module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 648      _    -> lex_id3 module_dot buf
 649   '('# ->
 650     case lookAhead# buf 1# of
 651      ')'# -> end_lex_id module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 652      ','# -> lex_tuple module_dot (stepOnBy# buf 2#)
 653      _    -> lex_id3 module_dot buf
 654   ':'# -> lex_id3 module_dot (incLexeme buf)
 655   _    -> lex_id3 module_dot buf
 656
 657
 658
 659 -- Dealt with [], (), : special cases
 660
 661 lex_id3 module_dot buf =
 662 -- _trace ("lex_id3: "++[C# (currentChar# buf)]) $
 663  case expandWhile (is_id_char) buf of
 664   buf' ->
 665     case module_dot of
 666      Just _ ->
 667        end_lex_id module_dot (mk_var_token lexeme) (stepOverLexeme buf')
 668      Nothing ->
 669        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 670          Just kwd_token -> kwd_token           : lexIface new_buf
 671          Nothing        -> mk_var_token lexeme : lexIface new_buf
 672     where
 673      lexeme  = lexemeToFastString buf'
 674      new_buf = stepOverLexeme buf'
 675
 676
 677 {- OLD:
 678 lex_id2 module_dot [] ('[': ']': cs) = end_lex_id module_dot (ITconid SLIT("[]")) cs
 679 lex_id2 module_dot [] ('(': ')': cs) = end_lex_id module_dot (ITconid SLIT("()")) cs
 680 lex_id2 module_dot [] ('(': ',': cs) = lex_tuple module_dot cs
 681 lex_id2 module_dot [] (':' : cs)     = lex_id3 module_dot [':'] cs
 682 lex_id2 module_dot xs cs             = lex_id3 module_dot xs cs
 683 -}
 684
 685
 686 -- Dealt with [], (), : special cases
 687
 688 {-
 689 lex_id3 module_dot len_xs xs cs =
 690  case my_span' (is_id_char) cs of
 691    (xs1,len_xs1,rest) ->
 692     case module_dot of
 693      Just m  -> end_lex_id (Just m) (mk_var_token rxs) rest --OLD:_PK_ (reverse xs))) rest
 694      Nothing ->
 695       case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM rxs of
 696        Just kwd_token -> kwd_token          : lexIface rest
 697        other          -> token : lexIface cs end_lex_id Nothing (mk_var_token rxs) rest
 698     where
 699      rxs = packNChars (len_xs+len_xs1) (xs++xs1) -- OLD: _PK_ (reverse xs)
 700 -}
 701 mk_var_token pk_str =
 702      let
 703       f = _HEAD_ pk_str
 704      in
 705      --
 706      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 707      -- remove the second half of disjunction when using a 1.3 prelude.
 708      --
 709      if      isUpper f    then ITconid pk_str
 710      else if isLower f    then ITvarid pk_str
 711      else if f == ':'     then ITconsym pk_str
 712      else if isLowerISO f then ITvarid pk_str
 713      else if isUpperISO f then ITconid pk_str
 714      else ITvarsym pk_str
 715
 716 {-
 717     mk_var_token xs@(f:_) | isUpper f || isUpperISO f = ITconid n
 718                           | f == ':'              = ITconsym n
 719                           | isAlpha f             = ITvarid n
 720                           | otherwise             = ITvarsym n
 721                 where
 722                       n = _PK_ xs
 723 -}
 724
 725 end_lex_id Nothing token buf  = token : lexIface buf
 726 end_lex_id (Just (m,hif)) token buf =
 727  case token of
 728    ITconid n  -> ITqconid  (m,n,hif)         : lexIface buf
 729    ITvarid n  -> ITqvarid  (m,n,hif)         : lexIface buf
 730    ITconsym n -> ITqconsym (m,n,hif)         : lexIface buf
 731    ITvarsym n -> ITqvarsym (m,n,hif)         : lexIface buf
 732    ITbang     -> ITqvarsym (m,SLIT("!"),hif) : lexIface buf
 733    _          -> ITunknown (show token)      : lexIface buf
 734
 735 ------------
 736 ifaceKeywordsFM :: UniqFM IfaceToken
 737 ifaceKeywordsFM = listToUFM $
 738       map (\ (x,y) -> (_PK_ x,y))
 739        [("/\\_",                ITbiglam)
 740        ,("@_",                  ITatsign)
 741        ,("letrec_",             ITletrec)
 742        ,("interface_",          ITinterface)
 743        ,("usages_",             ITusages)
 744        ,("versions_",           ITversions)
 745        ,("exports_",            ITexports)
 746        ,("instance_modules_",   ITinstance_modules)
 747        ,("instances_",          ITinstances)
 748        ,("fixities_",           ITfixities)
 749        ,("declarations_",       ITdeclarations)
 750        ,("pragmas_",            ITpragmas)
 751        ,("forall_",             ITforall)
 752        ,("U_",                  ITunfold False)
 753        ,("U!_",                 ITunfold True)
 754        ,("A_",                  ITarity)
 755        ,("coerce_in_",          ITcoerce_in)
 756        ,("coerce_out_",         ITcoerce_out)
 757        ,("bot_",                ITbottom)
 758        ,("integer_",            ITinteger_lit)
 759        ,("rational_",           ITrational_lit)
 760        ,("addr_",               ITaddr_lit)
 761        ,("float_",              ITfloat_lit)
 762        ,("string_",             ITstring_lit)
 763        ,("litlit_",             ITlit_lit)
 764        ,("ccall_",              ITccall (False, False))
 765        ,("ccall_GC_",           ITccall (False, True))
 766        ,("casm_",               ITccall (True,  False))
 767        ,("casm_GC_",            ITccall (True,  True))
 768        ]
 769
 770 haskellKeywordsFM = listToUFM $
 771       map (\ (x,y) -> (_PK_ x,y))
 772       [ ("data",                ITdata)
 773        ,("type",                ITtype)
 774        ,("newtype",             ITnewtype)
 775        ,("class",               ITclass)
 776        ,("where",               ITwhere)
 777        ,("instance",            ITinstance)
 778        ,("infixl",              ITinfixl)
 779        ,("infixr",              ITinfixr)
 780        ,("infix",               ITinfix)
 781        ,("case",                ITcase)
 782        ,("case#",               ITprim_case)
 783        ,("of",                  ITof)
 784        ,("in",                  ITin)
 785        ,("let",                 ITlet)
 786        ,("deriving",            ITderiving)
 787
 788        ,("->",                  ITrarrow)
 789        ,("\\",                  ITlam)
 790        ,("|",                   ITvbar)
 791        ,("!",                   ITbang)
 792        ,("=>",                  ITdarrow)
 793        ,("=",                   ITequal)
 794        ]
 795
 796
 797 -- doDiscard rips along really fast looking for a double semicolon,
 798 -- indicating the end of the pragma we're skipping
 799 doDiscard inStr buf =
 800 -- _trace (show (C# (currentChar# buf))) $
 801  case currentChar# buf of
 802    ';'# ->
 803      if not inStr then
 804        case lookAhead# buf 1# of
 805         ';'# -> incLexeme (incLexeme buf)
 806         _    -> doDiscard inStr (incLexeme buf)
 807      else
 808        doDiscard inStr (incLexeme buf)
 809    '"'# ->
 810        let
 811         odd_slashes buf flg i# =
 812           case lookAhead# buf i# of
 813            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 814            _     -> flg
 815        in
 816        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 817          '\\'# -> -- escaping something..
 818            if odd_slashes buf True (negateInt# 2#) then
 819                -- odd number of slashes, " is escaped.
 820               doDiscard inStr (incLexeme buf)
 821            else
 822                -- even number of slashes, \ is escaped.
 823               doDiscard (not inStr) (incLexeme buf)
 824          _ -> case inStr of -- forced to avoid build-up
 825                True  -> doDiscard False (incLexeme buf)
 826                False -> doDiscard True  (incLexeme buf)
 827    _ -> doDiscard inStr (incLexeme buf)
 828
 829 \end{code}
 830
 831 begin{code}
 832 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 833 my_span p xs = go [] xs
 834   where
 835     go so_far (x:xs') | p x = go (x:so_far) xs'
 836     go so_far xs            = (reverse so_far, xs)
 837
 838 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 839 my_span' p xs = go [] 0 xs
 840   where
 841     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 842     go so_far n xs            = (reverse so_far,n, xs)
 843 end{code}
 844
 845
 846 %************************************************************************
 847 %*                                                                      *
 848 \subsection{Other utility functions
 849 %*                                                                      *
 850 %************************************************************************
 851
 852 \begin{code}
 853 type IfM a = MaybeErr a Error
 854
 855 returnIf   :: a -> IfM a
 856 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 857 happyError :: Int -> [IfaceToken] -> IfM a
 858
 859 returnIf a = Succeeded a
 860
 861 thenIf (Succeeded a) k = k a
 862 thenIf (Failed  err) _ = Failed err
 863
 864 happyError ln toks = Failed (ifaceParseErr ln toks)
 865
 866 -----------------------------------------------------------------
 867
 868 ifaceParseErr ln toks sty
 869   = hsep [ptext SLIT("Interface-file parse error: line"), int ln, ptext SLIT("toks="), text (show (take 10 toks))]
 870 \end{code}