ghc/compiler/reader/Lex.lhs

   1 %
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
   3 %
   4 \section[Lexical analysis]{Lexical analysis}
   5
   6 \begin{code}
   7 #include "HsVersions.h"
   8
   9 module Lex (
  10
  11         isLexCon, isLexVar, isLexId, isLexSym,
  12         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  13         mkTupNameStr, ifaceParseErr,
  14
  15         -- Monad for parser
  16         IfaceToken(..), lexIface, SYN_IE(IfM), thenIf, returnIf, happyError,
  17         StringBuffer
  18
  19     ) where
  20
  21
  22 IMPORT_1_3(Char(isDigit, isAlpha, isAlphanum, isUpper,isLower, isSpace, ord))
  23 IMPORT_DELOOPER(Ubiq)
  24 IMPORT_DELOOPER(IdLoop)    -- get the CostCentre type&constructors from here
  25
  26 import CmdLineOpts      ( opt_IgnoreIfacePragmas )
  27 import Demand           ( Demand(..) {- instance Read -} )
  28 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  29 --import FiniteMap      ( FiniteMap, listToFM, lookupFM )
  30 #if __GLASGOW_HASKELL__ >= 202
  31 import Maybes           ( MaybeErr(..) )
  32 #else
  33 import Maybes           ( Maybe(..), MaybeErr(..) )
  34 #endif
  35 import Pretty
  36
  37
  38
  39 import ErrUtils         ( Error(..) )
  40 import Outputable       ( Outputable(..) )
  41 import PprStyle         ( PprStyle(..) )
  42 import Util             ( nOfThem, panic )
  43
  44 import FastString
  45 import StringBuffer
  46
  47 #if __GLASGOW_HASKELL__ <= 201
  48 import PreludeGlaST
  49 #else
  50 import GlaExts
  51 #endif
  52 \end{code}
  53
  54 %************************************************************************
  55 %*                                                                      *
  56 \subsection{Lexical categories}
  57 %*                                                                      *
  58 %************************************************************************
  59
  60 These functions test strings to see if they fit the lexical categories
  61 defined in the Haskell report.  Normally applied as in e.g. @isCon
  62 (getLocalName foo)@.
  63
  64 \begin{code}
  65 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  66  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  67
  68 isLexCon cs = isLexConId  cs || isLexConSym cs
  69 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  70
  71 isLexId  cs = isLexConId  cs || isLexVarId  cs
  72 isLexSym cs = isLexConSym cs || isLexVarSym cs
  73
  74 -------------
  75
  76 isLexConId cs
  77   | _NULL_ cs        = False
  78   | cs == SLIT("[]") = True
  79   | c  == '('        = True     -- (), (,), (,,), ...
  80   | otherwise        = isUpper c || isUpperISO c
  81   where
  82     c = _HEAD_ cs
  83
  84 isLexVarId cs
  85   | _NULL_ cs    = False
  86   | otherwise    = isLower c || isLowerISO c
  87   where
  88     c = _HEAD_ cs
  89
  90 isLexConSym cs
  91   | _NULL_ cs   = False
  92   | otherwise   = c  == ':'
  93                || cs == SLIT("->")
  94   where
  95     c = _HEAD_ cs
  96
  97 isLexVarSym cs
  98   | _NULL_ cs = False
  99   | otherwise = isSymbolASCII c
 100              || isSymbolISO c
 101   where
 102     c = _HEAD_ cs
 103
 104 -------------
 105 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 106 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 107 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 108 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 109 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 110 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 111 \end{code}
 112
 113
 114 %************************************************************************
 115 %*                                                                      *
 116 \subsection{Tuple strings -- ugh!}
 117 %*                                                                      *
 118 %************************************************************************
 119
 120 \begin{code}
 121 mkTupNameStr 0 = SLIT("()")
 122 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 123 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 124 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 125 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 126 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 127 \end{code}
 128
 129
 130
 131 %************************************************************************
 132 %*                                                                      *
 133 \subsection{Data types}
 134 %*                                                                      *
 135 %************************************************************************
 136
 137 The token data type, fairly un-interesting except from two constructors,
 138 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 139 strictness, unfolding etc) and types for id decls.
 140
 141 The Idea/Observation here is that the renamer needs to scan through
 142 all of an interface file before it can continue. But only a fraction
 143 of the information contained in the file turns out to be useful, so
 144 delaying as much as possible of the scanning and parsing of an
 145 interface file Makes Sense (Heap profiles of the compiler
 146 show at a reduction in heap usage by at least a factor of two,
 147 post-renamer).
 148
 149 Hence, the interface file lexer spots when value declarations are
 150 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 151 for the type and any other id info for that binding (unfolding, strictness
 152 etc). These constructors are applied to the result of lexing these sub-chunks.
 153
 154 The lexing of the type and id info is all done lazily, of course, so
 155 the scanning (and subsequent parsing) will be done *only* on the ids the
 156 renamer finds out that it is interested in. The rest will just be junked.
 157 Laziness, you know it makes sense :-)
 158
 159 \begin{code}
 160 data IfaceToken
 161   = ITinterface         -- keywords
 162   | ITusages
 163   | ITversions
 164   | ITexports
 165   | ITinstance_modules
 166   | ITinstances
 167   | ITfixities
 168   | ITdeclarations
 169   | ITpragmas
 170   | ITdata
 171   | ITtype
 172   | ITnewtype
 173   | ITderiving
 174   | ITclass
 175   | ITwhere
 176   | ITinstance
 177   | ITinfixl
 178   | ITinfixr
 179   | ITinfix
 180   | ITforall
 181   | ITbang              -- magic symbols
 182   | ITvbar
 183   | ITdcolon
 184   | ITcomma
 185   | ITdarrow
 186   | ITdotdot
 187   | ITequal
 188   | ITocurly
 189   | ITobrack
 190   | IToparen
 191   | ITrarrow
 192   | ITccurly
 193   | ITcbrack
 194   | ITcparen
 195   | ITsemi
 196   | ITvarid   FAST_STRING
 197   | ITconid   FAST_STRING
 198   | ITvarsym  FAST_STRING
 199   | ITconsym  FAST_STRING
 200   | ITqvarid  (FAST_STRING,FAST_STRING)
 201   | ITqconid  (FAST_STRING,FAST_STRING)
 202   | ITqvarsym (FAST_STRING,FAST_STRING)
 203   | ITqconsym (FAST_STRING,FAST_STRING)
 204
 205   | ITidinfo [IfaceToken]  -- lazily return the stream of tokens for
 206                            -- the info attached to an id.
 207   | ITtysig [IfaceToken]   -- lazily return the stream of tokens for
 208                            -- the info attached to an id.
 209         -- Stuff for reading unfoldings
 210   | ITarity | ITstrict | ITunfold
 211   | ITdemand [Demand] | ITbottom
 212   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 213   | ITcoerce_in | ITcoerce_out | ITatsign
 214   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 215   | ITscc CostCentre
 216   | ITchar Char | ITstring FAST_STRING
 217   | ITinteger Integer | ITdouble Double
 218   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 219   | ITunknown String            -- Used when the lexer can't make sense of it
 220   deriving Text -- debugging
 221
 222 instance Text CostCentre -- cheat!
 223
 224 \end{code}
 225
 226 %************************************************************************
 227 %*                                                                      *
 228 \subsection{The lexical analyser}
 229 %*                                                                      *
 230 %************************************************************************
 231
 232 \begin{code}
 233 lexIface :: StringBuffer -> [IfaceToken]
 234 lexIface buf =
 235  _scc_ "Lexer"
 236 -- if bufferExhausted buf then
 237 --  []
 238 -- else
 239 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 240   case currentChar# buf of
 241       -- whitespace and comments, ignore.
 242     ' '#  -> lexIface (stepOn buf)
 243     '\t'# -> lexIface (stepOn buf)
 244     '\n'# -> lexIface (stepOn buf)
 245
 246 -- Numbers and comments
 247     '-'#  ->
 248       case lookAhead# buf 1# of
 249         '-'# -> lex_comment (stepOnBy# buf 2#)
 250         c    ->
 251           if isDigit (C# c)
 252           then lex_num (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 253           else lex_id buf
 254
 255 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 256 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 257
 258     '('# ->
 259          case prefixMatch (stepOn buf) "..)" of
 260            Just buf' ->  ITdotdot : lexIface (stepOverLexeme buf')
 261            Nothing ->
 262             case lookAhead# buf 1# of
 263               ','# -> lex_tuple Nothing  (stepOnBy# buf 2#)
 264               ')'# -> ITconid SLIT("()") : lexIface (stepOnBy# buf 2#)
 265               _    -> IToparen : lexIface (stepOn buf)
 266
 267     '{'# -> ITocurly : lexIface (stepOn buf)
 268     '}'# -> ITccurly : lexIface (stepOn buf)
 269     ')'# -> ITcparen : lexIface (stepOn buf)
 270     '['# ->
 271       case lookAhead# buf 1# of
 272         ']'# -> ITconid SLIT("[]") : lexIface (stepOnBy# buf 2#)
 273         _    -> ITobrack : lexIface (stepOn buf)
 274     ']'# -> ITcbrack     : lexIface (stepOn buf)
 275     ','# -> ITcomma      : lexIface (stepOn buf)
 276     ':'# -> case lookAhead# buf 1# of
 277               ':'# -> ITdcolon  : lexIface (stepOnBy# buf 2#)
 278               _    -> lex_id (incLexeme buf)
 279     ';'#  -> ITsemi     : lexIface (stepOn buf)
 280     '\"'# -> case untilEndOfString# (stepOn buf) of
 281               buf' ->
 282                   -- the string literal does *not* include the dquotes
 283                 case lexemeToFastString buf' of
 284                  v -> ITstring v : lexIface (stepOn (stepOverLexeme buf'))
 285
 286     '\''# -> --
 287              -- untilEndOfChar# extends the current lexeme until
 288              -- it hits a non-escaped single quote. The lexeme of the
 289              -- StringBuffer returned does *not* include the closing quote,
 290              -- hence we augment the lexeme and make sure to add the
 291              -- starting quote, before `read'ing the string.
 292              --
 293              case untilEndOfChar# (stepOn buf) of
 294                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 295                         [  (ch, rest)] -> ITchar ch : lexIface (stepOverLexeme (incLexeme buf'))
 296
 297 -- ``thingy'' form for casm
 298     '`'# ->
 299             case lookAhead# buf 1# of
 300               '`'# -> lex_cstring (stepOnBy# buf 2#) -- remove the `` and go.
 301               _    -> lex_id (incLexeme buf)         -- add ` to lexeme and assume
 302                                                      -- scanning an id of some sort.
 303 -- Keywords
 304     '_'# ->
 305          case lookAhead# buf 1# of
 306            'S'# -> case lookAhead# buf 2# of
 307                     '_'# -> ITstrict :
 308                             lex_demand (stepOnUntil (not . isSpace)
 309                                                     (stepOnBy# buf 3#)) -- past _S_
 310            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 311                      Just buf' -> lex_scc (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 312                      Nothing   -> lex_keyword (stepOnBy# buf 1#) -- drop the '_' and assume
 313                                                                  -- it is a keyword.
 314            _    -> lex_keyword (stepOn buf)
 315
 316     '\NUL'# ->
 317             if bufferExhausted (stepOn buf) then
 318                []
 319             else
 320                lex_id buf
 321     c ->
 322         if isDigit (C# c) then
 323            lex_num (id) (ord# c -# ord# '0'#) (incLexeme buf)
 324         else
 325            lex_id buf
 326 --  where
 327 lex_comment buf =
 328 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 329    case untilChar# buf '\n'# of {buf' -> lexIface (stepOverLexeme buf')}
 330
 331 ------------------
 332 lex_demand buf =
 333 -- _trace ("demand: "++[C# (currentChar# buf)]) $
 334  case read_em [] buf of { (ls,buf') -> ITdemand ls : lexIface (stepOverLexeme buf')}
 335  where
 336    -- code snatched from Demand.lhs
 337   read_em acc buf =
 338 --   _trace ("read_em: "++[C# (currentChar# buf)]) $
 339    case currentChar# buf of
 340     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 341     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 342     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 343     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 344     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 345     ')'# -> (reverse acc, stepOn buf)
 346     'U'# -> do_unpack True  acc (stepOnBy# buf 2#)
 347     'u'# -> do_unpack False acc (stepOnBy# buf 2#)
 348     _    -> (reverse acc, buf)
 349
 350   do_unpack wrapper_unpacks acc buf
 351    = case read_em [] buf of
 352       (stuff, rest) -> read_em (WwUnpack wrapper_unpacks stuff : acc) rest
 353
 354 ------------------
 355 lex_scc buf =
 356 -- _trace ("scc: "++[C# (currentChar# buf)]) $
 357  case currentChar# buf of
 358   '"'# ->
 359       -- YUCK^2
 360      case prefixMatch (stepOn buf) "NO_CC\"" of
 361       Just buf' -> ITscc noCostCentre : lexIface (stepOverLexeme buf')
 362       Nothing ->
 363        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 364         Just buf' -> ITscc useCurrentCostCentre : lexIface (stepOverLexeme buf')
 365         Nothing   ->
 366          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 367          Just buf' -> ITscc overheadCostCentre : lexIface (stepOverLexeme buf')
 368          Nothing   ->
 369           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 370            Just buf' -> ITscc dontCareCostCentre : lexIface (stepOverLexeme buf')
 371            Nothing   ->
 372             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 373              Just buf' -> ITscc subsumedCosts : lexIface (stepOverLexeme buf')
 374              Nothing ->
 375               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 376                Just buf' -> ITscc preludeCafsCostCentre : lexIface (stepOverLexeme buf')
 377                Nothing ->
 378                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 379                  Just buf' ->
 380                   case untilChar# (stepOverLexeme buf') '\"'# of
 381                    buf'' -> ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_):
 382                             lexIface (stepOn (stepOverLexeme buf''))
 383                  Nothing ->
 384                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 385                    Just buf' -> ITscc (preludeDictsCostCentre True) : lexIface (stepOverLexeme buf')
 386                    Nothing ->
 387                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 388                      Just buf' ->
 389                       case untilChar# (stepOverLexeme buf') '\"'# of
 390                        buf'' -> ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True):
 391                                 lexIface (stepOn (stepOverLexeme buf''))
 392                      Nothing ->
 393                       let
 394                        match_user_cc buf =
 395                         case untilChar# buf '/'# of
 396                          buf' ->
 397                           let mod_name = lexemeToFastString buf' in
 398                           case untilChar# (stepOn (stepOverLexeme buf')) '/'# of
 399                            buf'' ->
 400                             let grp_name = lexemeToFastString buf'' in
 401                             case untilChar# (stepOn (stepOverLexeme buf'')) '\"'# of
 402                              buf''' ->
 403                                let cc_name = lexemeToFastString buf''' in
 404                                (mkUserCC cc_name mod_name grp_name,
 405                                 stepOn (stepOverLexeme buf'''))
 406                       in
 407                       case prefixMatch (stepOn buf) "CAF:" of
 408                        Just buf' ->
 409                          case match_user_cc (stepOverLexeme buf') of
 410                           (cc, buf'') -> ITscc (cafifyCC cc) : lexIface buf''
 411                        Nothing ->
 412                          case match_user_cc (stepOn buf) of
 413                           (cc, buf'') -> ITscc cc : lexIface buf''
 414   c -> ITunknown [C# c] : lexIface (stepOn buf)
 415
 416
 417 -----------
 418 lex_num :: (Int -> Int) -> Int# -> StringBuffer -> [IfaceToken]
 419 lex_num minus acc# buf =
 420 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 421  case scanNumLit (I# acc#) buf of
 422      (acc',buf') ->
 423        case currentChar# buf' of
 424          '.'# ->
 425              -- this case is not optimised at all, as the
 426              -- presence of floating point numbers in interface
 427              -- files is not that common. (ToDo)
 428             case expandWhile (isDigit) (incLexeme buf') of
 429               buf'' -> -- points to first non digit char
 430                 case reads (lexemeToString buf'') of
 431                   [(v,_)] -> ITdouble v : lexIface (stepOverLexeme buf'')
 432          _ -> ITinteger (fromInt (minus acc')) : lexIface (stepOverLexeme buf')
 433
 434 --         case reads (lexemeToString buf') of
 435 --           [(i,_)] -> ITinteger i : lexIface (stepOverLexeme buf')
 436
 437 ------------
 438 lex_keyword buf =
 439 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 440  case currentChar# buf of
 441   ':'# -> case lookAhead# buf 1# of
 442             '_'# -> -- a binding, type (and other id-info) follows,
 443                     -- to make the parser ever so slightly, we push
 444                     --
 445                 lex_decl (stepOnBy# buf 2#)
 446             v# -> ITunknown (['_',':',C# v#]) : lexIface (stepOnBy# buf 2#)
 447   _ ->
 448     case expandWhile (is_kwd_char) buf of
 449      buf' ->
 450       let kw = lexemeToFastString buf' in
 451 --    _trace ("kw: "++lexemeToString buf') $
 452       case lookupUFM ifaceKeywordsFM kw of
 453        Nothing -> ITunknown (_UNPK_ kw) : -- (minor) sigh
 454                   lexIface (stepOverLexeme buf')
 455        Just xx -> xx : lexIface (stepOverLexeme buf')
 456
 457 lex_decl buf =
 458  case doDiscard False buf of -- spin until ;; is found
 459    buf' ->
 460       {- _trace (show (lexemeToString buf')) $ -}
 461       case currentChar# buf' of
 462        '\n'# -> -- newline, no id info.
 463            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 464            lexIface (stepOverLexeme buf')
 465        '\r'# -> -- just to be sure for those Win* boxes..
 466            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 467            lexIface (stepOverLexeme buf')
 468        '\NUL'# ->
 469            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 470            lexIface (stepOverLexeme buf')
 471        c     -> -- run all over the id info
 472          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 473            buf'' ->
 474                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 475                     --_trace (show (lexemeToString (decLexeme buf''))) $
 476                     ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))):
 477                     let ls = lexIface (stepOverLexeme buf'') in
 478                     if opt_IgnoreIfacePragmas then
 479                         ls
 480                     else
 481                         let is = lexIface (lexemeToBuffer (decLexeme buf'')) in
 482                         --_trace (show is) $
 483                         ITidinfo is : ls
 484
 485 -- ToDo: hammer!
 486 is_kwd_char c@(C# c#) =
 487  isAlphanum c || -- OLD: c `elem` "_@/\\"
 488  (case c# of
 489    '_'#  -> True
 490    '@'#  -> True
 491    '/'#  -> True
 492    '\\'# -> True
 493    _     -> False)
 494
 495
 496
 497 -----------
 498 lex_cstring buf =
 499 -- _trace ("lex_cstring: "++[C# (currentChar# buf)]) $
 500  case expandUntilMatch buf "\'\'" of
 501    buf' -> ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))) :
 502            lexIface (stepOverLexeme buf')
 503
 504 -----------
 505 lex_tuple module_dot buf =
 506 -- _trace ("lex_tuple: "++[C# (currentChar# buf)]) $
 507   go 2 buf
 508   where
 509    go n buf =
 510     case currentChar# buf of
 511       ','# -> go (n+1) (stepOn buf)
 512       ')'# -> end_lex_id module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 513       _    -> ITunknown ("tuple " ++ show n) : lexIface buf
 514
 515 -- Similarly ' itself is ok inside an identifier, but not at the start
 516
 517 id_arr :: _ByteArray Int
 518 id_arr =
 519  unsafePerformPrimIO (
 520   newCharArray (0,255) `thenPrimIO` \ barr ->
 521   let
 522    loop 256# = returnPrimIO ()
 523    loop i# =
 524     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 525        writeCharArray barr (I# i#) '\1' `seqPrimIO`
 526        loop (i# +# 1#)
 527     else
 528        writeCharArray barr (I# i#) '\0' `seqPrimIO`
 529        loop (i# +# 1#)
 530   in
 531   loop 0#                    `seqPrimIO`
 532   unsafeFreezeByteArray barr)
 533
 534 is_id_char (C# c#) =
 535  let
 536   _ByteArray _ arr# = id_arr
 537  in
 538  case ord# (indexCharArray# arr# (ord# c#)) of
 539   0# -> False
 540   1# -> True
 541
 542 --is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 543
 544 is_sym c#=
 545  case c# of {
 546    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 547    '#'# -> True; '$'#  -> True; ':'#  -> True; '%'# -> True;
 548    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 549    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 550    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 551    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 552
 553 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 554
 555
 556 mod_arr :: _ByteArray Int
 557 mod_arr =
 558  unsafePerformPrimIO (
 559   newCharArray (0,255) `thenPrimIO` \ barr ->
 560   let
 561    loop 256# = returnPrimIO ()
 562    loop i# =
 563     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 564        writeCharArray barr (I# i#) '\1' `seqPrimIO`
 565        loop (i# +# 1#)
 566     else
 567        writeCharArray barr (I# i#) '\0' `seqPrimIO`
 568        loop (i# +# 1#)
 569   in
 570   loop 0#                    `seqPrimIO`
 571   unsafeFreezeByteArray barr)
 572
 573
 574 is_mod_char (C# c#) =
 575  let
 576   _ByteArray _ arr# = mod_arr
 577  in
 578  case ord# (indexCharArray# arr# (ord# c#)) of
 579   0# -> False
 580   1# -> True
 581
 582 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 583
 584 {-
 585 lex_id cs =
 586  case _scc_ "lex_id.span" my_span' (is_mod_char) cs of
 587    (xs, len, cs') ->
 588     case cs' of
 589      [] -> case xs of
 590             [] -> lex_id2 Nothing cs
 591             _  -> lex_id3 Nothing len xs cs
 592
 593      '.':cs'' ->
 594         case xs of
 595           [] -> lex_id2 Nothing cs
 596           _  ->
 597            let
 598             pk_str = _PK_ (xs::String)
 599             len = lengthPS pk_str
 600            in
 601            if len==len+1 then
 602               error "Well, I never!"
 603            else
 604               lex_id2 (Just pk_str) cs''
 605      _ -> case xs of
 606             [] -> lex_id2 Nothing cs
 607             _  -> lex_id3 Nothing len xs cs'
 608
 609 -}
 610
 611 lex_id buf =
 612 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 613  case expandWhile (is_mod_char) buf of
 614    buf' ->
 615     case currentChar# buf' of
 616      '.'# ->
 617         if not (emptyLexeme buf') then
 618 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 619            case lexemeToFastString buf' of
 620              l@(FastString u# l# ba#) -> lex_id2 (Just (FastString u# l# ba#))
 621                                                  (stepOn (stepOverLexeme buf'))
 622         else
 623            lex_id2 Nothing buf'
 624      _  -> lex_id2 Nothing buf'
 625
 626 -- Dealt with the Module.part
 627 lex_id2 module_dot buf =
 628 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 629  case currentChar# buf of
 630   '['# ->
 631     case lookAhead# buf 1# of
 632      ']'# -> end_lex_id module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 633      _    -> lex_id3 module_dot buf
 634   '('# ->
 635     case lookAhead# buf 1# of
 636      ')'# -> end_lex_id module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 637      ','# -> lex_tuple module_dot (stepOnBy# buf 2#)
 638      _    -> lex_id3 module_dot buf
 639   ':'# -> lex_id3 module_dot (incLexeme buf)
 640   _    -> lex_id3 module_dot buf
 641
 642
 643
 644 -- Dealt with [], (), : special cases
 645
 646 lex_id3 module_dot buf =
 647 -- _trace ("lex_id3: "++[C# (currentChar# buf)]) $
 648  case expandWhile (is_id_char) buf of
 649   buf' ->
 650     case module_dot of
 651      Just _ ->
 652        end_lex_id module_dot (mk_var_token lexeme) (stepOverLexeme buf')
 653      Nothing ->
 654        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 655          Just kwd_token -> kwd_token           : lexIface new_buf
 656          Nothing        -> mk_var_token lexeme : lexIface new_buf
 657     where
 658      lexeme  = lexemeToFastString buf'
 659      new_buf = stepOverLexeme buf'
 660
 661
 662 {- OLD:
 663 lex_id2 module_dot [] ('[': ']': cs) = end_lex_id module_dot (ITconid SLIT("[]")) cs
 664 lex_id2 module_dot [] ('(': ')': cs) = end_lex_id module_dot (ITconid SLIT("()")) cs
 665 lex_id2 module_dot [] ('(': ',': cs) = lex_tuple module_dot cs
 666 lex_id2 module_dot [] (':' : cs)     = lex_id3 module_dot [':'] cs
 667 lex_id2 module_dot xs cs             = lex_id3 module_dot xs cs
 668 -}
 669
 670
 671 -- Dealt with [], (), : special cases
 672
 673 {-
 674 lex_id3 module_dot len_xs xs cs =
 675  case my_span' (is_id_char) cs of
 676    (xs1,len_xs1,rest) ->
 677     case module_dot of
 678      Just m  -> end_lex_id (Just m) (mk_var_token rxs) rest --OLD:_PK_ (reverse xs))) rest
 679      Nothing ->
 680       case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM rxs of
 681        Just kwd_token -> kwd_token          : lexIface rest
 682        other          -> token : lexIface cs end_lex_id Nothing (mk_var_token rxs) rest
 683     where
 684      rxs = packNChars (len_xs+len_xs1) (xs++xs1) -- OLD: _PK_ (reverse xs)
 685 -}
 686 mk_var_token pk_str =
 687      let
 688       f = _HEAD_ pk_str
 689      in
 690      --
 691      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 692      -- remove the second half of disjunction when using a 1.3 prelude.
 693      --
 694      if      isUpper f    then ITconid pk_str
 695      else if isLower f    then ITvarid pk_str
 696      else if f == ':'     then ITconsym pk_str
 697      else if isLowerISO f then ITvarid pk_str
 698      else if isUpperISO f then ITconid pk_str
 699      else ITvarsym pk_str
 700
 701 {-
 702     mk_var_token xs@(f:_) | isUpper f || isUpperISO f = ITconid n
 703                           | f == ':'              = ITconsym n
 704                           | isAlpha f             = ITvarid n
 705                           | otherwise             = ITvarsym n
 706                 where
 707                       n = _PK_ xs
 708 -}
 709
 710 end_lex_id Nothing token buf  = token : lexIface buf
 711 end_lex_id (Just m) token buf =
 712  case token of
 713    ITconid n  -> ITqconid  (m,n)         : lexIface buf
 714    ITvarid n  -> ITqvarid  (m,n)         : lexIface buf
 715    ITconsym n -> ITqconsym (m,n)         : lexIface buf
 716    ITvarsym n -> ITqvarsym (m,n)         : lexIface buf
 717    ITbang     -> ITqvarsym (m,SLIT("!")) : lexIface buf
 718    _          -> ITunknown (show token)  : lexIface buf
 719
 720 ------------
 721 ifaceKeywordsFM :: UniqFM IfaceToken
 722 ifaceKeywordsFM = listToUFM $
 723       map (\ (x,y) -> (_PK_ x,y))
 724        [("/\\_",                ITbiglam)
 725        ,("@_",                  ITatsign)
 726        ,("letrec_",             ITletrec)
 727        ,("interface_",          ITinterface)
 728        ,("usages_",             ITusages)
 729        ,("versions_",           ITversions)
 730        ,("exports_",            ITexports)
 731        ,("instance_modules_",   ITinstance_modules)
 732        ,("instances_",          ITinstances)
 733        ,("fixities_",           ITfixities)
 734        ,("declarations_",       ITdeclarations)
 735        ,("pragmas_",            ITpragmas)
 736        ,("forall_",             ITforall)
 737        ,("U_",                  ITunfold)
 738        ,("A_",                  ITarity)
 739        ,("coerce_in_",          ITcoerce_in)
 740        ,("coerce_out_",         ITcoerce_out)
 741        ,("bot_",                ITbottom)
 742        ,("integer_",            ITinteger_lit)
 743        ,("rational_",           ITrational_lit)
 744        ,("addr_",               ITaddr_lit)
 745        ,("float_",              ITfloat_lit)
 746        ,("string_",             ITstring_lit)
 747        ,("litlit_",             ITlit_lit)
 748        ,("ccall_",              ITccall (False, False))
 749        ,("ccall_GC_",           ITccall (False, True))
 750        ,("casm_",               ITccall (True,  False))
 751        ,("casm_GC_",            ITccall (True,  True))
 752        ]
 753
 754 haskellKeywordsFM = listToUFM $
 755       map (\ (x,y) -> (_PK_ x,y))
 756       [ ("data",                ITdata)
 757        ,("type",                ITtype)
 758        ,("newtype",             ITnewtype)
 759        ,("class",               ITclass)
 760        ,("where",               ITwhere)
 761        ,("instance",            ITinstance)
 762        ,("infixl",              ITinfixl)
 763        ,("infixr",              ITinfixr)
 764        ,("infix",               ITinfix)
 765        ,("case",                ITcase)
 766        ,("case#",               ITprim_case)
 767        ,("of",                  ITof)
 768        ,("in",                  ITin)
 769        ,("let",                 ITlet)
 770        ,("deriving",            ITderiving)
 771
 772        ,("->",                  ITrarrow)
 773        ,("\\",                  ITlam)
 774        ,("|",                   ITvbar)
 775        ,("!",                   ITbang)
 776        ,("=>",                  ITdarrow)
 777        ,("=",                   ITequal)
 778        ]
 779
 780
 781 -- doDiscard rips along really fast looking for a double semicolon,
 782 -- indicating the end of the pragma we're skipping
 783 doDiscard inStr buf =
 784 -- _trace (show (C# (currentChar# buf))) $
 785  case currentChar# buf of
 786    ';'# ->
 787      if not inStr then
 788        case lookAhead# buf 1# of
 789         ';'# -> incLexeme (incLexeme buf)
 790         _    -> doDiscard inStr (incLexeme buf)
 791      else
 792        doDiscard inStr (incLexeme buf)
 793    '"'# ->
 794        let
 795         odd_slashes buf flg i# =
 796           case lookAhead# buf i# of
 797            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 798            _     -> flg
 799        in
 800        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 801          '\\'# -> -- escaping something..
 802            if odd_slashes buf True (negateInt# 2#) then
 803                -- odd number of slashes, " is escaped.
 804               doDiscard inStr (incLexeme buf)
 805            else
 806                -- even number of slashes, \ is escaped.
 807               doDiscard (not inStr) (incLexeme buf)
 808          _ -> case inStr of -- forced to avoid build-up
 809                True  -> doDiscard False (incLexeme buf)
 810                False -> doDiscard True  (incLexeme buf)
 811    _ -> doDiscard inStr (incLexeme buf)
 812
 813 \end{code}
 814
 815 begin{code}
 816 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 817 my_span p xs = go [] xs
 818   where
 819     go so_far (x:xs') | p x = go (x:so_far) xs'
 820     go so_far xs            = (reverse so_far, xs)
 821
 822 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 823 my_span' p xs = go [] 0 xs
 824   where
 825     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 826     go so_far n xs            = (reverse so_far,n, xs)
 827 end{code}
 828
 829
 830 %************************************************************************
 831 %*                                                                      *
 832 \subsection{Other utility functions
 833 %*                                                                      *
 834 %************************************************************************
 835
 836 \begin{code}
 837 type IfM a = MaybeErr a Error
 838
 839 returnIf   :: a -> IfM a
 840 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 841 happyError :: Int -> [IfaceToken] -> IfM a
 842
 843 returnIf a = Succeeded a
 844
 845 thenIf (Succeeded a) k = k a
 846 thenIf (Failed  err) _ = Failed err
 847
 848 happyError ln toks = Failed (ifaceParseErr ln toks)
 849
 850 -----------------------------------------------------------------
 851
 852 ifaceParseErr ln toks sty
 853   = hsep [ptext SLIT("Interface-file parse error: line"), int ln, ptext SLIT("toks="), text (show (take 10 toks))]
 854 \end{code}