ghc/compiler/reader/Lex.lhs

   1 %
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
   3 %
   4 \section[Lexical analysis]{Lexical analysis}
   5
   6 \begin{code}
   7 #include "HsVersions.h"
   8
   9 module Lex (
  10
  11         isLexCon, isLexVar, isLexId, isLexSym,
  12         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  13         mkTupNameStr, ifaceParseErr,
  14
  15         -- Monad for parser
  16         IfaceToken(..), lexIface, SYN_IE(IfM), thenIf, returnIf, happyError,
  17         StringBuffer
  18
  19     ) where
  20
  21
  22 IMPORT_1_3(Char(isDigit, isAlpha, isAlphanum, isUpper,isLower, isSpace, ord))
  23 IMPORT_DELOOPER(Ubiq)
  24 IMPORT_DELOOPER(IdLoop)    -- get the CostCentre type&constructors from here
  25
  26 import CmdLineOpts      ( opt_IgnoreIfacePragmas )
  27 import Demand           ( Demand(..) {- instance Read -} )
  28 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  29 import BasicTypes       ( NewOrData(..) )
  30
  31 #if __GLASGOW_HASKELL__ >= 202
  32 import Maybes           ( MaybeErr(..) )
  33 #else
  34 import Maybes           ( Maybe(..), MaybeErr(..) )
  35 #endif
  36 import Pretty
  37
  38
  39
  40 import ErrUtils         ( Error(..) )
  41 import Outputable       ( Outputable(..), PprStyle(..) )
  42 import Util             ( nOfThem, panic )
  43
  44 import FastString
  45 import StringBuffer
  46
  47 #if __GLASGOW_HASKELL__ <= 201
  48 import PreludeGlaST
  49 #else
  50 import GlaExts
  51 #endif
  52 \end{code}
  53
  54 %************************************************************************
  55 %*                                                                      *
  56 \subsection{Lexical categories}
  57 %*                                                                      *
  58 %************************************************************************
  59
  60 These functions test strings to see if they fit the lexical categories
  61 defined in the Haskell report.  Normally applied as in e.g. @isCon
  62 (getLocalName foo)@.
  63
  64 \begin{code}
  65 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  66  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  67
  68 isLexCon cs = isLexConId  cs || isLexConSym cs
  69 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  70
  71 isLexId  cs = isLexConId  cs || isLexVarId  cs
  72 isLexSym cs = isLexConSym cs || isLexVarSym cs
  73
  74 -------------
  75
  76 isLexConId cs
  77   | _NULL_ cs        = False
  78   | cs == SLIT("[]") = True
  79   | c  == '('        = True     -- (), (,), (,,), ...
  80   | otherwise        = isUpper c || isUpperISO c
  81   where
  82     c = _HEAD_ cs
  83
  84 isLexVarId cs
  85   | _NULL_ cs    = False
  86   | otherwise    = isLower c || isLowerISO c
  87   where
  88     c = _HEAD_ cs
  89
  90 isLexConSym cs
  91   | _NULL_ cs   = False
  92   | otherwise   = c  == ':'
  93                || cs == SLIT("->")
  94   where
  95     c = _HEAD_ cs
  96
  97 isLexVarSym cs
  98   | _NULL_ cs = False
  99   | otherwise = isSymbolASCII c
 100              || isSymbolISO c
 101   where
 102     c = _HEAD_ cs
 103
 104 -------------
 105 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 106 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 107 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 108 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 109 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 110 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 111 \end{code}
 112
 113
 114 %************************************************************************
 115 %*                                                                      *
 116 \subsection{Tuple strings -- ugh!}
 117 %*                                                                      *
 118 %************************************************************************
 119
 120 \begin{code}
 121 mkTupNameStr 0 = SLIT("()")
 122 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 123 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 124 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 125 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 126 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 127 \end{code}
 128
 129
 130
 131 %************************************************************************
 132 %*                                                                      *
 133 \subsection{Data types}
 134 %*                                                                      *
 135 %************************************************************************
 136
 137 The token data type, fairly un-interesting except from two constructors,
 138 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 139 strictness, unfolding etc) and types for id decls.
 140
 141 The Idea/Observation here is that the renamer needs to scan through
 142 all of an interface file before it can continue. But only a fraction
 143 of the information contained in the file turns out to be useful, so
 144 delaying as much as possible of the scanning and parsing of an
 145 interface file Makes Sense (Heap profiles of the compiler
 146 show at a reduction in heap usage by at least a factor of two,
 147 post-renamer).
 148
 149 Hence, the interface file lexer spots when value declarations are
 150 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 151 for the type and any other id info for that binding (unfolding, strictness
 152 etc). These constructors are applied to the result of lexing these sub-chunks.
 153
 154 The lexing of the type and id info is all done lazily, of course, so
 155 the scanning (and subsequent parsing) will be done *only* on the ids the
 156 renamer finds out that it is interested in. The rest will just be junked.
 157 Laziness, you know it makes sense :-)
 158
 159 \begin{code}
 160 data IfaceToken
 161   = ITinterface         -- keywords
 162   | ITusages
 163   | ITversions
 164   | ITexports
 165   | ITinstance_modules
 166   | ITinstances
 167   | ITfixities
 168   | ITdeclarations
 169   | ITpragmas
 170   | ITdata
 171   | ITtype
 172   | ITnewtype
 173   | ITderiving
 174   | ITclass
 175   | ITwhere
 176   | ITinstance
 177   | ITinfixl
 178   | ITinfixr
 179   | ITinfix
 180   | ITforall
 181   | ITbang              -- magic symbols
 182   | ITvbar
 183   | ITdcolon
 184   | ITcomma
 185   | ITdarrow
 186   | ITdotdot
 187   | ITequal
 188   | ITocurly
 189   | ITobrack
 190   | IToparen
 191   | ITrarrow
 192   | ITccurly
 193   | ITcbrack
 194   | ITcparen
 195   | ITsemi
 196   | ITvarid   FAST_STRING
 197   | ITconid   FAST_STRING
 198   | ITvarsym  FAST_STRING
 199   | ITconsym  FAST_STRING
 200   | ITqvarid  (FAST_STRING,FAST_STRING)
 201   | ITqconid  (FAST_STRING,FAST_STRING)
 202   | ITqvarsym (FAST_STRING,FAST_STRING)
 203   | ITqconsym (FAST_STRING,FAST_STRING)
 204
 205   | ITidinfo [IfaceToken]  -- lazily return the stream of tokens for
 206                            -- the info attached to an id.
 207   | ITtysig [IfaceToken]   -- lazily return the stream of tokens for
 208                            -- the info attached to an id.
 209         -- Stuff for reading unfoldings
 210   | ITarity | ITstrict
 211   | ITunfold Bool               -- True <=> there's an INLINE pragma on this Id
 212   | ITdemand [Demand] | ITbottom
 213   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 214   | ITcoerce_in | ITcoerce_out | ITatsign
 215   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 216   | ITscc CostCentre
 217   | ITchar Char | ITstring FAST_STRING
 218   | ITinteger Integer | ITdouble Double
 219   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 220   | ITunknown String            -- Used when the lexer can't make sense of it
 221   deriving Text -- debugging
 222
 223 instance Text CostCentre -- cheat!
 224
 225 \end{code}
 226
 227 %************************************************************************
 228 %*                                                                      *
 229 \subsection{The lexical analyser}
 230 %*                                                                      *
 231 %************************************************************************
 232
 233 \begin{code}
 234 lexIface :: StringBuffer -> [IfaceToken]
 235 lexIface buf =
 236  _scc_ "Lexer"
 237 -- if bufferExhausted buf then
 238 --  []
 239 -- else
 240 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 241   case currentChar# buf of
 242       -- whitespace and comments, ignore.
 243     ' '#  -> lexIface (stepOn buf)
 244     '\t'# -> lexIface (stepOn buf)
 245     '\n'# -> lexIface (stepOn buf)
 246
 247 -- Numbers and comments
 248     '-'#  ->
 249       case lookAhead# buf 1# of
 250         '-'# -> lex_comment (stepOnBy# buf 2#)
 251         c    ->
 252           if isDigit (C# c)
 253           then lex_num (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 254           else lex_id buf
 255
 256 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 257 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 258
 259     '('# ->
 260          case prefixMatch (stepOn buf) "..)" of
 261            Just buf' ->  ITdotdot : lexIface (stepOverLexeme buf')
 262            Nothing ->
 263             case lookAhead# buf 1# of
 264               ','# -> lex_tuple Nothing  (stepOnBy# buf 2#)
 265               ')'# -> ITconid SLIT("()") : lexIface (stepOnBy# buf 2#)
 266               _    -> IToparen : lexIface (stepOn buf)
 267
 268     '{'# -> ITocurly : lexIface (stepOn buf)
 269     '}'# -> ITccurly : lexIface (stepOn buf)
 270     ')'# -> ITcparen : lexIface (stepOn buf)
 271     '['# ->
 272       case lookAhead# buf 1# of
 273         ']'# -> ITconid SLIT("[]") : lexIface (stepOnBy# buf 2#)
 274         _    -> ITobrack : lexIface (stepOn buf)
 275     ']'# -> ITcbrack     : lexIface (stepOn buf)
 276     ','# -> ITcomma      : lexIface (stepOn buf)
 277     ':'# -> case lookAhead# buf 1# of
 278               ':'# -> ITdcolon  : lexIface (stepOnBy# buf 2#)
 279               _    -> lex_id (incLexeme buf)
 280     ';'#  -> ITsemi     : lexIface (stepOn buf)
 281     '\"'# -> case untilEndOfString# (stepOn buf) of
 282               buf' ->
 283                   -- the string literal does *not* include the dquotes
 284                 case lexemeToFastString buf' of
 285                  v -> ITstring v : lexIface (stepOn (stepOverLexeme buf'))
 286
 287     '\''# -> --
 288              -- untilEndOfChar# extends the current lexeme until
 289              -- it hits a non-escaped single quote. The lexeme of the
 290              -- StringBuffer returned does *not* include the closing quote,
 291              -- hence we augment the lexeme and make sure to add the
 292              -- starting quote, before `read'ing the string.
 293              --
 294              case untilEndOfChar# (stepOn buf) of
 295                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 296                         [  (ch, rest)] -> ITchar ch : lexIface (stepOverLexeme (incLexeme buf'))
 297
 298 -- ``thingy'' form for casm
 299     '`'# ->
 300             case lookAhead# buf 1# of
 301               '`'# -> lex_cstring (stepOnBy# buf 2#) -- remove the `` and go.
 302               _    -> lex_id (incLexeme buf)         -- add ` to lexeme and assume
 303                                                      -- scanning an id of some sort.
 304 -- Keywords
 305     '_'# ->
 306          case lookAhead# buf 1# of
 307            'S'# -> case lookAhead# buf 2# of
 308                     '_'# -> ITstrict :
 309                             lex_demand (stepOnUntil (not . isSpace)
 310                                                     (stepOnBy# buf 3#)) -- past _S_
 311            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 312                      Just buf' -> lex_scc (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 313                      Nothing   -> lex_keyword (stepOnBy# buf 1#) -- drop the '_' and assume
 314                                                                  -- it is a keyword.
 315            _    -> lex_keyword (stepOn buf)
 316
 317     '\NUL'# ->
 318             if bufferExhausted (stepOn buf) then
 319                []
 320             else
 321                lex_id buf
 322     c ->
 323         if isDigit (C# c) then
 324            lex_num (id) (ord# c -# ord# '0'#) (incLexeme buf)
 325         else
 326            lex_id buf
 327 --  where
 328 lex_comment buf =
 329 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 330    case untilChar# buf '\n'# of {buf' -> lexIface (stepOverLexeme buf')}
 331
 332 ------------------
 333 lex_demand buf =
 334 -- _trace ("demand: "++[C# (currentChar# buf)]) $
 335  case read_em [] buf of { (ls,buf') -> ITdemand ls : lexIface (stepOverLexeme buf')}
 336  where
 337    -- code snatched from Demand.lhs
 338   read_em acc buf =
 339 --   _trace ("read_em: "++[C# (currentChar# buf)]) $
 340    case currentChar# buf of
 341     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 342     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 343     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 344     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 345     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 346     ')'# -> (reverse acc, stepOn buf)
 347     'U'# -> do_unpack DataType True  acc (stepOnBy# buf 2#)
 348     'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
 349     'N'# -> do_unpack NewType True  acc (stepOnBy# buf 2#)
 350     'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
 351     _    -> (reverse acc, buf)
 352
 353   do_unpack new_or_data wrapper_unpacks acc buf
 354    = case read_em [] buf of
 355       (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
 356
 357 ------------------
 358 lex_scc buf =
 359 -- _trace ("scc: "++[C# (currentChar# buf)]) $
 360  case currentChar# buf of
 361   '"'# ->
 362       -- YUCK^2
 363      case prefixMatch (stepOn buf) "NO_CC\"" of
 364       Just buf' -> ITscc noCostCentre : lexIface (stepOverLexeme buf')
 365       Nothing ->
 366        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 367         Just buf' -> ITscc useCurrentCostCentre : lexIface (stepOverLexeme buf')
 368         Nothing   ->
 369          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 370          Just buf' -> ITscc overheadCostCentre : lexIface (stepOverLexeme buf')
 371          Nothing   ->
 372           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 373            Just buf' -> ITscc dontCareCostCentre : lexIface (stepOverLexeme buf')
 374            Nothing   ->
 375             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 376              Just buf' -> ITscc subsumedCosts : lexIface (stepOverLexeme buf')
 377              Nothing ->
 378               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 379                Just buf' -> ITscc preludeCafsCostCentre : lexIface (stepOverLexeme buf')
 380                Nothing ->
 381                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 382                  Just buf' ->
 383                   case untilChar# (stepOverLexeme buf') '\"'# of
 384                    buf'' -> ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_):
 385                             lexIface (stepOn (stepOverLexeme buf''))
 386                  Nothing ->
 387                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 388                    Just buf' -> ITscc (preludeDictsCostCentre True) : lexIface (stepOverLexeme buf')
 389                    Nothing ->
 390                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 391                      Just buf' ->
 392                       case untilChar# (stepOverLexeme buf') '\"'# of
 393                        buf'' -> ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True):
 394                                 lexIface (stepOn (stepOverLexeme buf''))
 395                      Nothing ->
 396                       let
 397                        match_user_cc buf =
 398                         case untilChar# buf '/'# of
 399                          buf' ->
 400                           let mod_name = lexemeToFastString buf' in
 401                           case untilChar# (stepOn (stepOverLexeme buf')) '/'# of
 402                            buf'' ->
 403                             let grp_name = lexemeToFastString buf'' in
 404                             case untilChar# (stepOn (stepOverLexeme buf'')) '\"'# of
 405                              buf''' ->
 406                                let cc_name = lexemeToFastString buf''' in
 407                                (mkUserCC cc_name mod_name grp_name,
 408                                 stepOn (stepOverLexeme buf'''))
 409                       in
 410                       case prefixMatch (stepOn buf) "CAF:" of
 411                        Just buf' ->
 412                          case match_user_cc (stepOverLexeme buf') of
 413                           (cc, buf'') -> ITscc (cafifyCC cc) : lexIface buf''
 414                        Nothing ->
 415                          case match_user_cc (stepOn buf) of
 416                           (cc, buf'') -> ITscc cc : lexIface buf''
 417   c -> ITunknown [C# c] : lexIface (stepOn buf)
 418
 419
 420 -----------
 421 lex_num :: (Int -> Int) -> Int# -> StringBuffer -> [IfaceToken]
 422 lex_num minus acc# buf =
 423 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 424  case scanNumLit (I# acc#) buf of
 425      (acc',buf') ->
 426        case currentChar# buf' of
 427          '.'# ->
 428              -- this case is not optimised at all, as the
 429              -- presence of floating point numbers in interface
 430              -- files is not that common. (ToDo)
 431             case expandWhile (isDigit) (incLexeme buf') of
 432               buf'' -> -- points to first non digit char
 433                 case reads (lexemeToString buf'') of
 434                   [(v,_)] -> ITdouble v : lexIface (stepOverLexeme buf'')
 435          _ -> ITinteger (fromInt (minus acc')) : lexIface (stepOverLexeme buf')
 436
 437 --         case reads (lexemeToString buf') of
 438 --           [(i,_)] -> ITinteger i : lexIface (stepOverLexeme buf')
 439
 440 ------------
 441 lex_keyword buf =
 442 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 443  case currentChar# buf of
 444   ':'# -> case lookAhead# buf 1# of
 445             '_'# -> -- a binding, type (and other id-info) follows,
 446                     -- to make the parser ever so slightly, we push
 447                     --
 448                 lex_decl (stepOnBy# buf 2#)
 449             v# -> ITunknown (['_',':',C# v#]) : lexIface (stepOnBy# buf 2#)
 450   _ ->
 451     case expandWhile (is_kwd_char) buf of
 452      buf' ->
 453       let kw = lexemeToFastString buf' in
 454 --    _trace ("kw: "++lexemeToString buf') $
 455       case lookupUFM ifaceKeywordsFM kw of
 456        Nothing -> ITunknown (_UNPK_ kw) : -- (minor) sigh
 457                   lexIface (stepOverLexeme buf')
 458        Just xx -> xx : lexIface (stepOverLexeme buf')
 459
 460 lex_decl buf =
 461  case doDiscard False buf of -- spin until ;; is found
 462    buf' ->
 463       {- _trace (show (lexemeToString buf')) $ -}
 464       case currentChar# buf' of
 465        '\n'# -> -- newline, no id info.
 466            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 467            lexIface (stepOverLexeme buf')
 468        '\r'# -> -- just to be sure for those Win* boxes..
 469            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 470            lexIface (stepOverLexeme buf')
 471        '\NUL'# ->
 472            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 473            lexIface (stepOverLexeme buf')
 474        c     -> -- run all over the id info
 475          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 476            buf'' ->
 477                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 478                     --_trace (show (lexemeToString (decLexeme buf''))) $
 479                     ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))):
 480                     let ls = lexIface (stepOverLexeme buf'') in
 481                     if opt_IgnoreIfacePragmas then
 482                         ls
 483                     else
 484                         let is = lexIface (lexemeToBuffer (decLexeme buf'')) in
 485                         --_trace (show is) $
 486                         ITidinfo is : ls
 487
 488 -- ToDo: hammer!
 489 is_kwd_char c@(C# c#) =
 490  isAlphanum c || -- OLD: c `elem` "_@/\\"
 491  (case c# of
 492    '_'#  -> True
 493    '@'#  -> True
 494    '/'#  -> True
 495    '\\'# -> True
 496    _     -> False)
 497
 498
 499
 500 -----------
 501 lex_cstring buf =
 502 -- _trace ("lex_cstring: "++[C# (currentChar# buf)]) $
 503  case expandUntilMatch buf "\'\'" of
 504    buf' -> ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))) :
 505            lexIface (stepOverLexeme buf')
 506
 507 -----------
 508 lex_tuple module_dot buf =
 509 -- _trace ("lex_tuple: "++[C# (currentChar# buf)]) $
 510   go 2 buf
 511   where
 512    go n buf =
 513     case currentChar# buf of
 514       ','# -> go (n+1) (stepOn buf)
 515       ')'# -> end_lex_id module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 516       _    -> ITunknown ("tuple " ++ show n) : lexIface buf
 517
 518 -- Similarly ' itself is ok inside an identifier, but not at the start
 519
 520 id_arr :: _ByteArray Int
 521 id_arr =
 522  unsafePerformPrimIO (
 523   newCharArray (0,255) `thenPrimIO` \ barr ->
 524   let
 525    loop 256# = returnPrimIO ()
 526    loop i# =
 527     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 528        writeCharArray barr (I# i#) '\1' `seqPrimIO`
 529        loop (i# +# 1#)
 530     else
 531        writeCharArray barr (I# i#) '\0' `seqPrimIO`
 532        loop (i# +# 1#)
 533   in
 534   loop 0#                    `seqPrimIO`
 535   unsafeFreezeByteArray barr)
 536
 537 is_id_char (C# c#) =
 538  let
 539   _ByteArray _ arr# = id_arr
 540  in
 541  case ord# (indexCharArray# arr# (ord# c#)) of
 542   0# -> False
 543   1# -> True
 544
 545 --is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 546
 547 is_sym c#=
 548  case c# of {
 549    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 550    '#'# -> True; '$'#  -> True; ':'#  -> True; '%'# -> True;
 551    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 552    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 553    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 554    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 555
 556 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 557
 558
 559 mod_arr :: _ByteArray Int
 560 mod_arr =
 561  unsafePerformPrimIO (
 562   newCharArray (0,255) `thenPrimIO` \ barr ->
 563   let
 564    loop 256# = returnPrimIO ()
 565    loop i# =
 566     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 567        writeCharArray barr (I# i#) '\1' `seqPrimIO`
 568        loop (i# +# 1#)
 569     else
 570        writeCharArray barr (I# i#) '\0' `seqPrimIO`
 571        loop (i# +# 1#)
 572   in
 573   loop 0#                    `seqPrimIO`
 574   unsafeFreezeByteArray barr)
 575
 576
 577 is_mod_char (C# c#) =
 578  let
 579   _ByteArray _ arr# = mod_arr
 580  in
 581  case ord# (indexCharArray# arr# (ord# c#)) of
 582   0# -> False
 583   1# -> True
 584
 585 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 586
 587 {-
 588 lex_id cs =
 589  case _scc_ "lex_id.span" my_span' (is_mod_char) cs of
 590    (xs, len, cs') ->
 591     case cs' of
 592      [] -> case xs of
 593             [] -> lex_id2 Nothing cs
 594             _  -> lex_id3 Nothing len xs cs
 595
 596      '.':cs'' ->
 597         case xs of
 598           [] -> lex_id2 Nothing cs
 599           _  ->
 600            let
 601             pk_str = _PK_ (xs::String)
 602             len = lengthPS pk_str
 603            in
 604            if len==len+1 then
 605               error "Well, I never!"
 606            else
 607               lex_id2 (Just pk_str) cs''
 608      _ -> case xs of
 609             [] -> lex_id2 Nothing cs
 610             _  -> lex_id3 Nothing len xs cs'
 611
 612 -}
 613
 614 lex_id buf =
 615 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 616  case expandWhile (is_mod_char) buf of
 617    buf' ->
 618     case currentChar# buf' of
 619      '.'# ->
 620         if not (emptyLexeme buf') then
 621 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 622            case lexemeToFastString buf' of
 623              l@(FastString u# l# ba#) -> lex_id2 (Just (FastString u# l# ba#))
 624                                                  (stepOn (stepOverLexeme buf'))
 625         else
 626            lex_id2 Nothing buf'
 627      _  -> lex_id2 Nothing buf'
 628
 629 -- Dealt with the Module.part
 630 lex_id2 module_dot buf =
 631 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 632  case currentChar# buf of
 633   '['# ->
 634     case lookAhead# buf 1# of
 635      ']'# -> end_lex_id module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 636      _    -> lex_id3 module_dot buf
 637   '('# ->
 638     case lookAhead# buf 1# of
 639      ')'# -> end_lex_id module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 640      ','# -> lex_tuple module_dot (stepOnBy# buf 2#)
 641      _    -> lex_id3 module_dot buf
 642   ':'# -> lex_id3 module_dot (incLexeme buf)
 643   _    -> lex_id3 module_dot buf
 644
 645
 646
 647 -- Dealt with [], (), : special cases
 648
 649 lex_id3 module_dot buf =
 650 -- _trace ("lex_id3: "++[C# (currentChar# buf)]) $
 651  case expandWhile (is_id_char) buf of
 652   buf' ->
 653     case module_dot of
 654      Just _ ->
 655        end_lex_id module_dot (mk_var_token lexeme) (stepOverLexeme buf')
 656      Nothing ->
 657        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 658          Just kwd_token -> kwd_token           : lexIface new_buf
 659          Nothing        -> mk_var_token lexeme : lexIface new_buf
 660     where
 661      lexeme  = lexemeToFastString buf'
 662      new_buf = stepOverLexeme buf'
 663
 664
 665 {- OLD:
 666 lex_id2 module_dot [] ('[': ']': cs) = end_lex_id module_dot (ITconid SLIT("[]")) cs
 667 lex_id2 module_dot [] ('(': ')': cs) = end_lex_id module_dot (ITconid SLIT("()")) cs
 668 lex_id2 module_dot [] ('(': ',': cs) = lex_tuple module_dot cs
 669 lex_id2 module_dot [] (':' : cs)     = lex_id3 module_dot [':'] cs
 670 lex_id2 module_dot xs cs             = lex_id3 module_dot xs cs
 671 -}
 672
 673
 674 -- Dealt with [], (), : special cases
 675
 676 {-
 677 lex_id3 module_dot len_xs xs cs =
 678  case my_span' (is_id_char) cs of
 679    (xs1,len_xs1,rest) ->
 680     case module_dot of
 681      Just m  -> end_lex_id (Just m) (mk_var_token rxs) rest --OLD:_PK_ (reverse xs))) rest
 682      Nothing ->
 683       case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM rxs of
 684        Just kwd_token -> kwd_token          : lexIface rest
 685        other          -> token : lexIface cs end_lex_id Nothing (mk_var_token rxs) rest
 686     where
 687      rxs = packNChars (len_xs+len_xs1) (xs++xs1) -- OLD: _PK_ (reverse xs)
 688 -}
 689 mk_var_token pk_str =
 690      let
 691       f = _HEAD_ pk_str
 692      in
 693      --
 694      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 695      -- remove the second half of disjunction when using a 1.3 prelude.
 696      --
 697      if      isUpper f    then ITconid pk_str
 698      else if isLower f    then ITvarid pk_str
 699      else if f == ':'     then ITconsym pk_str
 700      else if isLowerISO f then ITvarid pk_str
 701      else if isUpperISO f then ITconid pk_str
 702      else ITvarsym pk_str
 703
 704 {-
 705     mk_var_token xs@(f:_) | isUpper f || isUpperISO f = ITconid n
 706                           | f == ':'              = ITconsym n
 707                           | isAlpha f             = ITvarid n
 708                           | otherwise             = ITvarsym n
 709                 where
 710                       n = _PK_ xs
 711 -}
 712
 713 end_lex_id Nothing token buf  = token : lexIface buf
 714 end_lex_id (Just m) token buf =
 715  case token of
 716    ITconid n  -> ITqconid  (m,n)         : lexIface buf
 717    ITvarid n  -> ITqvarid  (m,n)         : lexIface buf
 718    ITconsym n -> ITqconsym (m,n)         : lexIface buf
 719    ITvarsym n -> ITqvarsym (m,n)         : lexIface buf
 720    ITbang     -> ITqvarsym (m,SLIT("!")) : lexIface buf
 721    _          -> ITunknown (show token)  : lexIface buf
 722
 723 ------------
 724 ifaceKeywordsFM :: UniqFM IfaceToken
 725 ifaceKeywordsFM = listToUFM $
 726       map (\ (x,y) -> (_PK_ x,y))
 727        [("/\\_",                ITbiglam)
 728        ,("@_",                  ITatsign)
 729        ,("letrec_",             ITletrec)
 730        ,("interface_",          ITinterface)
 731        ,("usages_",             ITusages)
 732        ,("versions_",           ITversions)
 733        ,("exports_",            ITexports)
 734        ,("instance_modules_",   ITinstance_modules)
 735        ,("instances_",          ITinstances)
 736        ,("fixities_",           ITfixities)
 737        ,("declarations_",       ITdeclarations)
 738        ,("pragmas_",            ITpragmas)
 739        ,("forall_",             ITforall)
 740        ,("U_",                  ITunfold False)
 741        ,("U!_",                 ITunfold True)
 742        ,("A_",                  ITarity)
 743        ,("coerce_in_",          ITcoerce_in)
 744        ,("coerce_out_",         ITcoerce_out)
 745        ,("bot_",                ITbottom)
 746        ,("integer_",            ITinteger_lit)
 747        ,("rational_",           ITrational_lit)
 748        ,("addr_",               ITaddr_lit)
 749        ,("float_",              ITfloat_lit)
 750        ,("string_",             ITstring_lit)
 751        ,("litlit_",             ITlit_lit)
 752        ,("ccall_",              ITccall (False, False))
 753        ,("ccall_GC_",           ITccall (False, True))
 754        ,("casm_",               ITccall (True,  False))
 755        ,("casm_GC_",            ITccall (True,  True))
 756        ]
 757
 758 haskellKeywordsFM = listToUFM $
 759       map (\ (x,y) -> (_PK_ x,y))
 760       [ ("data",                ITdata)
 761        ,("type",                ITtype)
 762        ,("newtype",             ITnewtype)
 763        ,("class",               ITclass)
 764        ,("where",               ITwhere)
 765        ,("instance",            ITinstance)
 766        ,("infixl",              ITinfixl)
 767        ,("infixr",              ITinfixr)
 768        ,("infix",               ITinfix)
 769        ,("case",                ITcase)
 770        ,("case#",               ITprim_case)
 771        ,("of",                  ITof)
 772        ,("in",                  ITin)
 773        ,("let",                 ITlet)
 774        ,("deriving",            ITderiving)
 775
 776        ,("->",                  ITrarrow)
 777        ,("\\",                  ITlam)
 778        ,("|",                   ITvbar)
 779        ,("!",                   ITbang)
 780        ,("=>",                  ITdarrow)
 781        ,("=",                   ITequal)
 782        ]
 783
 784
 785 -- doDiscard rips along really fast looking for a double semicolon,
 786 -- indicating the end of the pragma we're skipping
 787 doDiscard inStr buf =
 788 -- _trace (show (C# (currentChar# buf))) $
 789  case currentChar# buf of
 790    ';'# ->
 791      if not inStr then
 792        case lookAhead# buf 1# of
 793         ';'# -> incLexeme (incLexeme buf)
 794         _    -> doDiscard inStr (incLexeme buf)
 795      else
 796        doDiscard inStr (incLexeme buf)
 797    '"'# ->
 798        let
 799         odd_slashes buf flg i# =
 800           case lookAhead# buf i# of
 801            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 802            _     -> flg
 803        in
 804        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 805          '\\'# -> -- escaping something..
 806            if odd_slashes buf True (negateInt# 2#) then
 807                -- odd number of slashes, " is escaped.
 808               doDiscard inStr (incLexeme buf)
 809            else
 810                -- even number of slashes, \ is escaped.
 811               doDiscard (not inStr) (incLexeme buf)
 812          _ -> case inStr of -- forced to avoid build-up
 813                True  -> doDiscard False (incLexeme buf)
 814                False -> doDiscard True  (incLexeme buf)
 815    _ -> doDiscard inStr (incLexeme buf)
 816
 817 \end{code}
 818
 819 begin{code}
 820 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 821 my_span p xs = go [] xs
 822   where
 823     go so_far (x:xs') | p x = go (x:so_far) xs'
 824     go so_far xs            = (reverse so_far, xs)
 825
 826 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 827 my_span' p xs = go [] 0 xs
 828   where
 829     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 830     go so_far n xs            = (reverse so_far,n, xs)
 831 end{code}
 832
 833
 834 %************************************************************************
 835 %*                                                                      *
 836 \subsection{Other utility functions
 837 %*                                                                      *
 838 %************************************************************************
 839
 840 \begin{code}
 841 type IfM a = MaybeErr a Error
 842
 843 returnIf   :: a -> IfM a
 844 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 845 happyError :: Int -> [IfaceToken] -> IfM a
 846
 847 returnIf a = Succeeded a
 848
 849 thenIf (Succeeded a) k = k a
 850 thenIf (Failed  err) _ = Failed err
 851
 852 happyError ln toks = Failed (ifaceParseErr ln toks)
 853
 854 -----------------------------------------------------------------
 855
 856 ifaceParseErr ln toks sty
 857   = hsep [ptext SLIT("Interface-file parse error: line"), int ln, ptext SLIT("toks="), text (show (take 10 toks))]
 858 \end{code}