ghc/compiler/basicTypes/OccName.lhs

   1
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
   3 %
   4
   5 \section[OccName]{@OccName@}
   6
   7 \begin{code}
   8 module OccName (
   9         -- Modules
  10         Module,         -- Abstract, instance of Outputable
  11         mkModule, mkModuleFS, moduleString, moduleCString, pprModule,
  12
  13         -- The OccName type
  14         OccName,        -- Abstract, instance of Outputable
  15         varOcc,    tcOcc,    tvOcc,     -- Occ constructors
  16         srcVarOcc, srcTCOcc, srcTvOcc,  -- For Occs arising from source code
  17
  18         mkSuperDictSelOcc, mkDFunOcc,
  19         mkDictOcc, mkWorkerOcc, mkDefaultMethodOcc,
  20         mkClassTyConOcc, mkClassDataConOcc,
  21
  22         isTvOcc, isTCOcc, isVarOcc, isConSymOcc, isConOcc, isSymOcc,
  23         isWildCardOcc, isAnonOcc,
  24         pprOccName, occNameString, occNameFlavour,
  25
  26         -- The basic form of names
  27         isLexCon, isLexVar, isLexId, isLexSym,
  28         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  29         isLowerISO, isUpperISO,
  30
  31         -- Tidying up
  32         TidyOccEnv, emptyTidyOccEnv, tidyOccName, initTidyOccEnv,
  33
  34         -- Junk
  35         identToC
  36
  37     ) where
  38
  39 #include "HsVersions.h"
  40
  41 import Char     ( isAlpha, isUpper, isLower, ISALPHANUM, ord )
  42 import Util     ( thenCmp )
  43 import FiniteMap ( FiniteMap, emptyFM, lookupFM, addToFM, elemFM )
  44 import Outputable
  45 import GlaExts
  46 \end{code}
  47
  48
  49 %************************************************************************
  50 %*                                                                      *
  51 \subsection[Module]{The name of a module}
  52 %*                                                                      *
  53 %************************************************************************
  54
  55 \begin{code}
  56 data Module = Module FAST_STRING        -- User and interface files
  57                      FAST_STRING        -- Print this in C files
  58
  59         -- The C version has quote chars Z-encoded
  60
  61 instance Outputable Module where
  62   ppr = pprModule
  63
  64 instance Eq Module where
  65   (Module m1 _) == (Module m2 _) = m1 == m2
  66
  67 instance Ord Module where
  68   (Module m1 _) `compare` (Module m2 _) = m1 `compare` m2
  69
  70 pprModule :: Module -> SDoc
  71 pprModule (Module real code)
  72   = getPprStyle         $ \ sty ->
  73     if codeStyle sty then
  74         ptext code
  75     else
  76         ptext real
  77
  78 mkModule :: String -> Module
  79 mkModule s = Module (_PK_ s) (identToC s)
  80
  81 mkModuleFS :: FAST_STRING -> Module
  82 mkModuleFS s = Module s (identFsToC s)
  83
  84 moduleString :: Module -> String
  85 moduleString (Module mod _) = _UNPK_ mod
  86
  87 moduleCString :: Module -> String
  88 moduleCString (Module _ code) = _UNPK_ code
  89 \end{code}
  90
  91
  92 %************************************************************************
  93 %*                                                                      *
  94 \subsection[Name-pieces-datatypes]{The @OccName@ datatypes}
  95 %*                                                                      *
  96 %************************************************************************
  97
  98 \begin{code}
  99 data OccName = OccName
 100                   OccSpace
 101                   FAST_STRING   -- The 'real name'
 102                   FAST_STRING   -- Print this in interface files
 103                   FAST_STRING   -- Print this in C/asm code
 104
 105 -- The OccSpace/real-name pair define the OccName
 106 -- The iface and c/asm versions are simply derived from the
 107 -- other two.  They are cached here simply to avoid recomputing
 108 -- them repeatedly when printing
 109
 110 -- The latter two are irrelevant in RdrNames; on the other hand,
 111 -- the OccSpace field is irrelevant after RdrNames.
 112 -- So the OccName type might be refined a bit.
 113 -- It is now abstract so that's easier than before
 114
 115
 116 -- Why three print-names?
 117 --      Real    Iface   C
 118 --      ---------------------
 119 --      foo     foo     foo
 120 --
 121 --      +       +       Zp      Operators OK in interface files;
 122 --                              'Z' is the escape char for C names
 123 --
 124 --      x#      x#      xZh     Trailing # lexed ok by GHC -fglasgow-exts
 125 --
 126 --      _foo    _ufoo   _ufoo   Leading '_' is the escape char in interface files
 127 --
 128 --      _vfoo   _vfoo   _vfoo   Worker for foo
 129 --
 130 --      _wp     _wp     _wp     Worker for +
 131
 132
 133 data OccSpace = VarOcc  -- Variables and data constructors
 134               | TvOcc   -- Type variables
 135               | TCOcc   -- Type constructors and classes
 136               deriving( Eq, Ord )
 137 \end{code}
 138
 139
 140 %************************************************************************
 141 %*                                                                      *
 142 \subsection{Printing}
 143 %*                                                                      *
 144 %************************************************************************
 145
 146 \begin{code}
 147 instance Outputable OccName where
 148   ppr = pprOccName
 149
 150 pprOccName :: OccName -> SDoc
 151 pprOccName (OccName space real iface code)
 152   = getPprStyle $ \ sty ->
 153     if codeStyle sty then
 154         ptext code
 155     else if ifaceStyle sty then
 156         ptext iface
 157     else
 158         ptext real
 159 \end{code}
 160
 161
 162 %************************************************************************
 163 %*                                                                      *
 164 \subsection{Construction}
 165 %*                                                                      *
 166 %************************************************************************
 167
 168 *Source-code* things beginning with '_' are zapped to begin with '_u'
 169
 170 \begin{code}
 171 mkSrcOcc :: OccSpace -> FAST_STRING -> OccName
 172 mkSrcOcc occ_sp real
 173   = case _UNPK_ real of
 174
 175         '_' : rest -> OccName occ_sp real (_PK_ zapped_str) (identToC zapped_str)
 176                    where
 177                       zapped_str = '_' : 'u' : rest
 178
 179         other      -> OccName occ_sp real real (identFsToC real)
 180
 181 srcVarOcc, srcTCOcc, srcTvOcc :: FAST_STRING -> OccName
 182 srcVarOcc = mkSrcOcc VarOcc
 183 srcTCOcc  = mkSrcOcc TCOcc
 184 srcTvOcc  = mkSrcOcc TvOcc
 185 \end{code}
 186
 187 However, things that don't come from Haskell source code aren't
 188 treated specially.
 189
 190 \begin{code}
 191 mkOcc :: OccSpace -> String -> OccName
 192 mkOcc occ_sp str = OccName occ_sp fs fs (identToC str)
 193                  where
 194                    fs = _PK_ str
 195
 196 mkFsOcc :: OccSpace -> FAST_STRING -> OccName
 197 mkFsOcc occ_sp real = OccName occ_sp real real (identFsToC real)
 198
 199 varOcc, tcOcc, tvOcc :: FAST_STRING -> OccName
 200 varOcc = mkFsOcc VarOcc
 201 tcOcc  = mkFsOcc TCOcc
 202 tvOcc  = mkFsOcc TvOcc
 203 \end{code}
 204
 205
 206 %************************************************************************
 207 %*                                                                      *
 208 \subsection{Making system names}
 209 %*                                                                      *
 210 %************************************************************************
 211
 212 Here's our convention for splitting up the interface file name space:
 213
 214         _d...           dictionary identifiers
 215
 216         _f...           dict-fun identifiers (from inst decls)
 217         _g...           ditto, when the tycon has symbols
 218
 219         _t...           externally visible (non-user visible) names
 220
 221         _m...           default methods
 222         _n...           default methods (encoded symbols, eg. <= becomes _nle)
 223
 224         _p...           superclass selectors
 225
 226         _v...           workers
 227         _w...           workers (encoded symbols)
 228
 229         _x...           local variables
 230
 231         _u...           user-defined names that previously began with '_'
 232
 233         _T...           compiler-generated tycons for dictionaries
 234         _D..            ...ditto data cons
 235
 236         __....          keywords (__export, __letrec etc.)
 237
 238 This knowledge is encoded in the following functions.
 239
 240
 241
 242
 243 @mkDerivedOcc@ generates an @OccName@ from an existing @OccName@;
 244         eg: workers, derived methods
 245
 246 We pass a character to use as the prefix.  So, for example,
 247         "f" gets derived to "_vf", if the prefix char is 'v'
 248
 249 \begin{code}
 250 mk_deriv :: OccSpace -> Char -> String -> OccName
 251 mk_deriv occ_sp sys_ch str = mkOcc occ_sp ('_' : sys_ch : str)
 252 \end{code}
 253
 254 Things are a bit more complicated if the thing is an operator; then
 255 we must encode it into a normal identifier first.  We do this in
 256 a simple way, and use a different character prefix (one after the one
 257 suggested).  For example
 258         "<" gets derived to "_wl", if the prefix char is 'v'
 259
 260 \begin{code}
 261 mk_enc_deriv :: OccSpace
 262              -> Char    -- The system-name-space character (see list above)
 263              -> OccName -- The OccName from which we are deriving
 264              -> OccName
 265
 266 mk_enc_deriv occ_sp sys_ch occ
 267   | needs_encoding real_str = mk_deriv occ_sp sys_op_ch (encode_operator real_str)
 268   | otherwise               = mk_deriv occ_sp sys_ch    real_str
 269   where
 270     real_str  = occNameString occ
 271     sys_op_ch = succ sys_ch
 272
 273
 274 mkDictOcc, mkWorkerOcc, mkDefaultMethodOcc,
 275            mkClassTyConOcc, mkClassDataConOcc
 276    :: OccName -> OccName
 277
 278 mkWorkerOcc        = mk_enc_deriv VarOcc 'v'    -- v,w
 279 mkDefaultMethodOcc = mk_enc_deriv VarOcc 'm'    -- m,n
 280 mkClassTyConOcc    = mk_enc_deriv TCOcc  'T'    -- not U
 281 mkClassDataConOcc  = mk_enc_deriv VarOcc 'D'    -- not E
 282 mkDictOcc          = mk_enc_deriv VarOcc 'd'    -- not e
 283 \end{code}
 284
 285 \begin{code}
 286 mkSuperDictSelOcc :: Int        -- Index of superclass, eg 3
 287                   -> OccName    -- Class, eg "Ord"
 288                   -> OccName    -- eg "p3Ord"
 289 mkSuperDictSelOcc index cls_occ
 290   = mk_deriv VarOcc 'p' (show index ++ occNameString cls_occ)
 291 \end{code}
 292
 293
 294 \begin{code}
 295 mkDFunOcc :: OccName    -- class, eg "Ord"
 296           -> OccName    -- tycon (or something convenient from the instance type)
 297                         --      eg "Maybe"
 298           -> Int        -- Unique to distinguish dfuns which share the previous two
 299                         --      eg 3
 300           -> OccName    -- "dOrdMaybe3"
 301
 302 mkDFunOcc cls_occ tycon_occ index
 303   | needs_encoding tycon_str    -- Drat!  Have to encode the tycon
 304   = mk_deriv VarOcc 'g' (show_index ++ cls_str ++ encode_operator tycon_str)
 305   | otherwise                   -- Normal case
 306   = mk_deriv VarOcc 'f' (show_index ++ cls_str ++ tycon_str)
 307   where
 308     cls_str   = occNameString cls_occ
 309     tycon_str = occNameString tycon_occ
 310         -- NB: if a non-operator the tycon has a trailing # we don't encode.
 311     show_index | index == 0 = ""
 312                | otherwise  = show index
 313 \end{code}
 314
 315
 316 %************************************************************************
 317 %*                                                                      *
 318 \subsection{Lexical categories}
 319 %*                                                                      *
 320 %************************************************************************
 321
 322 These functions test strings to see if they fit the lexical categories
 323 defined in the Haskell report.
 324
 325 \begin{code}
 326 isLexCon,   isLexVar,    isLexId,    isLexSym    :: FAST_STRING -> Bool
 327 isLexConId, isLexConSym, isLexVarId, isLexVarSym :: FAST_STRING -> Bool
 328
 329 isLexCon cs = isLexConId  cs || isLexConSym cs
 330 isLexVar cs = isLexVarId  cs || isLexVarSym cs
 331
 332 isLexId  cs = isLexConId  cs || isLexVarId  cs
 333 isLexSym cs = isLexConSym cs || isLexVarSym cs
 334
 335 -------------
 336
 337 isLexConId cs                           -- Prefix type or data constructors
 338   | _NULL_ cs        = False            --      e.g. "Foo", "[]", "(,)"
 339   | cs == SLIT("[]") = True
 340   | c  == '('        = True     -- (), (,), (,,), ...
 341   | otherwise        = isUpper c || isUpperISO c
 342   where
 343     c = _HEAD_ cs
 344
 345 isLexVarId cs                           -- Ordinary prefix identifiers
 346   | _NULL_ cs    = False                --      e.g. "x", "_x"
 347   | otherwise    = isLower c || isLowerISO c || c == '_'
 348   where
 349     c = _HEAD_ cs
 350
 351 isLexConSym cs                          -- Infix type or data constructors
 352   | _NULL_ cs   = False                 --      e.g. ":-:", ":", "->"
 353   | otherwise   = c  == ':'
 354                || cs == SLIT("->")
 355   where
 356     c = _HEAD_ cs
 357
 358 isLexVarSym cs                          -- Infix identifiers
 359   | _NULL_ cs = False                   --      e.g. "+"
 360   | otherwise = isSymbolASCII c
 361              || isSymbolISO c
 362   where
 363     c = _HEAD_ cs
 364
 365 -------------
 366 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 367 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 368 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 369         --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 370 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 371         --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 372 \end{code}
 373
 374 %************************************************************************
 375 %*                                                                      *
 376 \subsection{Predicates and taking them apart}
 377 %*                                                                      *
 378 %************************************************************************
 379
 380 \begin{code}
 381 occNameString :: OccName -> String
 382 occNameString (OccName _ s _ _) = _UNPK_ s
 383
 384 -- occNameFlavour is used only to generate good error messages, so it doesn't matter
 385 -- that the VarOcc case isn't mega-efficient.  We could have different Occ constructors for
 386 -- data constructors and values, but that makes everything else a bit more complicated.
 387 occNameFlavour :: OccName -> String
 388 occNameFlavour (OccName VarOcc s _ _) | isLexConId s = "Data constructor"
 389                                       | otherwise    = "Value"
 390 occNameFlavour (OccName TvOcc _ _ _)                 = "Type variable"
 391 occNameFlavour (OccName TCOcc s _ _)                 = "Type constructor or class"
 392
 393 isVarOcc, isTCOcc, isTvOcc,
 394  isConSymOcc, isSymOcc, isWildCardOcc :: OccName -> Bool
 395
 396 isVarOcc (OccName VarOcc _ _ _) = True
 397 isVarOcc other                  = False
 398
 399 isTvOcc (OccName TvOcc _ _ _) = True
 400 isTvOcc other                 = False
 401
 402 isTCOcc (OccName TCOcc _ _ _) = True
 403 isTCOcc other                 = False
 404
 405 isConSymOcc (OccName _ s _ _) = isLexConSym s
 406
 407 isSymOcc (OccName _ s _ _) = isLexSym s
 408
 409 isConOcc (OccName _ s _ _) = isLexCon s
 410
 411 isWildCardOcc (OccName _ s _ _) = (_HEAD_ s) == '_' && _LENGTH_ s == 1
 412
 413 isAnonOcc (OccName _ s _ _) = (_HEAD_ s) == '_'
 414 \end{code}
 415
 416
 417 %************************************************************************
 418 %*                                                                      *
 419 \subsection{Comparison}
 420 %*                                                                      *
 421 %************************************************************************
 422
 423 Comparison is done by space and 'real' name
 424
 425 \begin{code}
 426 instance Eq OccName where
 427     a == b = case (a `compare` b) of { EQ -> True;  _ -> False }
 428     a /= b = case (a `compare` b) of { EQ -> False; _ -> True }
 429
 430 instance Ord OccName where
 431     a <= b = case (a `compare` b) of { LT -> True;  EQ -> True;  GT -> False }
 432     a <  b = case (a `compare` b) of { LT -> True;  EQ -> False; GT -> False }
 433     a >= b = case (a `compare` b) of { LT -> False; EQ -> True;  GT -> True  }
 434     a >  b = case (a `compare` b) of { LT -> False; EQ -> False; GT -> True  }
 435
 436     compare (OccName sp1 r1 _ _) (OccName sp2 r2 _ _)
 437         = (sp1 `compare` sp2) `thenCmp` (r1 `compare` r2)
 438 \end{code}
 439
 440
 441 %************************************************************************
 442 %*                                                                      *
 443 \subsection{Tidying them up}
 444 %*                                                                      *
 445 %************************************************************************
 446
 447 Before we print chunks of code we like to rename it so that
 448 we don't have to print lots of silly uniques in it.  But we mustn't
 449 accidentally introduce name clashes!  So the idea is that we leave the
 450 OccName alone unless it accidentally clashes with one that is already
 451 in scope; if so, we tack on '1' at the end and try again, then '2', and
 452 so on till we find a unique one.
 453
 454 There's a wrinkle for operators.  Consider '>>='.  We can't use '>>=1'
 455 because that isn't a single lexeme.  So we encode it to 'lle' and *then*
 456 tack on the '1', if necessary.
 457
 458 \begin{code}
 459 type TidyOccEnv = FiniteMap FAST_STRING Int     -- The in-scope OccNames
 460 emptyTidyOccEnv = emptyFM
 461
 462 initTidyOccEnv :: [OccName] -> TidyOccEnv       -- Initialise with names to avoid!
 463 initTidyOccEnv = foldl (\env (OccName _ fs _ _) -> addToFM env fs 1) emptyTidyOccEnv
 464
 465 tidyOccName :: TidyOccEnv -> OccName -> (TidyOccEnv, OccName)
 466
 467 tidyOccName in_scope occ@(OccName occ_sp real _ _)
 468   | not (real `elemFM` in_scope) &&
 469     not (isLexCon real)                 -- Hack alert!   Specialised versions of overloaded
 470                                         -- constructors end up as ordinary Ids, but we don't
 471                                         -- want them as ConIds in interface files.
 472
 473   = (addToFM in_scope real 1, occ)      -- First occurrence
 474
 475   | otherwise                           -- Already occurs
 476   =     -- First encode, to deal with
 477         --      a) operators, and
 478         --      b) trailing # signs
 479         -- so that we can then append '1', '2', etc
 480     go in_scope (encode_operator (_UNPK_ real))
 481   where
 482
 483     go in_scope str = case lookupFM in_scope pk_str of
 484                         Just n  -> go (addToFM in_scope pk_str (n+1)) (str ++ show n)
 485                                 -- Need to go round again, just in case "t3" (say)
 486                                 -- clashes with a "t3" that's already in scope
 487
 488                         Nothing -> (addToFM in_scope pk_str 1, mkFsOcc occ_sp pk_str)
 489                                 -- str is now unique
 490                     where
 491                       pk_str = _PK_ str
 492 \end{code}
 493
 494
 495 %************************************************************************
 496 %*                                                                      *
 497 \subsection{Encoding for operators in derived names}
 498 %*                                                                      *
 499 %************************************************************************
 500
 501 See comments with mk_enc_deriv
 502
 503 \begin{code}
 504 needs_encoding :: String -> Bool        -- Needs encoding when embedded in a derived name
 505                                         -- Just look at the first character
 506 needs_encoding (c:cs) = not (isAlpha c || c == '_')
 507
 508 encode_operator :: String -> String
 509 encode_operator nm = foldr tran "" nm
 510  where
 511     tran c cs = case trChar c of
 512                    '\0'  -> '_' : show (ord c) ++ cs  -- No translation
 513                    tr_c  -> tr_c : cs
 514
 515     trChar '&'  = 'a'
 516     trChar '|'  = 'b'
 517     trChar ':'  = 'c'
 518     trChar '/'  = 'd'
 519     trChar '='  = 'e'
 520     trChar '>'  = 'g'
 521     trChar '#'  = 'h'
 522     trChar '@'  = 'i'
 523     trChar '<'  = 'l'
 524     trChar '-'  = 'm'
 525     trChar '!'  = 'n'
 526     trChar '+'  = 'p'
 527     trChar '\'' = 'q'
 528     trChar '$'  = 'r'
 529     trChar '?'  = 's'
 530     trChar '*'  = 't'
 531     trChar '_'  = 'u'
 532     trChar '.'  = 'v'
 533     trChar '\\' = 'w'
 534     trChar '%'  = 'x'
 535     trChar '~'  = 'y'
 536     trChar '^'  = 'z'
 537     trChar _    = '\0'  -- No translation
 538 \end{code}
 539
 540
 541 %************************************************************************
 542 %*                                                                      *
 543 \subsection{The 'Z' encoding}
 544 %*                                                                      *
 545 %************************************************************************
 546
 547 We provide two interfaces for efficiency.
 548
 549 \begin{code}
 550 identToC :: String -> FAST_STRING
 551 identToC str
 552   | all ISALPHANUM str && not std = _PK_ str
 553   | std                           = _PK_ ("Zs" ++ encode str)
 554   | otherwise                     = _PK_ (encode str)
 555   where
 556     std = has_std_prefix str
 557
 558 identFsToC :: FAST_STRING -> FAST_STRING
 559 identFsToC fast_str
 560   | all ISALPHANUM str && not std = fast_str
 561   | std                           = _PK_ ("Zs" ++ encode str)
 562   | otherwise                     = _PK_ (encode str)
 563   where
 564     std = has_std_prefix str
 565     str = _UNPK_ fast_str
 566
 567 -- avoid "stdin", "stdout", and "stderr"...
 568 has_std_prefix ('s':'t':'d':_) = True
 569 has_std_prefix _               = False
 570
 571 encode :: String -> String
 572 encode [] = []
 573 encode (c:cs) = encode_ch c ++ encode cs
 574
 575 encode_ch :: Char -> String
 576 encode_ch c | ISALPHANUM c = [c]
 577         -- Common case first
 578 encode_ch 'Z'  = "ZZ"
 579 encode_ch '&'  = "Za"
 580 encode_ch '|'  = "Zb"
 581 encode_ch ':'  = "Zc"
 582 encode_ch '/'  = "Zd"
 583 encode_ch '='  = "Ze"
 584 encode_ch '>'  = "Zg"
 585 encode_ch '#'  = "Zh"
 586 encode_ch '<'  = "Zl"
 587 encode_ch '-'  = "Zm"
 588 encode_ch '!'  = "Zn"
 589 encode_ch '.'  = "Zs"
 590 encode_ch '\'' = "Zq"
 591 encode_ch '*'  = "Zt"
 592 encode_ch '+'  = "Zp"
 593 encode_ch '_'  = "_"
 594 encode_ch c    = 'Z':show (ord c)
 595 \end{code}
 596
 597 For \tr{modnameToC}, we really only have to worry about \tr{'}s
 598 (quote chars) in the name.  Rare.
 599
 600 \begin{code}
 601 modnameToC  :: FAST_STRING -> FAST_STRING
 602 modnameToC fast_str = identFsToC fast_str
 603 \end{code}