ghc/compiler/basicTypes/OccName.lhs

   1
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
   3 %
   4
   5 \section[OccName]{@OccName@}
   6
   7 \begin{code}
   8 module OccName (
   9         -- Modules
  10         Module,         -- Abstract, instance of Outputable
  11         mkModule, mkModuleFS, moduleString, moduleCString, pprModule,
  12
  13         -- The OccName type
  14         OccName,        -- Abstract, instance of Outputable
  15         varOcc,    tcOcc,    tvOcc,     -- Occ constructors
  16         srcVarOcc, srcTCOcc, srcTvOcc,  -- For Occs arising from source code
  17
  18         mkSuperDictSelOcc, mkDFunOcc,
  19         mkDictOcc, mkWorkerOcc, mkDefaultMethodOcc,
  20         mkClassTyConOcc, mkClassDataConOcc,
  21
  22         isTvOcc, isTCOcc, isVarOcc, isConSymOcc, isConOcc, isSymOcc,
  23         isWildCardOcc, isAnonOcc,
  24         pprOccName, occNameString, occNameFlavour,
  25
  26         -- The basic form of names
  27         isLexCon, isLexVar, isLexId, isLexSym,
  28         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  29         isLowerISO, isUpperISO,
  30
  31         -- Tidying up
  32         TidyOccEnv, emptyTidyOccEnv, tidyOccName, initTidyOccEnv,
  33
  34         -- Junk
  35         identToC
  36
  37     ) where
  38
  39 #include "HsVersions.h"
  40
  41 #if __HASKELL1__ > 4
  42 #define ISALPHANUM isAlphaNum
  43 #else
  44 #define ISALPHANUM isAlphanum
  45 #endif
  46
  47 import Char     ( isAlpha, isUpper, isLower, ISALPHANUM, ord )
  48 import Util     ( thenCmp )
  49 import FiniteMap ( FiniteMap, emptyFM, lookupFM, addToFM, elemFM )
  50 import Outputable
  51 import GlaExts
  52 \end{code}
  53
  54
  55 %************************************************************************
  56 %*                                                                      *
  57 \subsection[Module]{The name of a module}
  58 %*                                                                      *
  59 %************************************************************************
  60
  61 \begin{code}
  62 data Module = Module FAST_STRING        -- User and interface files
  63                      FAST_STRING        -- Print this in C files
  64
  65         -- The C version has quote chars Z-encoded
  66
  67 instance Outputable Module where
  68   ppr = pprModule
  69
  70 instance Eq Module where
  71   (Module m1 _) == (Module m2 _) = m1 == m2
  72
  73 instance Ord Module where
  74   (Module m1 _) `compare` (Module m2 _) = m1 `compare` m2
  75
  76 pprModule :: Module -> SDoc
  77 pprModule (Module real code)
  78   = getPprStyle         $ \ sty ->
  79     if codeStyle sty then
  80         ptext code
  81     else
  82         ptext real
  83
  84 mkModule :: String -> Module
  85 mkModule s = Module (_PK_ s) (identToC s)
  86
  87 mkModuleFS :: FAST_STRING -> Module
  88 mkModuleFS s = Module s (identFsToC s)
  89
  90 moduleString :: Module -> String
  91 moduleString (Module mod _) = _UNPK_ mod
  92
  93 moduleCString :: Module -> String
  94 moduleCString (Module _ code) = _UNPK_ code
  95 \end{code}
  96
  97
  98 %************************************************************************
  99 %*                                                                      *
 100 \subsection[Name-pieces-datatypes]{The @OccName@ datatypes}
 101 %*                                                                      *
 102 %************************************************************************
 103
 104 \begin{code}
 105 data OccName = OccName
 106                   OccSpace
 107                   FAST_STRING   -- The 'real name'
 108                   FAST_STRING   -- Print this in interface files
 109                   FAST_STRING   -- Print this in C/asm code
 110
 111 -- The OccSpace/real-name pair define the OccName
 112 -- The iface and c/asm versions are simply derived from the
 113 -- other two.  They are cached here simply to avoid recomputing
 114 -- them repeatedly when printing
 115
 116 -- The latter two are irrelevant in RdrNames; on the other hand,
 117 -- the OccSpace field is irrelevant after RdrNames.
 118 -- So the OccName type might be refined a bit.
 119 -- It is now abstract so that's easier than before
 120
 121
 122 -- Why three print-names?
 123 --      Real    Iface   C
 124 --      ---------------------
 125 --      foo     foo     foo
 126 --
 127 --      +       +       Zp      Operators OK in interface files;
 128 --                              'Z' is the escape char for C names
 129 --
 130 --      x#      x#      xZh     Trailing # lexed ok by GHC -fglasgow-exts
 131 --
 132 --      _foo    _ufoo   _ufoo   Leading '_' is the escape char in interface files
 133 --
 134 --      _vfoo   _vfoo   _vfoo   Worker for foo
 135 --
 136 --      _wp     _wp     _wp     Worker for +
 137
 138
 139 data OccSpace = VarOcc  -- Variables and data constructors
 140               | TvOcc   -- Type variables
 141               | TCOcc   -- Type constructors and classes
 142               deriving( Eq, Ord )
 143 \end{code}
 144
 145
 146 %************************************************************************
 147 %*                                                                      *
 148 \subsection{Printing}
 149 %*                                                                      *
 150 %************************************************************************
 151
 152 \begin{code}
 153 instance Outputable OccName where
 154   ppr = pprOccName
 155
 156 pprOccName :: OccName -> SDoc
 157 pprOccName (OccName space real iface code)
 158   = getPprStyle $ \ sty ->
 159     if codeStyle sty then
 160         ptext code
 161     else if ifaceStyle sty then
 162         ptext iface
 163     else
 164         ptext real
 165 \end{code}
 166
 167
 168 %************************************************************************
 169 %*                                                                      *
 170 \subsection{Construction}
 171 %*                                                                      *
 172 %************************************************************************
 173
 174 *Source-code* things beginning with '_' are zapped to begin with '_u'
 175
 176 \begin{code}
 177 mkSrcOcc :: OccSpace -> FAST_STRING -> OccName
 178 mkSrcOcc occ_sp real
 179   = case _UNPK_ real of
 180
 181         '_' : rest -> OccName occ_sp real (_PK_ zapped_str) (identToC zapped_str)
 182                    where
 183                       zapped_str = '_' : 'u' : rest
 184
 185         other      -> OccName occ_sp real real (identFsToC real)
 186
 187 srcVarOcc, srcTCOcc, srcTvOcc :: FAST_STRING -> OccName
 188 srcVarOcc = mkSrcOcc VarOcc
 189 srcTCOcc  = mkSrcOcc TCOcc
 190 srcTvOcc  = mkSrcOcc TvOcc
 191 \end{code}
 192
 193 However, things that don't come from Haskell source code aren't
 194 treated specially.
 195
 196 \begin{code}
 197 mkOcc :: OccSpace -> String -> OccName
 198 mkOcc occ_sp str = OccName occ_sp fs fs (identToC str)
 199                  where
 200                    fs = _PK_ str
 201
 202 mkFsOcc :: OccSpace -> FAST_STRING -> OccName
 203 mkFsOcc occ_sp real = OccName occ_sp real real (identFsToC real)
 204
 205 varOcc, tcOcc, tvOcc :: FAST_STRING -> OccName
 206 varOcc = mkFsOcc VarOcc
 207 tcOcc  = mkFsOcc TCOcc
 208 tvOcc  = mkFsOcc TvOcc
 209 \end{code}
 210
 211
 212 %************************************************************************
 213 %*                                                                      *
 214 \subsection{Making system names}
 215 %*                                                                      *
 216 %************************************************************************
 217
 218 Here's our convention for splitting up the interface file name space:
 219
 220         _d...           dictionary identifiers
 221
 222         _f...           dict-fun identifiers (from inst decls)
 223         _g...           ditto, when the tycon has symbols
 224
 225         _t...           externally visible (non-user visible) names
 226
 227         _m...           default methods
 228         _n...           default methods (encoded symbols, eg. <= becomes _nle)
 229
 230         _p...           superclass selectors
 231
 232         _v...           workers
 233         _w...           workers (encoded symbols)
 234
 235         _x...           local variables
 236
 237         _u...           user-defined names that previously began with '_'
 238
 239         _T...           compiler-generated tycons for dictionaries
 240         _D..            ...ditto data cons
 241
 242         __....          keywords (__export, __letrec etc.)
 243
 244 This knowledge is encoded in the following functions.
 245
 246
 247
 248
 249 @mkDerivedOcc@ generates an @OccName@ from an existing @OccName@;
 250         eg: workers, derived methods
 251
 252 We pass a character to use as the prefix.  So, for example,
 253         "f" gets derived to "_vf", if the prefix char is 'v'
 254
 255 \begin{code}
 256 mk_deriv :: OccSpace -> Char -> String -> OccName
 257 mk_deriv occ_sp sys_ch str = mkOcc occ_sp ('_' : sys_ch : str)
 258 \end{code}
 259
 260 Things are a bit more complicated if the thing is an operator; then
 261 we must encode it into a normal identifier first.  We do this in
 262 a simple way, and use a different character prefix (one after the one
 263 suggested).  For example
 264         "<" gets derived to "_wl", if the prefix char is 'v'
 265
 266 \begin{code}
 267 mk_enc_deriv :: OccSpace
 268              -> Char    -- The system-name-space character (see list above)
 269              -> OccName -- The OccName from which we are deriving
 270              -> OccName
 271
 272 mk_enc_deriv occ_sp sys_ch occ
 273   | needs_encoding real_str = mk_deriv occ_sp sys_op_ch (encode_operator real_str)
 274   | otherwise               = mk_deriv occ_sp sys_ch    real_str
 275   where
 276     real_str  = occNameString occ
 277     sys_op_ch = succ sys_ch
 278
 279
 280 mkDictOcc, mkWorkerOcc, mkDefaultMethodOcc,
 281            mkClassTyConOcc, mkClassDataConOcc
 282    :: OccName -> OccName
 283
 284 mkWorkerOcc        = mk_enc_deriv VarOcc 'v'    -- v,w
 285 mkDefaultMethodOcc = mk_enc_deriv VarOcc 'm'    -- m,n
 286 mkClassTyConOcc    = mk_enc_deriv TCOcc  'T'    -- not U
 287 mkClassDataConOcc  = mk_enc_deriv VarOcc 'D'    -- not E
 288 mkDictOcc          = mk_enc_deriv VarOcc 'd'    -- not e
 289 \end{code}
 290
 291 \begin{code}
 292 mkSuperDictSelOcc :: Int        -- Index of superclass, eg 3
 293                   -> OccName    -- Class, eg "Ord"
 294                   -> OccName    -- eg "p3Ord"
 295 mkSuperDictSelOcc index cls_occ
 296   = mk_deriv VarOcc 'p' (show index ++ occNameString cls_occ)
 297 \end{code}
 298
 299
 300 \begin{code}
 301 mkDFunOcc :: OccName    -- class, eg "Ord"
 302           -> OccName    -- tycon (or something convenient from the instance type)
 303                         --      eg "Maybe"
 304           -> Int        -- Unique to distinguish dfuns which share the previous two
 305                         --      eg 3
 306           -> OccName    -- "dOrdMaybe3"
 307
 308 mkDFunOcc cls_occ tycon_occ index
 309   | needs_encoding tycon_str    -- Drat!  Have to encode the tycon
 310   = mk_deriv VarOcc 'g' (show_index ++ cls_str ++ encode_operator tycon_str)
 311   | otherwise                   -- Normal case
 312   = mk_deriv VarOcc 'f' (show_index ++ cls_str ++ tycon_str)
 313   where
 314     cls_str   = occNameString cls_occ
 315     tycon_str = occNameString tycon_occ
 316         -- NB: if a non-operator the tycon has a trailing # we don't encode.
 317     show_index | index == 0 = ""
 318                | otherwise  = show index
 319 \end{code}
 320
 321
 322 %************************************************************************
 323 %*                                                                      *
 324 \subsection{Lexical categories}
 325 %*                                                                      *
 326 %************************************************************************
 327
 328 These functions test strings to see if they fit the lexical categories
 329 defined in the Haskell report.
 330
 331 \begin{code}
 332 isLexCon,   isLexVar,    isLexId,    isLexSym    :: FAST_STRING -> Bool
 333 isLexConId, isLexConSym, isLexVarId, isLexVarSym :: FAST_STRING -> Bool
 334
 335 isLexCon cs = isLexConId  cs || isLexConSym cs
 336 isLexVar cs = isLexVarId  cs || isLexVarSym cs
 337
 338 isLexId  cs = isLexConId  cs || isLexVarId  cs
 339 isLexSym cs = isLexConSym cs || isLexVarSym cs
 340
 341 -------------
 342
 343 isLexConId cs                           -- Prefix type or data constructors
 344   | _NULL_ cs        = False            --      e.g. "Foo", "[]", "(,)"
 345   | cs == SLIT("[]") = True
 346   | c  == '('        = True     -- (), (,), (,,), ...
 347   | otherwise        = isUpper c || isUpperISO c
 348   where
 349     c = _HEAD_ cs
 350
 351 isLexVarId cs                           -- Ordinary prefix identifiers
 352   | _NULL_ cs    = False                --      e.g. "x", "_x"
 353   | otherwise    = isLower c || isLowerISO c || c == '_'
 354   where
 355     c = _HEAD_ cs
 356
 357 isLexConSym cs                          -- Infix type or data constructors
 358   | _NULL_ cs   = False                 --      e.g. ":-:", ":", "->"
 359   | otherwise   = c  == ':'
 360                || cs == SLIT("->")
 361   where
 362     c = _HEAD_ cs
 363
 364 isLexVarSym cs                          -- Infix identifiers
 365   | _NULL_ cs = False                   --      e.g. "+"
 366   | otherwise = isSymbolASCII c
 367              || isSymbolISO c
 368   where
 369     c = _HEAD_ cs
 370
 371 -------------
 372 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 373 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 374 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 375         --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 376 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 377         --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 378 \end{code}
 379
 380 %************************************************************************
 381 %*                                                                      *
 382 \subsection{Predicates and taking them apart}
 383 %*                                                                      *
 384 %************************************************************************
 385
 386 \begin{code}
 387 occNameString :: OccName -> String
 388 occNameString (OccName _ s _ _) = _UNPK_ s
 389
 390 -- occNameFlavour is used only to generate good error messages, so it doesn't matter
 391 -- that the VarOcc case isn't mega-efficient.  We could have different Occ constructors for
 392 -- data constructors and values, but that makes everything else a bit more complicated.
 393 occNameFlavour :: OccName -> String
 394 occNameFlavour (OccName VarOcc s _ _) | isLexConId s = "Data constructor"
 395                                       | otherwise    = "Value"
 396 occNameFlavour (OccName TvOcc _ _ _)                 = "Type variable"
 397 occNameFlavour (OccName TCOcc s _ _)                 = "Type constructor or class"
 398
 399 isVarOcc, isTCOcc, isTvOcc,
 400  isConSymOcc, isSymOcc, isWildCardOcc :: OccName -> Bool
 401
 402 isVarOcc (OccName VarOcc _ _ _) = True
 403 isVarOcc other                  = False
 404
 405 isTvOcc (OccName TvOcc _ _ _) = True
 406 isTvOcc other                 = False
 407
 408 isTCOcc (OccName TCOcc _ _ _) = True
 409 isTCOcc other                 = False
 410
 411 isConSymOcc (OccName _ s _ _) = isLexConSym s
 412
 413 isSymOcc (OccName _ s _ _) = isLexSym s
 414
 415 isConOcc (OccName _ s _ _) = isLexCon s
 416
 417 isWildCardOcc (OccName _ s _ _) = (_HEAD_ s) == '_' && _LENGTH_ s == 1
 418
 419 isAnonOcc (OccName _ s _ _) = (_HEAD_ s) == '_'
 420 \end{code}
 421
 422
 423 %************************************************************************
 424 %*                                                                      *
 425 \subsection{Comparison}
 426 %*                                                                      *
 427 %************************************************************************
 428
 429 Comparison is done by space and 'real' name
 430
 431 \begin{code}
 432 instance Eq OccName where
 433     a == b = case (a `compare` b) of { EQ -> True;  _ -> False }
 434     a /= b = case (a `compare` b) of { EQ -> False; _ -> True }
 435
 436 instance Ord OccName where
 437     a <= b = case (a `compare` b) of { LT -> True;  EQ -> True;  GT -> False }
 438     a <  b = case (a `compare` b) of { LT -> True;  EQ -> False; GT -> False }
 439     a >= b = case (a `compare` b) of { LT -> False; EQ -> True;  GT -> True  }
 440     a >  b = case (a `compare` b) of { LT -> False; EQ -> False; GT -> True  }
 441
 442     compare (OccName sp1 r1 _ _) (OccName sp2 r2 _ _)
 443         = (sp1 `compare` sp2) `thenCmp` (r1 `compare` r2)
 444 \end{code}
 445
 446
 447 %************************************************************************
 448 %*                                                                      *
 449 \subsection{Tidying them up}
 450 %*                                                                      *
 451 %************************************************************************
 452
 453 Before we print chunks of code we like to rename it so that
 454 we don't have to print lots of silly uniques in it.  But we mustn't
 455 accidentally introduce name clashes!  So the idea is that we leave the
 456 OccName alone unless it accidentally clashes with one that is already
 457 in scope; if so, we tack on '1' at the end and try again, then '2', and
 458 so on till we find a unique one.
 459
 460 There's a wrinkle for operators.  Consider '>>='.  We can't use '>>=1'
 461 because that isn't a single lexeme.  So we encode it to 'lle' and *then*
 462 tack on the '1', if necessary.
 463
 464 \begin{code}
 465 type TidyOccEnv = FiniteMap FAST_STRING Int     -- The in-scope OccNames
 466 emptyTidyOccEnv = emptyFM
 467
 468 initTidyOccEnv :: [OccName] -> TidyOccEnv       -- Initialise with names to avoid!
 469 initTidyOccEnv = foldl (\env (OccName _ fs _ _) -> addToFM env fs 1) emptyTidyOccEnv
 470
 471 tidyOccName :: TidyOccEnv -> OccName -> (TidyOccEnv, OccName)
 472
 473 tidyOccName in_scope occ@(OccName occ_sp real _ _)
 474   | not (real `elemFM` in_scope) &&
 475     not (isLexCon real)                 -- Hack alert!   Specialised versions of overloaded
 476                                         -- constructors end up as ordinary Ids, but we don't
 477                                         -- want them as ConIds in interface files.
 478
 479   = (addToFM in_scope real 1, occ)      -- First occurrence
 480
 481   | otherwise                           -- Already occurs
 482   =     -- First encode, to deal with
 483         --      a) operators, and
 484         --      b) trailing # signs
 485         -- so that we can then append '1', '2', etc
 486     go in_scope (encode_operator (_UNPK_ real))
 487   where
 488
 489     go in_scope str = case lookupFM in_scope pk_str of
 490                         Just n  -> go (addToFM in_scope pk_str (n+1)) (str ++ show n)
 491                                 -- Need to go round again, just in case "t3" (say)
 492                                 -- clashes with a "t3" that's already in scope
 493
 494                         Nothing -> (addToFM in_scope pk_str 1, mkFsOcc occ_sp pk_str)
 495                                 -- str is now unique
 496                     where
 497                       pk_str = _PK_ str
 498 \end{code}
 499
 500
 501 %************************************************************************
 502 %*                                                                      *
 503 \subsection{Encoding for operators in derived names}
 504 %*                                                                      *
 505 %************************************************************************
 506
 507 See comments with mk_enc_deriv
 508
 509 \begin{code}
 510 needs_encoding :: String -> Bool        -- Needs encoding when embedded in a derived name
 511                                         -- Just look at the first character
 512 needs_encoding (c:cs) = not (isAlpha c || c == '_')
 513
 514 encode_operator :: String -> String
 515 encode_operator nm = foldr tran "" nm
 516  where
 517     tran c cs = case trChar c of
 518                    '\0'  -> '_' : show (ord c) ++ cs  -- No translation
 519                    tr_c  -> tr_c : cs
 520
 521     trChar '&'  = 'a'
 522     trChar '|'  = 'b'
 523     trChar ':'  = 'c'
 524     trChar '/'  = 'd'
 525     trChar '='  = 'e'
 526     trChar '>'  = 'g'
 527     trChar '#'  = 'h'
 528     trChar '@'  = 'i'
 529     trChar '<'  = 'l'
 530     trChar '-'  = 'm'
 531     trChar '!'  = 'n'
 532     trChar '+'  = 'p'
 533     trChar '\'' = 'q'
 534     trChar '$'  = 'r'
 535     trChar '?'  = 's'
 536     trChar '*'  = 't'
 537     trChar '_'  = 'u'
 538     trChar '.'  = 'v'
 539     trChar '\\' = 'w'
 540     trChar '%'  = 'x'
 541     trChar '~'  = 'y'
 542     trChar '^'  = 'z'
 543     trChar _    = '\0'  -- No translation
 544 \end{code}
 545
 546
 547 %************************************************************************
 548 %*                                                                      *
 549 \subsection{The 'Z' encoding}
 550 %*                                                                      *
 551 %************************************************************************
 552
 553 We provide two interfaces for efficiency.
 554
 555 \begin{code}
 556 identToC :: String -> FAST_STRING
 557 identToC str
 558   | all ISALPHANUM str && not std = _PK_ str
 559   | std                           = _PK_ ("Zs" ++ encode str)
 560   | otherwise                     = _PK_ (encode str)
 561   where
 562     std = has_std_prefix str
 563
 564 identFsToC :: FAST_STRING -> FAST_STRING
 565 identFsToC fast_str
 566   | all ISALPHANUM str && not std = fast_str
 567   | std                           = _PK_ ("Zs" ++ encode str)
 568   | otherwise                     = _PK_ (encode str)
 569   where
 570     std = has_std_prefix str
 571     str = _UNPK_ fast_str
 572
 573 -- avoid "stdin", "stdout", and "stderr"...
 574 has_std_prefix ('s':'t':'d':_) = True
 575 has_std_prefix _               = False
 576
 577 encode :: String -> String
 578 encode [] = []
 579 encode (c:cs) = encode_ch c ++ encode cs
 580
 581 encode_ch :: Char -> String
 582 encode_ch c | ISALPHANUM c = [c]
 583         -- Common case first
 584 encode_ch 'Z'  = "ZZ"
 585 encode_ch '&'  = "Za"
 586 encode_ch '|'  = "Zb"
 587 encode_ch ':'  = "Zc"
 588 encode_ch '/'  = "Zd"
 589 encode_ch '='  = "Ze"
 590 encode_ch '>'  = "Zg"
 591 encode_ch '#'  = "Zh"
 592 encode_ch '<'  = "Zl"
 593 encode_ch '-'  = "Zm"
 594 encode_ch '!'  = "Zn"
 595 encode_ch '.'  = "Zs"
 596 encode_ch '\'' = "Zq"
 597 encode_ch '*'  = "Zt"
 598 encode_ch '+'  = "Zp"
 599 encode_ch '_'  = "_"
 600 encode_ch c    = 'Z':show (ord c)
 601 \end{code}
 602
 603 For \tr{modnameToC}, we really only have to worry about \tr{'}s
 604 (quote chars) in the name.  Rare.
 605
 606 \begin{code}
 607 modnameToC  :: FAST_STRING -> FAST_STRING
 608 modnameToC fast_str = identFsToC fast_str
 609 \end{code}