ghc/compiler/basicTypes/OccName.lhs

   1
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
   3 %
   4
   5 \section[OccName]{@OccName@}
   6
   7 \begin{code}
   8 module OccName (
   9         -- Modules
  10         Module,         -- Abstract, instance of Outputable
  11         mkModule, mkModuleFS, moduleString, moduleCString, pprModule,
  12
  13         -- The OccName type
  14         OccName,        -- Abstract, instance of Outputable
  15         varOcc,    tcOcc,    tvOcc,     -- Occ constructors
  16         srcVarOcc, srcTCOcc, srcTvOcc,  -- For Occs arising from source code
  17
  18         mkSuperDictSelOcc, mkDFunOcc,
  19         mkDictOcc, mkWorkerOcc, mkDefaultMethodOcc,
  20         mkClassTyConOcc, mkClassDataConOcc,
  21
  22         isTvOcc, isTCOcc, isVarOcc, isConSymOcc, isConOcc, isSymOcc,
  23         isWildCardOcc, isAnonOcc,
  24         pprOccName, occNameString, occNameFlavour,
  25
  26         -- The basic form of names
  27         isLexCon, isLexVar, isLexId, isLexSym,
  28         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  29         isLowerISO, isUpperISO,
  30
  31         -- Tidying up
  32         TidyOccEnv, emptyTidyOccEnv, tidyOccName, initTidyOccEnv,
  33
  34         -- Junk
  35         identToC
  36
  37     ) where
  38
  39 #include "HsVersions.h"
  40
  41 #if __HASKELL1__ > 4
  42 import Char     ( isAlpha, isUpper, isLower, isAlphaNum{-sigh-}, ord )
  43 #else
  44 import Char     ( isAlpha, isUpper, isLower, isAlphanum, ord )
  45 #endif
  46 import Util     ( thenCmp )
  47 import FiniteMap ( FiniteMap, emptyFM, lookupFM, addToFM, elemFM )
  48 import Outputable
  49 import GlaExts
  50 \end{code}
  51
  52
  53 %************************************************************************
  54 %*                                                                      *
  55 \subsection[Module]{The name of a module}
  56 %*                                                                      *
  57 %************************************************************************
  58
  59 \begin{code}
  60 data Module = Module FAST_STRING        -- User and interface files
  61                      FAST_STRING        -- Print this in C files
  62
  63         -- The C version has quote chars Z-encoded
  64
  65 instance Outputable Module where
  66   ppr = pprModule
  67
  68 instance Eq Module where
  69   (Module m1 _) == (Module m2 _) = m1 == m2
  70
  71 instance Ord Module where
  72   (Module m1 _) `compare` (Module m2 _) = m1 `compare` m2
  73
  74 pprModule :: Module -> SDoc
  75 pprModule (Module real code)
  76   = getPprStyle         $ \ sty ->
  77     if codeStyle sty then
  78         ptext code
  79     else
  80         ptext real
  81
  82 mkModule :: String -> Module
  83 mkModule s = Module (_PK_ s) (identToC s)
  84
  85 mkModuleFS :: FAST_STRING -> Module
  86 mkModuleFS s = Module s (identFsToC s)
  87
  88 moduleString :: Module -> String
  89 moduleString (Module mod _) = _UNPK_ mod
  90
  91 moduleCString :: Module -> String
  92 moduleCString (Module _ code) = _UNPK_ code
  93 \end{code}
  94
  95
  96 %************************************************************************
  97 %*                                                                      *
  98 \subsection[Name-pieces-datatypes]{The @OccName@ datatypes}
  99 %*                                                                      *
 100 %************************************************************************
 101
 102 \begin{code}
 103 data OccName = OccName
 104                   OccSpace
 105                   FAST_STRING   -- The 'real name'
 106                   FAST_STRING   -- Print this in interface files
 107                   FAST_STRING   -- Print this in C/asm code
 108
 109 -- The OccSpace/real-name pair define the OccName
 110 -- The iface and c/asm versions are simply derived from the
 111 -- other two.  They are cached here simply to avoid recomputing
 112 -- them repeatedly when printing
 113
 114 -- The latter two are irrelevant in RdrNames; on the other hand,
 115 -- the OccSpace field is irrelevant after RdrNames.
 116 -- So the OccName type might be refined a bit.
 117 -- It is now abstract so that's easier than before
 118
 119
 120 -- Why three print-names?
 121 --      Real    Iface   C
 122 --      ---------------------
 123 --      foo     foo     foo
 124 --
 125 --      +       +       Zp      Operators OK in interface files;
 126 --                              'Z' is the escape char for C names
 127 --
 128 --      x#      x#      xZh     Trailing # lexed ok by GHC -fglasgow-exts
 129 --
 130 --      _foo    _ufoo   _ufoo   Leading '_' is the escape char in interface files
 131 --
 132 --      _vfoo   _vfoo   _vfoo   Worker for foo
 133 --
 134 --      _wp     _wp     _wp     Worker for +
 135
 136
 137 data OccSpace = VarOcc  -- Variables and data constructors
 138               | TvOcc   -- Type variables
 139               | TCOcc   -- Type constructors and classes
 140               deriving( Eq, Ord )
 141 \end{code}
 142
 143
 144 %************************************************************************
 145 %*                                                                      *
 146 \subsection{Printing}
 147 %*                                                                      *
 148 %************************************************************************
 149
 150 \begin{code}
 151 instance Outputable OccName where
 152   ppr = pprOccName
 153
 154 pprOccName :: OccName -> SDoc
 155 pprOccName (OccName space real iface code)
 156   = getPprStyle $ \ sty ->
 157     if codeStyle sty then
 158         ptext code
 159     else if ifaceStyle sty then
 160         ptext iface
 161     else
 162         ptext real
 163 \end{code}
 164
 165
 166 %************************************************************************
 167 %*                                                                      *
 168 \subsection{Construction}
 169 %*                                                                      *
 170 %************************************************************************
 171
 172 *Source-code* things beginning with '_' are zapped to begin with '_u'
 173
 174 \begin{code}
 175 mkSrcOcc :: OccSpace -> FAST_STRING -> OccName
 176 mkSrcOcc occ_sp real
 177   = case _UNPK_ real of
 178
 179         '_' : rest -> OccName occ_sp real (_PK_ zapped_str) (identToC zapped_str)
 180                    where
 181                       zapped_str = '_' : 'u' : rest
 182
 183         other      -> OccName occ_sp real real (identFsToC real)
 184
 185 srcVarOcc, srcTCOcc, srcTvOcc :: FAST_STRING -> OccName
 186 srcVarOcc = mkSrcOcc VarOcc
 187 srcTCOcc  = mkSrcOcc TCOcc
 188 srcTvOcc  = mkSrcOcc TvOcc
 189 \end{code}
 190
 191 However, things that don't come from Haskell source code aren't
 192 treated specially.
 193
 194 \begin{code}
 195 mkOcc :: OccSpace -> String -> OccName
 196 mkOcc occ_sp str = OccName occ_sp fs fs (identToC str)
 197                  where
 198                    fs = _PK_ str
 199
 200 mkFsOcc :: OccSpace -> FAST_STRING -> OccName
 201 mkFsOcc occ_sp real = OccName occ_sp real real (identFsToC real)
 202
 203 varOcc, tcOcc, tvOcc :: FAST_STRING -> OccName
 204 varOcc = mkFsOcc VarOcc
 205 tcOcc  = mkFsOcc TCOcc
 206 tvOcc  = mkFsOcc TvOcc
 207 \end{code}
 208
 209
 210 %************************************************************************
 211 %*                                                                      *
 212 \subsection{Making system names}
 213 %*                                                                      *
 214 %************************************************************************
 215
 216 Here's our convention for splitting up the interface file name space:
 217
 218         _d...           dictionary identifiers
 219
 220         _f...           dict-fun identifiers (from inst decls)
 221         _g...           ditto, when the tycon has symbols
 222
 223         _t...           externally visible (non-user visible) names
 224
 225         _m...           default methods
 226         _n...           default methods (encoded symbols, eg. <= becomes _nle)
 227
 228         _p...           superclass selectors
 229
 230         _v...           workers
 231         _w...           workers (encoded symbols)
 232
 233         _x...           local variables
 234
 235         _u...           user-defined names that previously began with '_'
 236
 237         _T...           compiler-generated tycons for dictionaries
 238         _D..            ...ditto data cons
 239
 240         __....          keywords (__export, __letrec etc.)
 241
 242 This knowledge is encoded in the following functions.
 243
 244
 245
 246
 247 @mkDerivedOcc@ generates an @OccName@ from an existing @OccName@;
 248         eg: workers, derived methods
 249
 250 We pass a character to use as the prefix.  So, for example,
 251         "f" gets derived to "_vf", if the prefix char is 'v'
 252
 253 \begin{code}
 254 mk_deriv :: OccSpace -> Char -> String -> OccName
 255 mk_deriv occ_sp sys_ch str = mkOcc occ_sp ('_' : sys_ch : str)
 256 \end{code}
 257
 258 Things are a bit more complicated if the thing is an operator; then
 259 we must encode it into a normal identifier first.  We do this in
 260 a simple way, and use a different character prefix (one after the one
 261 suggested).  For example
 262         "<" gets derived to "_wl", if the prefix char is 'v'
 263
 264 \begin{code}
 265 mk_enc_deriv :: OccSpace
 266              -> Char    -- The system-name-space character (see list above)
 267              -> OccName -- The OccName from which we are deriving
 268              -> OccName
 269
 270 mk_enc_deriv occ_sp sys_ch occ
 271   | needs_encoding real_str = mk_deriv occ_sp sys_op_ch (encode_operator real_str)
 272   | otherwise               = mk_deriv occ_sp sys_ch    real_str
 273   where
 274     real_str  = occNameString occ
 275     sys_op_ch = succ sys_ch
 276
 277
 278 mkDictOcc, mkWorkerOcc, mkDefaultMethodOcc,
 279            mkClassTyConOcc, mkClassDataConOcc
 280    :: OccName -> OccName
 281
 282 mkWorkerOcc        = mk_enc_deriv VarOcc 'v'    -- v,w
 283 mkDefaultMethodOcc = mk_enc_deriv VarOcc 'm'    -- m,n
 284 mkClassTyConOcc    = mk_enc_deriv TCOcc  'T'    -- not U
 285 mkClassDataConOcc  = mk_enc_deriv VarOcc 'D'    -- not E
 286 mkDictOcc          = mk_enc_deriv VarOcc 'd'    -- not e
 287 \end{code}
 288
 289 \begin{code}
 290 mkSuperDictSelOcc :: Int        -- Index of superclass, eg 3
 291                   -> OccName    -- Class, eg "Ord"
 292                   -> OccName    -- eg "p3Ord"
 293 mkSuperDictSelOcc index cls_occ
 294   = mk_deriv VarOcc 'p' (show index ++ occNameString cls_occ)
 295 \end{code}
 296
 297
 298 \begin{code}
 299 mkDFunOcc :: OccName    -- class, eg "Ord"
 300           -> OccName    -- tycon (or something convenient from the instance type)
 301                         --      eg "Maybe"
 302           -> Int        -- Unique to distinguish dfuns which share the previous two
 303                         --      eg 3
 304           -> OccName    -- "dOrdMaybe3"
 305
 306 mkDFunOcc cls_occ tycon_occ index
 307   | needs_encoding tycon_str    -- Drat!  Have to encode the tycon
 308   = mk_deriv VarOcc 'g' (show_index ++ cls_str ++ encode_operator tycon_str)
 309   | otherwise                   -- Normal case
 310   = mk_deriv VarOcc 'f' (show_index ++ cls_str ++ tycon_str)
 311   where
 312     cls_str   = occNameString cls_occ
 313     tycon_str = occNameString tycon_occ
 314         -- NB: if a non-operator the tycon has a trailing # we don't encode.
 315     show_index | index == 0 = ""
 316                | otherwise  = show index
 317 \end{code}
 318
 319
 320 %************************************************************************
 321 %*                                                                      *
 322 \subsection{Lexical categories}
 323 %*                                                                      *
 324 %************************************************************************
 325
 326 These functions test strings to see if they fit the lexical categories
 327 defined in the Haskell report.
 328
 329 \begin{code}
 330 isLexCon,   isLexVar,    isLexId,    isLexSym    :: FAST_STRING -> Bool
 331 isLexConId, isLexConSym, isLexVarId, isLexVarSym :: FAST_STRING -> Bool
 332
 333 isLexCon cs = isLexConId  cs || isLexConSym cs
 334 isLexVar cs = isLexVarId  cs || isLexVarSym cs
 335
 336 isLexId  cs = isLexConId  cs || isLexVarId  cs
 337 isLexSym cs = isLexConSym cs || isLexVarSym cs
 338
 339 -------------
 340
 341 isLexConId cs                           -- Prefix type or data constructors
 342   | _NULL_ cs        = False            --      e.g. "Foo", "[]", "(,)"
 343   | cs == SLIT("[]") = True
 344   | c  == '('        = True     -- (), (,), (,,), ...
 345   | otherwise        = isUpper c || isUpperISO c
 346   where
 347     c = _HEAD_ cs
 348
 349 isLexVarId cs                           -- Ordinary prefix identifiers
 350   | _NULL_ cs    = False                --      e.g. "x", "_x"
 351   | otherwise    = isLower c || isLowerISO c || c == '_'
 352   where
 353     c = _HEAD_ cs
 354
 355 isLexConSym cs                          -- Infix type or data constructors
 356   | _NULL_ cs   = False                 --      e.g. ":-:", ":", "->"
 357   | otherwise   = c  == ':'
 358                || cs == SLIT("->")
 359   where
 360     c = _HEAD_ cs
 361
 362 isLexVarSym cs                          -- Infix identifiers
 363   | _NULL_ cs = False                   --      e.g. "+"
 364   | otherwise = isSymbolASCII c
 365              || isSymbolISO c
 366   where
 367     c = _HEAD_ cs
 368
 369 -------------
 370 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 371 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 372 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 373         --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 374 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 375         --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 376 \end{code}
 377
 378 %************************************************************************
 379 %*                                                                      *
 380 \subsection{Predicates and taking them apart}
 381 %*                                                                      *
 382 %************************************************************************
 383
 384 \begin{code}
 385 occNameString :: OccName -> String
 386 occNameString (OccName _ s _ _) = _UNPK_ s
 387
 388 -- occNameFlavour is used only to generate good error messages, so it doesn't matter
 389 -- that the VarOcc case isn't mega-efficient.  We could have different Occ constructors for
 390 -- data constructors and values, but that makes everything else a bit more complicated.
 391 occNameFlavour :: OccName -> String
 392 occNameFlavour (OccName VarOcc s _ _) | isLexConId s = "Data constructor"
 393                                       | otherwise    = "Value"
 394 occNameFlavour (OccName TvOcc _ _ _)                 = "Type variable"
 395 occNameFlavour (OccName TCOcc s _ _)                 = "Type constructor or class"
 396
 397 isVarOcc, isTCOcc, isTvOcc,
 398  isConSymOcc, isSymOcc, isWildCardOcc :: OccName -> Bool
 399
 400 isVarOcc (OccName VarOcc _ _ _) = True
 401 isVarOcc other                  = False
 402
 403 isTvOcc (OccName TvOcc _ _ _) = True
 404 isTvOcc other                 = False
 405
 406 isTCOcc (OccName TCOcc _ _ _) = True
 407 isTCOcc other                 = False
 408
 409 isConSymOcc (OccName _ s _ _) = isLexConSym s
 410
 411 isSymOcc (OccName _ s _ _) = isLexSym s
 412
 413 isConOcc (OccName _ s _ _) = isLexCon s
 414
 415 isWildCardOcc (OccName _ s _ _) = (_HEAD_ s) == '_' && _LENGTH_ s == 1
 416
 417 isAnonOcc (OccName _ s _ _) = (_HEAD_ s) == '_'
 418 \end{code}
 419
 420
 421 %************************************************************************
 422 %*                                                                      *
 423 \subsection{Comparison}
 424 %*                                                                      *
 425 %************************************************************************
 426
 427 Comparison is done by space and 'real' name
 428
 429 \begin{code}
 430 instance Eq OccName where
 431     a == b = case (a `compare` b) of { EQ -> True;  _ -> False }
 432     a /= b = case (a `compare` b) of { EQ -> False; _ -> True }
 433
 434 instance Ord OccName where
 435     a <= b = case (a `compare` b) of { LT -> True;  EQ -> True;  GT -> False }
 436     a <  b = case (a `compare` b) of { LT -> True;  EQ -> False; GT -> False }
 437     a >= b = case (a `compare` b) of { LT -> False; EQ -> True;  GT -> True  }
 438     a >  b = case (a `compare` b) of { LT -> False; EQ -> False; GT -> True  }
 439
 440     compare (OccName sp1 r1 _ _) (OccName sp2 r2 _ _)
 441         = (sp1 `compare` sp2) `thenCmp` (r1 `compare` r2)
 442 \end{code}
 443
 444
 445 %************************************************************************
 446 %*                                                                      *
 447 \subsection{Tidying them up}
 448 %*                                                                      *
 449 %************************************************************************
 450
 451 Before we print chunks of code we like to rename it so that
 452 we don't have to print lots of silly uniques in it.  But we mustn't
 453 accidentally introduce name clashes!  So the idea is that we leave the
 454 OccName alone unless it accidentally clashes with one that is already
 455 in scope; if so, we tack on '1' at the end and try again, then '2', and
 456 so on till we find a unique one.
 457
 458 There's a wrinkle for operators.  Consider '>>='.  We can't use '>>=1'
 459 because that isn't a single lexeme.  So we encode it to 'lle' and *then*
 460 tack on the '1', if necessary.
 461
 462 \begin{code}
 463 type TidyOccEnv = FiniteMap FAST_STRING Int     -- The in-scope OccNames
 464 emptyTidyOccEnv = emptyFM
 465
 466 initTidyOccEnv :: [OccName] -> TidyOccEnv       -- Initialise with names to avoid!
 467 initTidyOccEnv = foldl (\env (OccName _ fs _ _) -> addToFM env fs 1) emptyTidyOccEnv
 468
 469 tidyOccName :: TidyOccEnv -> OccName -> (TidyOccEnv, OccName)
 470
 471 tidyOccName in_scope occ@(OccName occ_sp real _ _)
 472   | not (real `elemFM` in_scope) &&
 473     not (isLexCon real)                 -- Hack alert!   Specialised versions of overloaded
 474                                         -- constructors end up as ordinary Ids, but we don't
 475                                         -- want them as ConIds in interface files.
 476
 477   = (addToFM in_scope real 1, occ)      -- First occurrence
 478
 479   | otherwise                           -- Already occurs
 480   =     -- First encode, to deal with
 481         --      a) operators, and
 482         --      b) trailing # signs
 483         -- so that we can then append '1', '2', etc
 484     go in_scope (encode_operator (_UNPK_ real))
 485   where
 486
 487     go in_scope str = case lookupFM in_scope pk_str of
 488                         Just n  -> go (addToFM in_scope pk_str (n+1)) (str ++ show n)
 489                                 -- Need to go round again, just in case "t3" (say)
 490                                 -- clashes with a "t3" that's already in scope
 491
 492                         Nothing -> (addToFM in_scope pk_str 1, mkFsOcc occ_sp pk_str)
 493                                 -- str is now unique
 494                     where
 495                       pk_str = _PK_ str
 496 \end{code}
 497
 498
 499 %************************************************************************
 500 %*                                                                      *
 501 \subsection{Encoding for operators in derived names}
 502 %*                                                                      *
 503 %************************************************************************
 504
 505 See comments with mk_enc_deriv
 506
 507 \begin{code}
 508 needs_encoding :: String -> Bool        -- Needs encoding when embedded in a derived name
 509                                         -- Just look at the first character
 510 needs_encoding (c:cs) = not (isAlpha c || c == '_')
 511
 512 encode_operator :: String -> String
 513 encode_operator nm = foldr tran "" nm
 514  where
 515     tran c cs = case trChar c of
 516                    '\0'  -> '_' : show (ord c) ++ cs  -- No translation
 517                    tr_c  -> tr_c : cs
 518
 519     trChar '&'  = 'a'
 520     trChar '|'  = 'b'
 521     trChar ':'  = 'c'
 522     trChar '/'  = 'd'
 523     trChar '='  = 'e'
 524     trChar '>'  = 'g'
 525     trChar '#'  = 'h'
 526     trChar '@'  = 'i'
 527     trChar '<'  = 'l'
 528     trChar '-'  = 'm'
 529     trChar '!'  = 'n'
 530     trChar '+'  = 'p'
 531     trChar '\'' = 'q'
 532     trChar '$'  = 'r'
 533     trChar '?'  = 's'
 534     trChar '*'  = 't'
 535     trChar '_'  = 'u'
 536     trChar '.'  = 'v'
 537     trChar '\\' = 'w'
 538     trChar '%'  = 'x'
 539     trChar '~'  = 'y'
 540     trChar '^'  = 'z'
 541     trChar _    = '\0'  -- No translation
 542 \end{code}
 543
 544
 545 %************************************************************************
 546 %*                                                                      *
 547 \subsection{The 'Z' encoding}
 548 %*                                                                      *
 549 %************************************************************************
 550
 551 We provide two interfaces for efficiency.
 552
 553 \begin{code}
 554 identToC :: String -> FAST_STRING
 555 identToC str
 556   | all isAlphanum str && not std = _PK_ str
 557   | std                           = _PK_ ("Zs" ++ encode str)
 558   | otherwise                     = _PK_ (encode str)
 559   where
 560     std = has_std_prefix str
 561
 562 identFsToC :: FAST_STRING -> FAST_STRING
 563 identFsToC fast_str
 564   | all isAlphanum str && not std = fast_str
 565   | std                           = _PK_ ("Zs" ++ encode str)
 566   | otherwise                     = _PK_ (encode str)
 567   where
 568     std = has_std_prefix str
 569     str = _UNPK_ fast_str
 570
 571 -- avoid "stdin", "stdout", and "stderr"...
 572 has_std_prefix ('s':'t':'d':_) = True
 573 has_std_prefix _               = False
 574
 575 encode :: String -> String
 576 encode [] = []
 577 encode (c:cs) = encode_ch c ++ encode cs
 578
 579 encode_ch :: Char -> String
 580 encode_ch c | isAlphanum c = [c]
 581         -- Common case first
 582 encode_ch 'Z'  = "ZZ"
 583 encode_ch '&'  = "Za"
 584 encode_ch '|'  = "Zb"
 585 encode_ch ':'  = "Zc"
 586 encode_ch '/'  = "Zd"
 587 encode_ch '='  = "Ze"
 588 encode_ch '>'  = "Zg"
 589 encode_ch '#'  = "Zh"
 590 encode_ch '<'  = "Zl"
 591 encode_ch '-'  = "Zm"
 592 encode_ch '!'  = "Zn"
 593 encode_ch '.'  = "Zs"
 594 encode_ch '\'' = "Zq"
 595 encode_ch '*'  = "Zt"
 596 encode_ch '+'  = "Zp"
 597 encode_ch '_'  = "_"
 598 encode_ch c    = 'Z':show (ord c)
 599 \end{code}
 600
 601 For \tr{modnameToC}, we really only have to worry about \tr{'}s
 602 (quote chars) in the name.  Rare.
 603
 604 \begin{code}
 605 modnameToC  :: FAST_STRING -> FAST_STRING
 606 modnameToC fast_str = identFsToC fast_str
 607 \end{code}