Data/HashTable.hs

   1 {-# OPTIONS_GHC -fno-implicit-prelude #-}
   2
   3 -----------------------------------------------------------------------------
   4 -- |
   5 -- Module      :  Data.HashTable
   6 -- Copyright   :  (c) The University of Glasgow 2003
   7 -- License     :  BSD-style (see the file libraries/base/LICENSE)
   8 --
   9 -- Maintainer  :  libraries@haskell.org
  10 -- Stability   :  provisional
  11 -- Portability :  portable
  12 --
  13 -- An implementation of extensible hash tables, as described in
  14 -- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
  15 -- pp. 446--457.  The implementation is also derived from the one
  16 -- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
  17 --
  18 -----------------------------------------------------------------------------
  19
  20 module Data.HashTable (
  21         -- * Basic hash table operations
  22         HashTable, new, insert, delete, lookup, update,
  23         -- * Converting to and from lists
  24         fromList, toList,
  25         -- * Hash functions
  26         -- $hash_functions
  27         hashInt, hashString,
  28         prime,
  29         -- * Diagnostics
  30         longestChain
  31  ) where
  32
  33 -- This module is imported by Data.Dynamic, which is pretty low down in the
  34 -- module hierarchy, so don't import "high-level" modules
  35
  36 #ifdef __GLASGOW_HASKELL__
  37 import GHC.Base
  38 #else
  39 import Prelude  hiding  ( lookup )
  40 #endif
  41 import Data.Tuple       ( fst )
  42 import Data.Bits
  43 import Data.Maybe
  44 import Data.List        ( maximumBy, length, concat, foldl', partition )
  45 import Data.Int         ( Int32 )
  46
  47 #if defined(__GLASGOW_HASKELL__)
  48 import GHC.Num
  49 import GHC.Real         ( fromIntegral )
  50 import GHC.Show         ( Show(..) )
  51 import GHC.Int          ( Int64 )
  52
  53 import GHC.IOBase       ( IO, IOArray, newIOArray,
  54                           unsafeReadIOArray, unsafeWriteIOArray, unsafePerformIO,
  55                           IORef, newIORef, readIORef, writeIORef )
  56 #else
  57 import Data.Char        ( ord )
  58 import Data.IORef       ( IORef, newIORef, readIORef, writeIORef )
  59 import System.IO.Unsafe ( unsafePerformIO )
  60 import Data.Int         ( Int64 )
  61 #  if defined(__HUGS__)
  62 import Hugs.IOArray     ( IOArray, newIOArray,
  63                           unsafeReadIOArray, unsafeWriteIOArray )
  64 #  elif defined(__NHC__)
  65 import NHC.IOExtras     ( IOArray, newIOArray, readIOArray, writeIOArray )
  66 #  endif
  67 #endif
  68 import Control.Monad    ( mapM, mapM_, sequence_ )
  69
  70
  71 -----------------------------------------------------------------------
  72
  73 iNSTRUMENTED :: Bool
  74 iNSTRUMENTED = False
  75
  76 -----------------------------------------------------------------------
  77
  78 readHTArray  :: HTArray a -> Int32 -> IO a
  79 writeMutArray :: MutArray a -> Int32 -> a -> IO ()
  80 freezeArray  :: MutArray a -> IO (HTArray a)
  81 thawArray    :: HTArray a -> IO (MutArray a)
  82 newMutArray   :: (Int32, Int32) -> a -> IO (MutArray a)
  83 #if defined(DEBUG) || defined(__NHC__)
  84 type MutArray a = IOArray Int32 a
  85 type HTArray a = MutArray a
  86 newMutArray = newIOArray
  87 readHTArray  = readIOArray
  88 writeMutArray = writeIOArray
  89 freezeArray = return
  90 thawArray = return
  91 #else
  92 type MutArray a = IOArray Int32 a
  93 type HTArray a = MutArray a -- Array Int32 a
  94 newMutArray = newIOArray
  95 readHTArray arr i = readMutArray arr i -- return $! (unsafeAt arr (fromIntegral i))
  96 readMutArray  :: MutArray a -> Int32 -> IO a
  97 readMutArray arr i = unsafeReadIOArray arr (fromIntegral i)
  98 writeMutArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
  99 freezeArray = return -- unsafeFreeze
 100 thawArray = return -- unsafeThaw
 101 #endif
 102
 103 data HashTable key val = HashTable {
 104                                      cmp     :: !(key -> key -> Bool),
 105                                      hash_fn :: !(key -> Int32),
 106                                      tab     :: !(IORef (HT key val))
 107                                    }
 108 -- TODO: the IORef should really be an MVar.
 109
 110 data HT key val
 111   = HT {
 112         kcount  :: !Int32,              -- Total number of keys.
 113         bmask   :: !Int32,
 114         buckets :: !(HTArray [(key,val)])
 115        }
 116
 117 -- ------------------------------------------------------------
 118 -- Instrumentation for performance tuning
 119
 120 -- This ought to be roundly ignored after optimization when
 121 -- iNSTRUMENTED=False.
 122
 123 -- STRICT version of modifyIORef!
 124 modifyIORef :: IORef a -> (a -> a) -> IO ()
 125 modifyIORef r f = do
 126   v <- readIORef r
 127   let z = f v in z `seq` writeIORef r z
 128
 129 data HashData = HD {
 130   tables :: !Integer,
 131   insertions :: !Integer,
 132   lookups :: !Integer,
 133   totBuckets :: !Integer,
 134   maxEntries :: !Int32,
 135   maxChain :: !Int,
 136   maxBuckets :: !Int32
 137 } deriving (Eq, Show)
 138
 139 {-# NOINLINE hashData #-}
 140 hashData :: IORef HashData
 141 hashData =  unsafePerformIO (newIORef (HD { tables=0, insertions=0, lookups=0,
 142                                             totBuckets=0, maxEntries=0,
 143                                             maxChain=0, maxBuckets=tABLE_MIN } ))
 144
 145 instrument :: (HashData -> HashData) -> IO ()
 146 instrument i | iNSTRUMENTED = modifyIORef hashData i
 147              | otherwise    = return ()
 148
 149 recordNew :: IO ()
 150 recordNew = instrument rec
 151   where rec hd@HD{ tables=t, totBuckets=b } =
 152                hd{ tables=t+1, totBuckets=b+fromIntegral tABLE_MIN }
 153
 154 recordIns :: Int32 -> Int32 -> [a] -> IO ()
 155 recordIns i sz bkt = instrument rec
 156   where rec hd@HD{ insertions=ins, maxEntries=mx, maxChain=mc } =
 157                hd{ insertions=ins+fromIntegral i, maxEntries=mx `max` sz,
 158                    maxChain=mc `max` length bkt }
 159
 160 recordResize :: Int32 -> Int32 -> IO ()
 161 recordResize older newer = instrument rec
 162   where rec hd@HD{ totBuckets=b, maxBuckets=mx } =
 163                hd{ totBuckets=b+fromIntegral (newer-older),
 164                    maxBuckets=mx `max` newer }
 165
 166 recordLookup :: IO ()
 167 recordLookup = instrument lkup
 168   where lkup hd@HD{ lookups=l } = hd{ lookups=l+1 }
 169
 170 -- stats :: IO String
 171 -- stats =  fmap show $ readIORef hashData
 172
 173 -- -----------------------------------------------------------------------------
 174 -- Sample hash functions
 175
 176 -- $hash_functions
 177 --
 178 -- This implementation of hash tables uses the low-order /n/ bits of the hash
 179 -- value for a key, where /n/ varies as the hash table grows.  A good hash
 180 -- function therefore will give an even distribution regardless of /n/.
 181 --
 182 -- If your keyspace is integrals such that the low-order bits between
 183 -- keys are highly variable, then you could get away with using 'id'
 184 -- as the hash function.
 185 --
 186 -- We provide some sample hash functions for 'Int' and 'String' below.
 187
 188 golden :: Int32
 189 golden = -1640531527
 190
 191 -- | A sample (and useful) hash function for Int and Int32,
 192 -- implemented by extracting the uppermost 32 bits of the 64-bit
 193 -- result of multiplying by a 32-bit constant.  The constant is from
 194 -- Knuth, derived from the golden ratio:
 195 --
 196 -- > golden = round ((sqrt 5 - 1) * 2^31) :: Int
 197 hashInt :: Int -> Int32
 198 hashInt x = mulHi (fromIntegral x) golden
 199
 200 -- hi 32 bits of a x-bit * 32 bit -> 64-bit multiply
 201 mulHi :: Int32 -> Int32 -> Int32
 202 mulHi a b = fromIntegral (r `shiftR` 32)
 203   where r :: Int64
 204         r = fromIntegral a * fromIntegral b :: Int64
 205
 206 -- | A sample hash function for Strings.  We keep multiplying by the
 207 -- golden ratio and adding.  The implementation is:
 208 --
 209 -- > hashString = foldl' f 0
 210 -- >   where f m c = fromIntegral (ord c) + mulHi m golden
 211 --
 212 -- Note that this has not been extensively tested for reasonability,
 213 -- but Knuth argues that repeated multiplication by the golden ratio
 214 -- will minimize gaps in the hash space.
 215 hashString :: String -> Int32
 216 hashString = foldl' f 0
 217   where f m c = fromIntegral (ord c) + mulHi m golden
 218
 219 -- | A prime larger than the maximum hash table size
 220 prime :: Int32
 221 prime = 33554467
 222
 223 -- -----------------------------------------------------------------------------
 224 -- Parameters
 225
 226 tABLE_MAX :: Int32
 227 tABLE_MAX  = 32 * 1024 * 1024   -- Maximum size of hash table
 228 tABLE_MIN :: Int32
 229 tABLE_MIN  = 8
 230
 231 hLOAD :: Int32
 232 hLOAD = 7                       -- Maximum average load of a single hash bucket
 233
 234 hYSTERESIS :: Int32
 235 hYSTERESIS = 64                 -- entries to ignore in load computation
 236
 237 {- Hysteresis favors long association-list-like behavior for small tables. -}
 238
 239 -- -----------------------------------------------------------------------------
 240 -- Creating a new hash table
 241
 242 -- | Creates a new hash table.  The following property should hold for the @eq@
 243 -- and @hash@ functions passed to 'new':
 244 --
 245 -- >   eq A B  =>  hash A == hash B
 246 --
 247 new
 248   :: (key -> key -> Bool)    -- ^ @eq@: An equality comparison on keys
 249   -> (key -> Int32)          -- ^ @hash@: A hash function on keys
 250   -> IO (HashTable key val)  -- ^ Returns: an empty hash table
 251
 252 new cmpr hash = do
 253   recordNew
 254   -- make a new hash table with a single, empty, segment
 255   let mask = tABLE_MIN-1
 256   bkts'  <- newMutArray (0,mask) []
 257   bkts   <- freezeArray bkts'
 258
 259   let
 260     kcnt = 0
 261     ht = HT {  buckets=bkts, kcount=kcnt, bmask=mask }
 262
 263   table <- newIORef ht
 264   return (HashTable { tab=table, hash_fn=hash, cmp=cmpr })
 265
 266 -- -----------------------------------------------------------------------------
 267 -- Inserting a key\/value pair into the hash table
 268
 269 -- | Inserts a key\/value mapping into the hash table.
 270 --
 271 -- Note that 'insert' doesn't remove the old entry from the table -
 272 -- the behaviour is like an association list, where 'lookup' returns
 273 -- the most-recently-inserted mapping for a key in the table.  The
 274 -- reason for this is to keep 'insert' as efficient as possible.  If
 275 -- you need to update a mapping, then we provide 'update'.
 276 --
 277 insert :: HashTable key val -> key -> val -> IO ()
 278
 279 insert ht key val =
 280   updatingBucket CanInsert (\bucket -> ((key,val):bucket, 1, ())) ht key
 281
 282
 283 -- ------------------------------------------------------------
 284 -- The core of the implementation is lurking down here, in findBucket,
 285 -- updatingBucket, and expandHashTable.
 286
 287 tooBig :: Int32 -> Int32 -> Bool
 288 tooBig k b = k-hYSTERESIS > hLOAD * b
 289
 290 -- index of bucket within table.
 291 bucketIndex :: Int32 -> Int32 -> Int32
 292 bucketIndex mask h = h .&. mask
 293
 294 -- find the bucket in which the key belongs.
 295 -- returns (key equality, bucket index, bucket)
 296 --
 297 -- This rather grab-bag approach gives enough power to do pretty much
 298 -- any bucket-finding thing you might want to do.  We rely on inlining
 299 -- to throw away the stuff we don't want.  I'm proud to say that this
 300 -- plus updatingBucket below reduce most of the other definitions to a
 301 -- few lines of code, while actually speeding up the hashtable
 302 -- implementation when compared with a version which does everything
 303 -- from scratch.
 304 {-# INLINE findBucket #-}
 305 findBucket :: HashTable key val -> key -> IO (HT key val, Int32, [(key,val)])
 306 findBucket HashTable{ tab=ref, hash_fn=hash} key = do
 307   table@HT{ buckets=bkts, bmask=b } <- readIORef ref
 308   let indx = bucketIndex b (hash key)
 309   bucket <- readHTArray bkts indx
 310   return (table, indx, bucket)
 311
 312 data Inserts = CanInsert
 313              | Can'tInsert
 314              deriving (Eq)
 315
 316 -- updatingBucket is the real workhorse of all single-element table
 317 -- updates.  It takes a hashtable and a key, along with a function
 318 -- describing what to do with the bucket in which that key belongs.  A
 319 -- flag indicates whether this function may perform table insertions.
 320 -- The function returns the new contents of the bucket, the number of
 321 -- bucket entries inserted (negative if entries were deleted), and a
 322 -- value which becomes the return value for the function as a whole.
 323 -- The table sizing is enforced here, calling out to expandSubTable as
 324 -- necessary.
 325
 326 -- This function is intended to be inlined and specialized for every
 327 -- calling context (eg every provided bucketFn).
 328 {-# INLINE updatingBucket #-}
 329
 330 updatingBucket :: Inserts -> ([(key,val)] -> ([(key,val)], Int32, a)) ->
 331                   HashTable key val -> key ->
 332                   IO a
 333 updatingBucket canEnlarge bucketFn
 334                ht@HashTable{ tab=ref, hash_fn=hash } key = do
 335   (table@HT{ kcount=k, buckets=bkts, bmask=b },
 336    indx, bckt) <- findBucket ht key
 337   (bckt', inserts, result) <- return $ bucketFn bckt
 338   let k' = k + inserts
 339       table1 = table { kcount=k' }
 340   bkts' <- thawArray bkts
 341   writeMutArray bkts' indx bckt'
 342   freezeArray bkts'
 343   table2 <- if canEnlarge == CanInsert && inserts > 0 then do
 344                recordIns inserts k' bckt'
 345                if tooBig k' b
 346                   then expandHashTable hash table1
 347                   else return table1
 348             else return table1
 349   writeIORef ref table2
 350   return result
 351
 352 expandHashTable :: (key -> Int32) -> HT key val -> IO (HT key val)
 353 expandHashTable hash table@HT{ buckets=bkts, bmask=mask } = do
 354    let
 355       oldsize = mask + 1
 356       newmask = mask + mask + 1
 357    recordResize oldsize (newmask+1)
 358    --
 359    if newmask > tABLE_MAX-1
 360       then return table
 361       else do
 362    --
 363    newbkts' <- newMutArray (0,newmask) []
 364
 365    let
 366     splitBucket oldindex = do
 367       bucket <- readHTArray bkts oldindex
 368       let (oldb,newb) =
 369               partition ((oldindex==). bucketIndex newmask . hash . fst) bucket
 370       writeMutArray newbkts' oldindex oldb
 371       writeMutArray newbkts' (oldindex + oldsize) newb
 372    mapM_ splitBucket [0..mask]
 373
 374    newbkts <- freezeArray newbkts'
 375
 376    return ( table{ buckets=newbkts, bmask=newmask } )
 377
 378 -- -----------------------------------------------------------------------------
 379 -- Deleting a mapping from the hash table
 380
 381 -- Remove a key from a bucket
 382 deleteBucket :: (key -> Bool) -> [(key,val)] -> ([(key, val)], Int32, ())
 383 deleteBucket _   [] = ([],0,())
 384 deleteBucket del (pair@(k,_):bucket) =
 385   case deleteBucket del bucket of
 386     (bucket', dels, _) | del k     -> dels' `seq` (bucket', dels', ())
 387                        | otherwise -> (pair:bucket', dels, ())
 388       where dels' = dels - 1
 389
 390 -- | Remove an entry from the hash table.
 391 delete :: HashTable key val -> key -> IO ()
 392
 393 delete ht@HashTable{ cmp=eq } key =
 394   updatingBucket Can'tInsert (deleteBucket (eq key)) ht key
 395
 396 -- -----------------------------------------------------------------------------
 397 -- Updating a mapping in the hash table
 398
 399 -- | Updates an entry in the hash table, returning 'True' if there was
 400 -- already an entry for this key, or 'False' otherwise.  After 'update'
 401 -- there will always be exactly one entry for the given key in the table.
 402 --
 403 -- 'insert' is more efficient than 'update' if you don't care about
 404 -- multiple entries, or you know for sure that multiple entries can't
 405 -- occur.  However, 'update' is more efficient than 'delete' followed
 406 -- by 'insert'.
 407 update :: HashTable key val -> key -> val -> IO Bool
 408
 409 update ht@HashTable{ cmp=eq } key val =
 410   updatingBucket CanInsert
 411     (\bucket -> let (bucket', dels, _) = deleteBucket (eq key) bucket
 412                 in  ((key,val):bucket', 1+dels, dels/=0))
 413     ht key
 414
 415 -- -----------------------------------------------------------------------------
 416 -- Looking up an entry in the hash table
 417
 418 -- | Looks up the value of a key in the hash table.
 419 lookup :: HashTable key val -> key -> IO (Maybe val)
 420
 421 lookup ht@HashTable{ cmp=eq } key = do
 422   recordLookup
 423   (_, _, bucket) <- findBucket ht key
 424   let firstHit (k,v) r | eq key k  = Just v
 425                        | otherwise = r
 426   return (foldr firstHit Nothing bucket)
 427
 428 -- -----------------------------------------------------------------------------
 429 -- Converting to/from lists
 430
 431 -- | Convert a list of key\/value pairs into a hash table.  Equality on keys
 432 -- is taken from the Eq instance for the key type.
 433 --
 434 fromList :: (Eq key) => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
 435 fromList hash list = do
 436   table <- new (==) hash
 437   sequence_ [ insert table k v | (k,v) <- list ]
 438   return table
 439
 440 -- | Converts a hash table to a list of key\/value pairs.
 441 --
 442 toList :: HashTable key val -> IO [(key,val)]
 443 toList = mapReduce id concat
 444
 445 {-# INLINE mapReduce #-}
 446 mapReduce :: ([(key,val)] -> r) -> ([r] -> r) -> HashTable key val -> IO r
 447 mapReduce m r HashTable{ tab=ref } = do
 448   HT{ buckets=bckts, bmask=b } <- readIORef ref
 449   fmap r (mapM (fmap m . readHTArray bckts) [0..b])
 450
 451 -- -----------------------------------------------------------------------------
 452 -- Diagnostics
 453
 454 -- | This function is useful for determining whether your hash
 455 -- function is working well for your data set.  It returns the longest
 456 -- chain of key\/value pairs in the hash table for which all the keys
 457 -- hash to the same bucket.  If this chain is particularly long (say,
 458 -- longer than 14 elements or so), then it might be a good idea to try
 459 -- a different hash function.
 460 --
 461 longestChain :: HashTable key val -> IO [(key,val)]
 462 longestChain = mapReduce id (maximumBy lengthCmp)
 463   where lengthCmp (_:x)(_:y) = lengthCmp x y
 464         lengthCmp []   []    = EQ
 465         lengthCmp []   _     = LT
 466         lengthCmp _    []    = GT